forked from REMitchell/python-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path4-solveCaptcha.py
More file actions
48 lines (43 loc) · 1.98 KB
/
4-solveCaptcha.py
File metadata and controls
48 lines (43 loc) · 1.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
import subprocess
import requests
from PIL import Image
from PIL import ImageOps
def cleanImage(imagePath):
image = Image.open(imagePath)
image = image.point(lambda x: 0 if x<143 else 255)
borderImage = ImageOps.expand(image,border=20,fill='white')
borderImage.save(imagePath)
html = urlopen("http://www.pythonscraping.com/humans-only")
bsObj = BeautifulSoup(html)
#Gather prepopulated form values
imageLocation = bsObj.find("img", {"title": "Image CAPTCHA"})["src"]
formBuildId = bsObj.find("input", {"name":"form_build_id"})["value"]
captchaSid = bsObj.find("input", {"name":"captcha_sid"})["value"]
captchaToken = bsObj.find("input", {"name":"captcha_token"})["value"]
captchaUrl = "http://pythonscraping.com"+imageLocation
urlretrieve(captchaUrl, "captcha.jpg")
cleanImage("captcha.jpg")
p = subprocess.Popen(["tesseract", "captcha.jpg", "captcha"], stdout=
subprocess.PIPE,stderr=subprocess.PIPE)
p.wait()
f = open("captcha.txt", "r")
#Clean any whitespace characters
captchaResponse = f.read().replace(" ", "").replace("\n", "")
print("Captcha solution attempt: "+captchaResponse)
if len(captchaResponse) == 5:
params = {"captcha_token":captchaToken, "captcha_sid":captchaSid,
"form_id":"comment_node_page_form", "form_build_id": formBuildId,
"captcha_response":captchaResponse, "name":"Ryan Mitchell",
"subject": "I come to seek the Grail",
"comment_body[und][0][value]":
"...and I am definitely not a bot"}
r = requests.post("http://www.pythonscraping.com/comment/reply/10",
data=params)
responseObj = BeautifulSoup(r.text)
if responseObj.find("div", {"class":"messages"}) is not None:
print(responseObj.find("div", {"class":"messages"}).get_text())
else:
print("There was a problem reading the CAPTCHA correctly!")