Skip to content

Commit 0688ea5

Browse files
committed
新增赶集网抓取验证码处理
1 parent 6af6cd8 commit 0688ea5

1 file changed

Lines changed: 99 additions & 0 deletions

File tree

fuck/ganji_callback.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
"""
5+
@author: zhanghe
6+
@software: PyCharm
7+
@file: ganji_callback.py
8+
@time: 2017/8/18 下午4:18
9+
"""
10+
11+
12+
import time
13+
import lxml.html
14+
import requests
15+
from urlparse import urljoin
16+
17+
from requests.exceptions import Timeout
18+
19+
20+
header = {
21+
'Host': 'callback.ganji.com',
22+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
23+
}
24+
25+
26+
s = requests.session()
27+
28+
29+
callback_url = 'http://callback.ganji.com/firewall/valid/1709685058.do?namespace=ganji_hy_list_pc&url=http%3A%2F%2Fanshan.ganji.com%2Fbanjia%2F'
30+
31+
32+
def fuck():
33+
r_g = s.get(callback_url)
34+
html = r_g.text
35+
doc = lxml.html.fromstring(html)
36+
37+
# 获取页面隐藏域表单
38+
uuid = doc.xpath('//input[@id="uuid"]/@value')[0].strip()
39+
url = doc.xpath('//input[@id="url"]/@value')[0].strip()
40+
namespace = doc.xpath('//input[@id="namespace"]/@value')[0].strip()
41+
ip = doc.xpath('//input[@id="ip"]/@value')[0].strip()
42+
43+
# 获取验证码图片
44+
img_url_text = doc.xpath('//img[@id="verify_img"]/@src')[0].strip()
45+
img_url = urljoin(callback_url, img_url_text)
46+
print img_url
47+
48+
# 保存验证码图片
49+
img_name = 'ganji_%s.jpg' % uuid
50+
img_content = s.get(img_url).content
51+
with open(img_name, 'w') as f:
52+
f.write(img_content)
53+
time.sleep(5)
54+
55+
verify_code = raw_input('verify_code')
56+
57+
print uuid
58+
print url
59+
print namespace
60+
print ip
61+
print r_g.cookies.__dict__
62+
data = {
63+
'namespace': namespace,
64+
'uuid': uuid,
65+
'url': url,
66+
'verify_code': verify_code
67+
}
68+
r_p = s.post(callback_url, data=data)
69+
r_p_json = r_p.json() # {"msg":"验证码过期.","code":-1}
70+
print r_p_json
71+
if r_p_json.get('code') == 0:
72+
print u'识别成功'
73+
else:
74+
print r_p_json.get('msg')
75+
76+
77+
if __name__ == '__main__':
78+
fuck()
79+
80+
81+
"""
82+
# 获取页面隐藏域表单
83+
<input type="hidden" id="uuid" value="25f77df09ff249a0942c78e46c79dc89" />
84+
<input type="hidden" id="url" value="http://anshan.ganji.com/banjia/" />
85+
<input type="hidden" id="namespace" value="ganji_hy_list_pc" />
86+
<input type="hidden" id="ip" value="1709685058" />
87+
88+
# 查看出口IP
89+
➜ ~ curl ifconfig.me
90+
101.231.185.66
91+
92+
# 验证IP
93+
In [1]: 101*256*256*256 + 231*256*256 + 185*256 + 66
94+
Out[1]: 1709685058
95+
96+
买一送一,童叟无欺
97+
58也是一样的:
98+
http://callback.58.com/firewall/valid/920593415.do?namespace=huangyedetailpc&url=http%3A%2F%2Finfodetail1.58.com%2Fsz%2Fjisuanji%2F27978971970226x.shtml
99+
"""

0 commit comments

Comments
 (0)