Skip to content

Commit bb23bc7

Browse files
committed
新增百姓网页面信息提取
1 parent f27d2e6 commit bb23bc7

1 file changed

Lines changed: 100 additions & 0 deletions

File tree

fuck/baixing.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
"""
5+
@author: zhanghe
6+
@software: PyCharm
7+
@file: baixing.py
8+
@time: 2017/2/10 下午6:08
9+
"""
10+
11+
12+
import requests
13+
import re
14+
import time
15+
import lxml.html
16+
17+
18+
UserAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
19+
20+
21+
def get_city():
22+
"""
23+
获取所有城市
24+
:return:
25+
"""
26+
header = {
27+
'Host': 'www.baixing.com',
28+
'User-Agent': UserAgent
29+
}
30+
city_url = 'http://www.baixing.com/?changeLocation=yes'
31+
32+
response = requests.get(city_url, headers=header)
33+
html = response.text
34+
# print html
35+
doc = lxml.html.fromstring(html)
36+
link_list = doc.xpath('//ul/li/a')
37+
# print link_list
38+
link_rule = u'<a href="//(.*?).baixing.com/">(.*?)</a>'
39+
for link in link_list:
40+
link_html = lxml.html.tostring(link, encoding='utf-8')
41+
city_result = re.compile(link_rule, re.S).findall(link_html.decode('utf-8'))
42+
for city in city_result:
43+
yield city
44+
45+
46+
def get_area(city_code):
47+
"""
48+
获取区域
49+
:return:
50+
"""
51+
header = {
52+
'Host': '%s.baixing.com' % city_code,
53+
'Referer': 'http://%s.baixing.com/' % city_code,
54+
'User-Agent': UserAgent
55+
}
56+
city_url = 'http://%s.baixing.com/baomu/' % city_code
57+
# , proxies={'http': 'http://192.168.2.158:3128'}
58+
response = requests.get(city_url, headers=header)
59+
html = response.text
60+
# print html
61+
doc = lxml.html.fromstring(html)
62+
link_list = doc.xpath('//div[contains(@class,"area")]')
63+
link_rule = u'<a href="/baomu/(.*?)/">(.*?)</a>'
64+
for link in link_list:
65+
link_html = lxml.html.tostring(link, encoding='utf-8')
66+
area_result = re.compile(link_rule, re.S).findall(link_html.decode('utf-8'))
67+
for area in area_result:
68+
yield area
69+
70+
71+
def output_area():
72+
"""
73+
输出地区
74+
:return:
75+
"""
76+
for city in get_city():
77+
print '# %s' % city[1]
78+
print '\'%s\': [' % city[0]
79+
for area in get_area(city[0]):
80+
print '\t\'%s\', # %s' % (area[0], area[1])
81+
print ']'
82+
83+
84+
def test_area(city_code):
85+
print '# %s' % ''
86+
print '\'%s\': [' % city_code
87+
for area in get_area(city_code):
88+
print '\t\'%s\', # %s' % (area[0], area[1])
89+
print '],'
90+
91+
92+
def test_city():
93+
for i in get_city():
94+
print i[0], i[1]
95+
96+
97+
if __name__ == '__main__':
98+
# output_area()
99+
# test_area('taian')
100+
test_city()

0 commit comments

Comments
 (0)