Skip to content

Commit 705a33e

Browse files
committed
更新58分类页面信息抓取解析
1 parent 7a9f73c commit 705a33e

1 file changed

Lines changed: 76 additions & 2 deletions

File tree

fuck/58.py

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import requests
66
import re
77
import json
8+
import lxml.html
89

910

1011
UserAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
@@ -31,6 +32,77 @@ def get_city_list():
3132
print json.dumps(city, indent=4).decode('raw_unicode_escape')
3233

3334

35+
def parse_city_list():
36+
"""
37+
解析城市列表(去除海外城市)
38+
"""
39+
# 入口页的url
40+
url = 'http://www.58.com/changecity.aspx'
41+
header = {
42+
'Host': 'www.58.com',
43+
'Referer': 'http://sh.58.com/',
44+
'User-Agent': UserAgent
45+
}
46+
response = requests.get(url, headers=header)
47+
html = response.text
48+
doc = lxml.html.fromstring(html)
49+
50+
# 省份
51+
province_list = doc.xpath('//dl[@id="clist"]//dt[not(@class)]/text()')[:-1]
52+
# for i in province_list:
53+
# print i
54+
55+
# 城市
56+
city_rule = '<a href="http://.*?.58.com/" onclick="co\(\'(.*?)\'\)">(.*?)</a>'
57+
city_list = doc.xpath('//dl[@id="clist"]//dd[not(@class)]')[:-1]
58+
59+
for index, city_item in enumerate(city_list):
60+
city_link_list = city_item.xpath('./a')
61+
for city_link in city_link_list:
62+
city_link_html = lxml.html.tostring(city_link, encoding='utf-8')
63+
city_result = re.compile(city_rule, re.S).findall(city_link_html)
64+
print city_result[0][0], city_result[0][1], province_list[index]
65+
66+
# 校验省份城市数量
67+
print len(province_list), len(city_list)
68+
69+
70+
def get_cate_list():
71+
"""
72+
获取分类列表
73+
"""
74+
# 入口页的url
75+
url = 'http://sh.58.com/shenghuo.shtml'
76+
77+
header = {
78+
'Host': 'sh.58.com',
79+
'User-Agent': UserAgent
80+
}
81+
response = requests.get(url, headers=header)
82+
html = response.text
83+
doc = lxml.html.fromstring(html)
84+
85+
cate_list = doc.xpath('//div[@class="sublist"]//dl[@class="catecss-item"]')
86+
87+
cate_title_rule = '<dt><a href="http://sh.58.com/(.*?)(.shtml|/)" target="_blank".*?>(.*?)</a>'
88+
cate_item_rule = '<a href="http://sh.58.com/(.*?)/" target="_blank".*?>(.*?)</a>'
89+
90+
for i in cate_list:
91+
cate_title_html = lxml.html.tostring(i.xpath('./dt')[0], encoding='utf-8')
92+
cate_item_html = lxml.html.tostring(i.xpath('./dd')[0], encoding='utf-8')
93+
# 标题
94+
cate_title_result = re.compile(cate_title_rule, re.S).findall(cate_title_html)
95+
for cate_title_list in cate_title_result:
96+
print '#', cate_title_list[0], cate_title_list[2]
97+
98+
# 明细
99+
cate_item_result = re.compile(cate_item_rule, re.S).findall(cate_item_html)
100+
cate = {}
101+
for cate_item_list in cate_item_result:
102+
cate[cate_item_list[0]] = cate_item_list[1].strip()
103+
print json.dumps(cate, indent=4).decode('raw_unicode_escape')
104+
105+
34106
def get_contacts():
35107
"""
36108
获取联系方式
@@ -78,5 +150,7 @@ def get_promotion_info():
78150

79151
if __name__ == '__main__':
80152
# get_city_list()
81-
get_contacts()
82-
get_promotion_info()
153+
parse_city_list()
154+
# get_cate_list()
155+
# get_contacts()
156+
# get_promotion_info()

0 commit comments

Comments
 (0)