55import requests
66import re
77import json
8+ import lxml .html
89
910
1011UserAgent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
@@ -31,6 +32,77 @@ def get_city_list():
3132 print json .dumps (city , indent = 4 ).decode ('raw_unicode_escape' )
3233
3334
35+ def parse_city_list ():
36+ """
37+ 解析城市列表(去除海外城市)
38+ """
39+ # 入口页的url
40+ url = 'http://www.58.com/changecity.aspx'
41+ header = {
42+ 'Host' : 'www.58.com' ,
43+ 'Referer' : 'http://sh.58.com/' ,
44+ 'User-Agent' : UserAgent
45+ }
46+ response = requests .get (url , headers = header )
47+ html = response .text
48+ doc = lxml .html .fromstring (html )
49+
50+ # 省份
51+ province_list = doc .xpath ('//dl[@id="clist"]//dt[not(@class)]/text()' )[:- 1 ]
52+ # for i in province_list:
53+ # print i
54+
55+ # 城市
56+ city_rule = '<a href="http://.*?.58.com/" onclick="co\(\' (.*?)\' \)">(.*?)</a>'
57+ city_list = doc .xpath ('//dl[@id="clist"]//dd[not(@class)]' )[:- 1 ]
58+
59+ for index , city_item in enumerate (city_list ):
60+ city_link_list = city_item .xpath ('./a' )
61+ for city_link in city_link_list :
62+ city_link_html = lxml .html .tostring (city_link , encoding = 'utf-8' )
63+ city_result = re .compile (city_rule , re .S ).findall (city_link_html )
64+ print city_result [0 ][0 ], city_result [0 ][1 ], province_list [index ]
65+
66+ # 校验省份城市数量
67+ print len (province_list ), len (city_list )
68+
69+
70+ def get_cate_list ():
71+ """
72+ 获取分类列表
73+ """
74+ # 入口页的url
75+ url = 'http://sh.58.com/shenghuo.shtml'
76+
77+ header = {
78+ 'Host' : 'sh.58.com' ,
79+ 'User-Agent' : UserAgent
80+ }
81+ response = requests .get (url , headers = header )
82+ html = response .text
83+ doc = lxml .html .fromstring (html )
84+
85+ cate_list = doc .xpath ('//div[@class="sublist"]//dl[@class="catecss-item"]' )
86+
87+ cate_title_rule = '<dt><a href="http://sh.58.com/(.*?)(.shtml|/)" target="_blank".*?>(.*?)</a>'
88+ cate_item_rule = '<a href="http://sh.58.com/(.*?)/" target="_blank".*?>(.*?)</a>'
89+
90+ for i in cate_list :
91+ cate_title_html = lxml .html .tostring (i .xpath ('./dt' )[0 ], encoding = 'utf-8' )
92+ cate_item_html = lxml .html .tostring (i .xpath ('./dd' )[0 ], encoding = 'utf-8' )
93+ # 标题
94+ cate_title_result = re .compile (cate_title_rule , re .S ).findall (cate_title_html )
95+ for cate_title_list in cate_title_result :
96+ print '#' , cate_title_list [0 ], cate_title_list [2 ]
97+
98+ # 明细
99+ cate_item_result = re .compile (cate_item_rule , re .S ).findall (cate_item_html )
100+ cate = {}
101+ for cate_item_list in cate_item_result :
102+ cate [cate_item_list [0 ]] = cate_item_list [1 ].strip ()
103+ print json .dumps (cate , indent = 4 ).decode ('raw_unicode_escape' )
104+
105+
34106def get_contacts ():
35107 """
36108 获取联系方式
@@ -78,5 +150,7 @@ def get_promotion_info():
78150
79151if __name__ == '__main__' :
80152 # get_city_list()
81- get_contacts ()
82- get_promotion_info ()
153+ parse_city_list ()
154+ # get_cate_list()
155+ # get_contacts()
156+ # get_promotion_info()
0 commit comments