Skip to content

Commit 54a4c03

Browse files
committed
新增职友集分类抓取
1 parent 0f3f360 commit 54a4c03

2 files changed

Lines changed: 65 additions & 1 deletion

File tree

fuck/58.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ def get_city_list():
3434

3535

3636
if __name__ == '__main__':
37-
get_city_list()
37+
get_city_list()

fuck/jobui.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
4+
"""
5+
@author: zhanghe
6+
@software: PyCharm
7+
@file: jobui.py
8+
@time: 16-5-20 下午4:38
9+
"""
10+
11+
12+
import requests
13+
import re
14+
import json
15+
16+
17+
# 伪装成浏览器
18+
header = {
19+
'Host': 'www.jobui.com',
20+
'Referer': 'http://www.jobui.com',
21+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
22+
}
23+
24+
s = requests.session()
25+
26+
27+
def get_city_list():
28+
"""
29+
获取城市列表
30+
"""
31+
# 入口页的url
32+
url = 'http://www.jobui.com/changecity/'
33+
response = s.get(url, headers=header)
34+
html = response.text
35+
rule = '<a onclick="changeCity\(this\);" href=".*?" data_city="(.*?)" data_url=".*?">.*?</a>'
36+
city_list = re.compile(rule, re.S).findall(html)
37+
city_name_list = []
38+
for item in city_list:
39+
if item not in city_name_list:
40+
city_name_list.append(item)
41+
print json.dumps(city_name_list, indent=4).decode('raw_unicode_escape')
42+
43+
44+
def get_industry_list():
45+
"""
46+
获取行业列表
47+
"""
48+
# 入口页的url
49+
url = 'http://www.jobui.com/cmp'
50+
response = s.get(url, headers=header)
51+
html = response.text
52+
rule = '<a href="/cmp\?industry=.*?" >(.*?)</a>'
53+
city_list = re.compile(rule, re.S).findall(html)
54+
industry_name_list = []
55+
for item in city_list:
56+
if item not in industry_name_list:
57+
industry_name_list.append(item)
58+
print json.dumps(industry_name_list, indent=4).decode('raw_unicode_escape')
59+
60+
61+
if __name__ == '__main__':
62+
get_city_list()
63+
get_industry_list()
64+

0 commit comments

Comments
 (0)