Skip to content

Commit cbfd5fb

Browse files
committed
www_ycit_cn爬虫示例
1 parent fb3387d commit cbfd5fb

1 file changed

Lines changed: 16 additions & 21 deletions

File tree

www_ycit_cn.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# encoding: utf-8
22
__author__ = 'zhanghe'
33

4-
54
import requests
65
from pyquery import PyQuery as Pq
76
import json
@@ -40,12 +39,18 @@ def url_filter(url_str, domain):
4039

4140

4241
def save(result_list, file_name):
43-
# 保存json至服务器
42+
"""
43+
保存文件
44+
:param result_list:
45+
:param file_name:
46+
:return:
47+
"""
4448
import os
45-
filepath = 'static/url_list/'
46-
if not os.path.isdir(filepath):
47-
os.mkdir(filepath)
48-
filename = filepath + file_name
49+
50+
file_path = 'static/url_list/'
51+
if not os.path.isdir(file_path):
52+
os.mkdir(file_path)
53+
filename = file_path + file_name
4954
result_json = json.dumps(result_list, indent=4, ensure_ascii=False)
5055
with open(filename, 'wb') as f:
5156
f.write(result_json.encode('utf-8'))
@@ -70,23 +75,13 @@ def web_crawler(url_node=None):
7075
save(url_visited_list, 'url_visited_list.json')
7176

7277

73-
7478
if __name__ == "__main__":
7579
while len(url_list) > 0:
7680
web_crawler(url_list.pop(0))
77-
# for url_item in url_list:
78-
# web_crawler(url_item)
79-
80-
# 中心主进程
81-
# 分配url(待访问列表减少)
82-
83-
# 工作进程
84-
# 请求url(已访问列表增加)
85-
# 补充url(增加待访问列表)
86-
#
87-
# url_list = [] # 爬虫待访问url列表
88-
# url_visited_list = [] # 爬虫已访问url列表
89-
90-
9181

9282

83+
"""
84+
查看测试结果:
85+
$ tail -f ~/code/python/static/url_list/url_list.json
86+
$ tail -f ~/code/python/static/url_list/url_visited_list.json
87+
"""

0 commit comments

Comments
 (0)