11# encoding: utf-8
22__author__ = 'zhanghe'
33
4-
54import requests
65from pyquery import PyQuery as Pq
76import json
@@ -40,12 +39,18 @@ def url_filter(url_str, domain):
4039
4140
4241def save (result_list , file_name ):
43- # 保存json至服务器
42+ """
43+ 保存文件
44+ :param result_list:
45+ :param file_name:
46+ :return:
47+ """
4448 import os
45- filepath = 'static/url_list/'
46- if not os .path .isdir (filepath ):
47- os .mkdir (filepath )
48- filename = filepath + file_name
49+
50+ file_path = 'static/url_list/'
51+ if not os .path .isdir (file_path ):
52+ os .mkdir (file_path )
53+ filename = file_path + file_name
4954 result_json = json .dumps (result_list , indent = 4 , ensure_ascii = False )
5055 with open (filename , 'wb' ) as f :
5156 f .write (result_json .encode ('utf-8' ))
@@ -70,23 +75,13 @@ def web_crawler(url_node=None):
7075 save (url_visited_list , 'url_visited_list.json' )
7176
7277
73-
7478if __name__ == "__main__" :
7579 while len (url_list ) > 0 :
7680 web_crawler (url_list .pop (0 ))
77- # for url_item in url_list:
78- # web_crawler(url_item)
79-
80- # 中心主进程
81- # 分配url(待访问列表减少)
82-
83- # 工作进程
84- # 请求url(已访问列表增加)
85- # 补充url(增加待访问列表)
86- #
87- # url_list = [] # 爬虫待访问url列表
88- # url_visited_list = [] # 爬虫已访问url列表
89-
90-
9181
9282
83+ """
84+ 查看测试结果:
85+ $ tail -f ~/code/python/static/url_list/url_list.json
86+ $ tail -f ~/code/python/static/url_list/url_visited_list.json
87+ """
0 commit comments