1+ # encoding: utf-8
2+ __author__ = 'zhanghe'
3+
4+ import time
5+ import requests
6+ import re
7+
8+ root_url = 'http://www.ycit.cn/' # 爬虫入口
9+ web_host = 'http://www.ycit.cn/'
10+ web_domain = 'ycit.cn'
11+ url_list = [root_url ] # 爬虫待访问url列表
12+ url_visited_list = [] # 爬虫已访问url列表
13+
14+
15+ def url_join (url_str , host ):
16+ """
17+ url拼接
18+ :param url_str:
19+ :param host:
20+ :return:
21+ """
22+ if url_str is not None :
23+ if url_str .startswith (host ) or url_str .startswith ('http://' ):
24+ return url_str
25+ return host .rstrip ('/' ) + '/' + url_str .lstrip ('/' )
26+
27+
28+ def url_filter (url_str , domain ):
29+ """
30+ 过滤其它域名
31+ :param url_str:
32+ :param domain:
33+ :return:
34+ """
35+ if url_str is not None :
36+ if domain in url_str :
37+ return url_str
38+
39+
40+ def routine (func ):
41+ def ret ():
42+ f = func ()
43+ f .next ()
44+ return f
45+ return ret
46+
47+
48+ @routine
49+ def hit ():
50+ while 1 :
51+ url_node = (yield )
52+ if url_node is None :
53+ print '待抓取列表为空'
54+ response = requests .get (url_node )
55+ html = response .text
56+ reg = '<a .*?href="(.+?)".*?>'
57+ tags = re .compile (reg , re .I ).findall (html )
58+ for tag in tags :
59+ if tag != '#' and tag is not None : # 过滤掉错误地址
60+ url = url_filter (url_join (tag , web_host ), web_domain )
61+ if url is not None and url not in url_list and url not in url_visited_list : # 去重
62+ url_list .append (url .rstrip ('/' ))
63+ # print "访问 %s" % url_node
64+ print "待访问节点:%s" % len (url_list )
65+ url_visited_list .append (url_node )
66+ print "已访问节点:%s" % len (url_visited_list )
67+ end_time = time .time ()
68+ print "耗时:%0.2f S" % (end_time - start_time )
69+ print '--------------'
70+
71+
72+ def get ():
73+ c = hit ()
74+ while len (url_list ) > 0 :
75+ c .send (url_list .pop (0 ))
76+
77+
78+ if __name__ == "__main__" :
79+ start_time = time .time ()
80+ get ()
0 commit comments