Skip to content

Commit 2e0775f

Browse files
committed
新增通过yield实现协程抓取
1 parent c02dbe2 commit 2e0775f

1 file changed

Lines changed: 80 additions & 0 deletions

File tree

yield.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# encoding: utf-8
2+
__author__ = 'zhanghe'
3+
4+
import time
5+
import requests
6+
import re
7+
8+
root_url = 'http://www.ycit.cn/' # 爬虫入口
9+
web_host = 'http://www.ycit.cn/'
10+
web_domain = 'ycit.cn'
11+
url_list = [root_url] # 爬虫待访问url列表
12+
url_visited_list = [] # 爬虫已访问url列表
13+
14+
15+
def url_join(url_str, host):
16+
"""
17+
url拼接
18+
:param url_str:
19+
:param host:
20+
:return:
21+
"""
22+
if url_str is not None:
23+
if url_str.startswith(host) or url_str.startswith('http://'):
24+
return url_str
25+
return host.rstrip('/') + '/' + url_str.lstrip('/')
26+
27+
28+
def url_filter(url_str, domain):
29+
"""
30+
过滤其它域名
31+
:param url_str:
32+
:param domain:
33+
:return:
34+
"""
35+
if url_str is not None:
36+
if domain in url_str:
37+
return url_str
38+
39+
40+
def routine(func):
41+
def ret():
42+
f = func()
43+
f.next()
44+
return f
45+
return ret
46+
47+
48+
@routine
49+
def hit():
50+
while 1:
51+
url_node = (yield)
52+
if url_node is None:
53+
print '待抓取列表为空'
54+
response = requests.get(url_node)
55+
html = response.text
56+
reg = '<a .*?href="(.+?)".*?>'
57+
tags = re.compile(reg, re.I).findall(html)
58+
for tag in tags:
59+
if tag != '#' and tag is not None: # 过滤掉错误地址
60+
url = url_filter(url_join(tag, web_host), web_domain)
61+
if url is not None and url not in url_list and url not in url_visited_list: # 去重
62+
url_list.append(url.rstrip('/'))
63+
# print "访问 %s" % url_node
64+
print "待访问节点:%s" % len(url_list)
65+
url_visited_list.append(url_node)
66+
print "已访问节点:%s" % len(url_visited_list)
67+
end_time = time.time()
68+
print "耗时:%0.2f S" % (end_time - start_time)
69+
print '--------------'
70+
71+
72+
def get():
73+
c = hit()
74+
while len(url_list) > 0:
75+
c.send(url_list.pop(0))
76+
77+
78+
if __name__ == "__main__":
79+
start_time = time.time()
80+
get()

0 commit comments

Comments
 (0)