-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathwebcrawl.py
More file actions
126 lines (118 loc) · 4.46 KB
/
webcrawl.py
File metadata and controls
126 lines (118 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#! /usr/bin/env python
#encoding=utf-8
from bs4 import BeautifulSoup
import socket
import urllib2
import re
import zlib
class MyCrawler:
def __init__(self,seeds):
#初始化当前抓取的深度
self.current_deepth = 1
#使用种子初始化url队列
self.linkQuence=linkQuence()
if isinstance(seeds,str):
self.linkQuence.addUnvisitedUrl(seeds)
if isinstance(seeds,list):
for i in seeds:
self.linkQuence.addUnvisitedUrl(i)
print "Add the seeds url \"%s\" to the unvisited url list"%str(self.linkQuence.unVisited)
#抓取过程主函数
def crawling(self,seeds,crawl_deepth):
#循环条件:抓取深度不超过crawl_deepth
while self.current_deepth <= crawl_deepth:
#循环条件:待抓取的链接不空
while not self.linkQuence.unVisitedUrlsEnmpy():
#队头url出队列
visitUrl=self.linkQuence.unVisitedUrlDeQuence()
print "Pop out one url \"%s\" from unvisited url list"%visitUrl
if visitUrl is None or visitUrl=="":
continue
#获取超链接
links=self.getHyperLinks(visitUrl)
print "Get %d new links"%len(links)
#将url放入已访问的url中
self.linkQuence.addVisitedUrl(visitUrl)
print "Visited url count: "+str(self.linkQuence.getVisitedUrlCount())
print "Visited deepth: "+str(self.current_deepth)
#未访问的url入列
for link in links:
self.linkQuence.addUnvisitedUrl(link)
print "%d unvisited links:"%len(self.linkQuence.getUnvisitedUrl())
self.current_deepth += 1
#获取源码中得超链接
def getHyperLinks(self,url):
links=[]
data=self.getPageSource(url)
if data[0]=="200":
soup=BeautifulSoup(data[1])
a=soup.findAll("a",{"href":re.compile('^http|^/')})
for i in a:
if i["href"].find("http://")!=-1:
links.append(i["href"])
return links
#获取网页源码
def getPageSource(self,url,timeout=100,coding=None):
try:
socket.setdefaulttimeout(timeout)
req = urllib2.Request(url)
req.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')
response = urllib2.urlopen(req)
page = ''
if response.headers.get('Content-Encoding') == 'gzip':
page = zlib.decompress(page, 16+zlib.MAX_WBITS)
if coding is None:
coding= response.headers.getparam("charset")
#如果获取的网站编码为None
if coding is None:
page=response.read()
#获取网站编码并转化为utf-8
else:
page=response.read()
page=page.decode(coding).encode('utf-8')
return ["200",page]
except Exception,e:
print str(e)
return [str(e),None]
class linkQuence:
def __init__(self):
#已访问的url集合
self.visted=[]
#待访问的url集合
self.unVisited=[]
#获取访问过的url队列
def getVisitedUrl(self):
return self.visted
#获取未访问的url队列
def getUnvisitedUrl(self):
return self.unVisited
#添加到访问过得url队列中
def addVisitedUrl(self,url):
self.visted.append(url)
#移除访问过得url
def removeVisitedUrl(self,url):
self.visted.remove(url)
#未访问过得url出队列
def unVisitedUrlDeQuence(self):
try:
return self.unVisited.pop()
except:
return None
#保证每个url只被访问一次
def addUnvisitedUrl(self,url):
if url!="" and url not in self.visted and url not in self.unVisited:
self.unVisited.insert(0,url)
#获得已访问的url数目
def getVisitedUrlCount(self):
return len(self.visted)
#获得未访问的url数目
def getUnvistedUrlCount(self):
return len(self.unVisited)
#判断未访问的url队列是否为空
def unVisitedUrlsEnmpy(self):
return len(self.unVisited)==0
def main(seeds,crawl_deepth):
craw=MyCrawler(seeds)
craw.crawling(seeds,crawl_deepth)
if __name__=="__main__":
main(["http://www.baidu.com", "http://www.google.com.hk", "http://www.sina.com.cn"],10)