-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmywebcrawler.py
More file actions
executable file
·47 lines (42 loc) · 1.49 KB
/
mywebcrawler.py
File metadata and controls
executable file
·47 lines (42 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
import re,urllib2,sys,os
def get_links(url):
try:
f = urllib2.urlopen(url=url,timeout=3)
except urllib2.URLError,e:
if hasattr(e, "code"):
print "The server couldn't fulfill the request."
print "Error code: %s" % e.code
elif hasattr(e, "reason"):
print "We failed to reach a server. Please check your url and read the Reason"
print "Reason: %s" % e.reason
sys.exit(2)
content = f.read().decode("utf-8")
print "Read webpage successfully."
pattern = re.compile(r'\stitle.*img\ssrc="(.*)"\salt="(.*)"/></a>')
results = re.findall(pattern, content) #(url,name)
return results
def file_save((url_avatar,username),count):
dirname = os.path.dirname(sys.argv[0])
dir_imgs = os.path.join(dirname,'imgs')
if not os.path.exists(dir_imgs):
os.mkdir(dir_imgs)
img = urllib2.urlopen(url_avatar)
filename = dir_imgs+'/'+username+'.jpg'
if not os.path.exists(filename):
with open(filename,'w') as f:
f.write(img.read())
count+=1
return count
if __name__ == "__main__":
count = 0
page_num = 0
while count < 100:
page_num+=1
website = "http://www.douban.com/interest/1/1/?p=%d" % (page_num)
outp = get_links(website)
outp = list(set(outp))
l_outp = len(outp)
for i in xrange(l_outp):
count = file_save(outp[i],count)
print "Download %d avatars successfully." % count