-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdownloader.py
More file actions
92 lines (81 loc) · 3.15 KB
/
downloader.py
File metadata and controls
92 lines (81 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import urlparse
import urllib2
import random
import time
from datetime import datetime, timedelta
import socket
DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 60
class Downloader:
def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
socket.setdefaulttimeout(timeout)
self.throttle = Throttle(delay)
self.user_agent = user_agent
self.proxies = proxies
self.num_retries = num_retries
self.opener = opener
self.cache = cache
def __call__(self, url):
result = None
if self.cache:
try:
result = self.cache[url]
except KeyError:
# url is not available in cache
pass
else:
if self.num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download
result = None
if result is None:
# result was not loaded from cache so still need to download
self.throttle.wait(url)
proxy = random.choice(self.proxies) if self.proxies else None
headers = {'User-agent': self.user_agent}
result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
if self.cache:
# save result to cache
self.cache[url] = result
return result['html']
def download(self, url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers or {})
opener = self.opener or urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except Exception as e:
print 'Download error:', str(e)
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return self._get(url, headers, proxy, num_retries-1, data)
else:
code = None
return {'html': html, 'code': code}
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()