Skip to content

Commit 8f2d0cb

Browse files
author
proxycrawl
committed
Merge add-encoding-headers into master
2 parents 15ed508 + 8922a3e commit 8f2d0cb

2 files changed

Lines changed: 40 additions & 10 deletions

File tree

proxycrawl/proxycrawl_api.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,24 @@
1+
import json
2+
import gzip
3+
import ssl
14
try:
25
# For Python 3.0 and later
3-
from urllib.request import urlopen, HTTPError
6+
from urllib.request import urlopen, HTTPError, Request
47
except ImportError:
58
# Fall back to Python 2's
6-
from urllib2 import urlopen, HTTPError
9+
from urllib2 import urlopen, HTTPError, Request
710
try:
811
# For Python 3.0 and later
912
from urllib.parse import urlencode, quote_plus
1013
except ImportError:
1114
# Fall back to Python 2's
1215
from urllib import urlencode, quote_plus
13-
import json
16+
try:
17+
# For Python 3.0 and later
18+
from io import BytesIO
19+
except ImportError:
20+
# Fall back to Python 2's
21+
from BytesIO import BytesIO
1422

1523
#
1624
# A Python class that acts as wrapper for ProxyCrawl API.
@@ -24,6 +32,7 @@
2432

2533
class ProxyCrawlAPI:
2634
timeout = 30000
35+
headers = { 'Accept-Encoding': 'gzip' }
2736

2837
def __init__(self, options):
2938
if options['token'] is None or options['token'] == '':
@@ -42,21 +51,25 @@ def post(self, url, data, options = None):
4251

4352
def request(self, url, data = None, options = None):
4453
self.response = {}
54+
self.response['headers'] = {}
4555
url = self.buildURL(url, options)
56+
req = Request(url, headers=self.headers)
57+
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
4658

4759
try:
48-
handler = urlopen(url, data, self.timeout)
60+
self.handler = urlopen(req, data, self.timeout, context=ssl_context)
4961
except HTTPError as error:
50-
self.response['headers'] = {}
5162
self.response['body'] = ''
5263
self.response['status_code'] = error.code
5364
return self.response
5465

55-
self.response['status_code'] = handler.getcode()
56-
self.response['headers'] = handler.headers
57-
self.response['body'] = handler.read()
66+
self.response['status_code'] = self.handler.getcode()
67+
self.response['body'] = self.decompressBody()
68+
5869
if not options.get('callback') and options.get('format') == 'json':
5970
self.parseJsonResponse()
71+
else:
72+
self.parseRegularResponse()
6073

6174
return self.response
6275

@@ -67,8 +80,20 @@ def buildURL(self, url, options):
6780

6881
return url
6982

83+
def decompressBody(self):
84+
body_stream = BytesIO(self.handler.read())
85+
body_gzip = gzip.GzipFile(fileobj=body_stream)
86+
87+
return body_gzip.read()
88+
7089
def parseJsonResponse(self):
7190
parsed_json = json.loads(self.response['body'])
7291
self.response['headers']['original_status'] = str(parsed_json['original_status'])
7392
self.response['headers']['pc_status'] = str(parsed_json['pc_status'])
74-
self.response['headers']['url'] = parsed_json['url']
93+
self.response['headers']['url'] = str(parsed_json['url'])
94+
95+
def parseRegularResponse(self):
96+
headers = self.handler.headers
97+
self.response['headers']['original_status'] = str(headers.get('original_status'))
98+
self.response['headers']['pc_status'] = str(headers.get('pc_status'))
99+
self.response['headers']['url'] = str(headers.get('url'))

test.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import sys
22
import json
3-
from proxycrawl import ProxyCrawlAPI
3+
try:
4+
# For Python 3.0 and later
5+
from proxycrawl.proxycrawl_api import ProxyCrawlAPI
6+
except ImportError:
7+
# Fall back to Python 2's
8+
from proxycrawl import ProxyCrawlAPI
49

510
normal_token = ''
611
javascript_token = ''

0 commit comments

Comments
 (0)