1+ import json
2+ import gzip
3+ import ssl
14try :
25 # For Python 3.0 and later
3- from urllib .request import urlopen , HTTPError
6+ from urllib .request import urlopen , HTTPError , Request
47except ImportError :
58 # Fall back to Python 2's
6- from urllib2 import urlopen , HTTPError
9+ from urllib2 import urlopen , HTTPError , Request
710try :
811 # For Python 3.0 and later
912 from urllib .parse import urlencode , quote_plus
1013except ImportError :
1114 # Fall back to Python 2's
1215 from urllib import urlencode , quote_plus
13- import json
16+ try :
17+ # For Python 3.0 and later
18+ from io import BytesIO
19+ except ImportError :
20+ # Fall back to Python 2's
21+ from BytesIO import BytesIO
1422
1523#
1624# A Python class that acts as wrapper for ProxyCrawl API.
2432
2533class ProxyCrawlAPI :
2634 timeout = 30000
35+ headers = { 'Accept-Encoding' : 'gzip' }
2736
2837 def __init__ (self , options ):
2938 if options ['token' ] is None or options ['token' ] == '' :
@@ -42,21 +51,25 @@ def post(self, url, data, options = None):
4251
4352 def request (self , url , data = None , options = None ):
4453 self .response = {}
54+ self .response ['headers' ] = {}
4555 url = self .buildURL (url , options )
56+ req = Request (url , headers = self .headers )
57+ ssl_context = ssl .SSLContext (ssl .PROTOCOL_TLS )
4658
4759 try :
48- handler = urlopen (url , data , self .timeout )
60+ self . handler = urlopen (req , data , self .timeout , context = ssl_context )
4961 except HTTPError as error :
50- self .response ['headers' ] = {}
5162 self .response ['body' ] = ''
5263 self .response ['status_code' ] = error .code
5364 return self .response
5465
55- self .response ['status_code' ] = handler .getcode ()
56- self .response ['headers ' ] = handler . headers
57- self . response [ 'body' ] = handler . read ()
66+ self .response ['status_code' ] = self . handler .getcode ()
67+ self .response ['body ' ] = self . decompressBody ()
68+
5869 if not options .get ('callback' ) and options .get ('format' ) == 'json' :
5970 self .parseJsonResponse ()
71+ else :
72+ self .parseRegularResponse ()
6073
6174 return self .response
6275
@@ -67,8 +80,20 @@ def buildURL(self, url, options):
6780
6881 return url
6982
83+ def decompressBody (self ):
84+ body_stream = BytesIO (self .handler .read ())
85+ body_gzip = gzip .GzipFile (fileobj = body_stream )
86+
87+ return body_gzip .read ()
88+
7089 def parseJsonResponse (self ):
7190 parsed_json = json .loads (self .response ['body' ])
7291 self .response ['headers' ]['original_status' ] = str (parsed_json ['original_status' ])
7392 self .response ['headers' ]['pc_status' ] = str (parsed_json ['pc_status' ])
74- self .response ['headers' ]['url' ] = parsed_json ['url' ]
93+ self .response ['headers' ]['url' ] = str (parsed_json ['url' ])
94+
95+ def parseRegularResponse (self ):
96+ headers = self .handler .headers
97+ self .response ['headers' ]['original_status' ] = str (headers .get ('original_status' ))
98+ self .response ['headers' ]['pc_status' ] = str (headers .get ('pc_status' ))
99+ self .response ['headers' ]['url' ] = str (headers .get ('url' ))
0 commit comments