code linting

adbar · adbar · commit bd8293eb637e · 2020-01-30T16:56:38.000+01:00
diff --git a/readability/cleaners.py b/readability/cleaners.py
@@ -14,18 +14,21 @@
     ">"        # end
 , re.I)
 
+
 def clean_attributes(html):
     while htmlstrip.search(html):
         html = htmlstrip.sub('<\\1\\2>', html)
     return html
 
+
 def normalize_spaces(s):
     if not s:
         return ''
     """replace any sequence of whitespace
     characters with a single space"""
     return ' '.join(s.split())
 
+
 html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                   style=True, links=True, meta=False, add_nofollow=False,
                   page_structure=False, processing_instructions=True, embedded=False,
diff --git a/readability/debug.py b/readability/debug.py
@@ -49,5 +49,3 @@ def text_content(elem, length=40):
     if len(content) < length:
         return content
     return content[:length] + '...'
-
-
diff --git a/readability/htmls.py b/readability/htmls.py
@@ -1,29 +1,31 @@
 from lxml.html import tostring
-import logging
 import lxml.html
-import re, sys
+import re
 
 from .cleaners import normalize_spaces, clean_attributes
 from .encoding import get_encoding
 from .compat import str_
 
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
+
 def build_doc(page):
     if isinstance(page, str_):
         encoding = None
         decoded_page = page
     else:
         encoding = get_encoding(page) or 'utf-8'
         decoded_page = page.decode(encoding, 'replace')
-    
+
     # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
     doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
     return doc, encoding
 
+
 def js_re(src, pattern, flags, repl):
     return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
 
+
 def normalize_entities(cur_title):
     entities = {
         u'\u2014':'-',
@@ -41,26 +43,31 @@ def normalize_entities(cur_title):
 
     return cur_title
 
+
 def norm_title(title):
     return normalize_entities(normalize_spaces(title))
 
+
 def get_title(doc):
     title = doc.find('.//title')
     if title is None or title.text is None or len(title.text) == 0:
         return '[no-title]'
 
     return norm_title(title.text)
 
+
 def add_match(collection, text, orig):
     text = norm_title(text)
     if len(text.split()) >= 2 and len(text) >= 15:
         if text.replace('"', '') in orig.replace('"', ''):
             collection.add(text)
 
+
 TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
                         '.news_title', '.title', '.head', '.heading',
                         '.contentheading', '.small_header_red']
 
+
 def shorten_title(doc):
     title = doc.find('.//title')
     if title is None or title.text is None or len(title.text) == 0:
@@ -109,6 +116,8 @@ def shorten_title(doc):
 
     return title
 
+
+# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
 def get_body(doc):
     for elem in doc.xpath('.//script | .//link | .//style'):
         elem.drop_tree()
diff --git a/readability/readability.py b/readability/readability.py
@@ -4,7 +4,6 @@
 import re
 import sys
 
-from collections import defaultdict
 from lxml.etree import tostring
 from lxml.etree import tounicode
 from lxml.html import document_fromstring
@@ -56,7 +55,6 @@ def to_int(x):
 def clean(text):
     # Many spaces make the following regexes run forever
     text = re.sub(r'\s{255,}', ' ' * 255, text)
-
     text = re.sub(r'\s*\n\s*', '\n', text)
     text = re.sub(r'\t|[ \t]{2,}', ' ', text)
     return text.strip()
@@ -65,12 +63,11 @@ def clean(text):
 def text_length(i):
     return len(clean(i.text_content() or ""))
 
-regexp_type = type(re.compile('hello, world'))
 
 def compile_pattern(elements):
     if not elements:
         return None
-    elif isinstance(elements, regexp_type):
+    elif isinstance(elements, re._pattern_type):
         return elements
     elif isinstance(elements, (str_, bytes_)):
         if isinstance(elements, bytes_):
@@ -82,6 +79,7 @@ def compile_pattern(elements):
         raise Exception("Unknown type for the pattern: {}".format(type(elements)))
         # assume string or string like object
 
+
 class Document:
     """Class to build a etree document out of html."""
 
@@ -98,9 +96,9 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
         :param xpath: If set to True, adds x="..." attribute to each HTML node,
         containing xpath path pointing to original document path (allows to
         reconstruct selected summary in original document).
-        :param handle_failures: Parameter passed to `lxml` for handling failure during exception. 
+        :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
         Support options = ["discard", "ignore", None]
-        
+
         Examples:
             positive_keywords=["news-item", "block"]
             positive_keywords=["news-item, block"]
@@ -290,7 +288,7 @@ def select_best_candidate(self, candidates):
             return None
 
         sorted_candidates = sorted(
-            candidates.values(), 
+            candidates.values(),
             key=lambda x: x['content_score'],
             reverse=True
         )
@@ -517,10 +515,10 @@ def sanitize(self, node, candidates):
 
                 #if el.tag == 'div' and counts["img"] >= 1:
                 #    continue
-                if counts["p"] and counts["img"] > 1+counts["p"]*1.3:
+                if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
                     reason = "too many images (%s)" % counts["img"]
                     to_remove = True
-                elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
+                elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
                     reason = "more <li>s than <p>s"
                     to_remove = True
                 elif counts["input"] > (counts["p"] / 3):