44import re
55import sys
66
7- from collections import defaultdict
87from lxml .etree import tostring
98from lxml .etree import tounicode
109from lxml .html import document_fromstring
@@ -56,7 +55,6 @@ def to_int(x):
5655def clean (text ):
5756 # Many spaces make the following regexes run forever
5857 text = re .sub (r'\s{255,}' , ' ' * 255 , text )
59-
6058 text = re .sub (r'\s*\n\s*' , '\n ' , text )
6159 text = re .sub (r'\t|[ \t]{2,}' , ' ' , text )
6260 return text .strip ()
@@ -65,12 +63,11 @@ def clean(text):
6563def text_length (i ):
6664 return len (clean (i .text_content () or "" ))
6765
68- regexp_type = type (re .compile ('hello, world' ))
6966
7067def compile_pattern (elements ):
7168 if not elements :
7269 return None
73- elif isinstance (elements , regexp_type ):
70+ elif isinstance (elements , re . _pattern_type ):
7471 return elements
7572 elif isinstance (elements , (str_ , bytes_ )):
7673 if isinstance (elements , bytes_ ):
@@ -82,6 +79,7 @@ def compile_pattern(elements):
8279 raise Exception ("Unknown type for the pattern: {}" .format (type (elements )))
8380 # assume string or string like object
8481
82+
8583class Document :
8684 """Class to build a etree document out of html."""
8785
@@ -98,9 +96,9 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
9896 :param xpath: If set to True, adds x="..." attribute to each HTML node,
9997 containing xpath path pointing to original document path (allows to
10098 reconstruct selected summary in original document).
101- :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
99+ :param handle_failures: Parameter passed to `lxml` for handling failure during exception.
102100 Support options = ["discard", "ignore", None]
103-
101+
104102 Examples:
105103 positive_keywords=["news-item", "block"]
106104 positive_keywords=["news-item, block"]
@@ -290,7 +288,7 @@ def select_best_candidate(self, candidates):
290288 return None
291289
292290 sorted_candidates = sorted (
293- candidates .values (),
291+ candidates .values (),
294292 key = lambda x : x ['content_score' ],
295293 reverse = True
296294 )
@@ -517,10 +515,10 @@ def sanitize(self, node, candidates):
517515
518516 #if el.tag == 'div' and counts["img"] >= 1:
519517 # continue
520- if counts ["p" ] and counts ["img" ] > 1 + counts ["p" ]* 1.3 :
518+ if counts ["p" ] and counts ["img" ] > 1 + counts ["p" ]* 1.3 :
521519 reason = "too many images (%s)" % counts ["img" ]
522520 to_remove = True
523- elif counts ["li" ] > counts ["p" ] and tag != "ul" and tag != "ol" :
521+ elif counts ["li" ] > counts ["p" ] and tag not in ( "ol" , "ul" ) :
524522 reason = "more <li>s than <p>s"
525523 to_remove = True
526524 elif counts ["input" ] > (counts ["p" ] / 3 ):
0 commit comments