1616from .htmls import get_body
1717from .htmls import get_title
1818from .htmls import shorten_title
19- from .compat import str_
19+ from .compat import str_ , bytes_
2020from .debug import describe , text_content
2121
2222
@@ -67,14 +67,18 @@ def text_length(i):
6767def compile_pattern (elements ):
6868 if not elements :
6969 return None
70+ elif isinstance (elements , (str_ , bytes_ )):
71+ if isinstance (elements , bytes_ ):
72+ elements = str_ (elements , 'utf-8' )
73+ elements = elements .split (u',' )
74+ return re .compile (u'|' .join ([re .escape (x .lower ()) for x in elements ]), re .U )
7075 elif isinstance (elements , (list , tuple )):
7176 return list (elements )
7277 elif isinstance (elements , regexp_type ):
7378 return elements
7479 else :
80+ raise Exception ("Unknown format for the pattern" )
7581 # assume string or string like object
76- elements = elements .split (',' )
77- return re .compile (u'|' .join ([re .escape (x .lower ()) for x in elements ]), re .U )
7882
7983class Document :
8084 """Class to build a etree document out of html."""
@@ -84,16 +88,18 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
8488 """Generate the document
8589
8690 :param input: string of the html content.
87- :param positive_keywords: regex or list of patterns in classes and ids
88- :param negative_keywords: regex or list of patterns in classes and ids
91+ :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
92+ :param negative_keywords: regex, list or comma-separated string in classes and ids
8993 :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
9094 :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
9195 :param xpath: If set to True, adds x="..." attribute to each HTML node,
9296 containing xpath path pointing to original document path (allows to
9397 reconstruct selected summary in original document).
9498
95- Example :
99+ Examples :
96100 positive_keywords=["news-item", "block"]
101+ positive_keywords=["news-item, block"]
102+ positive_keywords=re.compile("news|block")
97103 negative_keywords=["mysidebar", "related", "ads"]
98104
99105 The Document class is not re-enterable.
0 commit comments