@@ -76,13 +76,23 @@ def clean(text):
7676def text_length (i ):
7777 return len (clean (i .text_content () or "" ))
7878
79+ regexp_type = type (re .compile ('hello, world' ))
80+
81+ def compile_pattern (elements ):
82+ if not elements :
83+ return None
84+ if isinstance (elements , regexp_type ):
85+ return elements
86+ if isinstance (elements , basestring ):
87+ elements = elements .split (',' )
88+ return re .compile (u'|' .join ([re .escape (x .lower ()) for x in elements ]), re .U )
7989
8090class Document :
8191 """Class to build a etree document out of html."""
8292 TEXT_LENGTH_THRESHOLD = 25
8393 RETRY_LENGTH = 250
8494
85- def __init__ (self , input , ** options ):
95+ def __init__ (self , input , positive_keywords = None , negative_keywords = None , ** options ):
8696 """Generate the document
8797
8898 :param input: string of the html content.
@@ -93,19 +103,24 @@ def __init__(self, input, **options):
93103 - min_text_length:
94104 - retry_length:
95105 - url: will allow adjusting links to be absolute
96-
106+ - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
107+ - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
108+ Also positive_keywords and negative_keywords could be a regexp.
97109 """
98110 self .input = input
99111 self .options = options
100112 self .html = None
113+ self .encoding = None
114+ self .positive_keywords = compile_pattern (positive_keywords )
115+ self .negative_keywords = compile_pattern (negative_keywords )
101116
102117 def _html (self , force = False ):
103118 if force or self .html is None :
104119 self .html = self ._parse (self .input )
105120 return self .html
106121
107122 def _parse (self , input ):
108- doc = build_doc (input )
123+ doc , self . encoding = build_doc (input )
109124 doc = html_cleaner .clean_html (doc )
110125 base_href = self .options .get ('url' , None )
111126 if base_href :
@@ -311,19 +326,25 @@ def score_paragraphs(self, ):
311326
312327 def class_weight (self , e ):
313328 weight = 0
314- if e .get ('class' , None ):
315- if REGEXES ['negativeRe' ].search (e .get ('class' )):
316- weight -= 25
329+ for feature in [e .get ('class' , None ), e .get ('id' , None )]:
330+ if feature :
331+ if REGEXES ['negativeRe' ].search (feature ):
332+ weight -= 25
333+
334+ if REGEXES ['positiveRe' ].search (feature ):
335+ weight += 25
336+
337+ if self .positive_keywords and self .positive_keywords .search (feature ):
338+ weight += 25
317339
318- if REGEXES [ 'positiveRe' ]. search (e . get ( 'class' ) ):
319- weight + = 25
340+ if self . negative_keywords and self . negative_keywords . search (feature ):
341+ weight - = 25
320342
321- if e .get ('id' , None ):
322- if REGEXES ['negativeRe' ].search (e .get ('id' )):
323- weight -= 25
343+ if self .positive_keywords and self .positive_keywords .match ('tag-' + e .tag ):
344+ weight += 25
324345
325- if REGEXES [ 'positiveRe' ]. search ( e . get ( 'id' ) ):
326- weight + = 25
346+ if self . negative_keywords and self . negative_keywords . match ( 'tag-' + e . tag ):
347+ weight - = 25
327348
328349 return weight
329350
@@ -569,6 +590,8 @@ def main():
569590 parser = OptionParser (usage = "%prog: [options] [file]" )
570591 parser .add_option ('-v' , '--verbose' , action = 'store_true' )
571592 parser .add_option ('-u' , '--url' , default = None , help = "use URL instead of a local file" )
593+ parser .add_option ('-p' , '--positive-keywords' , default = None , help = "positive keywords (separated with comma)" , action = 'store' )
594+ parser .add_option ('-n' , '--negative-keywords' , default = None , help = "negative keywords (separated with comma)" , action = 'store' )
572595 (options , args ) = parser .parse_args ()
573596
574597 if not (len (args ) == 1 or options .url ):
@@ -581,11 +604,14 @@ def main():
581604 file = urllib .urlopen (options .url )
582605 else :
583606 file = open (args [0 ], 'rt' )
584- enc = sys .__stdout__ .encoding or 'utf-8'
607+ enc = sys .__stdout__ .encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
585608 try :
586609 print Document (file .read (),
587610 debug = options .verbose ,
588- url = options .url ).summary ().encode (enc , 'replace' )
611+ url = options .url ,
612+ positive_keywords = options .positive_keywords ,
613+ negative_keywords = options .negative_keywords ,
614+ ).summary ().encode (enc , 'replace' )
589615 finally :
590616 file .close ()
591617
0 commit comments