11#!/usr/bin/env python
22from BeautifulSoup import NavigableString
3- from page_parser import parse
3+ from page_parser import parse , get_title , get_body
44import logging
55import re
66
@@ -33,24 +33,38 @@ def __init__(self, input, **options):
3333 self .options = defaultdict (lambda : None )
3434 for k , v in options .items ():
3535 self .options [k ] = v
36+ self .html = None
3637
37- def make_html (self ):
38- self .html = parse (self .input , self .options ['url' ])
39-
38+ def _html (self , force = False ):
39+ if force or self .html is None :
40+ self .html = parse (self .input , self .options ['url' ])
41+ return self .html
42+
4043 def content (self ):
44+ return get_body (self ._html ())
45+
46+ def title (self ):
47+ return get_title (self ._html ())
48+
49+ def summary (self ):
4150 ruthless = True
4251 while True :
43- self .make_html ( )
52+ self ._html ( True )
4453 [i .extract () for i in self .tags (self .html , 'script' , 'style' )]
4554
4655 if ruthless : self .remove_unlikely_candidates ()
4756 self .transform_misused_divs_into_paragraphs ()
4857 candidates = self .score_paragraphs (self .options .get ('min_text_length' , self .TEXT_LENGTH_THRESHOLD ))
4958 best_candidate = self .select_best_candidate (candidates )
50- if ruthless and best_candidate is None :
51- ruthless = False
52- continue
53- article = self .get_article (candidates , best_candidate )
59+ if best_candidate :
60+ article = self .get_article (candidates , best_candidate )
61+ else :
62+ if ruthless :
63+ ruthless = False
64+ # try again
65+ continue
66+ else :
67+ article = self .html .find ('body' ) or self .html
5468
5569 cleaned_article = self .sanitize (article , candidates )
5670 of_acceptable_length = len (cleaned_article or '' ) >= (self .options ['retry_length' ] or self .RETRY_LENGTH )
@@ -88,16 +102,19 @@ def get_article(self, candidates, best_candidate):
88102 if append :
89103 output .append (sibling )
90104
105+ if not output : output .append (best_candidate )
91106 return output
92107
93108 def select_best_candidate (self , candidates ):
94109 sorted_candidates = sorted (candidates .values (), key = lambda x : x ['content_score' ], reverse = True )
95- self .debug ("Top 5 canidates :" )
110+ self .debug ("Top 5 candidates :" )
96111 for candidate in sorted_candidates [:5 ]:
97112 elem = candidate ['elem' ]
98113 self .debug ("Candidate %s with score %s" % (describe (elem ), candidate ['content_score' ]))
99114
100- best_candidate = sorted_candidates [0 ] if len (sorted_candidates ) > 1 else { 'elem' : self .html .find ("body" ), 'content_score' : 0 }
115+ if len (sorted_candidates ) == 0 :
116+ return None
117+ best_candidate = sorted_candidates [0 ]
101118 self .debug ("Best candidate %s with score %s" % (describe (best_candidate ['elem' ]), best_candidate ['content_score' ]))
102119 return best_candidate
103120
@@ -108,7 +125,7 @@ def get_link_density(self, elem):
108125
109126 def score_paragraphs (self , min_text_length ):
110127 candidates = {}
111- elems = self .html . findAll ( "p" ) + self .html . findAll ( "td" )
128+ elems = self .tags ( self .html , "p" , "td" )
112129
113130 for elem in elems :
114131 parent_node = elem .parent
@@ -201,7 +218,7 @@ def sanitize(self, node, candidates):
201218 for header in self .tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
202219 if self .class_weight (header ) < 0 or self .get_link_density (header ) > 0.33 : header .extract ()
203220
204- for elem in self .tags (node , "form" , "object" , " iframe" , "embed " ):
221+ for elem in self .tags (node , "form" , "iframe" ):
205222 elem .extract ()
206223
207224 # remove empty <p> tags
@@ -265,7 +282,7 @@ def sanitize(self, node, candidates):
265282 if not (self .options ['attributes' ]):
266283 el .attrMap = {}
267284
268- return str (node )
285+ return unicode (node )
269286
270287class HashableElement ():
271288 def __init__ (self , node ):
@@ -312,7 +329,7 @@ def main():
312329 else :
313330 file = open (args [0 ])
314331 try :
315- print Document (file .read (), debug = options .verbose ).content ( )
332+ print Document (file .read (), debug = options .verbose ).summary (). encode ( 'ascii' , 'ignore' )
316333 finally :
317334 file .close ()
318335
0 commit comments