@@ -86,12 +86,24 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
8686 :param input: string of the html content.
8787 :param positive_keywords: regex or list of patterns in classes and ids
8888 :param negative_keywords: regex or list of patterns in classes and ids
89- :param min_text_length:
90- :param retry_length:
89+ :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
90+ :param retry_length: Tunable. Set to a lower value for better detection of very small texts.
91+ :param xpath: If set to True, adds x="..." attribute to each HTML node,
92+ containing xpath path pointing to original document path (allows to
93+ reconstruct selected summary in original document).
9194
9295 Example:
9396 positive_keywords=["news-item", "block"]
9497 negative_keywords=["mysidebar", "related", "ads"]
98+
99+ The Document class is not re-enterable.
100+ You need to create a new Document() for each HTML file to process.
101+
102+ Provides four API methods:
103+ .get_title()
104+ .short_title()
105+ .get_content()
106+ .summary()
95107 """
96108 self .input = input
97109 self .html = None
@@ -131,23 +143,33 @@ def _parse(self, input):
131143 return doc
132144
133145 def content (self ):
146+ """Returns full document body"""
134147 return get_body (self ._html (True ))
135148
136149 def title (self ):
150+ """Returns document title"""
137151 return get_title (self ._html (True ))
138152
139153 def short_title (self ):
154+ """Returns cleaned up document title"""
140155 return shorten_title (self ._html (True ))
141156
142157 def get_clean_html (self ):
143- return clean_attributes (tounicode (self .html ))
158+ """
159+ An internal method, which can be overridden in subclasses, for example,
160+ to disable or to improve DOM-to-text conversion in .summary() method
161+ """
162+ return clean_attributes (tounicode (self .html ))
144163
145164 def summary (self , html_partial = False ):
146- """Generate the summary of the html docuemnt
165+ """
166+ Given a HTML file, extracts the text of the article.
147167
148168 :param html_partial: return only the div of the document, don't wrap
149169 in html and body tags.
150170
171+ Warning: It mangles internal DOM representation of the HTML document,
172+ so always use other API methods before this one.
151173 """
152174 try :
153175 ruthless = True
@@ -278,7 +300,7 @@ def get_link_density(self, elem):
278300 total_length = text_length (elem )
279301 return float (link_length ) / max (total_length , 1 )
280302
281- def score_paragraphs (self , ):
303+ def score_paragraphs (self ):
282304 MIN_LEN = self .min_text_length
283305 candidates = {}
284306 ordered = []
@@ -373,6 +395,7 @@ def score_node(self, elem):
373395 }
374396
375397 def remove_unlikely_candidates (self ):
398+ """Utility method"""
376399 for elem in self .html .iter ():
377400 s = "%s %s" % (elem .get ('class' , '' ), elem .get ('id' , '' ))
378401 if len (s ) < 2 :
@@ -382,6 +405,7 @@ def remove_unlikely_candidates(self):
382405 elem .drop_tree ()
383406
384407 def transform_misused_divs_into_paragraphs (self ):
408+ """Utility method"""
385409 for elem in self .tags (self .html , 'div' ):
386410 # transform <div>s that do not contain other block elements into
387411 # <p>s
0 commit comments