@@ -67,6 +67,8 @@ def text_length(i):
6767def compile_pattern (elements ):
6868 if not elements :
6969 return None
70+ elif isinstance (elements , (list , tuple )):
71+ return list (elements )
7072 elif isinstance (elements , regexp_type ):
7173 return elements
7274 else :
@@ -78,7 +80,7 @@ class Document:
7880 """Class to build a etree document out of html."""
7981
8082 def __init__ (self , input , positive_keywords = None , negative_keywords = None ,
81- url = None , min_text_length = 25 , retry_length = 250 , ):
83+ url = None , min_text_length = 25 , retry_length = 250 , xpath = False ):
8284 """Generate the document
8385
8486 :param input: string of the html content.
@@ -99,10 +101,16 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
99101 self .url = url
100102 self .min_text_length = min_text_length
101103 self .retry_length = retry_length
104+ self .xpath = xpath
102105
103106 def _html (self , force = False ):
104107 if force or self .html is None :
105108 self .html = self ._parse (self .input )
109+ if self .xpath :
110+ root = self .html .getroottree ()
111+ for i in self .html .getiterator ():
112+ #print root.getpath(i)
113+ i .attrib ['x' ] = root .getpath (i )
106114 return self .html
107115
108116 def _parse (self , input ):
0 commit comments