Skip to content

Commit 0c8f040

Browse files
committed
Updated docs for positive_keywords and negative_keywords, cleaner implementation.
1 parent 0e50b53 commit 0c8f040

3 files changed

Lines changed: 16 additions & 6 deletions

File tree

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ clean_venv:
3131
rm -rf .venv
3232

3333
develop: .venv/lib/python*/site-packages/readability-lxml.egg-link
34+
3435
.venv/lib/python*/site-packages/readability-lxml.egg-link:
3536
$(PY) setup.py develop
3637

readability/compat/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
"""
77
import sys
88
if sys.version_info[0] == 2:
9+
bytes_ = str
910
str_ = unicode
11+
1012
elif sys.version_info[0] == 3:
13+
bytes_ = bytes
1114
str_ = str

readability/readability.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from .htmls import get_body
1717
from .htmls import get_title
1818
from .htmls import shorten_title
19-
from .compat import str_
19+
from .compat import str_, bytes_
2020
from .debug import describe, text_content
2121

2222

@@ -67,14 +67,18 @@ def text_length(i):
6767
def compile_pattern(elements):
6868
if not elements:
6969
return None
70+
elif isinstance(elements, (str_, bytes_)):
71+
if isinstance(elements, bytes_):
72+
elements = str_(elements, 'utf-8')
73+
elements = elements.split(u',')
74+
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
7075
elif isinstance(elements, (list, tuple)):
7176
return list(elements)
7277
elif isinstance(elements, regexp_type):
7378
return elements
7479
else:
80+
raise Exception("Unknown format for the pattern")
7581
# assume string or string like object
76-
elements = elements.split(',')
77-
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
7882

7983
class Document:
8084
"""Class to build a etree document out of html."""
@@ -84,16 +88,18 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
8488
"""Generate the document
8589
8690
:param input: string of the html content.
87-
:param positive_keywords: regex or list of patterns in classes and ids
88-
:param negative_keywords: regex or list of patterns in classes and ids
91+
:param positive_keywords: regex, list or comma-separated string of patterns in classes and ids
92+
:param negative_keywords: regex, list or comma-separated string in classes and ids
8993
:param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts.
9094
:param retry_length: Tunable. Set to a lower value for better detection of very small texts.
9195
:param xpath: If set to True, adds x="..." attribute to each HTML node,
9296
containing xpath path pointing to original document path (allows to
9397
reconstruct selected summary in original document).
9498
95-
Example:
99+
Examples:
96100
positive_keywords=["news-item", "block"]
101+
positive_keywords=["news-item, block"]
102+
positive_keywords=re.compile("news|block")
97103
negative_keywords=["mysidebar", "related", "ads"]
98104
99105
The Document class is not re-enterable.

0 commit comments

Comments
 (0)