forked from buriy/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
97 lines (82 loc) · 2.12 KB
/
test.py
File metadata and controls
97 lines (82 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import readability
from bs4 import BeautifulSoup
from textacy.preprocess import preprocess_text
import json
def cleaning(text):
text = preprocess_text(text, fix_unicode=True)
return text
def cleaning_text(text):
text = preprocess_text(text, no_numbers=True, fix_unicode=True, lowercase=True, no_punct=True)
text = " ".join(text.replace("number", "").split())
return text
response = requests.get('https://www.medcom.id/olahraga/sports-lainnya/GKdWa2dk-dua-emas-jadi-target-timnas-bridge-di-asian-games-2018?utm_source=all&utm_medium=allfeed&utm_campaign=allpartnership')
negative_keywords = [
"sum",
"strong",
"credits",
"header"
]
positive_keywords = [
"detail",
"page",
"content-article"
]
unlikely_candidates = [
"sum",
"related",
"baca",
"juga",
"video",
"inner-link-baca-juga",
"iframe",
"caption"
]
p_exclude = [
'saksikantayanganvideomenarikberikutini:',
'simakvideomenarikberikutdibawah:',
'simakjugavideoberikutini:',
'bacaselengkapnya:',
'bacajuga',
'baca:',
'videopilihan',
'laporanwartawan',
'editor:',
'copyright',
'tags',
'sumber:',
'penulis:',
'pewarta:',
'followinstagramkami'
]
p_exclude = readability.compile_pattern(p_exclude)
doc = readability.Document(
response.text,
negative_keywords=negative_keywords,
positive_keywords=positive_keywords,
unlikely_candidates=unlikely_candidates
)
soup = BeautifulSoup(doc.summary(), "html5lib")
content = []
all_p = []
for p in soup.select("p"):
text_p = cleaning(p.get_text())
if text_p.find("\n") > 0:
all_p.extend(text_p.split("\n"))
else:
all_p.append(text_p)
for text_p in all_p:
if len(text_p) > 0:
try:
if not p_exclude.search(text_p.lower().replace(" ", "")):
content.append(text_p)
except Exception:
content.append(text_p)
content = "\n\n".join(content)
result = {
"title": doc.title(),
"short_title": doc.short_title(),
"content": content,
"content_lower": cleaning_text(content)
}
print(json.dumps(result))