-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformpharses.py
More file actions
38 lines (25 loc) · 969 Bytes
/
formpharses.py
File metadata and controls
38 lines (25 loc) · 969 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
char_splitter = re.compile("[,;!:()|]")
def remove_special_characters(text):
return re.sub("([{}@\"$%\\\*'\"])", "", text)
def generate_phrases(text, stopwords):
text = remove_special_characters(text)
text = " ".join([t if t.isupper() and len(t) > 1 else t.lower() for t in text.split()])
split_text = char_splitter.split(text)
phrases = []
for each_split_text in split_text:
temp = []
words = re.split("\\s+", each_split_text)
previous_stop = False
for w in words:
if w in stopwords and not previous_stop:
temp.append(";")
previous_stop = True
elif w not in stopwords:
temp.append(w.strip())
previous_stop = False
temp.append(";")
phrases.extend(temp)
final_phrases = re.split(";+", ' '.join(phrases))
final_phrases = [p.strip() for p in final_phrases]
return final_phrases