-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathobo.py
More file actions
32 lines (24 loc) · 752 Bytes
/
obo.py
File metadata and controls
32 lines (24 loc) · 752 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# obo.py
def stripTags(pageContents):
startLoc = pageContents.find("<p>")
endLoc = pageContents.rfind("<br/>")
pageContents = pageContents[startLoc:endLoc]
inside = 0
text = ''
for char in pageContents:
if char == '<':
inside = 1
elif (inside == 1 and char == '>'):
inside = 0
elif inside == 1:
continue
else:
text += char
return text
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def stripNonAlphaNum(text):
import re
return re.compile(r'\W+', re.UNICODE).split(text)
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]