-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.txt
More file actions
20 lines (17 loc) · 753 Bytes
/
tokenizer.txt
File metadata and controls
20 lines (17 loc) · 753 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
def get_bigrams(myString):
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(myString)
stemmer = PorterStemmer()
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
for bigram_tuple in bigrams:
x = "%s %s" % bigram_tuple
tokens.append(x)
result = [' '.join([stemmer.stem(w).lower() for w in x.split()]) for x in tokens if x.lower() not in stopwords.words('english') and len(x) > 8]
return result