-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTP002.py
More file actions
104 lines (73 loc) · 2.51 KB
/
TP002.py
File metadata and controls
104 lines (73 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import obo
import pandas as pd
#from stemming.porter2 import stem
import csv
import nltk as nltk
from nltk.stem import WordNetLemmatizer
#stemmer = PorterStemmer()
#stemmer.stem('identified')
lemmatizer = WordNetLemmatizer()
fullwordstring =""
ipic_comments_2015 = pd.read_csv('2015-1.csv', index_col = 0)
ipic_comments_2016 = pd.read_csv('2016-1.csv', index_col = 0)
#print(str(ipic_comments))
# Iterate over rows of 2015 ipic comments
for lab, row in ipic_comments_2015.iterrows() :
if(str(row['IPIC Comment']) == 'nan'):
pass
else:
fullwordstring+= str(row['IPIC Comment'])+ " "
# Iterate over rows of 2016 ipic comments
for lab, row in ipic_comments_2016.iterrows() :
if(str(row['IPIC Comment']) == 'nan'):
pass
else:
fullwordstring+= str(row['IPIC Comment'])+ " "
lower_fullwordstring = fullwordstring.lower()
tokens = obo.myTokenizer(nltk.word_tokenize(lower_fullwordstring))
keyword_tokens = obo.removeMyStopTokens(tokens, obo.myStopTokens)
fulltext = nltk.Text(keyword_tokens)
print(fulltext)
fdist1 = nltk.FreqDist(fulltext)
print(fdist1)
print ("most common 100 words")
words_dist= fdist1.most_common(100)
for word_dist in words_dist:
print(word_dist)
print('\n')
#print(type(fdist1.most_common(1000)))
#fdist1.plot(50, cumulative=True)
print("-----------------------------------")
print("hapaxes")
print(fdist1.hapaxes())
# #documents = [[stem(word) for word in sentence.split(" ")] for sentence in documents]
# print("String\n" + fullwordstring +"\n")
# fullwordlist = fullwordstring.split()
# wordlist = obo.removeStopwords(fullwordlist, obo.stopwords)
# final_wordlist =[]
# for word in wordlist:
# #final_wordlist.append(stemmer.stem(word))
# final_wordlist.append(stem(word))
# wordfreq = {}
# for w in final_wordlist:
# #wordfreq.append(wordlist.count(w))
# wordfreq[w]=final_wordlist.count(w)
# # print("String\n" + fullwordstring +"\n")
# # print("List\n" + str(wordlist) + "\n")
# #print("Frequencies\n" + str(wordfreq) + "\n")
# #print(str(type(zip(wordlist, wordfreq))))
# #print("Pairs\n" + str(zip(wordlist, wordfreq)))
# print(str(wordfreq))
# # with open('word_count.csv','w') as f:
# # w = csv.writer(f)
# # w.writerows(wordfreq.items())
# file = open("word_count1.csv","w")
# #file.write("Hello World")
# #file.write("This is our new text file")
# #file.write("and this is another line.")
# #file.write("Why? Because we can.")
# #print (str(file))
# #file.close()
# for key, value in wordfreq.items() :
# file.write(str(key)+","+str(value)+"\n")
# file.close()