forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCode.py
More file actions
306 lines (231 loc) · 9.33 KB
/
Code.py
File metadata and controls
306 lines (231 loc) · 9.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import numpy as np
import nltk
nltk.download('vader_lexicon')
from nltk.stem import PorterStemmer
import re
import operator
import random
import scipy.stats
import numpy as np
from scipy.stats import multivariate_normal
from nltk.sentiment.vader import SentimentIntensityAnalyzer
numberAspects = 7
oratings = []
def getStopWords(filename):
file = open(filename, "r")
words = []
for line in file:
line = line.strip()
if len(line) == 0:
continue
if line not in words:
words.append(line)
return words
def parseFeatureWords(filename):
featureWords = []
with open(filename) as file:
for i in range(numberAspects):
line = file.readline()
words = line.split()
# get rid of aspect name
words.pop(0)
# lowercase, stemming, remove stop words
ps = PorterStemmer()
wordArray = []
for word in words:
word = ps.stem(word)
word = word.lower()
wordArray.append(word)
featureWords.append(wordArray)
'''
featureWords.append(words)
'''
return featureWords
# preprocess data set
# remove reviews with missing aspect rating or doc length less than 50 words
# convert all words to lowercase
# remove punctuations and stop words, and terms occurring in less than 10 reviews
# use stemming
def parseReviews(filename, stopWords):
reviews = [] # list of parsed reviews, each review is a list of sentences
ratings = []
vocab = []
with open(filename) as file:
while (True):
authorLine = file.readline()
if len(authorLine) == 0:
# reached end of file
break
contentLine = file.readline()
dateLine = file.readline()
ratingLine = file.readline()
blankLine = file.readline()
contentWords = contentLine.split('>')[1]
if len(contentWords) < 50:
continue
# skip if a rating is missing
skip = False
ratingsList = ratingLine.split('>')[1].split()
ratingsInts = []
isOverallRating = 1
for rating in ratingsList:
if isOverallRating == 1:
isOverallRating = 0
oratings.append(int(rating))
if rating == '-1':
# print("found -1")
skip = True
break
else:
ratingsInts.append(int(rating))
if skip:
continue
ratingsInts.pop(0)
ratings.append(ratingsInts)
# split by sentence
sentences = re.split(r'[!.?]+', contentWords)
finalContentWords = []
for sentence in sentences:
if len(sentence) == 0:
continue
# remove punctuation
punctuations = '''!()-[]{};:'"\,./?@#$%^&*_~'''
for char in sentence:
if char in punctuations:
sentence = sentence.replace(char, "")
words = sentence.split()
if len(words) == 0:
continue
# lowercase, stemming, remove stop words
ps = PorterStemmer()
wordArray = []
for word in words:
word = ps.stem(word)
word = word.lower()
if word not in stopWords:
wordArray.append(word)
if word not in vocab:
vocab.append(word)
if len(wordArray) > 0:
finalContentWords.append(wordArray)
reviews.append(finalContentWords)
return reviews, ratings, vocab
# go sentence by sentence
# classify a sentence as describing the topic whose feature words it has the most of
def assignTopics(reviews, featureWords):
topicAssignments = [] # 2D array, a review has a topic assignment per sentence
for review in reviews:
assignments = [] # get a topic number per sentence
for sentence in review:
counter = {} # counter dictionary, topic to feature word count
for i in range(len(featureWords)):
counter[i] = 0
for word in sentence:
for topic in range(len(featureWords)):
topicFeatureWords = featureWords[topic]
if word in topicFeatureWords:
counter[topic] += 1
# assign topic number to the sentence
topicNum = max(counter.items(), key=operator.itemgetter(1))[0]
assignments.append(topicNum)
topicAssignments.append(assignments)
return topicAssignments
# returns ratings of each topic for each review by sentiment analysis
def assignTopicRatings(reviews, topicAssignments, aspects):
topicRatings = []
sentimentAnalyzer = SentimentIntensityAnalyzer()
for i in range(len(reviews)):
review = reviews[i]
ratings = [] # topic ratings for this review
for j in range(len(aspects)):
ratings.append(0)
for j in range(len(review)):
sentence = review[j]
topic = topicAssignments[i][j]
rating = 0
for word in sentence:
sentiment = sentimentAnalyzer.polarity_scores(word)['compound']
rating += sentiment
# find average of sentiments in the sentence, between [-1, 1]
rating /= len(sentence)
ratings[topic] += rating
for k in range(len(aspects)):
rating = ratings[k]
rating *= 2 # range = [-2, 2]
rating += 3 # range = [1, 5]
ratings[k] = rating
topicRatings.append(ratings)
return topicRatings
def assignAspectWeight(topicRatings) :
topic_weights = []
for review in topicRatings:
tmp_weight = [0] * 7
iters = 0
curr_prob = 0
aspect_weight = []
while iters < 100:
# get random weight values
for i in range(len(aspect_weight)):
tmp_weight[i] = round(random.uniform(0,1),3)
# normalize vector
total_val = sum(aspect_weight)
for i in range(len(aspect_weight)):
tmp_weight[i] = round(tmp_weight[i]/total_val, 4)
# mean is equal to the actual overall rating
mean = oratings[0]
#std dev is the
variance = sum((x - mean)**2 for x in oratings)/ len(oratings)
stdev = variance ** 0.5
# find overall rating from rating and weight
pred_orating = sum([a*b for a,b in zip(topicRatings[0], tmp_weight)])
#calculate prob of getting that rating from norm dist
test_prob = scipy.stats.multivariate_normal(mean, stdev, 0.6).pdf(pred_orating)
# see if prob is higher than existing prob and if it is change aspect_weight to the new weight
if(test_prob > curr_prob):
curr_prob = test_prob
aspect_weight = tmp_weight
iters += 1
topic_weights.append(aspect_weight)
# find predicted overall rating for the reviews
pred_oratings = []
for i in range(len(topicRatings)):
pred_oratings.append(sum([a*b for a,b in zip(topicRatings[i], topic_weights[i])]))
return topic_weights, pred_oratings
# calculate mean square error between actual ratings and the topic ratinfs found
def findMSE(topicRatings, ratings):
#calculate difference between these two arrays --> square it --> sum it up
total_sum = 0
for i in range(len(topicRatings)):
mse = sum([(a-b)**2 for a,b in zip(ratings[i], topicRatings[i])])
total_sum += mse
mse = total_sum/(len(topicRatings[0]) * len(topicRatings))
return mse
def main():
aspects = ["Value", "Rooms", "Location", "Cleanliness",
"Check in/front desk", "Service", "Business service"]
stopWords = getStopWords("Data/StopWords.txt")
featureWords = parseFeatureWords("Data/FeatureWords.txt")
# reviews is a 3D array
# sentence is array of words
# a review is array of sentences
# ratings is 2D array
reviews, ratings, vocab = parseReviews("Data/Reviews/hotel_72572_parsed.txt", stopWords)
topicAssignments = assignTopics(reviews, featureWords)
# print(topicAssignments)
# get topic ratings
topicRatings = assignTopicRatings(reviews, topicAssignments, aspects)
# print('\n')
topicWeights, pred_oratings = assignAspectWeight(topicRatings)
# print('\n')
# print(pred_oratings)
mse_val = findMSE(topicRatings, ratings)
print("This is MSE: ", mse_val)
print("\n")
for i in range(len(topicRatings)):
print("This is topic Rating:")
print(topicRatings[i])
print("This is topic weight:")
print(topicWeights[i])
print("\n")
if __name__ == '__main__':
main()