forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyze.py
More file actions
74 lines (58 loc) · 2.13 KB
/
analyze.py
File metadata and controls
74 lines (58 loc) · 2.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
import string
# Read in csv as dataframe
df = pd.read_csv('scraper4.csv')
# Positive sentiment: reviews w/ scores > 3
# Negative sentiment: reviews w/ scores <= 3
df['Sentiment'] = np.where(df['Score'] <= 3, -1, 1)
# Get positive reviews
positive = df[df['Sentiment'] == 1]
# Get negative reviews
negative = df[df['Sentiment'] == -1]
# Display histogram of positive and negative sentiment
df['Sentiments'] = np.where(df['Sentiment'] == -1, 'negative', 'positive')
fig = px.histogram(df, x="Sentiments")
fig.update_layout(title_text='Apartments Sentiments in Chicago')
fig.show()
# Remove all blank entries
df = df.dropna(subset=['Summary'])
# Remove all punctuations
for category, data in df.items():
if (category == 'Summary'):
for text in data:
text = text.translate(None, string.punctuation)
# Split dataframe into test and train sets: 80% for training, 20% for testing
percentage = 0.7
index = df.index
df['rand_num'] = np.random.randn(len(index))
train = df[df['rand_num'] <= percentage]
test = df[df['rand_num'] > percentage]
# Create bag of words
vectorizer = CountVectorizer()
train_matrix = vectorizer.fit_transform(train['Summary'])
test_matrix = vectorizer.transform(test['Summary'])
# Logistic Regression
log_reg = LogisticRegression()
# Fit logistic regression using training data set
x_train = train_matrix
y_train = train['Sentiment']
log_reg.fit(x_train, y_train)
# Get testing data set
x_test = test_matrix
y_test = test['Sentiment']
# Get prediction using logistic regression
predictions = log_reg.predict(x_test)
# Test model's accuracy:
# Get confusion matrix
print(confusion_matrix(predictions, y_test))
# Print classification report
print(classification_report(predictions, y_test))