forked from abhishek-ch/MachineLearning-using-R
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTwitterBasics.R
More file actions
238 lines (174 loc) · 7.4 KB
/
TwitterBasics.R
File metadata and controls
238 lines (174 loc) · 7.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#https://sites.google.com/site/miningtwitter/questions/talking-about/wordclouds/wordcloud1
#http://www.rdatamining.com/examples/text-mining
#https://sites.google.com/site/miningtwitter/basics/getting-data/by-twitter
#http://www.r-bloggers.com/r-text-mining-on-twitter-prayformh370-malaysia-airlines/
#twitter authentication - http://thinktostart.com/twitter-authentification-with-r/
#my twitter developer App
#https://apps.twitter.com/app/6536937/keys
library(devtools)
#install_github is package of devtools
install_github("geoffjentry/twitteR", username="geoffjentry")
library(RCurl)
library(twitteR)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(devtools)
library(stringr)
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
apiKey <- "MIgAEnO0XHTPKdMv3qiGKr6nu"
apiSecret <- "CMYO2quM7fUzcVuvx8JjALiKjC9cnpXeJFqQLtv2pnECJCCZKz"
access_token <- "69009666-XkI1bcxXtE4qXfOtbRYCgkiJJvpCfsmS0fq4OSq9d"
access_token_secret <- "w89WtxJDAwakPToMqoFtpQYJIfht6YS3a8136hpcyW7eG"
setup_twitter_oauth(apiKey,apiSecret,access_token,access_token_secret)
mach_tweets = searchTwitter("#websummit", n=2000)
#extract the text from tweets in a vector as R easily understands that
#laterI will store the same in file so that I could do the analysis in Java/Python :)
#as its easy over there ..hopefully
mach_text = sapply(mach_tweets, function(x) x$getText())
#to avoid the error http://stackoverflow.com/questions/9637278/r-tm-package-invalid-input-in-utf8towcs
mach_text=str_replace_all(mach_text,"[^[:graph:]]", " ")
#convTweets <- iconv(mach_text, to = "utf-8")
# create a corpus
mach_corpus = Corpus(VectorSource(mach_text))
#tm_map(mach_corpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
#tm_map(mach_corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
options(mc.cores=2)
# create document term matrix applying some transformations
tdm = TermDocumentMatrix(mach_corpus,
control = list(removePunctuation = TRUE,
stopwords = c("for", "all", "this","your","try","from","its","off",
"has","well","are","will","hey","let","the","but",
"that","can","make","open","get","out",stopwords("english")),
removeNumbers = TRUE, tolower = TRUE,
minWordLength=1)
)
findFreqTerms(tdm, lowfreq=7)
findAssocs(tdm, 'awesome', 0.30)
# define tdm as matrix
m = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing=TRUE)
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#http://www.r-bloggers.com/word-cloud-in-r/
# plot wordcloud
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"),freq=6)
pal <- brewer.pal(8, "Dark2")
pal <- pal[-(1:2)]
wordcloud(dm$word, dm$freq,scale=c(1,2),min.freq=10,max.words=100, random.order=F, rot.per=.15, colors=pal, vfont=c("sans serif","plain"))
# save the image in png format
png("summit.png", width=12, height=8, units="in", res=300)
wordcloud(dm$word, dm$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))
dev.off()
##################################Sentiment Analysis################################
#https://sites.google.com/site/miningtwitter/questions/sentiment/sentiment
#http://stackoverflow.com/questions/15194436/is-there-any-other-package-other-than-sentiment-to-do-sentiment-analysis-in-r
install.packages("C:/Users/achoudhary/Downloads/Rstem_0.4-1.zip", repos = NULL, type="source")
install.packages("Rstem", repos = "http://www.omegahat.org/R", type="source")
download.file("http://cran.r-project.org/src/contrib/Archive/sentiment/sentiment_0.2.tar.gz", "sentiment.tar.gz")
install.packages("sentiment.tar.gz", repos=NULL, type="source")
library(sentiment)
library(twitteR)
library(plyr)
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
library(stringr)
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
apiKey <- "MIgAEnO0XHTPKdMv3qiGKr6nu"
apiSecret <- "CMYO2quM7fUzcVuvx8JjALiKjC9cnpXeJFqQLtv2pnECJCCZKz"
access_token <- "69009666-XkI1bcxXtE4qXfOtbRYCgkiJJvpCfsmS0fq4OSq9d"
access_token_secret <- "w89WtxJDAwakPToMqoFtpQYJIfht6YS3a8136hpcyW7eG"
setup_twitter_oauth(apiKey,apiSecret,access_token,access_token_secret)
#fetch tweets
nexus = searchTwitter("#websummit", n=2500)
# get the text
nexus = sapply(nexus, function(x) x$getText())
#Avoid the non utf-8 characters
nexus=str_replace_all(nexus,"[^[:graph:]]", " ")
#Copy paste of direct cleaning of String
#based on general
# remove retweet entities
nexus = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
# remove at people
nexus = gsub("@\\w+", "", some_txt)
# remove punctuation
nexus = gsub("[[:punct:]]", "", nexus)
# remove numbers
nexus = gsub("[[:digit:]]", "", nexus)
# remove html links
nexus = gsub("http\\w+", "", nexus)
# remove unnecessary spaces
nexus = gsub("[ \t]{2,}", "", nexus)
nexus = gsub("^\\s+|\\s+$", "", nexus)
# define "tolower error handling" function
try.error = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# lower case using try.error with sapply
nexus = sapply(nexus, try.error)
# remove NAs in some_txt
nexus = nexus[!is.na(nexus)]
names(nexus) = NULL
# classify emotion
class_emo = classify_emotion(nexus, algorithm="bayes", prior=1.0)
# get emotion best fit
emotion = class_emo[,7]
# substitute NA's by "unknown"
emotion[is.na(emotion)] = "unknown"
# classify polarity
class_pol = classify_polarity(nexus, algorithm="bayes")
# get polarity best fit
polarity = class_pol[,4]
# data frame with results
sent_df = data.frame(text=nexus, emotion=emotion,
polarity=polarity, stringsAsFactors=FALSE)
# sort data frame
sent_df = within(sent_df,
emotion <- factor(emotion, levels=names(sort(table(emotion), decreasing=TRUE))))
# plot distribution of emotions
ggplot(sent_df, aes(x=emotion)) +
geom_bar(aes(y=..count.., fill=emotion)) +
scale_fill_brewer(palette="Dark2") +
labs(x="emotion categories", y="number of tweets") +
ggtitle("Google Neus 6 Tweets \n(classification by emotion)"
)
# plot distribution of polarity
ggplot(sent_df, aes(x=polarity)) +
geom_bar(aes(y=..count.., fill=polarity)) +
scale_fill_brewer(palette="RdGy") +
labs(x="polarity categories", y="number of tweets") +
ggtitle("Sentiment Analysis of Tweets about WebSummit\n(Dublin WebSummit)")
# separating text by emotion
emos = levels(factor(sent_df$emotion))
nemo = length(emos)
emo.docs = rep("", nemo)
for (i in 1:nemo)
{
tmp = nexus[emotion == emos[i]]
emo.docs[i] = paste(tmp, collapse=" ")
}
# remove stopwords
emo.docs = removeWords(emo.docs, stopwords("english"))
# create corpus
corpus = Corpus(VectorSource(emo.docs))
tdm = TermDocumentMatrix(corpus)
tdm = as.matrix(tdm)
colnames(tdm) = emos
# comparison word cloud
comparison.cloud(tdm, colors = brewer.pal(nemo, "Dark2"),
scale = c(3,.5), random.order = FALSE, title.size = 1.5)