MachineLearning-using-R/ExploredSentimentAnalysis.R at master · Pranav2134/MachineLearning-using-R

217 lines (169 loc) · 7.75 KB
#https://sites.google.com/site/miningtwitter/questions/sentiment
#https://sites.google.com/site/miningtwitter/questions/sentiment/analysis
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)
#The sequence of column in CSV does matter the smoot_spline method to work properly
if(Sys.info()["user"] == 'achoudhary'){
  path = "D:/Work/RWorkSpace/Git/"
  path = "/Users/abhishekchoudhary/MachineLearning/"
# function score.sentiment
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
  # Parameters
  # sentences: vector of text to score
  # pos.words: vector of words of postive sentiment
  # neg.words: vector of words of negative sentiment
  # .progress: passed to laply() to control of progress bar
  # create simple array of scores with laply
  scores = laply(sentences,
                 function(sentence, pos.words, neg.words)
                   # remove punctuation
                   sentence = gsub("[[:punct:]]", "", sentence)
                   # remove control characters
                   sentence = gsub("[[:cntrl:]]", "", sentence)
                   # remove digits?
                   sentence = gsub('\\d+', '', sentence)
                   # remove html links
                   sentence = gsub("http\\w+", "", sentence)
                   # remove unnecessary spaces
                   sentence = gsub("[ \t]{2,}", "", sentence)
                   sentence = gsub("^\\s+|\\s+$", "", sentence)
                   # define error handling function when trying tolower
                   tryTolower = function(x)
                     # create missing value
                     # tryCatch error
                     try_error = tryCatch(tolower(x), error=function(e) e)
                     # if not an error
                     if (!inherits(try_error, "error"))
                       y = tolower(x)
                     # result
                     return(y)
                   # use tryTolower with sapply 
                   sentence = sapply(sentence, tryTolower)
                   # split sentence into words with str_split (stringr package)
                   word.list = str_split(sentence, "\\s+")
                   words = unlist(word.list)
                   # compare words to the dictionaries of positive & negative terms
                   pos.matches = match(words, pos.words)
                   neg.matches = match(words, neg.words)
                   # get the position of the matched term or NA
                   # we just want a TRUE/FALSE
                   pos.matches = !is.na(pos.matches)
                   neg.matches = !is.na(neg.matches)
                   # final score
                   score = sum(pos.matches) - sum(neg.matches)
                   return(score)
                 }, pos.words, neg.words, .progress=.progress )
  # data frame with scores for each sentence
  scores.df = data.frame(text=sentences, score=scores)
  return(scores.df)
# import positive and negative words
posText = paste(path,"positive_words.txt", sep = "")
negText = paste(path,"negative_words.txt", sep = "")
# import positive and negative words
pos = readLines(posText)
neg = readLines(negText)
#Twitter API details
reqURL <- "https://api.twitter.com/oauth/request_token"
accessURL <- "https://api.twitter.com/oauth/access_token"
authURL <- "https://api.twitter.com/oauth/authorize"
apiKey <-  "MIgAEnO0XHTPKdMv3qiGKr6nu"
apiSecret <- "CMYO2quM7fUzcVuvx8JjALiKjC9cnpXeJFqQLtv2pnECJCCZKz"
access_token <- "69009666-XkI1bcxXtE4qXfOtbRYCgkiJJvpCfsmS0fq4OSq9d"
access_token_secret <- "w89WtxJDAwakPToMqoFtpQYJIfht6YS3a8136hpcyW7eG"
#setup_twitter_oauth(apiKey,apiSecret,access_token,access_token_secret)
setup_twitter_oauth("MIgAEnO0XHTPKdMv3qiGKr6nu","CMYO2quM7fUzcVuvx8JjALiKjC9cnpXeJFqQLtv2pnECJCCZKz")
#tweets for phone
iphone = searchTwitter("oneplus2", n=2000,lang="en")
nexus6 = searchTwitter("iphone6s", n=2000,lang="en")
samsung = searchTwitter("S6", n=2000,lang="en")
#total tweets of each item
totaltweets = c(length(iphone),length(nexus6),length(samsung))
#join all tweets
allcontents = c(iphone,nexus6,samsung)
allcontents = sapply(allcontents, function(x) x$getText())
#to avoid the error http://stackoverflow.com/questions/9637278/r-tm-package-invalid-input-in-utf8towcs
allcontents=str_replace_all(allcontents,"[^[:graph:]]", " ")
all_corpus = Corpus(VectorSource(allcontents))
# apply function score.sentiment
scores = score.sentiment(allcontents, pos, neg, .progress='text')
# add variables to data frame
scores$phones = factor(rep(c("iPhone", "Nexus6", "Samsung"), totaltweets))
scores$very.pos = as.numeric(scores$score >= 2)
scores$very.neg = as.numeric(scores$score <= -2)
# how many very positives and very negatives
numpos = sum(scores$very.pos)
numneg = sum(scores$very.neg)
# global score
global_score = round( 100 * numpos / (numpos + numneg) )
#http://stackoverflow.com/questions/15651084/ggplot2-mapping-variable-to-y-and-using-stat-bin?lq=1
# barplot of average score
meanscore = tapply(scores$score, scores$phones, mean)
df = data.frame(phones=names(meanscore), meanscore=meanscore)
df$drinks <- reorder(df$phones, df$meanscore)
g1<- ggplot(df, aes(y=meanscore)) +
  geom_bar(data=df, aes(x=phones, fill=phones),stat = "identity") +
  scale_fill_manual(values=cols[order(df$meanscore)]) +
  ggtitle("Average Sentiment Score")
# barplot of average very positive
phone_pos = ddply(scores, .(phones), summarise, mean_pos=mean(very.pos))
phone_pos$phones <- reorder(phone_pos$phones, phone_pos$mean_pos)
g2<- ggplot(phone_pos, aes(y=mean_pos)) +
  geom_bar(data=phone_pos, aes(x=phones, fill=phones),stat = "identity") +
  scale_fill_manual(values=cols[order(phone_pos$mean_pos)]) +
  ggtitle("Average Very Positive Sentiment Score")
# barplot of average very negative
phone_neg = ddply(scores, .(phones), summarise, mean_neg=mean(very.neg))
phone_neg$phones <- reorder(phone_neg$phones, phone_neg$mean_neg)
g3<- ggplot(phone_neg, aes(y=mean_neg)) +
  geom_bar(data=phone_neg, aes(x=phones, fill=phones),stat = "identity") +
  scale_fill_manual(values=cols[order(phone_neg$mean_neg)]) +
  ggtitle("Average Very Negative Sentiment Score")
multiplot(g1, g2, g3, cols=2)
# Multiple plot function
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  require(grid)
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  if (numPlots==1) {
    print(plots[[1]])
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ExploredSentimentAnalysis.R

Latest commit

History

ExploredSentimentAnalysis.R

File metadata and controls