## ----cache=FALSE, echo=FALSE,include=FALSE------------------------------- source('hooks.R', echo=TRUE) fig.path='figures/text-' ## ----echo=FALSE,eval=TRUE------------------------------------------------ options(continue=" ") ## ----message=FALSE, echo=TRUE, eval=FALSE-------------------------------- ## library(mdsr) ## macbeth_url <- "http://www.gutenberg.org/cache/epub/1129/pg1129.txt" ## Macbeth_raw <- RCurl::getURL(macbeth_url) ## ------------------------------------------------------------------------ data(Macbeth_raw) ## ------------------------------------------------------------------------ # strsplit returns a list: we only want the first element macbeth <- strsplit(Macbeth_raw, "\r\n")[[1]] length(macbeth) ## ------------------------------------------------------------------------ macbeth[300:310] ## ------------------------------------------------------------------------ macbeth_lines <- grep(" MACBETH", macbeth, value = TRUE) length(macbeth_lines) head(macbeth_lines) ## ------------------------------------------------------------------------ length(grep(" MACDUFF", macbeth)) ## ------------------------------------------------------------------------ length(grep(" MACBETH", macbeth)) length(grepl(" MACBETH", macbeth)) ## ------------------------------------------------------------------------ identical(macbeth[grep(" MACBETH", macbeth)], macbeth[grepl(" MACBETH", macbeth)]) ## ------------------------------------------------------------------------ library(stringr) pattern <- " MACBETH" grep(pattern, macbeth, value = TRUE) %>% str_extract(pattern) %>% head() ## ------------------------------------------------------------------------ head(grep("MAC.", macbeth, value = TRUE)) head(grep("MACBETH\\.", macbeth, value = TRUE)) ## ------------------------------------------------------------------------ head(grep("MAC[B-Z]", macbeth, value = TRUE)) ## ------------------------------------------------------------------------ head(grep("MAC(B|D)", macbeth, value = TRUE)) ## ------------------------------------------------------------------------ head(grep("^ MAC[B-Z]", macbeth, value = TRUE)) ## ------------------------------------------------------------------------ head(grep("^ ?MAC[B-Z]", macbeth, value = TRUE)) head(grep("^ *MAC[B-Z]", macbeth, value = TRUE)) head(grep("^ +MAC[B-Z]", macbeth, value = TRUE)) ## ------------------------------------------------------------------------ Macbeth <- grepl(" MACBETH\\.", macbeth) LadyMacbeth <- grepl(" LADY MACBETH\\.", macbeth) Banquo <- grepl(" BANQUO\\.", macbeth) Duncan <- grepl(" DUNCAN\\.", macbeth) ## ----message=FALSE------------------------------------------------------- library(tidyr) speaker_freq <- data.frame(Macbeth, LadyMacbeth, Banquo, Duncan) %>% mutate(line = 1:length(macbeth)) %>% gather(key = "character", value = "speak", -line) %>% mutate(speak = as.numeric(speak)) %>% filter(line > 218 & line < 3172) glimpse(speaker_freq) ## ------------------------------------------------------------------------ acts_idx <- grep("^ACT [I|V]+", macbeth) acts_labels <- str_extract(macbeth[acts_idx], "^ACT [I|V]+") acts <- data.frame(line = acts_idx, labels = acts_labels) ## ----macbeth, warning=FALSE, fig.cap="Speaking parts in \\textit{Macbeth} for four major characters. Duncan is killed early in the play and never speaks again. "---- ggplot(data = speaker_freq, aes(x = line, y = speak)) + geom_smooth(aes(color = character), method = "loess", se = 0, span = 0.4) + geom_vline(xintercept = acts_idx, color = "darkgray", lty = 3) + geom_text(data = acts, aes(y = 0.085, label = labels), hjust = "left", color = "darkgray") + ylim(c(0, NA)) + xlab("Line Number") + ylab("Proportion of Speeches") ## ----eval=FALSE, message=FALSE------------------------------------------- ## library(aRxiv) ## DataSciencePapers <- arxiv_search(query = '"Data Science"', limit = 200) ## ----eval=TRUE, message=FALSE-------------------------------------------- data(DataSciencePapers) ## ------------------------------------------------------------------------ head(DataSciencePapers) ## ----eval=TRUE, message=FALSE-------------------------------------------- library(lubridate) DataSciencePapers <- DataSciencePapers %>% mutate(submitted = ymd_hms(submitted), updated = ymd_hms(updated)) glimpse(DataSciencePapers) ## ----warning=FALSE,message=FALSE----------------------------------------- tally(~ year(submitted), data = DataSciencePapers) ## ------------------------------------------------------------------------ DataSciencePapers %>% filter(year(submitted) == 2007) %>% glimpse() ## ------------------------------------------------------------------------ tally(~ primary_category, data = DataSciencePapers) ## ------------------------------------------------------------------------ DataSciencePapers %>% mutate(field = str_extract(primary_category, "^[a-z,-]+")) %>% tally(x = ~field) %>% sort() ## ----message=FALSE------------------------------------------------------- library(tm) Corpus <- with(DataSciencePapers, VCorpus(VectorSource(abstract))) Corpus[[1]] %>% as.character() %>% strwrap() ## ------------------------------------------------------------------------ Corpus <- Corpus %>% tm_map(stripWhitespace) %>% tm_map(removeNumbers) %>% tm_map(removePunctuation) %>% tm_map(content_transformer(tolower)) %>% tm_map(removeWords, stopwords("english")) strwrap(as.character(Corpus[[1]])) ## ----wordcloud1,message=FALSE, warning=FALSE, fig.cap="A word cloud of terms that appear in the abstracts of arXiv papers on data science."---- library(wordcloud) wordcloud(Corpus, max.words = 30, scale = c(8, 1), colors = topo.colors(n = 30), random.color = TRUE) ## ------------------------------------------------------------------------ DTM <- DocumentTermMatrix(Corpus, control = list(weighting = weightTfIdf)) DTM ## ------------------------------------------------------------------------ findFreqTerms(DTM, lowfreq = 0.8) ## ----wordcount-head------------------------------------------------------ DTM %>% as.matrix() %>% apply(MARGIN = 2, sum) %>% sort(decreasing = TRUE) %>% head(9) ## ------------------------------------------------------------------------ findAssocs(DTM, terms = "statistics", corlimit = 0.5) findAssocs(DTM, terms = "mathematics", corlimit = 0.5) ## ----message=FALSE------------------------------------------------------- library(rvest) library(tidyr) library(methods) url <- "http://en.wikipedia.org/wiki/List_of_songs_recorded_by_the_Beatles" tables <- url %>% read_html() %>% html_nodes(css = "table") songs <- html_table(tables[[5]]) glimpse(songs) ## ----warning=FALSE------------------------------------------------------- songs <- songs %>% mutate(Title = gsub('\\"', "", Title), Year = as.numeric(Year)) %>% rename(songwriters = `Songwriter(s)`) ## ------------------------------------------------------------------------ tally(~songwriters, data = songs) %>% sort(decreasing = TRUE) %>% head() ## ------------------------------------------------------------------------ length(grep("McCartney", songs$songwriters)) length(grep("Lennon", songs$songwriters)) ## ------------------------------------------------------------------------ length(grep("(McCartney|Lennon)", songs$songwriters)) ## ------------------------------------------------------------------------ length(grep("(McCartney|Lennon).*(McCartney|Lennon)", songs$songwriters)) ## ------------------------------------------------------------------------ songs %>% filter(grepl("(McCartney|Lennon).*(McCartney|Lennon)", songwriters)) %>% select(Title) %>% head() ## ------------------------------------------------------------------------ song_titles <- VCorpus(VectorSource(songs$Title)) %>% tm_map(removeWords, stopwords("english")) %>% DocumentTermMatrix(control = list(weighting = weightTfIdf)) findFreqTerms(song_titles, 25) ## ----message=FALSE------------------------------------------------------- library(twitteR) setup_twitter_oauth(consumer_key = "u2UthjbK6YHyQSp4sPk6yjsuV", consumer_secret = "sC4mjd2WME5nH1FoWeSTuSy7JCP5DHjNtTYU1X6BwQ1vPZ0j3v", access_token = "1365606414-7vPfPxStYNq6kWEATQlT8HZBd4G83BBcX4VoS9T", access_secret = "0hJq9KYC3eBRuZzJqSacmtJ4PNJ7tNLkGrQrVl00JHirs") ## ----eval=FALSE---------------------------------------------------------- ## tweets <- searchTwitter("#datascience", lang = "en", n = 1000, ## retryOnRateLimit = 100) ## class(tweets) ## class(tweets[[1]]) ## ----echo=FALSE,eval=FALSE----------------------------------------------- ## save(tweets, file="tweets.Rda") ## ----echo=FALSE---------------------------------------------------------- load("tweets.Rda") ## ------------------------------------------------------------------------ tweet_df <- twListToDF(tweets) %>% as.tbl() tweet_df %>% select(text) %>% head() ## ----nchar-tweet, message=FALSE, fig.keep="last", fig.cap="Distribution of the number of characters in a sample of tweets."---- ggplot(data = tweet_df, aes(x = nchar(text))) + geom_density(size = 2) + geom_vline(xintercept = 140) + scale_x_continuous("Number of Characters") ## ------------------------------------------------------------------------ tweet_df %>% filter(nchar(text) > 140) %>% select(text) ## ----retweetCount, message=FALSE, fig.keep="last", fig.cap="Distribution of the number of retweets in a sample of tweets."---- ggplot(data = tweet_df, aes(x = retweetCount)) + geom_density(size = 2) ## ------------------------------------------------------------------------ tweet_df %>% filter(!is.na(longitude)) ## ----message=FALSE------------------------------------------------------- tweet_db <- tempfile() register_sqlite_backend(tweet_db) store_tweets_db(tweets) ## ------------------------------------------------------------------------ tweets_src <- src_sqlite(tweet_db) old_tweets <- tweets_src %>% tbl("tweets") glimpse(old_tweets) ## ----warning=FALSE------------------------------------------------------- big_data_tweets <- old_tweets %>% collect() %>% filter(grepl("#bigdata", text)) nrow(big_data_tweets) / nrow(collect(old_tweets)) ## ----message=FALSE------------------------------------------------------- library(ggmap) smith <- geocode("44 College Lane, 01063") smith ## ----new-haven----------------------------------------------------------- with(smith, closestTrendLocations(lat = lat, long = lon)) ## ----nh-trends----------------------------------------------------------- head(getTrends(2458410))