## ----cache=FALSE, echo=FALSE,include=FALSE------------------------------- source('hooks.R', echo=TRUE) fig.path='figures/learningII-' ## ----echo=FALSE,eval=TRUE------------------------------------------------ options(continue=" ") ## ----message=FALSE, eval=FALSE------------------------------------------- ## download.file("https://www.fueleconomy.gov/feg/epadata/16data.zip", ## destfile = "data/fueleconomy.zip") ## unzip("data/fueleconomy.zip", exdir = "data/fueleconomy/") ## ----message=FALSE, warning=FALSE---------------------------------------- library(mdsr) library(readxl) filename <- list.files("data/fueleconomy", pattern = "public\\.xlsx")[1] cars <- read_excel(paste0("data/fueleconomy/", filename)) %>% data.frame() cars <- cars %>% rename(make = Mfr.Name, model = Carline, displacement = Eng.Displ, cylinders = X..Cyl, city_mpg = City.FE..Guide....Conventional.Fuel, hwy_mpg = Hwy.FE..Guide....Conventional.Fuel, gears = X..Gears) %>% select(make, model, displacement, cylinders, gears, city_mpg, hwy_mpg) %>% distinct(model, .keep_all = TRUE) %>% filter(make == "Toyota") rownames(cars) <- cars$model glimpse(cars) ## ----warning=FALSE------------------------------------------------------- car_diffs <- dist(cars) str(car_diffs) car_mat <- car_diffs %>% as.matrix() car_mat[1:6, 1:6] %>% round(digits = 2) ## ----cars-tree, fig.height=14, fig.cap="A dendrogram constructed by hierarchical clustering from car-to-car distances implied by the Toyota fuel economy data."---- library(ape) car_diffs %>% hclust() %>% as.phylo() %>% plot(cex = 0.9, label.offset = 1) ## ----message=FALSE------------------------------------------------------- BigCities <- WorldCities %>% arrange(desc(population)) %>% head(4000) %>% select(longitude, latitude) glimpse(BigCities) ## ----cluster-cities, fig.cap="The world's 4,000 largest cities, clustered by the 6-means clustering algorithm.", message=FALSE---- set.seed(15) library(mclust) city_clusts <- BigCities %>% kmeans(centers = 6) %>% fitted("classes") %>% as.character() BigCities <- BigCities %>% mutate(cluster = city_clusts) BigCities %>% ggplot(aes(x = longitude, y = latitude)) + geom_point(aes(color = cluster), alpha = 0.5) ## ----echo=FALSE, results='asis'------------------------------------------ library(xtable) Votes_wide <- Votes %>% tidyr::spread(key = bill, value = vote) Votes_wide %>% select(1:5) %>% head(10) %>% xtable(caption = "Sample voting records data from the Scottish Parliament.", label = "tab:scot-votes-small") %>% print(include.rownames = FALSE) ## ----ballot-grid, fig.cap="Visualization of the Scottish Parliament votes."---- Votes %>% mutate(Vote = factor(vote, labels = c("Nay","Abstain","Aye"))) %>% ggplot(aes(x = bill, y = name, fill = Vote)) + geom_tile() + xlab("Ballot") + ylab("Member of Parliament") + scale_fill_manual(values = c("darkgray", "white", "goldenrod")) + scale_x_discrete(breaks = NULL, labels = NULL) + scale_y_discrete(breaks = NULL, labels = NULL) ## ----two-ballots, fig.cap="Scottish Parliament votes for two ballots."---- Votes %>% filter(bill %in% c("S1M-240.2", "S1M-639.1")) %>% tidyr::spread(key = bill, value = vote) %>% ggplot(aes(x = `S1M-240.2`, y = `S1M-639.1`)) + geom_point(alpha = 0.7, position = position_jitter(width = 0.1, height = 0.1)) + geom_point(alpha = 0.01, size = 10, color = "red" ) ## ----many-ballots, fig.cap="Scatterplot showing the correlation between Scottish Parliament votes in two arbitrary collections of ballots."---- Votes %>% mutate(set_num = as.numeric(factor(bill)), set = ifelse(set_num < max(set_num) / 2, "First_Half", "Second_Half")) %>% group_by(name, set) %>% summarize(Ayes = sum(vote)) %>% tidyr::spread(key = set, value = Ayes) %>% ggplot(aes(x = First_Half, y = Second_Half)) + geom_point(alpha = 0.7, size = 5) ## ----ballot-PCA, fig.cap="Clustering members of Scottish Parliament based on SVD along the members."---- Votes_wide <- Votes %>% tidyr::spread(key = bill, value = vote) vote_svd <- Votes_wide %>% select(-name) %>% svd() voters <- vote_svd$u[ , 1:5] %>% as.data.frame() clusts <- voters %>% kmeans(centers = 6) voters <- voters %>% mutate(cluster = as.factor(clusts$cluster)) ggplot(data = voters, aes(x = V1, y = V2)) + geom_point(aes(x = 0, y = 0), color = "red", shape = 1, size = 7) + geom_point(size = 5, alpha = 0.6, aes(color = cluster)) + xlab("Best Vector from SVD") + ylab("Second Best Vector from SVD") + ggtitle("Political Positions of Members of Parliament") ## ------------------------------------------------------------------------ voters <- voters %>% mutate(name = Votes_wide$name) %>% left_join(Parties, by = c("name" = "name")) tally(party ~ cluster, data = voters) ## ------------------------------------------------------------------------ ballots <- vote_svd$v[ , 1:5] %>% as.data.frame() clust_ballots <- kmeans(ballots, centers = 16) ballots <- ballots %>% mutate(cluster = as.factor(clust_ballots$cluster), bill = names(Votes_wide)[-1]) ## ----issue-clusters, fig.cap="Clustering of Scottish Parliament ballots based on SVD along the ballots."---- ggplot(data = ballots, aes(x = V1, y = V2)) + geom_point(aes(x = 0, y = 0), color = "red", shape = 1, size = 7) + geom_point(size = 5, alpha = 0.6, aes(color = cluster)) + xlab("Best Vector from SVD") + ylab("Second Best Vector from SVD") + ggtitle("Influential Ballots") ## ----SVD-ballots, fig.cap="Illustration of the Scottish Parliament votes when ordered by the primary vector of the SVD.", warning=FALSE---- Votes_svd <- Votes %>% mutate(Vote = factor(vote, labels = c("Nay", "Abstain", "Aye"))) %>% inner_join(ballots, by = "bill") %>% inner_join(voters, by = "name") ggplot(data = Votes_svd, aes(x = reorder(bill, V1.x), y = reorder(name, V1.y), fill = Vote)) + geom_tile() + xlab("Ballot") + ylab("Member of Parliament") + scale_fill_manual(values = c("darkgray", "white", "goldenrod")) + scale_x_discrete(breaks = NULL, labels = NULL) + scale_y_discrete(breaks = NULL, labels = NULL) ## ------------------------------------------------------------------------ Votes_svd %>% arrange(V1.y) %>% head(1) ## ------------------------------------------------------------------------ Votes_svd %>% arrange(V1.y) %>% tail(1)