diff --git a/.RData b/.RData index 1477de8..cf0ebc4 100644 Binary files a/.RData and b/.RData differ diff --git a/.Rhistory b/.Rhistory index fda2fd8..affce0f 100644 --- a/.Rhistory +++ b/.Rhistory @@ -1,400 +1,512 @@ -12/89 -2/46 -10/43 -66/85 -65/81 -View(loan50) -loan50 <- read.csv("https://www.openintro.org/stat/data/csv/loan50.csv") -View(loan50) -?loan50 -??loan50 -install.packages("shiny") -install github("StatsWithR/statsr") -install_github("StatsWithR/statsr", force =TRUE) -install github("StatsWithR/statsr", force =TRUE) -library(devtools) -install_github("StatsWithR/statsr") -library("tidyverse") -library("shiny") -data(rbuthnot) -library(dplyr) -library(ggplot2) -library(statsr) -data(rbuthnot) -data(arbuthnot) -arbuthnot -tail(arbuthnot) -View(arbuthnot) -arbuthnot <- arbuthnot %>% -mutate(total = boys + girls) -arbuthnot$total -ggplot(arbuthnot)+ -mapping= aes(x = total, y = years) -ggplot(data = arbuthnot, aes(x = year, y = ottal))+ +geom_box() +library(tidyverse) +ggplot(data = age, aes(x = X_ageg5yr)) + +geom_bar() +brfss_sl_chl <- brfss_clean %>% +select(sleptim1, children) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +summary(brfss_sl_chl) +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +brfss_sl_chl <- brfss_clean %>% +select(sleptim1, children) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +summary(brfss_sl_chl) +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_histogram(binwidth = 1) +brfss_sl_chl <- brfss_clean %>% +select(sleptim1, children) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +summary(brfss_sl_chl) +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_histogram() +?geom_point() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +summary(brfss_sl_chl) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point(color = factor(X_ageg5yr)) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +summary(brfss_sl_chl) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point(color = X_ageg5yr) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +View(brfss_sl_chl) +summary(brfss_sl_chl) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + geom_point() -ggplot(data = arbuthnot, aes(x = year, y = total))+ +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + geom_point() -ggplot(data = arbuthnot, aes(x = year, y = total)) + -geom_line() + +ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) + +geom_boxplot() + coord_flip() +?geom_boxplot() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summerise(mean = mean(children), median = median(children)) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + geom_point() -arbuthnot <- arbuthnot %>% -mutate(boyprop = boys / total) -ggplot(data = arbuthot, aes(x = year, y = boyprop))+ -geom_line() -ggplot(data = arbuthnot, aes(x = year, y = boyprop))+ -geom_line() -arbuthnot <- arbuthnot %>% -mutate(more_boys = boys > girls) -data(present) -data(present) -force(present) -View(present) -devtools::install_github('rstudio/rmarkdown') -# type your code for Question 6 here, and Knit -present <- present %>% -mutate(more_boys = boys > girls) -ggplot(data = present, aes(x = year, y = more_boys))+ -geom_line() -View(arbuthnot) -# type your code for Question 7 here, and Knit -present <- present %>% -mutate(prop_boy_girl = boys / girls) %>% -ggplot(data = present, aes(x = year, y = more_boys))+ -geom_line() -# type your code for Question 7 here, and Knit -present <- present %>% -mutate(prop_boy_girl = boys / girls) %>% -ggplot(data = present, aes(x = year, y = more_boys))+ -geom_line() -# type your code for Question 7 here, and Knit -present <- present %>% -mutate(prop_boy_girl = boys / girls) -ggplot(data = present, aes(x = year, y = more_boys))+ -geom_line() -# type your code for Question 7 here, and Knit -present <- present %>% -mutate(prop_boy_girl = boys / girls) -ggplot(data = present, aes(x = year, y = prop_boy_girl))+ -geom_line() -# type your code for Question 8 here -# sample code is provided below, edit as necessary, uncomment, and then Knit -#present %>% -# mutate(total = ?) %>% -# arrange(desc(total)) -present %>% -arrange(desc(total)) -# type your code for Question 8 here -# sample code is provided below, edit as necessary, uncomment, and then Knit -#present %>% -# mutate(total = ?) %>% -# arrange(desc(total)) -present %>% -mutate(total = boys + girls) %>% -arrange(desc(total)) -library(statsr) -library(dplyr) -library(ggplot2) -data(nycflights) -force(nycflights) -names(nycflights) -str(nycflights) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram() -ggplot(data = mycflights, aes(x = dep_delay))+ -geom_histogram(binwidth = 15) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 15) -ggplot(data = nycflights, aes(x = dep_delay))+ -geom_histogram(binwidth = 150) -rdu_flights <- nycflights %>% -filter(dest == "RDU") -ggplot(data = rdu_flights, aes(x = dest_delay))+ -geom_histogram() -ggplot(data = rdu_flights, aes(x = dep_delay))+ -geom_histogram() -rdu_flights %>% -summarise(mean_dd = mean(rdu_flights), sd_dd = sd(rdu_flights), n = n()) -str(rdu_flights) -rdu_flights %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -sfo_feb_flights <- nyc_flights %>% -filter(dest == "SFO", month == 2) -sfo_feb_flights <- nycflights %>% -filter(dest == "SFO", month == 2) -ggplot(data = sfo_feb_flights, aes(x = dep_delay))+ -geom_histogram(()) -ggplot(data = sfo_feb_flights, aes(x = dep_delay))+ -geom_histogram() -str(sfo_feb_flights) -?summarise -# type your code for Question 4 here, and Knit -"July" -# type your code for Question 5 here, and Knit -nycflights %>% -group_by(month) %>% -summarise(median(dep_delay)) -# type your code for Question 5 here, and Knit -nycflights %>% -group_by(month) %>% -summarise(median_dd = median(dep_delay)) %>% -arrange(desc(median_dd)) -nycflights <- nycflights %>% -mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed")) -View(nycflights) -nycflights %>% -group_by(origin) %>% -summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>% -arrange(desc(ot_dep_rate)) -ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + -geom_bar() -?nycflights -# type your code for Question 9 here, and Knit -ggplot(data = nycflights, aes(x = avg_speed, y = distance))+ +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) + +ag_ch + geom_boxplot() + coord_flip() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + geom_point() -library(statsr) -library(dplyr) -library(ggplot2) -data(nycflights) -names(nycflights) -str(nycflights) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram() -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 15) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 150) -rdu_flights <- nycflights %>% -filter(dest == "RDU") -ggplot(data = rdu_flights, aes(x = dep_delay)) + -geom_histogram() -rdu_flights %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -sfo_feb_flights <- nycflights %>% -filter(dest == "SFO", month == 2) -# type your code for Question 1 here, and Knit -68 -# type your code for Question 2 here, and Knit -ggplot(data = sfo_feb_flights, aes(x = arr_delay))+ -geom_histogram() -sfo_feb_flights %>% -summarise(median(arr_delay)) -rdu_flights %>% -group_by(origin) %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -# type your code for Question 3 here, and Knit -sfo_feb_flights %>% -group_by(carrier) %>% -summarise(median(arr_delay), sd(arr_delay)) -nycflights %>% -group_by(month) %>% -summarise(mean_dd = mean(dep_delay)) %>% -arrange(desc(mean_dd)) -# type your code for Question 4 here, and Knit -"July" -# type your code for Question 5 here, and Knit -nycflights %>% -group_by(month) %>% -summarise(median_dd = median(dep_delay)) %>% -arrange(desc(median_dd)) -ggplot(nycflights, aes(x = factor(month), y = dep_delay)) + -geom_boxplot() -nycflights <- nycflights %>% -mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed")) -nycflights %>% -group_by(origin) %>% -summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>% -arrange(desc(ot_dep_rate)) -# type your code for Question 7 here, and Knit -"LGA" -ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + -geom_bar() -# type your code for Question 8 here, and Knit -nycflights %>% -nycflights <- mutate(avg_speed = (distance / (air_time / 60))) %>% -arrange(desc(avg_speed)) %>% -select(avg_speed, tailnum) -library(statsr) -library(dplyr) -library(ggplot2) -data(nycflights) -names(nycflights) -str(nycflights) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram() -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 15) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 150) -rdu_flights <- nycflights %>% -filter(dest == "RDU") -ggplot(data = rdu_flights, aes(x = dep_delay)) + -geom_histogram() -rdu_flights %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -sfo_feb_flights <- nycflights %>% -filter(dest == "SFO", month == 2) -# type your code for Question 1 here, and Knit -68 -# type your code for Question 2 here, and Knit -ggplot(data = sfo_feb_flights, aes(x = arr_delay))+ -geom_histogram() -sfo_feb_flights %>% -summarise(median(arr_delay)) -rdu_flights %>% -group_by(origin) %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -# type your code for Question 3 here, and Knit -sfo_feb_flights %>% -group_by(carrier) %>% -summarise(median(arr_delay), sd(arr_delay)) -nycflights %>% -group_by(month) %>% -summarise(mean_dd = mean(dep_delay)) %>% -arrange(desc(mean_dd)) -# type your code for Question 4 here, and Knit -"July" -# type your code for Question 5 here, and Knit -nycflights %>% -group_by(month) %>% -summarise(median_dd = median(dep_delay)) %>% -arrange(desc(median_dd)) -ggplot(nycflights, aes(x = factor(month), y = dep_delay)) + -geom_boxplot() -nycflights <- nycflights %>% -mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed")) -nycflights %>% -group_by(origin) %>% -summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>% -arrange(desc(ot_dep_rate)) -# type your code for Question 7 here, and Knit -"LGA" -ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + -geom_bar() -# type your code for Question 8 here, and Knit -nycflights <- nycflights %>% -mutate(avg_speed = (distance / (air_time / 60))) %>% -arrange(desc(avg_speed)) %>% -select(avg_speed, tailnum) -# type your code for Question 9 here, and Knit -nycflights <- nycflights %>% -mutate(avg_speed = (distance / (air_time / 60))) -# type your code for Question 8 here, and Knit -nycflights %>% -mutate(avg_speed = (distance / (air_time / 60))) %>% -arrange(desc(avg_speed)) %>% -select(avg_speed, tailnum) -?nycflights -nycflights$distance -names(nycflights) -View(nycflights) -data(nycflights) -force(nycflights) -library(statsr) -library(dplyr) -library(ggplot2) -data(nycflights) -names(nycflights) -str(nycflights) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram() -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 15) -ggplot(data = nycflights, aes(x = dep_delay)) + -geom_histogram(binwidth = 150) -rdu_flights <- nycflights %>% -filter(dest == "RDU") -ggplot(data = rdu_flights, aes(x = dep_delay)) + -geom_histogram() -rdu_flights %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -sfo_feb_flights <- nycflights %>% -filter(dest == "SFO", month == 2) -# type your code for Question 1 here, and Knit -68 -# type your code for Question 2 here, and Knit -ggplot(data = sfo_feb_flights, aes(x = arr_delay))+ -geom_histogram() -sfo_feb_flights %>% -summarise(median(arr_delay)) -rdu_flights %>% -group_by(origin) %>% -summarise(mean_dd = mean(dep_delay), sd_dd = sd(dep_delay), n = n()) -# type your code for Question 3 here, and Knit -sfo_feb_flights %>% -group_by(carrier) %>% -summarise(median(arr_delay), sd(arr_delay)) -nycflights %>% -group_by(month) %>% -summarise(mean_dd = mean(dep_delay)) %>% -arrange(desc(mean_dd)) -# type your code for Question 4 here, and Knit -"July" -# type your code for Question 5 here, and Knit -nycflights %>% -group_by(month) %>% -summarise(median_dd = median(dep_delay)) %>% -arrange(desc(median_dd)) -ggplot(nycflights, aes(x = factor(month), y = dep_delay)) + -geom_boxplot() -nycflights <- nycflights %>% -mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed")) -nycflights %>% -group_by(origin) %>% -summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>% -arrange(desc(ot_dep_rate)) -# type your code for Question 7 here, and Knit -"LGA" -ggplot(data = nycflights, aes(x = origin, fill = dep_type)) + -geom_bar() -# type your code for Question 8 here, and Knit -nycflights %>% -mutate(avg_speed = (distance / (air_time / 60))) %>% -arrange(desc(avg_speed)) %>% -select(avg_speed, tailnum) -# type your code for Question 9 here, and Knit -ggplot(data = nycflights, aes(x = avg_speed, y = distance))+ +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +ag_ch + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(width = 0.2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() + geom_smooth() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + geom_point() -View(nycflights) -# type your code for Question 10 here, and Knit -nycflights <- nycflights %>% -mutate(arr_type = ifelse(arr_delay <= 0, "on time", "delayed")) -View(nycflights) -library(statsr) -library(dplyr) -library(ggplot2) -data(kobe_basket) -View(kobe_basket) -?calc_streak() -kobe_streak <- calc_streak(kobe_basket$shot) -ggplot(data = kobe_streak, aes(x = length)) + -geom_histogram(binwidth = 1) -summarise(kobe_streak, IQR(), median(), max(), min()) -?summarise -kobe_streak %>% -summarise(median(length), IQR(length), min(length), max(length())) -summary(length()) -kobe_streak %>% -summarise(median(length), IQR(length), min(length), max(length)) -coin_outcomes <- c("heads", "tails") -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sample(coin_outcomes, size = 1, replace = TRUE) -sim_fair_coin <- sample(coin_outcomes, size = 100, replace = TRUE) -View(sim_fair_coin) -sim_fair_coin -table(sim_fair_coin) -sim_unfair_coin <- sample(coin_outcomes, size = 100, replace = TRUE, prob = c(.2, .8)) -sim_unfair_coin -table(sim_unfair_coin) -?sample -shot_outcomes <- c("H", "M") -sim_basket <- sample(shot_outcomes, size = 1, replace = TRUE) -# type your code for the Exercise here, and Knit -sim_basket <- sample(shot_outcomes, size = 133, replace = TRUE, prob = c(.45)) +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier.high = children > quantile(children, .75) + 1.50*IQR(children), +outlier.low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier.color = case_when(outlier.high ~ "red", +outlier.low ~ "steelblue")) +a_c + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier.color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier.high = children > quantile(children, .75) + 1.50*IQR(children), +outlier.low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier.color = case_when(outlier.high ~ "red", +outlier.low ~ "steelblue")) +head(a_c) +a_c + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = outlier.color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), + +outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier.color = case_when(outlier.high ~ "red", outlier.low ~ "steelblue")) +head(a_c) +?quantile() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +a_c + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = outlier.color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +a_c + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = outlier_color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = outlier_color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) %>% +ungroup() +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) %>% +ungroup() +a_c %>% +group_by(X_ageg5yr) %>% +mutate(outlier = children > median(children) + IQR(children) * 1.5) %>% +ungroup() +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(outlier, width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) %>% +ungroup() +a_c %>% +group_by(X_ageg5yr) %>% +mutate(outlier = children > median(children) + IQR(children) * 1.5) %>% +ungroup() +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + +geom_jitter(data = filter(a_c, outlier ==T), width = .2) +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% +select(sleptim1, children, X_ageg5yr) %>% +drop_na() %>% +filter(children < 20 & sleptim1 < 24) +#look at the data with tables and numerical summary +head(brfss_sl_chl) +summary(brfss_sl_chl) +brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +summarise(mean = mean(children), median = median(children)) +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +p_chl + geom_point() +p_chl + geom_jitter() + coord_flip() +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + +geom_point() +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) +ag_ch + geom_boxplot() + coord_flip() +a_c <- brfss_sl_chl %>% +group_by(X_ageg5yr) %>% +mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) %>% +ungroup() +a_c <- a_c %>% +group_by(X_ageg5yr) %>% +mutate(outlier = children > median(children) + IQR(children) * 1.5) %>% +ungroup() +a_c <- a_c %>% +mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) +head(a_c) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + +geom_boxplot(outlier.shape = NA) + coord_flip() + +geom_jitter(data = filter(a_c, outlier ==T), width = .2) +source('~/.active-rstudio-document', echo=TRUE) +?quantile +url <- https://d18ky98rnyall9.cloudfront.net/_384b2d9eda4b29131fb681b243a7767d_brfss2013.RData?Expires=1569456000&Signature=d33tSiWG1HTbziLf5jLwq~7TgwKdv8JOinx5DL0XR927wK1gP7rh5uAltlxt4ZEBqVUQVnPVtn7LcdsxDK1mo7joxQkO0SqgpHbfexLWU1AOz~KY0S-PHzrcjDKE5fAffUdHQg6mVAah9ujDurpbREp6~q~dWLe4xSYEEXxQ1yY_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A +url <- "https://d18ky98rnyall9.cloudfront.net/_384b2d9eda4b29131fb681b243a7767d_brfss2013.RData" +destfile <- "./brfss2013.RData" +if (!file.exists(destfile)) { +setInternet2(TRUE) +download.file(url ,destfile,method="auto") +} +load(destfile) +source('~/.active-rstudio-document', echo=TRUE) diff --git a/.Rproj.user/F65EB65E/pcs/workbench-pane.pper b/.Rproj.user/F65EB65E/pcs/workbench-pane.pper index 92c5223..0e24b84 100644 --- a/.Rproj.user/F65EB65E/pcs/workbench-pane.pper +++ b/.Rproj.user/F65EB65E/pcs/workbench-pane.pper @@ -1,6 +1,6 @@ { "TabSet1" : 0, - "TabSet2" : 3, + "TabSet2" : 0, "TabZoom" : { } } \ No newline at end of file diff --git a/.Rproj.user/F65EB65E/rmd-outputs b/.Rproj.user/F65EB65E/rmd-outputs index 635a08a..58bbb46 100644 --- a/.Rproj.user/F65EB65E/rmd-outputs +++ b/.Rproj.user/F65EB65E/rmd-outputs @@ -1,8 +1,8 @@ -~/Data/OpenIntro Stats/week-3-lab.html +~/Data/OpenIntro Stats/intro_data_prob_project.html +~/Data/OpenIntro Stats/intro_data_prob_project.html +~/Data/OpenIntro Stats/intro_data_prob_project.html +~/Data/OpenIntro Stats/intro_data_prob_project.html -~/Data/OpenIntro Stats/week-3-lab.html -~/Data/OpenIntro Stats/week-3-lab.html -~/Data/OpenIntro Stats/week-3-lab.html diff --git a/.Rproj.user/F65EB65E/sources/prop/A8E8C529 b/.Rproj.user/F65EB65E/sources/prop/A8E8C529 index f52c169..043b0c6 100644 --- a/.Rproj.user/F65EB65E/sources/prop/A8E8C529 +++ b/.Rproj.user/F65EB65E/sources/prop/A8E8C529 @@ -1,4 +1,4 @@ { - "cursorPosition" : "99,21", - "scrollLine" : "71" + "cursorPosition" : "56,134", + "scrollLine" : "41" } \ No newline at end of file diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/1A524EAD-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/1A524EAD-contents deleted file mode 100644 index 855653b..0000000 --- a/.Rproj.user/F65EB65E/sources/s-6CF1D576/1A524EAD-contents +++ /dev/null @@ -1,317 +0,0 @@ ---- -title: "Probability" -output: statsr:::statswithr_lab ---- - -
-Complete all **Exercises**, and submit answers to **Questions** on the Coursera -platform. -
- -## Hot Hands - -Basketball players who make several baskets in succession are described as -having a *hot hand*. Fans and players have long believed in the hot hand -phenomenon, which refutes the assumption that each shot is independent of the -next. However, [a 1985 paper](http://www.sciencedirect.com/science/article/pii/0010028585900106) by Gilovich, Vallone, and Tversky collected evidence -that contradicted this belief and showed that successive shots are independent -events. This paper started a great controversy that continues to this day, as you can -see by Googling *hot hand basketball*. - -We do not expect to resolve this controversy today. However, in this lab we'll -apply one approach to answering questions like this. The goals for this lab are -to (1) think about the effects of independent and dependent events, (2) learn -how to simulate shooting streaks in R, and (3) to compare a simulation to actual -data in order to determine if the hot hand phenomenon appears to be real. - -## Getting Started - -### Load packages - -In this lab we will explore the data using the `dplyr` package and visualize it -using the `ggplot2` package for data visualization. The data can be found in the -companion package for this course, `statsr`. - -Let's load the packages. - -```{r load-packages, message=FALSE} -library(statsr) -library(dplyr) -library(ggplot2) -``` - -### Data - -Our investigation will focus on the performance of one player: Kobe Bryant of -the Los Angeles Lakers. His performance against the Orlando Magic in the 2009 -NBA finals earned him the title *Most Valuable Player* and many spectators -commented on how he appeared to show a hot hand. Let's load some necessary files -that we will need for this lab. - -```{r load-data} -data(kobe_basket) -``` - -This data frame contains 133 observations and 6 variables, where every -row records a shot taken by Kobe Bryant. The `shot` variable in this dataset -indicates whether the shot was a hit (`H`) or a miss (`M`). - -Just looking at the string of hits and misses, it can be difficult to gauge -whether or not it seems like Kobe was shooting with a hot hand. One way we can -approach this is by considering the belief that hot hand shooters tend to go on -shooting streaks. For this lab, we define the length of a shooting streak to be -the *number of consecutive baskets made until a miss occurs*. - -For example, in Game 1 Kobe had the following sequence of hits and misses from -his nine shot attempts in the first quarter: - -\[ \textrm{H M | M | H H M | M | M | M} \] - -You can verify this by viewing the first 8 rows of the data in the data viewer. - -Within the nine shot attempts, there are six streaks, which are separated by a -"|" above. Their lengths are one, zero, two, zero, zero, zero (in order of -occurrence). - -1. Fill in the blank: A streak length of 1 means one \_\_\_ followed by one miss. -
    -
  1. hit
  2. -
  3. miss
  4. -
- - -2. Fill in the blank: A streak length of 0 means one \_\_\_ which must occur after a -miss that ended the preceeding streak. -
    -
  1. hit
  2. -
  3. miss
  4. -
- -Counting streak lengths manually for all 133 shots would get tedious, so we'll -use the custom function `calc_streak` to calculate them, and store the results -in a data frame called `kobe_streak` as the `length` variable. - -```{r calc-streak-kobe} -kobe_streak <- calc_streak(kobe_basket$shot) -``` - -We can then take a look at the distribution of these streak lengths. - -```{r plot-streak-kobe} -ggplot(data = kobe_streak, aes(x = length)) + - geom_histogram(binwidth = 1) -``` - -3. Which of the following is false about the distribution of Kobe's streak lengths -from the 2009 NBA finals. -
    -
  1. The distribution of Kobe's streaks is unimodal and right skewed.
  2. -
  3. The typical length of a streak is 0 since the median of the distribution is at 0.
  4. -
  5. The IQR of the distribution is 1. -
  6. The longest streak of baskets is of length 4.
  7. -
  8. The shortest streak is of length 1.
  9. -
- -## Compared to What? - -We've shown that Kobe had some long shooting streaks, but are they long enough -to support the belief that he had hot hands? What can we compare them to? - -To answer these questions, let's return to the idea of *independence*. Two -processes are independent if the outcome of one process doesn't effect the outcome -of the second. If each shot that a player takes is an independent process, -having made or missed your first shot will not affect the probability that you -will make or miss your second shot. - -A shooter with a hot hand will have shots that are *not* independent of one -another. Specifically, if the shooter makes his first shot, the hot hand model -says he will have a *higher* probability of making his second shot. - -Let's suppose for a moment that the hot hand model is valid for Kobe. During his -career, the percentage of time Kobe makes a basket (i.e. his shooting -percentage) is about 45%, or in probability notation, - -\[ P(\textrm{shot 1 = H}) = 0.45 \] - -If he makes the first shot and has a hot hand (*not* independent shots), then -the probability that he makes his second shot would go up to, let's say, 60%, - -\[ P(\textrm{shot 2 = H} \, | \, \textrm{shot 1 = H}) = 0.60 \] - -As a result of these increased probabilites, you'd expect Kobe to have longer -streaks. Compare this to the skeptical perspective where Kobe does *not* have a -hot hand, where each shot is independent of the next. If he hit his first shot, -the probability that he makes the second is still 0.45. - -\[ P(\textrm{shot 2 = H} \, | \, \textrm{shot 1 = H}) = 0.45 \] - -In other words, making the first shot did nothing to effect the probability that -he'd make his second shot. If Kobe's shots are independent, then he'd have the -same probability of hitting every shot regardless of his past shots: 45%. - -Now that we've phrased the situation in terms of independent shots, let's return -to the question: how do we tell if Kobe's shooting streaks are long enough to -indicate that he has hot hands? We can compare his streak lengths to someone -without hot hands: an independent shooter. - -## Simulations in R - -While we don't have any data from a shooter we know to have independent shots, -that sort of data is very easy to simulate in R. In a simulation, you set the -ground rules of a random process and then the computer uses random numbers to -generate an outcome that adheres to those rules. As a simple example, you can -simulate flipping a fair coin with the following. - -```{r head-tail} -coin_outcomes <- c("heads", "tails") -sample(coin_outcomes, size = 1, replace = TRUE) -``` - -The vector `outcomes` can be thought of as a hat with two slips of paper in it: -one slip says `heads` and the other says `tails`. The function `sample` draws -one slip from the hat and tells us if it was a head or a tail. - -Run the second command listed above several times. Just like when flipping a -coin, sometimes you'll get a heads, sometimes you'll get a tails, but in the -long run, you'd expect to get roughly equal numbers of each. - -If you wanted to simulate flipping a fair coin 100 times, you could either run -the function 100 times or, more simply, adjust the `size` argument, which -governs how many samples to draw (the `replace = TRUE` argument indicates we put -the slip of paper back in the hat before drawing again). Save the resulting -vector of heads and tails in a new object called `sim_fair_coin`. - -```{r sim-fair-coin} -sim_fair_coin <- sample(coin_outcomes, size = 100, replace = TRUE) -``` - -To view the results of this simulation, type the name of the object and then use -`table` to count up the number of heads and tails. - -```{r table-sim-fair-coin} -sim_fair_coin -table(sim_fair_coin) -``` - -Since there are only two elements in `outcomes`, the probability that we "flip" -a coin and it lands heads is 0.5. Say we're trying to simulate an unfair coin -that we know only lands heads 20% of the time. We can adjust for this by adding -an argument called `prob`, which provides a vector of two probability weights. - -```{r sim-unfair-coin} -sim_unfair_coin <- sample(coin_outcomes, size = 100, replace = TRUE, - prob = c(0.2, 0.8)) -``` - -`prob = c(0.2, 0.8)` indicates that for the two elements in the `outcomes` vector, -we want to select the first one, `heads`, with probability 0.2 and the second -one, `tails` with probability 0.8. Another way of thinking about this is to -think of the outcome space as a bag of 10 chips, where 2 chips are labeled -"head" and 8 chips "tail". Therefore at each draw, the probability of drawing a -chip that says "head"" is 20%, and "tail" is 80%. - -
-**Exercise**: In your simulation of flipping the unfair coin 100 times, how many flips came up heads? -
- -In a sense, we've shrunken the size of the slip of paper that says "heads", -making it less likely to be drawn and we've increased the size of the slip of -paper saying "tails", making it more likely to be drawn. When we simulated the -fair coin, both slips of paper were the same size. This happens by default if -you don't provide a `prob` argument; all elements in the `outcomes` vector have -an equal probability of being drawn. - -If you want to learn more about `sample` or any other function, recall that you -can always check out its help file with `?sample`. - - -## Simulating the Independent Shooter - -Simulating a basketball player who has independent shots uses the same mechanism -that we use to simulate a coin flip. To simulate a single shot from an -independent shooter with a shooting percentage of 50% we type, - -```{r sim-basket} -shot_outcomes <- c("H", "M") -sim_basket <- sample(shot_outcomes, size = 1, replace = TRUE) -``` - -To make a valid comparison between Kobe and our simulated independent shooter, -we need to align both their shooting percentage and the number of attempted shots. - - -
-**Exercise**: What change needs to be made to the `sample` function so that it reflects a shooting percentage of 45%? Make this adjustment, then run a simulation to sample 133 shots. Assign the output of this simulation to a new object called `sim_basket`. -
-```{r} -# type your code for the Exercise here, and Knit -sim_basket <- sample(shot_outcomes, size = 133, replace = TRUE, prob = c(.45, 1-.45)) - -table(sim_basket) -``` - - -Note that we've named the new vector `sim_basket`, the same name that we gave to -the previous vector reflecting a shooting percentage of 50%. In this situation, -R overwrites the old object with the new one, so always make sure that you don't -need the information in an old vector before reassigning its name. - -With the results of the simulation saved as `sim_basket`, we have the data -necessary to compare Kobe to our independent shooter. - -Both data sets represent the results of 133 shot attempts, each with the same -shooting percentage of 45%. We know that our simulated data is from a shooter -that has independent shots. That is, we know the simulated shooter does not have -a hot hand. - -### Comparing Kobe Bryant to the Independent Shooter - -
-**Exercise**: Using `calc_streak`, compute the streak lengths of `sim_basket`, and save the results in a data frame called `sim_streak`. Note that since the `sim_streak` object is just a vector and not a variable in a data frame, we don't need to first select it from a data frame like we did earlier when we calculated the streak lengths for Kobe's shots. -
-```{r sim-streak-lengths} -# type your code for the Exercise here, and Knit -sim_streak <- calc_streak(sim_basket) -``` - -
-**Exercise**: Make a plot of the distribution of simulated streak lengths of the independent shooter. What is the typical streak length for this simulated independent shooter with a 45% shooting percentage? How long is the player's longest streak of baskets in 133 shots? -
-```{r plot-sim-streaks} -# type your code for the Exercise here, and Knit -ggplot(sim_streak, aes(x = length)) + - geom_histogram(binwidth = 1) - -sim_streak %>% - summarise(max(length)) -``` - -4. If you were to run the simulation of the independent shooter a second time, how -would you expect its streak distribution to compare to the distribution from the -exercise above? -
    -
  1. Exactly the same
  2. -
  3. Somewhat similar
  4. -
  5. Totally different
  6. -
- - -5. How does Kobe Bryant's distribution of streak lengths compare to the distribution -of streak lengths for the simulated shooter? Using this comparison, do you have -evidence that the hot hand model fits Kobe's shooting patterns? -
    -
  1. The distributions look very similar. Therefore, there doesn't appear to be evidence for Kobe Bryant's hot hand.
  2. -
  3. The distributions look very similar. Therefore, there appears to be evidence for Kobe Bryant's hot hand.
  4. -
  5. The distributions look very different. Therefore, there doesn't appear to be evidence for Kobe Bryant's hot hand.
  6. -
  7. The distributions look very different. Therefore, there appears to be evidence for Kobe Bryant's hot hand.
  8. -
- -
-**Exercise**: What concepts from the course videos are covered in this lab? What -concepts, if any, are not covered in the videos? Have you seen these concepts -elsewhere, e.g. textbook, previous labs, or practice problems? -
- -
-This is a derivative of an [OpenIntro](https://www.openintro.org/stat/labs.php) lab, and is released under a [Attribution-NonCommercial-ShareAlike 3.0 United States](https://creativecommons.org/licenses/by-nc-sa/3.0/us/) license. -
\ No newline at end of file diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/370D4C4C-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/370D4C4C-contents deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/5F26E372-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/5F26E372-contents deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/86D16AD0 b/.Rproj.user/F65EB65E/sources/s-6CF1D576/86D16AD0 deleted file mode 100644 index a2d3f32..0000000 --- a/.Rproj.user/F65EB65E/sources/s-6CF1D576/86D16AD0 +++ /dev/null @@ -1,33 +0,0 @@ -{ - "collab_server" : "", - "contents" : "", - "created" : 1569016556701.000, - "dirty" : false, - "encoding" : "", - "folds" : "", - "hash" : "0", - "id" : "86D16AD0", - "lastKnownWriteTime" : 2484833459456, - "last_content_update" : 1569016556701, - "path" : null, - "project_path" : null, - "properties" : { - "cacheKey" : "8C046545", - "caption" : "brfss_sl_al", - "contentUrl" : "grid_resource/gridviewer.html?env=&obj=brfss_sl_al&cache_key=8C046545", - "displayedObservations" : 229672, - "environment" : "", - "expression" : "brfss_sl_al", - "object" : "brfss_sl_al", - "preview" : 0, - "totalObservations" : 229672, - "variables" : 2 - }, - "read_only" : false, - "read_only_alternatives" : [ - ], - "relative_order" : 3, - "source_on_save" : false, - "source_window" : "", - "type" : "r_dataframe" -} \ No newline at end of file diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/86D16AD0-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/86D16AD0-contents deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/A0FB69B5 b/.Rproj.user/F65EB65E/sources/s-6CF1D576/A0FB69B5 deleted file mode 100644 index 6cba5f1..0000000 --- a/.Rproj.user/F65EB65E/sources/s-6CF1D576/A0FB69B5 +++ /dev/null @@ -1,33 +0,0 @@ -{ - "collab_server" : "", - "contents" : "", - "created" : 1569016010177.000, - "dirty" : false, - "encoding" : "", - "folds" : "", - "hash" : "0", - "id" : "A0FB69B5", - "lastKnownWriteTime" : 2484693520344, - "last_content_update" : 1569016010177, - "path" : null, - "project_path" : null, - "properties" : { - "cacheKey" : "B523010", - "caption" : "brfss_clean", - "contentUrl" : "grid_resource/gridviewer.html?env=&obj=brfss_clean&cache_key=B523010", - "displayedObservations" : 491775, - "environment" : "", - "expression" : "brfss_clean", - "object" : "brfss_clean", - "preview" : 0, - "totalObservations" : 491775, - "variables" : 4 - }, - "read_only" : false, - "read_only_alternatives" : [ - ], - "relative_order" : 2, - "source_on_save" : false, - "source_window" : "", - "type" : "r_dataframe" -} \ No newline at end of file diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/A0FB69B5-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/A0FB69B5-contents deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB b/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB deleted file mode 100644 index 58d2115..0000000 --- a/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB +++ /dev/null @@ -1,25 +0,0 @@ -{ - "collab_server" : "", - "contents" : "", - "created" : 1569011028597.000, - "dirty" : false, - "encoding" : "UTF-8", - "folds" : "", - "hash" : "0", - "id" : "BA08E5BB", - "lastKnownWriteTime" : 1569018618, - "last_content_update" : 1569018624652, - "path" : "~/Data/OpenIntro Stats/intro_data_prob_project.Rmd", - "project_path" : "intro_data_prob_project.Rmd", - "properties" : { - "cursorPosition" : "99,21", - "scrollLine" : "71" - }, - "read_only" : false, - "read_only_alternatives" : [ - ], - "relative_order" : 4, - "source_on_save" : false, - "source_window" : "", - "type" : "r_markdown" -} \ No newline at end of file diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB-contents b/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB-contents deleted file mode 100644 index 60bf5f2..0000000 --- a/.Rproj.user/F65EB65E/sources/s-6CF1D576/BA08E5BB-contents +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: "Exploring the BRFSS data" -output: - html_document: - fig_height: 4 - highlight: pygments - theme: spacelab ---- - -## Setup - -### Load packages - -```{r load-packages, message = FALSE} -library(ggplot2) -library(dplyr) -library(tidyverse) -``` - -### Load data - -```{r load-data} -load("brfss2013.RData") -``` - - - -* * * - -## Part 1: Data - -BRFSS conducts both landline telephone and cellular telephone based surveys. In conducting the BRFSS landline telephone survey, interviewers collect data from a randomly selected adult in a household. In conducting the cellular telephone version of the BRFSS questionnaire, interviewers collect data from an adult who participates by using a cellular telephone and resides in a private residence or college housing. The telephone numbers dialed are generated using random number generation. - -Health characteristics estimated from the BRFSS pertain to the non-institutionalized adult population, aged 18 years or older, who reside in the US. - -Using this methodology helps control selection bias. However the survey is somewhat lengthy (often around 45 minutes to complete), and additionally pertains to potentaially sensitive topics which may introduce non-response bias. - -This is an observational study over a limited point in time. - -* * * - -## Part 2: Research questions -The following research questions pertain to sleep amount. - -**Research quesion 1:** -Is there a relationship between time slept and alcohol consumption? - -**Research quesion 2:** -Is there a relationship between time slept and number of children in the household? - -**Research quesion 3:** -Is there a relationship between time slept and the type of payment for work? - -* * * - -## Part 3: Exploratory data analysis - -Create dataset with relavent variables -```{r} -brfss_clean <- brfss2013 %>% - select(sleptim1, avedrnk2, children, scntlpad) -``` -**Research quesion 1:** - -```{r} -brfss_sl_alc <- brfss_clean %>% - select(sleptim1, avedrnk2) %>% - drop_na() - -summary(brfss_sl_alc) - -ggplot(data = brfss_sl_alc, aes(x = sleptim1, y = avedrnk2)) + - geom_point() -``` - - - -**Research quesion 2:** - -```{r} -brfss_sl_chl <- brfss_clean %>% - select(sleptim1, children) %>% - drop_na() - -summary(brfss_sl_chl) - -ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) + - geom_point() -``` - - - -**Research quesion 3:** - -```{r} -brfss_sl_pay <- brfss_clean %>% - select(sleptim1, scntlpad) %>% - drop_na() - -summary(brfss_sl_pay) - -ggplot(data = brfss_sl_pay, aes(x = scntlpad, y = sleptim1)) + - geom_boxplot() -``` - diff --git a/.Rproj.user/F65EB65E/sources/s-6CF1D576/lock_file b/.Rproj.user/F65EB65E/sources/s-6CF1D576/lock_file deleted file mode 100644 index e69de29..0000000 diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/F65EB65E6CF1D576/chunks.json b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/F65EB65E6CF1D576/chunks.json index 7661453..8c1abe8 100644 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/F65EB65E6CF1D576/chunks.json +++ b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/F65EB65E6CF1D576/chunks.json @@ -1 +1 @@ -{"chunk_definitions":[{"chunk_id":"cnzitlqf52tib","chunk_label":"load-packages","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"load-packages","message":false},"row":17,"row_count":1,"visible":true},{"chunk_id":"c2g7quw7eem19","chunk_label":"load-data","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"load-data"},"row":23,"row_count":1,"visible":true},{"chunk_id":"cs7yetebuhqr9","chunk_label":"unnamed-chunk-1","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-11"},"row":61,"row_count":1,"visible":true},{"chunk_id":"c82pakogfn737","chunk_label":"unnamed-chunk-4","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-27"},"row":103,"row_count":1,"visible":true}],"doc_write_time":1569018618} \ No newline at end of file +{"chunk_definitions":[{"chunk_id":"cbwr2c6ltg73x","chunk_label":"load-packages","document_id":"6DC06A30","expansion_state":0,"options":{"engine":"r","label":"load-packages","message":false},"row":17,"row_count":1,"visible":true},{"chunk_id":"cf0rgtv18rg39","chunk_label":"unnamed-chunk-1","document_id":"6DC06A30","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-2"},"row":32,"row_count":1,"visible":true}],"doc_write_time":1569053411} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c2g7quw7eem19/000002.csv b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c2g7quw7eem19/000002.csv deleted file mode 100644 index d3c1af1..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c2g7quw7eem19/000002.csv +++ /dev/null @@ -1 +0,0 @@ -"0","load(""brfss2013.RData"")" diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000002.csv b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000002.csv deleted file mode 100644 index e7fa889..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000002.csv +++ /dev/null @@ -1,36 +0,0 @@ -"0","brfss_sl_pay <- brfss_clean %>%" -"0"," select(sleptim1, scntlpad) %>%" -"0"," drop_na()" -"0","" -"0","summary(brfss_sl_pay)" -"1","" -"1"," sleptim1 " -"1"," scntlpad " -"1"," -" -"1"," Min. : 1.000 " -"1"," Paid by salary :10099 " -"1"," -" -"1"," 1st Qu.: 6.000 " -"1"," Paid by the hour : 9089 " -"1"," -" -"1"," Median : 7.000 " -"1"," Paid by the job / task: 1246 " -"1"," -" -"1"," Mean : 7.288 " -"1"," Paid some other way : 1122 " -"1"," -" -"1"," 3rd Qu.: 8.000 " -"1"," " -"1"," -" -"1"," Max. :24.000 " -"1"," " -"1"," -" -"0","ggplot(data = brfss_sl_pay, aes(x = scntlpad, y = sleptim1)) +" -"0"," geom_boxplot()" diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.metadata b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.metadata deleted file mode 100644 index f5c61eb..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.metadata +++ /dev/null @@ -1 +0,0 @@ -{"conditions":[],"height":432.6328800988875,"size_behavior":0,"width":700.0000000000000} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.png b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.png deleted file mode 100644 index d5244fd..0000000 Binary files a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.png and /dev/null differ diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.snapshot b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.snapshot deleted file mode 100644 index 42eb047..0000000 Binary files a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/c82pakogfn737/000003.snapshot and /dev/null differ diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000002.csv b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000002.csv deleted file mode 100644 index 58345c9..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000002.csv +++ /dev/null @@ -1,38 +0,0 @@ -"0","brfss_sl_chl <- brfss_clean %>%" -"0"," select(sleptim1, children) %>%" -"0"," drop_na()" -"0","" -"0","summary(brfss_sl_chl)" -"1","" -"1"," sleptim1 " -"1"," children " -"1"," -" -"1"," Min. : 1.000 " -"1"," Min. : 0.0000 " -"1"," -" -"1"," 1st Qu.: 6.000 " -"1"," 1st Qu.: 0.0000 " -"1"," -" -"1"," Median : 7.000 " -"1"," Median : 0.0000 " -"1"," -" -"1"," Mean : 7.051 " -"1"," Mean : 0.5202 " -"1"," -" -"1"," 3rd Qu.: 8.000 " -"1"," 3rd Qu.: 1.0000 " -"1"," -" -"1"," Max. :103.000 " -"1"," Max. :47.0000 " -"1"," -" -"0","ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) +" -"0"," geom_histogram()" -"2","Error: stat_bin() must not be used with a y aesthetic. -" diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000003.error b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000003.error deleted file mode 100644 index 36c6ba0..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000003.error +++ /dev/null @@ -1 +0,0 @@ -{"frames":[{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"(function (x, ...) \nUseMethod(\"print\"))(x)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"print.ggplot(x)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"ggplot_build(x)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"ggplot_build.ggplot(x)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"by_layer(function(l, d) l$compute_statistic(d, layout))","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"f(l = layers[[i]], d = data[[i]])","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"l$compute_statistic(d, layout)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"f(..., self = self)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"self$stat$setup_params(data, self$stat_params)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"f(...)","line_number":0},{"character_number":0,"end_character_number":0,"end_line_number":0,"file":"","func":"stop(\"stat_bin() must not be used with a y aesthetic.\", call. = FALSE)","line_number":0}],"message":"Error: stat_bin() must not be used with a y aesthetic.\n"} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.metadata b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.metadata deleted file mode 100644 index f5c61eb..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.metadata +++ /dev/null @@ -1 +0,0 @@ -{"conditions":[],"height":432.6328800988875,"size_behavior":0,"width":700.0000000000000} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.png b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.png deleted file mode 100644 index 6848a51..0000000 Binary files a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cg9jvw9j4jst0/000004.png and /dev/null differ diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/chunks.json b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/chunks.json index 507cd14..aef21c9 100644 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/chunks.json +++ b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/chunks.json @@ -1 +1 @@ -{"chunk_definitions":[{"chunk_id":"cnzitlqf52tib","chunk_label":"load-packages","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"load-packages","message":false},"row":17,"row_count":1,"visible":true},{"chunk_id":"c2g7quw7eem19","chunk_label":"load-data","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"load-data"},"row":23,"row_count":1,"visible":true},{"chunk_id":"cs7yetebuhqr9","chunk_label":"unnamed-chunk-1","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-11"},"row":61,"row_count":1,"visible":true},{"chunk_id":"cg9jvw9j4jst0","chunk_label":"unnamed-chunk-3","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-28"},"row":88,"row_count":1,"visible":true},{"chunk_id":"c82pakogfn737","chunk_label":"unnamed-chunk-4","document_id":"BA08E5BB","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-27"},"row":103,"row_count":1,"visible":true}],"doc_write_time":1569018385} \ No newline at end of file +{"chunk_definitions":[{"chunk_id":"cbwr2c6ltg73x","chunk_label":"load-packages","document_id":"6DC06A30","expansion_state":0,"options":{"engine":"r","label":"load-packages","message":false},"row":15,"row_count":1,"visible":true},{"chunk_id":"cf0rgtv18rg39","chunk_label":"load-data","document_id":"6DC06A30","expansion_state":0,"options":{"engine":"r","label":"unnamed-chunk-2"},"row":30,"row_count":1,"visible":true}],"doc_write_time":1569366200} \ No newline at end of file diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cnzitlqf52tib/000002.csv b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cnzitlqf52tib/000002.csv deleted file mode 100644 index fda055e..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cnzitlqf52tib/000002.csv +++ /dev/null @@ -1,3 +0,0 @@ -"0","library(ggplot2)" -"0","library(dplyr)" -"0","library(tidyverse)" diff --git a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cs7yetebuhqr9/000004.csv b/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cs7yetebuhqr9/000004.csv deleted file mode 100644 index f1a2345..0000000 --- a/.Rproj.user/shared/notebooks/30DB09BC-intro_data_prob_project/1/s/cs7yetebuhqr9/000004.csv +++ /dev/null @@ -1,2 +0,0 @@ -"0","brfss_clean <- brfss2013 %>%" -"0"," select(sleptim1, avedrnk2, children, scntlpad)" diff --git a/intro_data_prob_project.Rmd b/intro_data_prob_project.Rmd index 60bf5f2..06875b9 100644 --- a/intro_data_prob_project.Rmd +++ b/intro_data_prob_project.Rmd @@ -1,5 +1,5 @@ --- -title: "Exploring the BRFSS data" +title: "Exploring the BRFSS 2013 data" output: html_document: fig_height: 4 @@ -12,15 +12,22 @@ output: ### Load packages ```{r load-packages, message = FALSE} -library(ggplot2) -library(dplyr) library(tidyverse) ``` ### Load data +Get data file if it doesn't exist ```{r load-data} -load("brfss2013.RData") +url <- "https://d18ky98rnyall9.cloudfront.net/_384b2d9eda4b29131fb681b243a7767d_brfss2013.RData" + + destfile <- "./brfss2013.RData" + if (!file.exists(destfile)) { + setInternet2(TRUE) + download.file(url ,destfile,method="auto") + } + load(destfile) + ``` @@ -40,7 +47,14 @@ This is an observational study over a limited point in time. * * * ## Part 2: Research questions -The following research questions pertain to sleep amount. + +Sleep is a critical indicator of overall health and well-being. The exact sleep needs for individuals vary, however the National Sleep Foundation has provided the following sleep duration recommendations.[1](#myfootnote1) + +- Younger adults (18-25): 7-9 hours +- Adults (26-64): 7-9 hours +- Older adults (65+): 7-8 hours + +Using the BRFSS 2013 dataset we will examine potential relations between sleep duration and a variatey of health and social variables. **Research quesion 1:** Is there a relationship between time slept and alcohol consumption? @@ -51,6 +65,9 @@ Is there a relationship between time slept and number of children in the househo **Research quesion 3:** Is there a relationship between time slept and the type of payment for work? +**Research question 4:** +What percentage of the population meets suggested sleep guidlines? + * * * ## Part 3: Exploratory data analysis @@ -69,8 +86,9 @@ brfss_sl_alc <- brfss_clean %>% summary(brfss_sl_alc) -ggplot(data = brfss_sl_alc, aes(x = sleptim1, y = avedrnk2)) + - geom_point() +p_alc <- ggplot(data = brfss_sl_alc, aes(x = sleptim1, y = avedrnk2)) + +p_alc + geom_point() + geom_smooth() ``` @@ -78,14 +96,58 @@ ggplot(data = brfss_sl_alc, aes(x = sleptim1, y = avedrnk2)) + **Research quesion 2:** ```{r} -brfss_sl_chl <- brfss_clean %>% - select(sleptim1, children) %>% - drop_na() +#prepare dataset, remove obvious outliers and na +brfss_sl_chl <- brfss2013 %>% + select(sleptim1, children, X_ageg5yr) %>% + drop_na() %>% + filter(children < 20 & sleptim1 < 24) + +#look at the data with tables and numerical summary +head(brfss_sl_chl) summary(brfss_sl_chl) -ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) + - geom_point() +brfss_sl_chl %>% + group_by(X_ageg5yr) %>% + summarise(mean = mean(children), median = median(children)) + +#look at the data visually +p_chl <- ggplot(data = brfss_sl_chl, aes(x = children, y = sleptim1)) + +p_chl + geom_point() + +p_chl + geom_jitter() + coord_flip() + +ggplot(data = brfss_sl_chl, aes(x = sleptim1, y = children)) + + geom_point() + +ag_ch <- ggplot(data = brfss_sl_chl, aes(x = X_ageg5yr, y = children)) + +ag_ch + geom_boxplot() + coord_flip() + +a_c <- brfss_sl_chl %>% + group_by(X_ageg5yr) %>% + mutate(outlier_high = children > quantile(children, .75) + 1.50*IQR(children), outlier_low = children < quantile(children, .25) - 1.50*IQR(children)) %>% + ungroup() + +a_c <- a_c %>% + group_by(X_ageg5yr) %>% + mutate(outlier = children > median(children) + IQR(children) * 1.5) %>% + ungroup() + +a_c <- a_c %>% + mutate(outlier_color = case_when(outlier_high ~ "red", outlier_low ~ "steelblue")) + +head(a_c) + + +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + + geom_boxplot(outlier.shape = NA) + coord_flip() + geom_jitter(color = a_c$outlier_color, width = .2) + +ggplot(data = a_c, aes(x = X_ageg5yr, y = children)) + + geom_boxplot(outlier.shape = NA) + coord_flip() + + geom_jitter(data = filter(a_c, outlier ==T), width = .2) + ``` @@ -97,9 +159,20 @@ brfss_sl_pay <- brfss_clean %>% select(sleptim1, scntlpad) %>% drop_na() -summary(brfss_sl_pay) +brfss_sl_pay %>% + group_by(scntlpad) %>% + summarise(median(sleptim1), mean(sleptim1), IQR(sleptim1)) + +p_pay <- ggplot(data = brfss_sl_pay, aes(x = scntlpad, y = sleptim1)) + +p_pay + geom_boxplot() +``` + +**Research question 4:** + +```{r} -ggplot(data = brfss_sl_pay, aes(x = scntlpad, y = sleptim1)) + - geom_boxplot() ``` +References: +1: National Sleep Foundation's sleep time duration recommendations: methodology and results summary. Sleep Health. 2015 Mar;1(1):40-43. doi: 10.1016/j.sleh.2014.12.010. Epub 2015 Jan 8. \ No newline at end of file diff --git a/intro_data_prob_project.html b/intro_data_prob_project.html index 4650901..f5cea50 100644 --- a/intro_data_prob_project.html +++ b/intro_data_prob_project.html @@ -12,7 +12,7 @@ -Exploring the BRFSS data +Exploring the BRFSS 2013 data