## ----message=FALSE------------------------------------------------------------
library(tidyverse)
library(infer)
library(moderndive)
library(nycflights13)
library(ggplot2movies)






## ----echo=FALSE---------------------------------------------------------------
set.seed(2102)


## -----------------------------------------------------------------------------
promotions %>% 
  sample_n(size = 6) %>% 
  arrange(id)


## ----eval=FALSE---------------------------------------------------------------
# ggplot(promotions, aes(x = gender, fill = decision)) +
#   geom_bar() +
#   labs(x = "Gender of name on résumé")




## -----------------------------------------------------------------------------
promotions %>% 
  group_by(gender, decision) %>% 
  tally()












## ----eval=FALSE---------------------------------------------------------------
# ggplot(promotions_shuffled,
#        aes(x = gender, fill = decision)) +
#   geom_bar() +
#   labs(x = "Gender of résumé name")



## -----------------------------------------------------------------------------
promotions_shuffled %>% 
  group_by(gender, decision) %>% 
  tally() # Same as summarize(n = n())











## ----eval=FALSE---------------------------------------------------------------
# obs_diff_prop <- promotions %>%
#   specify(decision ~ gender, success = "promoted") %>%
#   calculate(stat = "diff in props", order = c("male", "female"))
# obs_diff_prop


## ----echo=FALSE, eval=FALSE---------------------------------------------------
# set.seed(2019)
# tactile_permutes <- promotions %>%
#   specify(decision ~ gender, success = "promoted") %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 33, type = "permute") %>%
#   calculate(stat = "diff in props", order = c("male", "female"))
# ggplot(data = tactile_permutes, aes(x = stat)) +
#   geom_histogram(binwidth = 0.05, boundary = -0.2, color = "white") +
#   geom_vline(xintercept = pull(obs_diff_prop), color = "blue", size = 2) +
#   scale_y_continuous(breaks = 0:10)














## -----------------------------------------------------------------------------
promotions %>% 
  specify(formula = decision ~ gender, success = "promoted") 


## -----------------------------------------------------------------------------
promotions %>% 
  specify(formula = decision ~ gender, success = "promoted") %>% 
  hypothesize(null = "independence")




## ----eval=FALSE---------------------------------------------------------------
# promotions_generate <- promotions %>%
#   specify(formula = decision ~ gender, success = "promoted") %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute")
# nrow(promotions_generate)




## ----eval=FALSE---------------------------------------------------------------
# null_distribution <- promotions %>%
#   specify(formula = decision ~ gender, success = "promoted") %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   calculate(stat = "diff in props", order = c("male", "female"))
# null_distribution




## -----------------------------------------------------------------------------
obs_diff_prop <- promotions %>% 
  specify(decision ~ gender, success = "promoted") %>% 
  calculate(stat = "diff in props", order = c("male", "female"))
obs_diff_prop


## ----null-distribution-infer, fig.show="hold", fig.cap="Null distribution.", fig.height=1.8----
visualize(null_distribution, bins = 10)


## ----null-distribution-infer-2, fig.cap="Shaded histogram to show $p$-value."----
visualize(null_distribution, bins = 10) + 
  shade_p_value(obs_stat = obs_diff_prop, direction = "right")


## -----------------------------------------------------------------------------
null_distribution %>% 
  get_p_value(obs_stat = obs_diff_prop, direction = "right")



## ----eval=FALSE---------------------------------------------------------------
# null_distribution <- promotions %>%
#   specify(formula = decision ~ gender, success = "promoted") %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   calculate(stat = "diff in props", order = c("male", "female"))


## ----eval=FALSE---------------------------------------------------------------
# bootstrap_distribution <- promotions %>%
#   specify(formula = decision ~ gender, success = "promoted") %>%
#   # Change 1 - Remove hypothesize():
#   # hypothesize(null = "independence") %>%
#   # Change 2 - Switch type from "permute" to "bootstrap":
#   generate(reps = 1000, type = "bootstrap") %>%
#   calculate(stat = "diff in props", order = c("male", "female"))




## -----------------------------------------------------------------------------
percentile_ci <- bootstrap_distribution %>% 
  get_confidence_interval(level = 0.95, type = "percentile")
percentile_ci


## ----eval=FALSE---------------------------------------------------------------
# visualize(bootstrap_distribution) +
#   shade_confidence_interval(endpoints = percentile_ci)



## -----------------------------------------------------------------------------
se_ci <- bootstrap_distribution %>% 
  get_confidence_interval(level = 0.95, type = "se", 
                          point_estimate = obs_diff_prop)
se_ci


## ----eval=FALSE---------------------------------------------------------------
# visualize(bootstrap_distribution) +
#   shade_confidence_interval(endpoints = se_ci)





## ----eval=FALSE---------------------------------------------------------------
# library(moderndive)
# library(infer)
# null_distribution_mean <- promotions %>%
#   specify(formula = decision ~ gender, success = "promoted") %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   calculate(stat = "diff in means", order = c("male", "female"))






















## -----------------------------------------------------------------------------
movies




## -----------------------------------------------------------------------------
movies_sample


## ----action-romance-boxplot, fig.cap="Boxplot of IMDb rating vs. genre.", fig.height=2.7----
ggplot(data = movies_sample, aes(x = genre, y = rating)) +
  geom_boxplot() +
  labs(y = "IMDb rating")


## -----------------------------------------------------------------------------
movies_sample %>% 
  group_by(genre) %>% 
  summarize(n = n(), mean_rating = mean(rating), std_dev = sd(rating))





## -----------------------------------------------------------------------------
movies_sample %>% 
  specify(formula = rating ~ genre)


## -----------------------------------------------------------------------------
movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  hypothesize(null = "independence")


## ----eval=FALSE---------------------------------------------------------------
# movies_sample %>%
#   specify(formula = rating ~ genre) %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   View()




## ----eval=FALSE---------------------------------------------------------------
# null_distribution_movies <- movies_sample %>%
#   specify(formula = rating ~ genre) %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   calculate(stat = "diff in means", order = c("Action", "Romance"))
# null_distribution_movies




## -----------------------------------------------------------------------------
obs_diff_means <- movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  calculate(stat = "diff in means", order = c("Action", "Romance"))
obs_diff_means


## ----eval=FALSE---------------------------------------------------------------
# visualize(null_distribution_movies, bins = 10) +
#   shade_p_value(obs_stat = obs_diff_means, direction = "both")




## -----------------------------------------------------------------------------
null_distribution_movies %>% 
  get_p_value(obs_stat = obs_diff_means, direction = "both")

## ----echo=FALSE---------------------------------------------------------------
p_value_movies <- null_distribution_movies %>%
  get_p_value(obs_stat = obs_diff_means, direction = "both") %>%
  mutate(p_value = round(p_value, 3))










## -----------------------------------------------------------------------------
movies_sample %>% 
  group_by(genre) %>% 
  summarize(n = n(), mean_rating = mean(rating), std_dev = sd(rating))



## ----eval=FALSE---------------------------------------------------------------
# # Construct null distribution of xbar_a - xbar_r:
# null_distribution_movies <- movies_sample %>%
#   specify(formula = rating ~ genre) %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   calculate(stat = "diff in means", order = c("Action", "Romance"))
# visualize(null_distribution_movies, bins = 10)


## ----eval=FALSE---------------------------------------------------------------
# # Construct null distribution of t:
# null_distribution_movies_t <- movies_sample %>%
#   specify(formula = rating ~ genre) %>%
#   hypothesize(null = "independence") %>%
#   generate(reps = 1000, type = "permute") %>%
#   # Notice we switched stat from "diff in means" to "t"
#   calculate(stat = "t", order = c("Action", "Romance"))
# visualize(null_distribution_movies_t, bins = 10)






## ----t-stat-3, fig.cap="Null distribution using t-statistic and t-distribution.", fig.height=2.2----
visualize(null_distribution_movies_t, bins = 10, method = "both")


## -----------------------------------------------------------------------------
obs_two_sample_t <- movies_sample %>% 
  specify(formula = rating ~ genre) %>% 
  calculate(stat = "t", order = c("Action", "Romance"))
obs_two_sample_t


## ----t-stat-4, fig.cap="Null distribution using t-statistic and t-distribution with $p$-value shaded.", warning=TRUE, fig.height=1.7----
visualize(null_distribution_movies_t, method = "both") +
  shade_p_value(obs_stat = obs_two_sample_t, direction = "both")


## -----------------------------------------------------------------------------
null_distribution_movies_t %>% 
  get_p_value(obs_stat = obs_two_sample_t, direction = "both")


## -----------------------------------------------------------------------------
flights_sample <- flights %>% 
  filter(carrier %in% c("HA", "AS"))


## ----ha-as-flights-boxplot, fig.cap="Air time for Hawaiian and Alaska Airlines flights departing NYC in 2013.", fig.height=2.8----
ggplot(data = flights_sample, mapping = aes(x = carrier, y = air_time)) +
  geom_boxplot() +
  labs(x = "Carrier", y = "Air Time")


## -----------------------------------------------------------------------------
flights_sample %>% 
  group_by(carrier, dest) %>% 
  summarize(n = n(), mean_time = mean(air_time, na.rm = TRUE))








## ----eval=FALSE---------------------------------------------------------------
# # Fit regression model:
# score_model <- lm(score ~ bty_avg, data = evals)
# 
# # Get regression table:
# get_regression_table(score_model)

