# Examples using mtcars data

#### 2018-01-05

Note: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.

## Data preparation

library(infer)
library(dplyr)
mtcars <- mtcars %>%
mutate(cyl = factor(cyl),
vs = factor(vs),
am = factor(am),
gear = factor(gear),
carb = factor(carb))
# For reproducibility
set.seed(2018)         

One numerical variable (mean)

mtcars %>%
specify(response = mpg) %>% # formula alt: mpg ~ NULL
hypothesize(null = "point", mu = 25) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  24.7
##  2         2  23.1
##  3         3  26.9
##  4         4  24.8
##  5         5  25.6
##  6         6  23.2
##  7         7  24.2
##  8         8  24.9
##  9         9  23.3
## 10        10  26.5
## # … with 90 more rows

One numerical variable (median)

mtcars %>%
specify(response = mpg) %>% # formula alt: mpg ~ NULL
hypothesize(null = "point", med = 26) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "median")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  26.5
##  2         2  26.5
##  3         3  24.6
##  4         4  25.0
##  5         5  26
##  6         6  26
##  7         7  25.0
##  8         8  27.2
##  9         9  25.2
## 10        10  28.2
## # … with 90 more rows

One categorical (2 level) variable

mtcars %>%
specify(response = am, success = "1") %>% # formula alt: am ~ NULL
hypothesize(null = "point", p = .25) %>%
generate(reps = 100, type = "simulate") %>%
calculate(stat = "prop")
## # A tibble: 100 x 2
##    replicate   stat
##    <fct>      <dbl>
##  1 1         0.375
##  2 2         0.0625
##  3 3         0.125
##  4 4         0.25
##  5 5         0.188
##  6 6         0.406
##  7 7         0.219
##  8 8         0.375
##  9 9         0.344
## 10 10        0.188
## # … with 90 more rows

Two categorical (2 level) variables

mtcars %>%
specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "diff in props", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0397
##  2         2  0.0873
##  3         3  0.214
##  4         4 -0.167
##  5         5 -0.167
##  6         6 -0.0397
##  7         7  0.0873
##  8         8 -0.0397
##  9         9 -0.0397
## 10        10 -0.294
## # … with 90 more rows

One categorical (>2 level) - GoF

mtcars %>%
specify(cyl ~ NULL) %>% # alt: response = cyl
hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>%
generate(reps = 100, type = "simulate") %>%
calculate(stat = "Chisq")
## # A tibble: 100 x 2
##    replicate   stat
##    <fct>      <dbl>
##  1 1          0.688
##  2 2          1.69
##  3 3          1.69
##  4 4          1.69
##  5 5         10.2
##  6 6          4.5
##  7 7          3
##  8 8          2.69
##  9 9          0.5
## 10 10         1.5
## # … with 90 more rows

Two categorical (>2 level) variables

mtcars %>%
specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "Chisq")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 3.90
##  2         2 3.68
##  3         3 1.01
##  4         4 0.557
##  5         5 1.34
##  6         6 2.93
##  7         7 1.45
##  8         8 0.557
##  9         9 0.557
## 10        10 1.01
## # … with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "diff in means", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate     stat
##        <int>    <dbl>
##  1         1  3.12
##  2         2 -1.01
##  3         3  0.813
##  4         4  1.46
##  5         5  0.0101
##  6         6  1.94
##  7         7 -0.00283
##  8         8 -1.84
##  9         9 -2.24
## 10        10 -3.59
## # … with 90 more rows

One numerical variable one categorical (2 levels) (diff in medians)

mtcars %>%
specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "diff in medians", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 -5
##  2         2 -2.3
##  3         3  4.10
##  4         4  0
##  5         5  0
##  6         6  1
##  7         7  1.90
##  8         8 -0.5
##  9         9  2.90
## 10        10  1.90
## # … with 90 more rows

One numerical one categorical (>2 levels) - ANOVA

mtcars %>%
specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "F")
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 0.842
##  2         2 0.800
##  3         3 0.232
##  4         4 0.0158
##  5         5 0.0488
##  6         6 0.466
##  7         7 1.26
##  8         8 5.13
##  9         9 1.67
## 10        10 0.469
## # … with 90 more rows

Two numerical vars - SLR

mtcars %>%
specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl
hypothesize(null = "independence") %>%
generate(reps = 100, type = "permute") %>%
calculate(stat = "slope")
## # A tibble: 100 x 2
##    replicate       stat
##        <int>      <dbl>
##  1         1 -0.0158
##  2         2 -0.0104
##  3         3  0.00876
##  4         4  0.0291
##  5         5 -0.0000981
##  6         6 -0.0206
##  7         7 -0.00727
##  8         8  0.0167
##  9         9  0.00682
## 10        10  0.0116
## # … with 90 more rows

One numerical variable (standard deviation)

Not currently implemented

mtcars %>%
specify(response = mpg) %>% # formula alt: mpg ~ NULL
hypothesize(null = "point", sigma = 5) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "sd")

### Confidence intervals

One numerical (one mean)

mtcars %>%
specify(response = mpg) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  19.2
##  2         2  18.9
##  3         3  18.7
##  4         4  19.3
##  5         5  20.9
##  6         6  18.5
##  7         7  20.5
##  8         8  18.8
##  9         9  23.1
## 10        10  18.6
## # … with 90 more rows

One numerical (one median)

mtcars %>%
specify(response = mpg) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "median")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  17.0
##  2         2  19.2
##  3         3  19.4
##  4         4  19.4
##  5         5  17.1
##  6         6  18.2
##  7         7  20.4
##  8         8  22.8
##  9         9  19.0
## 10        10  21
## # … with 90 more rows

One numerical (standard deviation)

mtcars %>%
specify(response = mpg) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "sd")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1  6.11
##  2         2  5.27
##  3         3  4.82
##  4         4  4.35
##  5         5  5.55
##  6         6  7.83
##  7         7  6.28
##  8         8  5.68
##  9         9  7.19
## 10        10  5.67
## # … with 90 more rows

One categorical (one proportion)

mtcars %>%
specify(response = am, success = "1") %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "prop")
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.25
##  2         2 0.5
##  3         3 0.344
##  4         4 0.531
##  5         5 0.438
##  6         6 0.5
##  7         7 0.312
##  8         8 0.438
##  9         9 0.656
## 10        10 0.406
## # … with 90 more rows

One numerical variable one categorical (2 levels) (diff in means)

mtcars %>%
specify(mpg ~ am) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate  stat
##        <int> <dbl>
##  1         1 -4.36
##  2         2 -5.64
##  3         3 -8.54
##  4         4 -9.26
##  5         5 -5.24
##  6         6 -5.55
##  7         7 -7.71
##  8         8 -7.68
##  9         9 -9.21
## 10        10 -7.17
## # … with 90 more rows

Two categorical variables (diff in proportions)

mtcars %>%
specify(am ~ vs, success = "1") %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "diff in props", order = c("0", "1"))
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0648
##  2         2 -0.189
##  3         3 -0.208
##  4         4 -0.0952
##  5         5 -0.317
##  6         6  0.0317
##  7         7  0.143
##  8         8 -0.453
##  9         9 -0.212
## 10        10 -0.312
## # … with 90 more rows

Two numerical vars - SLR

mtcars %>%
specify(mpg ~ hp) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "slope")
## # A tibble: 100 x 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0878
##  2         2 -0.0691
##  3         3 -0.0866
##  4         4 -0.0518
##  5         5 -0.0593
##  6         6 -0.0711
##  7         7 -0.0588
##  8         8 -0.0776
##  9         9 -0.0615
## 10        10 -0.0464
## # … with 90 more rows

Two numerical vars - correlation

mtcars %>%
specify(mpg ~ hp) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "correlation")
## # A tibble: 100 x 2
##    replicate   stat
##        <int>  <dbl>
##  1         1 -0.765
##  2         2 -0.846
##  3         3 -0.789
##  4         4 -0.718
##  5         5 -0.748
##  6         6 -0.800
##  7         7 -0.744
##  8         8 -0.832
##  9         9 -0.752
## 10        10 -0.824
## # … with 90 more rows