[https://speakerdeck.com/romainfrancois/n-cool-number-dplyr-things]

`group_hug()`

Split data in groups

Apply something for each group

Combine

`group_modify()`

    library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

  fun <- function(slice, keys) {
      broom::tidy(lm(Petal.Length ~ Sepal.Length, data = slice))
  }
  
  iris %>% 
      group_by(Species) %>%
      group_modify(fun)

## # A tibble: 6 x 6
## # Groups:   Species [3]
##   Species    term         estimate std.error statistic  p.value
##   <fct>      <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 setosa     (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 setosa     Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 versicolor (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 versicolor Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 virginica  (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 virginica  Sepal.Length    0.750    0.0630    11.9   6.30e-16

iris %>% 
    group_by(Species) %>%
    group_modify(
        ~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))
    )

## # A tibble: 6 x 6
## # Groups:   Species [3]
##   Species    term         estimate std.error statistic  p.value
##   <fct>      <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 setosa     (Intercept)     0.803    0.344      2.34  2.38e- 2
## 2 setosa     Sepal.Length    0.132    0.0685     1.92  6.07e- 2
## 3 versicolor (Intercept)     0.185    0.514      0.360 7.20e- 1
## 4 versicolor Sepal.Length    0.686    0.0863     7.95  2.59e-10
## 5 virginica  (Intercept)     0.610    0.417      1.46  1.50e- 1
## 6 virginica  Sepal.Length    0.750    0.0630    11.9   6.30e-16

`group_map`

iris %>%
    group_by(Species) %>%
    group_map( ~ lm(Petal.Length ~ Sepal.Length, data = .x))

## [[1]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.8031        0.1316  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.1851        0.6865  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
## 
## Coefficients:
##  (Intercept)  Sepal.Length  
##       0.6105        0.7501

`group_modify()` diy with `group_map()`

iris %>%
    group_by(Species) %>%
    group_map( ~ {
        broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)) %>%
            tibble::add_column(Species = .y$Species)
    }) %>%
    bind_rows() %>%
    group_by(Species)

## # A tibble: 6 x 6
## # Groups:   Species [3]
##   term         estimate std.error statistic  p.value Species   
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl> <fct>     
## 1 (Intercept)     0.803    0.344      2.34  2.38e- 2 setosa    
## 2 Sepal.Length    0.132    0.0685     1.92  6.07e- 2 setosa    
## 3 (Intercept)     0.185    0.514      0.360 7.20e- 1 versicolor
## 4 Sepal.Length    0.686    0.0863     7.95  2.59e-10 versicolor
## 5 (Intercept)     0.610    0.417      1.46  1.50e- 1 virginica 
## 6 Sepal.Length    0.750    0.0630    11.9   6.30e-16 virginica

iris %>%
    group_by(Species) %>%
    group_map( ~{
        broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)) %>%
            tibble::add_column(!!!.y)
    }) %>%
    bind_rows() %>%
    group_by(Species)

## # A tibble: 6 x 6
## # Groups:   Species [3]
##   term         estimate std.error statistic  p.value Species   
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl> <fct>     
## 1 (Intercept)     0.803    0.344      2.34  2.38e- 2 setosa    
## 2 Sepal.Length    0.132    0.0685     1.92  6.07e- 2 setosa    
## 3 (Intercept)     0.185    0.514      0.360 7.20e- 1 versicolor
## 4 Sepal.Length    0.686    0.0863     7.95  2.59e-10 versicolor
## 5 (Intercept)     0.610    0.417      1.46  1.50e- 1 virginica 
## 6 Sepal.Length    0.750    0.0630    11.9   6.30e-16 virginica

`group_split()`

iris %>%
    group_by(Species) %>%
    group_split()

## [[1]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # … with 40 more rows
## 
## [[2]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>     
##  1          7           3.2          4.7         1.4 versicolor
##  2          6.4         3.2          4.5         1.5 versicolor
##  3          6.9         3.1          4.9         1.5 versicolor
##  4          5.5         2.3          4           1.3 versicolor
##  5          6.5         2.8          4.6         1.5 versicolor
##  6          5.7         2.8          4.5         1.3 versicolor
##  7          6.3         3.3          4.7         1.6 versicolor
##  8          4.9         2.4          3.3         1   versicolor
##  9          6.6         2.9          4.6         1.3 versicolor
## 10          5.2         2.7          3.9         1.4 versicolor
## # … with 40 more rows
## 
## [[3]]
## # A tibble: 50 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species  
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>    
##  1          6.3         3.3          6           2.5 virginica
##  2          5.8         2.7          5.1         1.9 virginica
##  3          7.1         3            5.9         2.1 virginica
##  4          6.3         2.9          5.6         1.8 virginica
##  5          6.5         3            5.8         2.2 virginica
##  6          7.6         3            6.6         2.1 virginica
##  7          4.9         2.5          4.5         1.7 virginica
##  8          7.3         2.9          6.3         1.8 virginica
##  9          6.7         2.5          5.8         1.8 virginica
## 10          7.2         3.6          6.1         2.5 virginica
## # … with 40 more rows

`group_data()`

iris %>%
    group_by(Species) %>%
    group_data()

## # A tibble: 3 x 2
##   Species    .rows     
##   <fct>      <list>    
## 1 setosa     <int [50]>
## 2 versicolor <int [50]>
## 3 virginica  <int [50]>

`group_keys()`

iris %>%
    group_by(Species) %>%
    group_keys()

## # A tibble: 3 x 1
##   Species   
##   <fct>     
## 1 setosa    
## 2 versicolor
## 3 virginica

`group_rows()`

iris %>%
    group_by(Species) %>%
    group_rows()

## [[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50
## 
## [[2]]
##  [1]  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
## [18]  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84
## [35]  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
## 
## [[3]]
##  [1] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [18] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
## [35] 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150

columns wise: select columns, Act on each

`summarise`

iris %>%
    group_by(Species) %>%
    summarise(
        Petal.Width  = mean(Petal.Width), # select "Petal.Width", act(calculate mean) on each
        Petal.Length = mean(Petal.Length),
        Sepal.Width  = mean(Sepal.Width),
        Sepal.Length = mean(Sepal.Length)
    )

## # A tibble: 3 x 5
##   Species    Petal.Width Petal.Length Sepal.Width Sepal.Length
##   <fct>            <dbl>        <dbl>       <dbl>        <dbl>
## 1 setosa           0.246         1.46        3.43         5.01
## 2 versicolor       1.33          4.26        2.77         5.94
## 3 virginica        2.03          5.55        2.97         6.59

`summarise_at`

iris %>% 
    group_by(Species) %>%
    summarise_at(
        vars(contains("Petal"), contains("Sepal")),
        mean
    )

## # A tibble: 3 x 5
##   Species    Petal.Length Petal.Width Sepal.Length Sepal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             1.46       0.246         5.01        3.43
## 2 versicolor         4.26       1.33          5.94        2.77
## 3 virginica          5.55       2.03          6.59        2.97

Custom function

trim_mean <- function(.x) mean(.x, trim = .2)

iris %>%
    group_by(Species) %>%
    summarise_at(vars(contains(".")),
                 trim_mean
    )

## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5           3.41         1.46        0.22
## 2 versicolor         5.91        2.80         4.31        1.34
## 3 virginica          6.55        2.96         5.49        2.02

Lambda action

iris %>%
    group_by(Species) %>%
    summarise_at(
        vars(contains(".")),
        ~ mean(.x, trim = .2)
    )

## # A tibble: 3 x 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5           3.41         1.46        0.22
## 2 versicolor         5.91        2.80         4.31        1.34
## 3 virginica          6.55        2.96         5.49        2.02

function(s)

iris %>%
    group_by(Species) %>%
    summarise_at(
        vars(starts_with("Sepal")),
        list(mean = mean, median = median) # multiple action (mean and median)
    )

## # A tibble: 3 x 5
##   Species Sepal.Length_me… Sepal.Width_mean Sepal.Length_me…
##   <fct>              <dbl>            <dbl>            <dbl>
## 1 setosa              5.01             3.43              5  
## 2 versic…             5.94             2.77              5.9
## 3 virgin…             6.59             2.97              6.5
## # … with 1 more variable: Sepal.Width_median <dbl>

functions(s) + Lambda(s)

iris %>%
    group_by(Species) %>%
    summarise_at(
        vars(starts_with("Sepal")),
        list(
            mean   = ~ mean(.x, trim = .2),
            median = median
        )
    )

## # A tibble: 3 x 5
##   Species Sepal.Length_me… Sepal.Width_mean Sepal.Length_me…
##   <fct>              <dbl>            <dbl>            <dbl>
## 1 setosa              5                3.41              5  
## 2 versic…             5.91             2.80              5.9
## 3 virgin…             6.55             2.96              6.5
## # … with 1 more variable: Sepal.Width_median <dbl>

Actions for Petal

Petal_exprs <- tidyselect::vars_select(names(iris), starts_with("Petal")) %>%
    purrr::map(~ expr(mean(!!sym(.))))
Petal_exprs

## $Petal.Length
## mean(Petal.Length)
## 
## $Petal.Width
## mean(Petal.Width)

Actions for Sepal

Sepal_exprs <- tidyselect::vars_select(names(iris), starts_with("Sepal")) %>%
    purrr::map(~ expr(median(!!sym(.))))
Sepal_exprs

## $Sepal.Length
## median(Sepal.Length)
## 
## $Sepal.Width
## median(Sepal.Width)

iris %>%
    group_by(Species) %>%
    summarise(!!!Petal_exprs, !!!Sepal_exprs)

## # A tibble: 3 x 5
##   Species    Petal.Length Petal.Width Sepal.Length Sepal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             1.46       0.246          5           3.4
## 2 versicolor         4.26       1.33           5.9         2.8
## 3 virginica          5.55       2.03           6.5         3

`dance`

library(dance)
#install.packages("dance")
iris %>%
    group_by(Species) %>%
    tango(
        swing(mean, starts_with("Petal")),
        swing(median, starts_with("Sepal"))
    )

## # A tibble: 3 x 5
##   Species    Petal.Length Petal.Width Sepal.Length Sepal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             1.46       0.246          5           3.4
## 2 versicolor         4.26       1.33           5.9         2.8
## 3 virginica          5.55       2.03           6.5         3

https://github.com/romainfrancois/dance

library(dance)
g <- iris %>% group_by(Species)

waltz(), polka(), tango(), charleston()

These are in the neighborhood of dplyr::summarise()

waltz() takes a grouped tibble and a list of formulas and returns a tibble with: as many columns as supplied formulas, one row per group. It does not prepend the grouping variables(see tango for that)

g %>%
    waltz(Sepal.Length = ~mean(Sepal.Length),
          Sepal.Width  = ~mean(Sepal.Width)
          )

## # A tibble: 3 x 2
##   Sepal.Length Sepal.Width
##          <dbl>       <dbl>
## 1         5.01        3.43
## 2         5.94        2.77
## 3         6.59        2.97

polka() deals with peeling off one layer of grouping:

g %>%
    polka()

## # A tibble: 3 x 1
##   Species   
##   <fct>     
## 1 setosa    
## 2 versicolor
## 3 virginica

tango() binds the results of polka() and waltz() so is the closest to dplyr::summarise()

g %>%
    tango(
        Sepal.Length = ~mean(Sepal.Length),
        Sepal.Width  = ~mean(Sepal.Width)
    )

## # A tibble: 3 x 3
##   Species    Sepal.Length Sepal.Width
##   <fct>             <dbl>       <dbl>
## 1 setosa             5.01        3.43
## 2 versicolor         5.94        2.77
## 3 virginica          6.59        2.97

g %>%
    charleston(
        Sepal.Length = ~mean(Sepal.Length),
        Sepal.Width  = ~mean(Sepal.Width)
    )

## # A tibble: 3 x 2
##   Species    data$Sepal.Length $Sepal.Width
##   <fct>                  <dbl>        <dbl>
## 1 setosa                  5.01         3.43
## 2 versicolor              5.94         2.77
## 3 virginica               6.59         2.97

swing, twist

There is no waltz_at(), tango_at(), etc… but instead we can use either the same function on a set of columns or a set of functions on the same column.

For this, we need to learn new dance moves:

swing() and twist() are for applying the same function to a set of columns:

library(tidyselect)
g %>%
    tango(swing(mean, starts_with("Petal")))

## # A tibble: 3 x 3
##   Species    Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>
## 1 setosa             1.46       0.246
## 2 versicolor         4.26       1.33 
## 3 virginica          5.55       2.03

#g %>% 
#    tango(data = twist(mean, starts_with("Petal")))

They differ in the type of column is created and how to name them:

swing() makes as many new columns as are selected by the tidy selection, and the columns are named using a .name glue pattern, this way we might swing() several times.

g %>% 
  tango(
    swing(mean, starts_with("Petal"), .name = "mean_{var}"), 
    swing(median, starts_with("Petal"), .name = "median_{var}") 
  )

## # A tibble: 3 x 5
##   Species mean_Petal.Leng… mean_Petal.Width median_Petal.Le…
##   <fct>              <dbl>            <dbl>            <dbl>
## 1 setosa              1.46            0.246             1.5 
## 2 versic…             4.26            1.33              4.35
## 3 virgin…             5.55            2.03              5.55
## # … with 1 more variable: median_Petal.Width <dbl>

twist() instead creates a single data frame column.

#g %>% 
#  tango(
#    mean   = twist(mean, starts_with("Petal")), 
#    median = twist(median, starts_with("Petal"))
#  )

The first arguments of swing() and twist() are either a function or a formula that uses . as a placeholder. Subsequent arguments are tidyselect selections.

You can combine swing() and twist() in the same tango() or waltz():

#g %>% 
#  tango(
#    swing(mean, starts_with("Petal"), .name = "mean_{var}"), 
#    median = twist(median, contains("."))
#  )

rumba, zumba

Similarly rumba can be used to apply several functions to a single column. rumba creates single columns and zumba packs them into a data frame column.

#g %>% 
#    tango(
#        rumba(Sepal.Width, mean = mean, median = median, .name = "Sepal_{fun}"),
#        Petal = zumba(Petal.Width, mean = mean, median = median)
#    )

Benchmark

big_iris <- iris
for (i in 1:1000) {big_iris <- rbind(big_iris, iris)}
dim(big_iris)

## [1] 150150      5

library(microbenchmark)
microbenchmark(big_iris %>%
    group_by(Species) %>%
    tango(
        swing(mean, starts_with("Petal")),
        swing(median, starts_with("Sepal"))
    )) #mean:54.63046

## Unit: milliseconds
##                                                                                                                   expr
##  big_iris %>% group_by(Species) %>% tango(swing(mean, starts_with("Petal")),      swing(median, starts_with("Sepal")))
##       min      lq     mean   median       uq      max neval
##  38.51598 64.9358 77.71282 77.27815 89.67084 133.7088   100

microbenchmark(big_iris %>%
    group_by(Species) %>%
    summarise_at(
        vars(starts_with("Sepal")),
        list(
            mean   = ~ mean(.x, trim = .2),
            median = median
        )
    )) #mean: 72.13311

## Unit: milliseconds
##                                                                                                                                   expr
##  big_iris %>% group_by(Species) %>% summarise_at(vars(starts_with("Sepal")),      list(mean = ~mean(.x, trim = 0.2), median = median))
##       min       lq     mean   median       uq      max neval
##  44.01962 54.72252 68.64215 65.31614 74.98087 267.4052   100

Petal_exprs <- tidyselect::vars_select(names(iris), starts_with("Petal")) %>%
    purrr::map(~ expr(mean(!!sym(.))))
Petal_exprs

## $Petal.Length
## mean(Petal.Length)
## 
## $Petal.Width
## mean(Petal.Width)

Sepal_exprs <- tidyselect::vars_select(names(iris), starts_with("Sepal")) %>%
    purrr::map(~ expr(median(!!sym(.))))

microbenchmark(big_iris %>%
    group_by(Species) %>%
    summarise(!!!Petal_exprs, !!!Sepal_exprs)) #mean:40.16754

## Unit: milliseconds
##                                                                               expr
##  big_iris %>% group_by(Species) %>% summarise(!!!Petal_exprs,      !!!Sepal_exprs)
##       min       lq     mean   median       uq      max neval
##  18.47395 30.13195 41.37051 41.40793 52.01668 71.00264   100

salsa, chacha, samba, madison

Now we enter the realms of dplyr::mutate() with:

salsa(): to create new columns
chacha(): to reorganize a grouped tibble so that data for each group is contiguous
samba(): chacha() + salsa()

g %>%
    salsa(
        Sepal = ~Sepal.Length * Sepal.Width,
        Petal = ~Petal.Length * Petal.Width
    )

## # A tibble: 150 x 2
##    Sepal Petal
##    <dbl> <dbl>
##  1  17.8 0.280
##  2  14.7 0.280
##  3  15.0 0.26 
##  4  14.3 0.3  
##  5  18   0.280
##  6  21.1 0.68 
##  7  15.6 0.42 
##  8  17   0.3  
##  9  12.8 0.280
## 10  15.2 0.15 
## # … with 140 more rows

You can swing(), twist(), rumba() and zumba() here too, and if you want the original data, you can use samba() instead of salsa():

#g %>% 
#  samba(centered = twist(~ . - mean(.), everything(), -Species))

madison() packs the columns salsa() would have created

#g %>% 
#  madison(swing(~ . - mean(.), starts_with("Sepal")))

bolero and mambo

bolero() is similar to dplyr::filter. The formulas may be made by mambo() if you want to apply the same predicate to a tidyselection of columns:

g %>% 
  bolero(~Sepal.Width > 4)

## # A tibble: 3 x 5
## # Groups:   Species [3]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.7         4.4          1.5         0.4 setosa 
## 2          5.2         4.1          1.5         0.1 setosa 
## 3          5.5         4.2          1.4         0.2 setosa

microbenchmark(big_iris %>% group_by(Species) %>% bolero(~Sepal.Width > 4)) #23.64931

## Unit: milliseconds
##                                                         expr      min
##  big_iris %>% group_by(Species) %>% bolero(~Sepal.Width > 4) 11.83451
##        lq     mean   median       uq      max neval
##  22.17752 32.38803 28.74377 40.96771 124.8217   100

microbenchmark(big_iris %>% group_by(Species) %>% filter(Sepal.Width > 4)) #19.62825

## Unit: milliseconds
##                                                        expr      min
##  big_iris %>% group_by(Species) %>% filter(Sepal.Width > 4) 6.906776
##        lq     mean   median       uq      max neval
##  13.05112 18.65715 15.38857 22.11206 58.20996   100

#g %>% 
#  bolero(mambo(~. > 4, starts_with("Sepal")))

#g %>% 
#  bolero(mambo(~. > 4, starts_with("Sepal"), .op = or))

dplyr

Jing Liu

July 19, 2019

`group_hug()`

`group_modify()`

`group_map`

`group_modify()` diy with `group_map()`

`group_split()`

`group_data()`

`group_keys()`

`group_rows()`

columns wise: select columns, Act on each

`summarise`

`summarise_at`

Custom function

Lambda action

function(s)

functions(s) + Lambda(s)

Actions for Petal

Actions for Sepal

`dance`

Benchmark

dplyr

Jing Liu

July 19, 2019

group_hug()

group_modify()

group_map

group_modify() diy with group_map()

group_split()

group_data()

group_keys()

group_rows()

columns wise: select columns, Act on each

summarise

summarise_at

Custom function

Lambda action

function(s)

functions(s) + Lambda(s)

Actions for Petal

Actions for Sepal

dance

Benchmark

`group_hug()`

`group_modify()`

`group_map`

`group_modify()` diy with `group_map()`

`group_split()`

`group_data()`

`group_keys()`

`group_rows()`

`summarise`

`summarise_at`

`dance`