[https://speakerdeck.com/romainfrancois/n-cool-number-dplyr-things]
group_hug()
Split data in groups
Apply something for each group
Combine
group_modify()
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fun <- function(slice, keys) {
broom::tidy(lm(Petal.Length ~ Sepal.Length, data = slice))
}
iris %>%
group_by(Species) %>%
group_modify(fun)
## # A tibble: 6 x 6
## # Groups: Species [3]
## Species term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa (Intercept) 0.803 0.344 2.34 2.38e- 2
## 2 setosa Sepal.Length 0.132 0.0685 1.92 6.07e- 2
## 3 versicolor (Intercept) 0.185 0.514 0.360 7.20e- 1
## 4 versicolor Sepal.Length 0.686 0.0863 7.95 2.59e-10
## 5 virginica (Intercept) 0.610 0.417 1.46 1.50e- 1
## 6 virginica Sepal.Length 0.750 0.0630 11.9 6.30e-16
iris %>%
group_by(Species) %>%
group_modify(
~ broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x))
)
## # A tibble: 6 x 6
## # Groups: Species [3]
## Species term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa (Intercept) 0.803 0.344 2.34 2.38e- 2
## 2 setosa Sepal.Length 0.132 0.0685 1.92 6.07e- 2
## 3 versicolor (Intercept) 0.185 0.514 0.360 7.20e- 1
## 4 versicolor Sepal.Length 0.686 0.0863 7.95 2.59e-10
## 5 virginica (Intercept) 0.610 0.417 1.46 1.50e- 1
## 6 virginica Sepal.Length 0.750 0.0630 11.9 6.30e-16
group_map
iris %>%
group_by(Species) %>%
group_map( ~ lm(Petal.Length ~ Sepal.Length, data = .x))
## [[1]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
## (Intercept) Sepal.Length
## 0.8031 0.1316
##
##
## [[2]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
## (Intercept) Sepal.Length
## 0.1851 0.6865
##
##
## [[3]]
##
## Call:
## lm(formula = Petal.Length ~ Sepal.Length, data = .x)
##
## Coefficients:
## (Intercept) Sepal.Length
## 0.6105 0.7501
group_modify()
diy with group_map()
iris %>%
group_by(Species) %>%
group_map( ~ {
broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)) %>%
tibble::add_column(Species = .y$Species)
}) %>%
bind_rows() %>%
group_by(Species)
## # A tibble: 6 x 6
## # Groups: Species [3]
## term estimate std.error statistic p.value Species
## <chr> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 (Intercept) 0.803 0.344 2.34 2.38e- 2 setosa
## 2 Sepal.Length 0.132 0.0685 1.92 6.07e- 2 setosa
## 3 (Intercept) 0.185 0.514 0.360 7.20e- 1 versicolor
## 4 Sepal.Length 0.686 0.0863 7.95 2.59e-10 versicolor
## 5 (Intercept) 0.610 0.417 1.46 1.50e- 1 virginica
## 6 Sepal.Length 0.750 0.0630 11.9 6.30e-16 virginica
iris %>%
group_by(Species) %>%
group_map( ~{
broom::tidy(lm(Petal.Length ~ Sepal.Length, data = .x)) %>%
tibble::add_column(!!!.y)
}) %>%
bind_rows() %>%
group_by(Species)
## # A tibble: 6 x 6
## # Groups: Species [3]
## term estimate std.error statistic p.value Species
## <chr> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 (Intercept) 0.803 0.344 2.34 2.38e- 2 setosa
## 2 Sepal.Length 0.132 0.0685 1.92 6.07e- 2 setosa
## 3 (Intercept) 0.185 0.514 0.360 7.20e- 1 versicolor
## 4 Sepal.Length 0.686 0.0863 7.95 2.59e-10 versicolor
## 5 (Intercept) 0.610 0.417 1.46 1.50e- 1 virginica
## 6 Sepal.Length 0.750 0.0630 11.9 6.30e-16 virginica
group_split()
iris %>%
group_by(Species) %>%
group_split()
## [[1]]
## # A tibble: 50 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 40 more rows
##
## [[2]]
## # A tibble: 50 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 7 3.2 4.7 1.4 versicolor
## 2 6.4 3.2 4.5 1.5 versicolor
## 3 6.9 3.1 4.9 1.5 versicolor
## 4 5.5 2.3 4 1.3 versicolor
## 5 6.5 2.8 4.6 1.5 versicolor
## 6 5.7 2.8 4.5 1.3 versicolor
## 7 6.3 3.3 4.7 1.6 versicolor
## 8 4.9 2.4 3.3 1 versicolor
## 9 6.6 2.9 4.6 1.3 versicolor
## 10 5.2 2.7 3.9 1.4 versicolor
## # … with 40 more rows
##
## [[3]]
## # A tibble: 50 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 6.3 3.3 6 2.5 virginica
## 2 5.8 2.7 5.1 1.9 virginica
## 3 7.1 3 5.9 2.1 virginica
## 4 6.3 2.9 5.6 1.8 virginica
## 5 6.5 3 5.8 2.2 virginica
## 6 7.6 3 6.6 2.1 virginica
## 7 4.9 2.5 4.5 1.7 virginica
## 8 7.3 2.9 6.3 1.8 virginica
## 9 6.7 2.5 5.8 1.8 virginica
## 10 7.2 3.6 6.1 2.5 virginica
## # … with 40 more rows
group_data()
iris %>%
group_by(Species) %>%
group_data()
## # A tibble: 3 x 2
## Species .rows
## <fct> <list>
## 1 setosa <int [50]>
## 2 versicolor <int [50]>
## 3 virginica <int [50]>
group_keys()
iris %>%
group_by(Species) %>%
group_keys()
## # A tibble: 3 x 1
## Species
## <fct>
## 1 setosa
## 2 versicolor
## 3 virginica
group_rows()
iris %>%
group_by(Species) %>%
group_rows()
## [[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50
##
## [[2]]
## [1] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [18] 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
## [35] 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
##
## [[3]]
## [1] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [18] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
## [35] 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
summarise
iris %>%
group_by(Species) %>%
summarise(
Petal.Width = mean(Petal.Width), # select "Petal.Width", act(calculate mean) on each
Petal.Length = mean(Petal.Length),
Sepal.Width = mean(Sepal.Width),
Sepal.Length = mean(Sepal.Length)
)
## # A tibble: 3 x 5
## Species Petal.Width Petal.Length Sepal.Width Sepal.Length
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 0.246 1.46 3.43 5.01
## 2 versicolor 1.33 4.26 2.77 5.94
## 3 virginica 2.03 5.55 2.97 6.59
summarise_at
iris %>%
group_by(Species) %>%
summarise_at(
vars(contains("Petal"), contains("Sepal")),
mean
)
## # A tibble: 3 x 5
## Species Petal.Length Petal.Width Sepal.Length Sepal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 1.46 0.246 5.01 3.43
## 2 versicolor 4.26 1.33 5.94 2.77
## 3 virginica 5.55 2.03 6.59 2.97
trim_mean <- function(.x) mean(.x, trim = .2)
iris %>%
group_by(Species) %>%
summarise_at(vars(contains(".")),
trim_mean
)
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5 3.41 1.46 0.22
## 2 versicolor 5.91 2.80 4.31 1.34
## 3 virginica 6.55 2.96 5.49 2.02
iris %>%
group_by(Species) %>%
summarise_at(
vars(contains(".")),
~ mean(.x, trim = .2)
)
## # A tibble: 3 x 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5 3.41 1.46 0.22
## 2 versicolor 5.91 2.80 4.31 1.34
## 3 virginica 6.55 2.96 5.49 2.02
iris %>%
group_by(Species) %>%
summarise_at(
vars(starts_with("Sepal")),
list(mean = mean, median = median) # multiple action (mean and median)
)
## # A tibble: 3 x 5
## Species Sepal.Length_me… Sepal.Width_mean Sepal.Length_me…
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 5.01 3.43 5
## 2 versic… 5.94 2.77 5.9
## 3 virgin… 6.59 2.97 6.5
## # … with 1 more variable: Sepal.Width_median <dbl>
iris %>%
group_by(Species) %>%
summarise_at(
vars(starts_with("Sepal")),
list(
mean = ~ mean(.x, trim = .2),
median = median
)
)
## # A tibble: 3 x 5
## Species Sepal.Length_me… Sepal.Width_mean Sepal.Length_me…
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 5 3.41 5
## 2 versic… 5.91 2.80 5.9
## 3 virgin… 6.55 2.96 6.5
## # … with 1 more variable: Sepal.Width_median <dbl>
Petal_exprs <- tidyselect::vars_select(names(iris), starts_with("Petal")) %>%
purrr::map(~ expr(mean(!!sym(.))))
Petal_exprs
## $Petal.Length
## mean(Petal.Length)
##
## $Petal.Width
## mean(Petal.Width)
Sepal_exprs <- tidyselect::vars_select(names(iris), starts_with("Sepal")) %>%
purrr::map(~ expr(median(!!sym(.))))
Sepal_exprs
## $Sepal.Length
## median(Sepal.Length)
##
## $Sepal.Width
## median(Sepal.Width)
iris %>%
group_by(Species) %>%
summarise(!!!Petal_exprs, !!!Sepal_exprs)
## # A tibble: 3 x 5
## Species Petal.Length Petal.Width Sepal.Length Sepal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 1.46 0.246 5 3.4
## 2 versicolor 4.26 1.33 5.9 2.8
## 3 virginica 5.55 2.03 6.5 3
dance
library(dance)
#install.packages("dance")
iris %>%
group_by(Species) %>%
tango(
swing(mean, starts_with("Petal")),
swing(median, starts_with("Sepal"))
)
## # A tibble: 3 x 5
## Species Petal.Length Petal.Width Sepal.Length Sepal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 1.46 0.246 5 3.4
## 2 versicolor 4.26 1.33 5.9 2.8
## 3 virginica 5.55 2.03 6.5 3
https://github.com/romainfrancois/dance
library(dance)
g <- iris %>% group_by(Species)
waltz(), polka(), tango(), charleston()
These are in the neighborhood of dplyr::summarise()
waltz()
takes a grouped tibble and a list of formulas and returns a tibble with: as many columns as supplied formulas, one row per group. It does not prepend the grouping variables(see tango
for that)
g %>%
waltz(Sepal.Length = ~mean(Sepal.Length),
Sepal.Width = ~mean(Sepal.Width)
)
## # A tibble: 3 x 2
## Sepal.Length Sepal.Width
## <dbl> <dbl>
## 1 5.01 3.43
## 2 5.94 2.77
## 3 6.59 2.97
polka()
deals with peeling off one layer of grouping:
g %>%
polka()
## # A tibble: 3 x 1
## Species
## <fct>
## 1 setosa
## 2 versicolor
## 3 virginica
tango()
binds the results of polka()
and waltz()
so is the closest to dplyr::summarise()
g %>%
tango(
Sepal.Length = ~mean(Sepal.Length),
Sepal.Width = ~mean(Sepal.Width)
)
## # A tibble: 3 x 3
## Species Sepal.Length Sepal.Width
## <fct> <dbl> <dbl>
## 1 setosa 5.01 3.43
## 2 versicolor 5.94 2.77
## 3 virginica 6.59 2.97
g %>%
charleston(
Sepal.Length = ~mean(Sepal.Length),
Sepal.Width = ~mean(Sepal.Width)
)
## # A tibble: 3 x 2
## Species data$Sepal.Length $Sepal.Width
## <fct> <dbl> <dbl>
## 1 setosa 5.01 3.43
## 2 versicolor 5.94 2.77
## 3 virginica 6.59 2.97
swing, twist
There is no waltz_at()
, tango_at()
, etc… but instead we can use either the same function on a set of columns or a set of functions on the same column.
For this, we need to learn new dance moves:
swing()
and twist()
are for applying the same function to a set of columns:
library(tidyselect)
g %>%
tango(swing(mean, starts_with("Petal")))
## # A tibble: 3 x 3
## Species Petal.Length Petal.Width
## <fct> <dbl> <dbl>
## 1 setosa 1.46 0.246
## 2 versicolor 4.26 1.33
## 3 virginica 5.55 2.03
#g %>%
# tango(data = twist(mean, starts_with("Petal")))
They differ in the type of column is created and how to name them:
swing()
makes as many new columns as are selected by the tidy selection, and the columns are named using a .name
glue pattern, this way we might swing()
several times.g %>%
tango(
swing(mean, starts_with("Petal"), .name = "mean_{var}"),
swing(median, starts_with("Petal"), .name = "median_{var}")
)
## # A tibble: 3 x 5
## Species mean_Petal.Leng… mean_Petal.Width median_Petal.Le…
## <fct> <dbl> <dbl> <dbl>
## 1 setosa 1.46 0.246 1.5
## 2 versic… 4.26 1.33 4.35
## 3 virgin… 5.55 2.03 5.55
## # … with 1 more variable: median_Petal.Width <dbl>
twist()
instead creates a single data frame column.#g %>%
# tango(
# mean = twist(mean, starts_with("Petal")),
# median = twist(median, starts_with("Petal"))
# )
The first arguments of swing()
and twist()
are either a function or a formula that uses .
as a placeholder. Subsequent arguments are tidyselect selections.
You can combine swing()
and twist()
in the same tango()
or waltz()
:
#g %>%
# tango(
# swing(mean, starts_with("Petal"), .name = "mean_{var}"),
# median = twist(median, contains("."))
# )
rumba, zumba
Similarly rumba
can be used to apply several functions to a single column. rumba
creates single columns and zumba
packs them into a data frame column.
#g %>%
# tango(
# rumba(Sepal.Width, mean = mean, median = median, .name = "Sepal_{fun}"),
# Petal = zumba(Petal.Width, mean = mean, median = median)
# )
big_iris <- iris
for (i in 1:1000) {big_iris <- rbind(big_iris, iris)}
dim(big_iris)
## [1] 150150 5
library(microbenchmark)
microbenchmark(big_iris %>%
group_by(Species) %>%
tango(
swing(mean, starts_with("Petal")),
swing(median, starts_with("Sepal"))
)) #mean:54.63046
## Unit: milliseconds
## expr
## big_iris %>% group_by(Species) %>% tango(swing(mean, starts_with("Petal")), swing(median, starts_with("Sepal")))
## min lq mean median uq max neval
## 38.51598 64.9358 77.71282 77.27815 89.67084 133.7088 100
microbenchmark(big_iris %>%
group_by(Species) %>%
summarise_at(
vars(starts_with("Sepal")),
list(
mean = ~ mean(.x, trim = .2),
median = median
)
)) #mean: 72.13311
## Unit: milliseconds
## expr
## big_iris %>% group_by(Species) %>% summarise_at(vars(starts_with("Sepal")), list(mean = ~mean(.x, trim = 0.2), median = median))
## min lq mean median uq max neval
## 44.01962 54.72252 68.64215 65.31614 74.98087 267.4052 100
Petal_exprs <- tidyselect::vars_select(names(iris), starts_with("Petal")) %>%
purrr::map(~ expr(mean(!!sym(.))))
Petal_exprs
## $Petal.Length
## mean(Petal.Length)
##
## $Petal.Width
## mean(Petal.Width)
Sepal_exprs <- tidyselect::vars_select(names(iris), starts_with("Sepal")) %>%
purrr::map(~ expr(median(!!sym(.))))
microbenchmark(big_iris %>%
group_by(Species) %>%
summarise(!!!Petal_exprs, !!!Sepal_exprs)) #mean:40.16754
## Unit: milliseconds
## expr
## big_iris %>% group_by(Species) %>% summarise(!!!Petal_exprs, !!!Sepal_exprs)
## min lq mean median uq max neval
## 18.47395 30.13195 41.37051 41.40793 52.01668 71.00264 100
salsa, chacha, samba, madison
Now we enter the realms of dplyr::mutate()
with:
salsa()
: to create new columns
chacha()
: to reorganize a grouped tibble so that data for each group is contiguous
samba()
: chacha()
+ salsa()
g %>%
salsa(
Sepal = ~Sepal.Length * Sepal.Width,
Petal = ~Petal.Length * Petal.Width
)
## # A tibble: 150 x 2
## Sepal Petal
## <dbl> <dbl>
## 1 17.8 0.280
## 2 14.7 0.280
## 3 15.0 0.26
## 4 14.3 0.3
## 5 18 0.280
## 6 21.1 0.68
## 7 15.6 0.42
## 8 17 0.3
## 9 12.8 0.280
## 10 15.2 0.15
## # … with 140 more rows
You can swing()
, twist()
, rumba()
and zumba()
here too, and if you want the original data, you can use samba()
instead of salsa()
:
#g %>%
# samba(centered = twist(~ . - mean(.), everything(), -Species))
madison()
packs the columns salsa()
would have created
#g %>%
# madison(swing(~ . - mean(.), starts_with("Sepal")))
bolero and mambo
bolero()
is similar to dplyr::filter
. The formulas may be made by mambo()
if you want to apply the same predicate to a tidyselection of columns:
g %>%
bolero(~Sepal.Width > 4)
## # A tibble: 3 x 5
## # Groups: Species [3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.7 4.4 1.5 0.4 setosa
## 2 5.2 4.1 1.5 0.1 setosa
## 3 5.5 4.2 1.4 0.2 setosa
microbenchmark(big_iris %>% group_by(Species) %>% bolero(~Sepal.Width > 4)) #23.64931
## Unit: milliseconds
## expr min
## big_iris %>% group_by(Species) %>% bolero(~Sepal.Width > 4) 11.83451
## lq mean median uq max neval
## 22.17752 32.38803 28.74377 40.96771 124.8217 100
microbenchmark(big_iris %>% group_by(Species) %>% filter(Sepal.Width > 4)) #19.62825
## Unit: milliseconds
## expr min
## big_iris %>% group_by(Species) %>% filter(Sepal.Width > 4) 6.906776
## lq mean median uq max neval
## 13.05112 18.65715 15.38857 22.11206 58.20996 100
#g %>%
# bolero(mambo(~. > 4, starts_with("Sepal")))
#g %>%
# bolero(mambo(~. > 4, starts_with("Sepal"), .op = or))