Reputation: 8494
I am interested in the difference in the mean of some variable according to a binary covariate.
I am computing the confidence interval of this difference by bootstraping:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
Created on 2021-06-13 by the reprex package (v2.0.0)
This works, but I find it quite unreadable and I wonder about its efficiency, as for
loops are often considered a bad design in R
.
Being a heavy user of the tidyverse
, I would like to rewrite this using this framework.
Is there a fast and readable way to do so?
PS: Here is the closest I could get, but it is far from being more readable and it is 250 times slower:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
EDIT: here are the benchmarks of all methods so far:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
Upvotes: 0
Views: 356
Reputation: 887223
We could use map
and avoid the multiple pivot_wider
steps
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-checking
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
Or may use diff
instead of pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-checking
mean(out2$beffect)
#[1] 175.9203
Or another option would be to do the sample
, bind them together with a group identifier, use that to extract the values of the columns, do a group by the group identifier and 'vs' and get the mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-checking
mean(out3$beffect)
#[1] 175.9203
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
Or with map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
Or instead of pivot_wider
, take the diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
Or a similar approach in data.table
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP's method
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a
Upvotes: 3
Reputation: 11255
This may be overlooking the obvious, but the tidyverse equivalent of a for loop would involve something like purrr::map()
. The simplest conversion would be to use purrr::map_dbl(1:R, ...)
such as:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
Also, since by
is binary, you may be able to improve performance by translating this into rcpp.
Upvotes: 3