Make bootstrap function more efficient with lapply

Question

I have a data frame with numeric columns and a character column with labels. See example:

library(tidyverse)

a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10",  "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" ,  "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
  transform(a = as.numeric(a)) %>%
  transform(b = as.numeric(b)) %>%
  transform(c = as.numeric(c))

I have written a function that takes a sample of rows for each label (number of rows in sample = number of rows for the specific label) and as output gives the average of the samples. Example: in the source data (dataframe) there are 3 rows of the label "fa05". Lets call them fa05_1, fa05_2, fa05_3 (just for explaining it). The function takes a sample of these three rows that each consist of 3 columns (a,b and c). The number of fa05 in the sample equals the number fa05 in the source data, so 3 in this case. The function takes a sample with replacement so it could fx be fa05_3, fa05_1, fa05_1. Then it takes the average of those three samples for each of the three columns a,b and c and gives the output. It looks like this:

samp <- function(df, col1, var){
  df %>% 
    group_by(!!col1) %>% 
    nest() %>%
    ungroup() %>% 
    mutate(n = !!var) %>%
    mutate(samp = map2(data, n, sample_n, replace=T)) %>%
    select(-data) %>%
    unnest(samp) %>%
    group_by(!!col1) %>%
    dplyr::summarise(across("a":"c", mean))
}

list <- c(3,3,2,2,2,6,4,3) # The number of times each label occur in the data 

samp(dataframe, quo(label), quo(list))

  label        a       b       c
            
1 fa05   0.00439 0.00416 0.00395
2 fa10   0.00894 0.00820 0.00752
3 fa20   0.00672 0.00634 0.00597
4 faflat 0.00908 0.00859 0.00811
5 sa05   0.0552  0.0395  0.0281 
6 sa10   0.00715 0.00657 0.00603
7 sa20   0.0101  0.00956 0.00903
8 saflat 0.0250  0.0211  0.0177

I would like to use this function on some data and repeat it 1000 times efficiently. At first it was not a function and I used rerun() but that was very inefficient. I read that I could write it as a function and the use lapply which should be more efficient, but it does not work when I do like this:

lapply(dataframe, samp, col1=quo(Pattern), var=quo(list))

Error in UseMethod("group_by_") : 
  no applicable method for 'group_by_' applied to an object of class "c('double', 'numeric')"

How do I make this work with lapply? And how to I tell lapply to rerun the function 1000 times? I hope you can help.

ekoam · Accepted Answer

You can just do this

replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)

However, this is really slow.

> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
   user  system elapsed 
  33.83    0.03   33.87

To make it faster, we need to rewrite your samp function. Here is a tidyverse approach

group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)

prep <- function(df, grp_var, sample_size) {
  df %>% 
    mutate(size = sample_size[.data[[grp_var]]]) %>% 
    group_by(across(!!grp_var))
}

rep_sample <- function(df, n) {
  replicate(
    n,
    df %>% 
      slice(sample.int(n(), size[[1L]], replace = TRUE)) %>% 
      summarise(across(a:c, mean), .groups = "drop"), 
    simplify = FALSE
  )
}

dataframe %>% 
  prep("label", group_sample_size) %>% 
  rep_sample(1000)

Performance has improved significantly but is still suboptimal IMO. It takes about 5-6 seconds to finish the simulation.

> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
   user  system elapsed 
   5.80    0.01    5.81

For efficiency, I think the following data.table approach would be better.

library(data.table)

fsamp <- function(df, grp_var, size, nsim) {
  df <- as.data.table(df)
  group_info <- table(df[[grp_var]], dnn = list(grp_var))
  simu_pool <- df[, -grp_var, with = FALSE]
  simu_vars <- names(simu_pool)
  simu_pool <- split(simu_pool, df[[grp_var]])
  
  out <- data.table(
    simu = rep(seq_len(nsim), each = length(group_info)), 
    group_info
  )
  
  out[
    , size := size[out[[grp_var]]]
  ][
    , (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
    by = c("simu", grp_var)
  ][]
}

This one is about four times faster than the optimised tidyverse approach.

> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
   user  system elapsed 
   1.47    0.04    1.50

All three approaches produce the same set of results

> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
      simu  label N size           a           b           c
   1:    1   fa05 3    3 0.015383909 0.013350778 0.011561460
   2:    1   fa10 3    3 0.763161377 0.371405971 0.160972865
   3:    1   fa20 2    2 0.006717308 0.006340109 0.005970452
   4:    1 faflat 2    2 0.009076331 0.008589699 0.008110854
   5:    1   sa05 2    2 0.055184818 0.039548290 0.028102024
  ---                                                       
7996: 1000 faflat 2    2 0.007826754 0.007378527 0.006940039
7997: 1000   sa05 2    2 0.099406840 0.068956516 0.046847336
7998: 1000   sa10 6    6 0.006648513 0.006118159 0.005626362
7999: 1000   sa20 4    4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3    3 0.008411304 0.007960328 0.007516569

> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

Make bootstrap function more efficient with lapply

Answers (1)

Related Questions