MC09
MC09

Reputation: 95

R: Apply function over nested lists with varying length

I have the following dataframe:

df <- data.frame(id = paste0('id', sample(c(1:4),80000, replace = TRUE)), date = as.Date(rbeta(80000, 0.7, 10) * 100, origin = "2016-01-01"), 
             variant = sample(c(0:1), 80000, replace = TRUE), type = sample(paste0(LETTERS[1:3],LETTERS[1]), 80000, TRUE), code = sample(letters[1:2], 80000, TRUE), 
             level = sample(LETTERS[1:8], 80000, TRUE), number = sample(c(1:100), 80000, replace = TRUE) )

Next, I split the dataframe several times and combine them (plus the original df) in a list:

dfs <- split(df,df$id)
df2 <- lapply(dfs, function(x) split(x,x$type))

df3 <- lapply(dfs, function(x) split(x,x$code))

df4 <- lapply(dfs, function(x) split(x,x$level))

df_all <- list(dfs,df2,df3,df4)

Thus, I first split the dataframe by Id, after which their are splitted on several conditions: none,type,code and level. Where "none" means that I don't split it any further.

My first question: is there a faster/cleaner way to achieve this?

Second question: how do I apply a function to each element of this list? It will probably will have something to do with lapply, but I can't figure out how, as the number of nested lists varies. Thus, to make it more clear, I would like to know how to apply my function to:

df_all[[1]]$id1
df_all[[1]]$id2
df_all[[1]]$id3
df_all[[1]]$id4
df_all[[2]]$id1$AA
df_all[[2]]$id1$BA
df_all[[2]]$id1$CA
df_all[[2]]$id2$AA
etc.

My function is as follows:

func <- function(x){
x <- x %>%
  group_by(variant) %>%
  summarise(H = sum(number)) %>%
  ungroup()

Upvotes: 2

Views: 789

Answers (1)

acylam
acylam

Reputation: 18681

If all you wanted to do is group by different combination of variables and summarize, then splitting the groups is probably not a good idea, just modify the function so that you can input different combinations of group by variables like the following:

library(dplyr)

func2 <- function(x, ...){
  group_quo = quos(...)
  x %>%
    group_by(!!!group_quo) %>%
    summarize(H = sum(number)) 
}

Result:

> func2(df, id, variant)
# A tibble: 8 x 3
# Groups:   id [?]
  id    variant      H
  <fct>   <int>  <int>
1 id1         0 500192
2 id1         1 508282
3 id2         0 505829
4 id2         1 511855
5 id3         0 502280
6 id3         1 510854
7 id4         0 502621
8 id4         1 510372

> func2(df, id, type, variant)
# A tibble: 24 x 4
# Groups:   id, type [?]
   id    type  variant      H
   <fct> <fct>   <int>  <int>
 1 id1   AA          0 167757
 2 id1   AA          1 169025
 3 id1   BA          0 166225
 4 id1   BA          1 168208
 5 id1   CA          0 166210
 6 id1   CA          1 171049
 7 id2   AA          0 169277
 8 id2   AA          1 172240
 9 id2   BA          0 168596
10 id2   BA          1 169396
# ... with 14 more rows

etc.

If you're trying to apply something more complex or you want to keep the hierarchical structure of the lists, you can try to use nested data.frames:

library(dplyr)
library(tidyr)
library(purrr)

func <- function(x){
  x %>%
    group_by(variant) %>%
    summarize(H = sum(number)) 
}

df_nested = df %>% 
  group_by(id) %>%
  nest() %>%
  mutate(df1 = data %>% map(func),
         df2 = data %>% map(~group_by(., type) %>% nest()),
         df3 = data %>% map(~group_by(., code) %>% nest()),
         df4 = data %>% map(~group_by(., level) %>% nest())) %>%
  mutate_at(vars(df2:df4), 
            funs(map(., function(x) mutate(x, data = map(data, func)) %>% unnest))) 

Result:

> df_nested
# A tibble: 4 x 6
  id    data                  df1              df2              df3              df4              
  <fct> <list>                <list>           <list>           <list>           <list>           
1 id1   <tibble [19,963 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
2 id3   <tibble [19,946 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
3 id2   <tibble [20,114 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>
4 id4   <tibble [19,977 x 6]> <tibble [2 x 2]> <tibble [6 x 3]> <tibble [4 x 3]> <tibble [16 x 3]>

> df_nested %>% 
+   select(id, data) %>%
+   unnest()
# A tibble: 80,000 x 7
   id    date       variant type  code  level number
   <fct> <date>       <int> <fct> <fct> <fct>  <int>
 1 id1   2016-01-05       1 AA    b     H         71
 2 id1   2016-01-01       0 CA    a     G         85
 3 id1   2016-01-03       0 CA    a     E         98
 4 id1   2016-01-01       1 BA    b     E         78
 5 id1   2016-01-01       1 BA    b     G         64
 6 id1   2016-01-18       1 AA    a     E         69
 7 id1   2016-01-04       1 BA    b     E         12
 8 id1   2016-01-02       0 CA    b     B         32
 9 id1   2016-01-01       1 CA    a     B         44
10 id1   2016-01-02       0 BA    a     F         89
# ... with 79,990 more rows

> df_nested %>% 
+   select(id, df1) %>%
+   unnest()
# A tibble: 8 x 3
  id    variant      H
  <fct>   <int>  <int>
1 id1         0 500192
2 id1         1 508282
3 id3         0 502280
4 id3         1 510854
5 id2         0 505829
6 id2         1 511855
7 id4         0 502621
8 id4         1 510372

Upvotes: 2

Related Questions