Reputation: 121
I have a deeply nested list I would like to convert to a data frame. Here's what the structure looks like:
ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))),
'20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))))
> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298
$G2.val
[1] -0.2761515
$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532 0.3503852 -0.7367241 0.3024992 -0.4931045
$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769 0.8733201 -0.4126742 1.8689940 0.1576750 -0.2184344
Each sublist name is a value for a different variable: in this example, maybe:
ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2
Ideally I'd like to have the names of sublists used as values in their own columns. I'd like the data frame to look like this:
# time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#1 10 123 0.1 0.1972457 -0.1224265 0.06121407 1.5102516
#2 10 123 0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3 10 123 0.1 0.1972457 -0.1224265 0.06648820 -0.2477285
#4 10 123 0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5 10 123 0.1 0.1972457 -0.1224265 0.90828911 -1.0710828
#6 10 123 0.1 0.1972457 -0.1224265 0.56427976 1.5086222
Thanks for the help.
Upvotes: 11
Views: 2853
Reputation: 5798
Base R solution:
# Flatten the nested list: flat_long_df => data.frame
flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
do.call(c, x)}, ls), data.frame))))
# Derive the features: data.frame => console (stdout)
long_df <- cbind(within(flat_long_df, rm(ind)),
do.call("rbind",
lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
seed = as.numeric(gsub("X", "", x[2])),
treatment = as.numeric(paste(x[3], x[4], sep = ".")),
var = paste(x[5], x[6], sep = ".")))
}
)
)
)
# Split data by the var vector: df_list => list of data.frames
df_list <- split(long_df, long_df$var)
# Reshape from long to wide: wide_df => data.frame
wide_df <- do.call("cbind", lapply(seq_along(df_list), function(i){
setNames(within(df_list[[i]], rm(var)),
c(names(df_list)[i], "time", "seed", "treatment"))
}
)
)
Upvotes: 3
Reputation: 4841
Here is base R solution which is 5-94 times faster than other answers.
You can use a function like this one which will work even if you change the names of the inner-most lists:
# turns a deep nested list to a data.frame.
#
# Args:
# x: list of nested lists. All needs to have identical setup and
# names.
# cnam: character vector with column names for the columns which are
# from the non-terminal lists.
deep_nested_to_df <- function(x, cnam)
.deep_nested_to_df(x, cnam)
# do not call this function
.deep_nested_to_df <- function(x, cnam, idx = 1L){
# check if all elements are lists
is_all_lists <- all(sapply(x, is.list))
if(is_all_lists){
# create data.frames out of elements
out <- lapply(x, .deep_nested_to_df, cnam = cnam, idx = idx + 1L)
# check that all column names match
my_cnam <- colnames(out[[1L]])
stopifnot(all(length(out[[1L]]) == sapply(out, length)),
all(sapply(out, function(x) all(colnames(x) == my_cnam))))
# create the new colum
new_col <- c(mapply(rep, x = names(x), times = sapply(out, NROW)))
# combine to one data.frame
out <- do.call(rbind, out)
# add the new column
out <- do.call(cbind, list(
as.data.frame(new_col, stringsAsFactors = FALSE), out))
colnames(out)[1L] <- cnam[idx]
if(idx == 1L)
rownames(out) <- 1:NROW(out)
return(out)
}
as.data.frame(x, stringsAsFactors = FALSE)
}
# use the function
res <- deep_nested_to_df(ls, c("time", "seed", "treatment"))
head(res, 16)
#R> time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#R> 1 10 123 0.1 -0.63 0.18 -0.836 1.512
#R> 2 10 123 0.1 -0.63 0.18 1.595 0.390
#R> 3 10 123 0.1 -0.63 0.18 0.330 -0.621
#R> 4 10 123 0.1 -0.63 0.18 -0.820 -2.215
#R> 5 10 123 0.1 -0.63 0.18 0.487 1.125
#R> 6 10 123 0.1 -0.63 0.18 0.738 -0.045
#R> 7 10 123 0.1 -0.63 0.18 0.576 -0.016
#R> 8 10 123 0.1 -0.63 0.18 -0.305 0.944
#R> 9 10 123 0.2 0.82 0.59 0.919 -0.478
#R> 10 10 123 0.2 0.82 0.59 0.782 0.418
#R> 11 10 123 0.2 0.82 0.59 0.075 1.359
#R> 12 10 123 0.2 0.82 0.59 -1.989 -0.103
#R> 13 10 123 0.2 0.82 0.59 0.620 0.388
#R> 14 10 123 0.2 0.82 0.59 -0.056 -0.054
#R> 15 10 123 0.2 0.82 0.59 -0.156 -1.377
#R> 16 10 123 0.2 0.82 0.59 -1.471 -0.415
str(res)
#R> 'data.frame': 64 obs. of 7 variables:
#R> $ time : chr "10" "10" "10" "10" ...
#R> $ seed : chr "123" "123" "123" "123" ...
#R> $ treatment: chr "0.1" "0.1" "0.1" "0.1" ...
#R> $ Gmax.val : num -0.626 -0.626 -0.626 -0.626 -0.626 ...
#R> $ G2.val : num 0.184 0.184 0.184 0.184 0.184 ...
#R> $ Gmax.vec : num -0.836 1.595 0.33 -0.82 0.487 ...
#R> $ G2.vec : num 1.512 0.39 -0.621 -2.215 1.125 ...
The function could be much faster. I suspect that the calls to as.data.frame
and rbind
slows the function down quite a bit. Nevertheless, it works.
A faster version is something like:
deep_nested_to_df_fast <- function(x, cnam)
.deep_nested_to_df_fast(x, cnam)
.deep_nested_to_df_fast <- function(x, cnam, idx = 1L){
# check if all elements are list
is_all_lists <- all(sapply(x, is.list))
if(is_all_lists){
# create data.frames out of elements
out <- lapply(x, .deep_nested_to_df_fast, cnam = cnam, idx = idx + 1L)
# check that all column names match
my_cnam <- colnames(out[[1L]])
stopifnot(all(length(out[[1L]]) == sapply(out, length)),
all(sapply(out, function(x) all(colnames(x) == my_cnam))))
# create the new colum
new_col <- mapply(
rep, x = names(x), times = sapply(out, function(x) length(x[[1L]])),
SIMPLIFY = FALSE)
new_col <- do.call(c, new_col)
# combine to a list instead of a data.frame
out <- do.call(mapply, c(list(FUN = c, SIMPLIFY = FALSE), out))
# add the new colum
out <- c(list(new_col), out)
names(out)[1L] <- cnam[idx]
if(idx == 1L)
# turn it into a data.frame
out <- structure(
lapply(out, unname), names = names(out),
row.names = 1:length(out[[1L]]), class = "data.frame")
return(out)
}
# create list of element with an equal number
ele_length <- sapply(x, length)
# check that all have either the maximum number of elements or one
# element
max_len <- max(ele_length)
stopifnot(all(ele_length %in% c(1L, max_len)))
# return list like data.frame
lapply(x, rep, length.out = max_len)
}
It gives the same and is faster in this case:
# gives the same
res_fast <- deep_nested_to_df_fast(ls, c("time", "seed", "treatment"))
all.equal(res, res_fast)
#R> [1] TRUE
# check computation time. We also compare with other answers
library(rrapply)
library(tidyverse)
library(data.table)
Joris_ans <- function()
rrapply(ls, how = "melt") %>%
pivot_wider(names_from = "L4") %>%
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
rename(time = L1, seed = L2, treatment = L3)
Andrew_ans <- function(data = ls)
tibble(time = names(data), data = data) %>%
unnest_wider(data) %>%
pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
unnest_wider(treatment) %>%
pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
unnest_wider(g_data) %>%
mutate(row_n = row_number()) %>%
pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
unnest_longer(g_val) %>%
group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
mutate(sub_n = row_number()) %>%
pivot_wider(names_from = g, values_from = g_val) %>%
ungroup() %>%
select(-row_n, -sub_n)
hello_friend_ans <- function(){
flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
do.call(c, x)}, ls), data.frame))))
long_df <- cbind(within(flat_long_df, rm(ind)),
do.call("rbind", lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
seed = as.numeric(gsub("X", "", x[2])),
treatment = as.numeric(paste(x[3], x[4], sep = ".")),
var = paste(x[5], x[6], sep = ".")))
}
)
)
)
wide_df <- setNames(reshape(long_df,
idvar= c("time", "seed", "treatment"), timevar="var", direction="wide"),
c("time", "seed", "treatment", "Gmax.val", "G2.val", "Gmax.vec", "G2.vec"))
wide_df
}
det_ans <- function(){
nested_list <- function(ls, list_names = NULL){
if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
dt <- as.data.table(ls)
dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
return(dt)
}
ls %>% imap(~nested_list(.x, c(list_names, .y)))
}
NestedList <- function(ls, new_names){
dt <- nested_list(ls) %>%
{do.call(c, unlist(., recursive = FALSE))} %>%
rbindlist()
setnames(dt, str_subset(names(dt), "NAME_"), new_names)
dt
}
NestedList(ls, c("time", "seed", "tretment"))
}
bench::mark(
first = deep_nested_to_df (ls, c("time", "seed", "treatment")),
fast = deep_nested_to_df_fast(ls, c("time", "seed", "treatment")),
Joris_ans(), Andrew_ans(), hello_friend_ans(), det_ans(),
min_time = 1, check = FALSE, relative = TRUE)
#R> # A tibble: 6 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#R> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
#R> 1 first 4.44 4.39 20.1 1.19 1 207 7 925.3ms <NULL> <Rpro… <bch… <tib…
#R> 2 fast 1 1 89.2 1 1.28 923 9 930.3ms <NULL> <Rpro… <bch… <tib…
#R> 3 Joris_ans() 9.90 9.85 9.01 1.92 1.03 90 7 897.9ms <NULL> <Rpro… <bch… <tib…
#R> 4 Andrew_ans() 60.2 59.1 1.62 10.9 1.51 11 7 611.5ms <NULL> <Rpro… <bch… <tib…
#R> 5 hello_friend_ans() 103. 94.9 1 203. 14.7 1 10 89.9ms <NULL> <Rpro… <bch… <tib…
#R> 6 det_ans() 5.87 5.73 15.4 106. 1.01 157 7 914ms <NULL> <Rpro… <bch… <tib…
It is 4 times faster than the first version, 5-94 times faster than other answers.
We can make this much faster by using a C++/C implementation which preallocates the final vectors that we will write to. This way, we avoid the repeated memory allocation we make above.
set.seed(1L)
ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))),
'20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))))
You might want to use another variable name than ls
as there is a ls
function.
Upvotes: 3
Reputation: 6244
Another approach is to:
rrapply()
in the rrapply-package (or similarly with reshape2::melt()
).pivot_wider()
and unnest()
.library(rrapply)
library(tidyverse)
rrapply(ls, how = "melt") %>% ## melt to long df
pivot_wider(names_from = "L4") %>% ## reshape to wide df
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>% ## unnest list columns
rename(time = L1, seed = L2, treatment = L3) ## rename columns
#> # A tibble: 64 x 7
#> time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 10 123 0.1 -0.626 0.184 -0.836 1.51
#> 2 10 123 0.1 -0.626 0.184 1.60 0.390
#> 3 10 123 0.1 -0.626 0.184 0.330 -0.621
#> 4 10 123 0.1 -0.626 0.184 -0.820 -2.21
#> 5 10 123 0.1 -0.626 0.184 0.487 1.12
#> 6 10 123 0.1 -0.626 0.184 0.738 -0.0449
#> 7 10 123 0.1 -0.626 0.184 0.576 -0.0162
#> 8 10 123 0.1 -0.626 0.184 -0.305 0.944
#> 9 10 123 0.2 0.821 0.594 0.919 -0.478
#> 10 10 123 0.2 0.821 0.594 0.782 0.418
#> # … with 54 more rows
Or using data.table's dcast()
to reshape the long table into wide format:
library(data.table)
long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
Some benchmarks
microbenchmark::microbenchmark(
tidyr = {
rrapply(ls, how = "melt") %>%
pivot_wider(names_from = "L4") %>%
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
rename(time = L1, seed = L2, treatment = L3)
},
data.table = {
wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
wide_dt
},
times = 25
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022 25
#> data.table 2.061861 2.655782 2.966581 2.784425 2.988044 5.032524 25
Upvotes: 11
Reputation: 5232
With recursion:
nested_list <- function(ls, list_names = NULL){
if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
dt <- as.data.table(ls)
dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
return(dt)
}
ls %>% imap(~nested_list(.x, c(list_names, .y)))
}
NestedList <- function(ls, new_names){
dt <- nested_list(ls) %>%
{do.call(c, unlist(., recursive = FALSE))} %>%
rbindlist()
setnames(dt, str_subset(names(dt), "NAME_"), new_names)
dt
}
NestedList(ls, c("time", "seed", "tretment"))
Upvotes: 1
Reputation: 21284
Here's a solution that uses some of the newer "rectangling" methods in tidyr
. I'm posting this mainly as an exercise to gain and share some familiarity with these functions - my sense is that this approach could definitely be, well, tidied up a bit. Still, it's a nice way to play with swinging back and forth between wide/long list unpacking.
library(tidyverse)
set.seed(1L)
tibble(time = names(data), data = data) %>%
unnest_wider(data) %>%
pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
unnest_wider(treatment) %>%
pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
unnest_wider(g_data) %>%
mutate(row_n = row_number()) %>%
pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
unnest_longer(g_val) %>%
group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
mutate(sub_n = row_number()) %>%
pivot_wider(names_from = g, values_from = g_val) %>%
ungroup() %>%
select(-row_n, -sub_n)
# A tibble: 64 x 7
time seed treatment Gmax.val G2.val Gmax.vec G2.vec
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 10 123 0.1 -0.626 0.184 -0.836 1.51
2 10 123 0.1 -0.626 0.184 1.60 0.390
3 10 123 0.1 -0.626 0.184 0.330 -0.621
4 10 123 0.1 -0.626 0.184 -0.820 -2.21
5 10 123 0.1 -0.626 0.184 0.487 1.12
6 10 123 0.1 -0.626 0.184 0.738 -0.0449
7 10 123 0.1 -0.626 0.184 0.576 -0.0162
8 10 123 0.1 -0.626 0.184 -0.305 0.944
9 10 123 0.2 0.821 0.594 0.919 -0.478
10 10 123 0.2 0.821 0.594 0.782 0.418
# … with 54 more rows
Upvotes: 5