Reputation: 125
Edited to add more details and clarify. Basically, I have a list of data frames, they have the same row numbers but various column numbers, so the dimension of each data frame is different. What I want to do now is to select the first row of each data frame, put them into a new data frame and use it as the first element of a new list, then do the same things for second rows, third rows...
I have contemplated to use 2 for loops to reassign rows, however that seems to be a very bad way to go about it given that nested for loop is pretty slow and the data I have is huge. Would truly appreciate sound insight and help.
myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
print(myList)
Current example data -
> print(myList)
[[1]]
V1 V2
1 1 4
2 2 5
3 3 6
[[2]]
V1 V2 V3
1 7 10 13
2 8 11 14
3 9 12 15
Desired Outcome
> print(myList2)
[[1]]
V1 V2 V3
1 1 4 0
2 7 10 13
[[2]]
V1 V2 V3
1 2 5 0
2 8 11 14
[[3]]
V1 V2 V3
1 3 6 0
2 9 12 15
The different dimensions of the current data frames makes it tricky.
Upvotes: 3
Views: 95
Reputation: 11255
Here's a base method of:
aperm
to match your intended outputapply
.myListBase <- myList #added because we modify the original list
#get all of the unique names from the list of dataframes
##default ordering is by ordering in list
all_cols <- Reduce(base::union, lapply(myListBase, names))
#loop, add new columns, and then re-order them so all data.frames
# have the same order
myListBase <- lapply(myListBase,
function(DF){
DF[, base::setdiff(all_cols, names(DF))] <- 0 #initialze columns
DF[, all_cols] #reorder columns
}
)
#create 3D array - could be simplified using abind::abind(abind(myListBase, along = 3))
myArrayBase <- array(unlist(myListBase, use.names = F),
dim = c(nrow(myListBase[[1]]), #rows
length(all_cols), #columns
length(myListBase) #3rd dimension
),
dimnames = list(NULL, all_cols, NULL))
#rows and 3rd dimension are transposed
myPermBase <- aperm(myArrayBase, c(3,2,1))
myPermBase
#, , 1
#
# V1 V2 V3
#[1,] 1 4 0
#[2,] 7 10 13
#
#, , 2
#
# V1 V2 V3
#[1,] 2 5 0
#[2,] 8 11 14
#
#, , 3
#
# V1 V2 V3
#[1,] 3 6 0
#[2,] 9 12 15
#make list of dataframes - likely not necessary
apply(myPermBase, 3, data.frame)
#[[1]]
# V1 V2 V3
#1 1 4 0
#2 7 10 13
#
#[[2]]
# V1 V2 V3
#1 2 5 0
#2 8 11 14
#
#[[3]]
# V1 V2 V3
#1 3 6 0
#2 9 12 15
Performance
The first version of the answer had a data.table
and abind
method but I've removed it - the base
version is much faster and there's not much additional clarity gained.
Unit: microseconds
expr min lq mean median uq max neval
camille_purrr_dplyr 7910.9 8139.25 8614.956 8246.30 8387.20 60159.5 1000
cole_DT_abind 2555.8 2804.75 3012.671 2917.95 3061.55 6602.3 1000
cole_base 600.3 634.40 697.987 663.00 733.10 3761.6 1000
Complete code for reference:
library(dplyr)
library(purrr)
library(data.table)
library(abind)
library(microbenchmark)
myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
microbenchmark(
camille_purrr_dplyr = {
myList %>%
map_dfr(tibble::rownames_to_column, var = "id") %>%
mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
split(.$id) %>%
map(select, -id)
}
,
cole_DT_abind = {
myListDT <- copy(myList)
all_cols <- Reduce(base::union, lapply(myListDT, names))
# data.table used for side effects of updating-by-reference in lapply
lapply(myListDT, setDT)
# add non-existing columns
lapply(myListDT,
function(DT) {
DT[, base::setdiff(all_cols, names(DT)) := 0]
setorderv(DT, all_cols)
})
# abind is used to make an array
myArray <- abind(myListDT, along = 3)
# aperm is used to transpose the array to the preferred route
myPermArray <- aperm(myArray, c(3,2,1))
# myPermArray
#or as a list of data.frames
apply(myPermArray, 3, data.frame)
}
,
cole_base = {
myListBase <- myList
all_cols <- Reduce(base::union, lapply(myListBase, names))
myListBase <- lapply(myListBase,
function(DF){
DF[, base::setdiff(all_cols, names(DF))] <- 0
DF[, all_cols]
}
)
myArrayBase <- array(unlist(myListBase, use.names = F),
dim = c(nrow(myListBase[[1]]), length(all_cols), length(myListBase)),
dimnames = list(NULL, all_cols, NULL))
myPermBase <- aperm(myArrayBase, c(3,2,1))
apply(myPermBase, 3, data.frame)
}
# ,
# cole_base_aperm = {
# myListBase <- myList
#
# all_cols <- Reduce(base::union, lapply(myListBase, names))
#
# myListBase <- lapply(myListBase,
# function(DF){
# DF[, base::setdiff(all_cols, names(DF))] <- 0
# DF[, all_cols]
# }
# )
#
# myArrayABind <- abind(myListBase, along = 3)
#
# myPermBase <- aperm(myArrayABind, c(3,2,1))
# apply(myPermBase, 3, data.frame)
# }
, times = 1000
)
Upvotes: 3
Reputation: 16832
One way with a few dplyr
& purrr
functions is to add an ID column to each row in each data frame, bind them all, then split by that ID. The base rbind
would throw an error because of the mismatched column names, but dplyr::bind_rows
takes a list of any number of data frames and adds NA
columns for anything missing.
First step gets you one data frame:
library(dplyr)
library(purrr)
myList %>%
map_dfr(tibble::rownames_to_column, var = "id")
#> id V1 V2 V3
#> 1 1 1 4 NA
#> 2 2 2 5 NA
#> 3 3 3 6 NA
#> 4 1 7 10 13
#> 5 2 8 11 14
#> 6 3 9 12 15
Fill in the NA
s with 0 in all columns except the ID—this could also be adjusted if need be. Split by ID, and drop the ID column since you no longer need it.
myList %>%
map_dfr(tibble::rownames_to_column, var = "id") %>%
mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
split(.$id) %>%
map(select, -id)
#> $`1`
#> V1 V2 V3
#> 1 1 4 0
#> 4 7 10 13
#>
#> $`2`
#> V1 V2 V3
#> 2 2 5 0
#> 5 8 11 14
#>
#> $`3`
#> V1 V2 V3
#> 3 3 6 0
#> 6 9 12 15
Upvotes: 1