R: Reorganizing rows of each dataframe within a list into a new list of new dataframes

Question

Edited to add more details and clarify. Basically, I have a list of data frames, they have the same row numbers but various column numbers, so the dimension of each data frame is different. What I want to do now is to select the first row of each data frame, put them into a new data frame and use it as the first element of a new list, then do the same things for second rows, third rows...

I have contemplated to use 2 for loops to reassign rows, however that seems to be a very bad way to go about it given that nested for loop is pretty slow and the data I have is huge. Would truly appreciate sound insight and help.

myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
print(myList)

Current example data -

> print(myList)
[[1]]
  V1 V2
1  1  4
2  2  5
3  3  6

[[2]]
  V1 V2 V3
1  7 10 13
2  8 11 14
3  9 12 15

Desired Outcome

> print(myList2)
[[1]]
  V1 V2 V3
1  1  4  0
2  7 10 13

[[2]]
  V1 V2 V3
1  2  5  0
2  8 11 14

[[3]]
  V1 V2 V3
1  3  6  0
2  9 12 15

The different dimensions of the current data frames makes it tricky.

Cole · Accepted Answer

Here's a base method of:

Adding all the column names to each list item
Converting the list to an array.
Transposing the array using aperm to match your intended output
Optional turning the array into a list using apply.

myListBase <- myList #added because we modify the original list

#get all of the unique names from the list of dataframes
##default ordering is by ordering in list
all_cols <- Reduce(base::union, lapply(myListBase, names))

#loop, add new columns, and then re-order them so all data.frames
# have the same order
myListBase <- lapply(myListBase,
                     function(DF){
                       DF[, base::setdiff(all_cols, names(DF))] <- 0 #initialze columns
                       DF[, all_cols] #reorder columns
                       }
                     )

#create 3D array - could be simplified using abind::abind(abind(myListBase, along = 3))
myArrayBase <- array(unlist(myListBase, use.names = F),
                     dim = c(nrow(myListBase[[1]]), #rows
                             length(all_cols), #columns
                             length(myListBase) #3rd dimension
                             ),
                     dimnames = list(NULL, all_cols, NULL))

#rows and 3rd dimension are transposed
myPermBase <- aperm(myArrayBase, c(3,2,1))
myPermBase

#, , 1
#
#     V1 V2 V3
#[1,]  1  4  0
#[2,]  7 10 13
#
#, , 2
#
#     V1 V2 V3
#[1,]  2  5  0
#[2,]  8 11 14
#
#, , 3
#
#     V1 V2 V3
#[1,]  3  6  0
#[2,]  9 12 15

#make list of dataframes - likely not necessary
apply(myPermBase, 3, data.frame)

#[[1]]
#  V1 V2 V3
#1  1  4  0
#2  7 10 13
#
#[[2]]
#  V1 V2 V3
#1  2  5  0
#2  8 11 14
#
#[[3]]
#  V1 V2 V3
#1  3  6  0
#2  9 12 15

Performance

The first version of the answer had a data.table and abind method but I've removed it - the base version is much faster and there's not much additional clarity gained.

Unit: microseconds
                expr    min      lq     mean  median      uq     max neval
 camille_purrr_dplyr 7910.9 8139.25 8614.956 8246.30 8387.20 60159.5  1000
       cole_DT_abind 2555.8 2804.75 3012.671 2917.95 3061.55  6602.3  1000
           cole_base  600.3  634.40  697.987  663.00  733.10  3761.6  1000

Complete code for reference:

library(dplyr)
library(purrr)
library(data.table)
library(abind)
library(microbenchmark)

myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2

microbenchmark(
  camille_purrr_dplyr = {
    myList %>%
      map_dfr(tibble::rownames_to_column, var = "id") %>%
      mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
      split(.$id) %>%
      map(select, -id)
  }
  ,
  cole_DT_abind = {
  myListDT <- copy(myList)
  all_cols <- Reduce(base::union, lapply(myListDT, names))

  # data.table used for side effects of updating-by-reference in lapply
  lapply(myListDT, setDT)

  # add non-existing columns
  lapply(myListDT,
         function(DT) {
           DT[, base::setdiff(all_cols, names(DT)) := 0]
           setorderv(DT, all_cols)
         })

  # abind is used to make an array
  myArray <- abind(myListDT, along = 3)

  # aperm is used to transpose the array to the preferred route
  myPermArray <- aperm(myArray, c(3,2,1))
  # myPermArray

  #or as a list of data.frames
  apply(myPermArray, 3, data.frame)
  }
  ,
  cole_base = {
    myListBase <- myList

    all_cols <- Reduce(base::union, lapply(myListBase, names))

    myListBase <- lapply(myListBase, 
                         function(DF){
                           DF[, base::setdiff(all_cols, names(DF))] <- 0
                           DF[, all_cols]
                         }
                         )

    myArrayBase <- array(unlist(myListBase, use.names = F),
                         dim = c(nrow(myListBase[[1]]), length(all_cols), length(myListBase)),
                         dimnames = list(NULL, all_cols, NULL))

    myPermBase <- aperm(myArrayBase, c(3,2,1))
    apply(myPermBase, 3, data.frame)
  }
  # ,
  # cole_base_aperm = {
  #   myListBase <- myList
  #   
  #   all_cols <- Reduce(base::union, lapply(myListBase, names))
  #   
  #   myListBase <- lapply(myListBase, 
  #                        function(DF){
  #                          DF[, base::setdiff(all_cols, names(DF))] <- 0
  #                          DF[, all_cols]
  #                        }
  #   )
  #   
  #   myArrayABind <- abind(myListBase, along = 3)
  #   
  #   myPermBase <- aperm(myArrayABind, c(3,2,1))
  #   apply(myPermBase, 3, data.frame)
  # }
, times = 1000
)

R: Reorganizing rows of each dataframe within a list into a new list of new dataframes

Answers (2)

Related Questions