brlockwood
brlockwood

Reputation: 135

Combine list of data frames of differing length by row names in R

I have a list (df) of data frames of differing lengths, indexed by years such that a proxy of the data looks like:

df

  $df1          
         X..i..
  1999     10
  1998     13 
  1997     14

 $df2
         X..i..
 1999      20
 1998      11

 $df3
         X..i..
 1999      17
 1998       8
 1997       9
 1996      19

I would like to combine these data frames to a single data frame using and preserving the index/rownames

So that:

df_all
 Index     df1   df2   df3
  1999      10    20    17
  1998      13    11     8
  1997      14    n/a    9
  1996      n/a   n/a   19

Edit:

smalldflist <- lapply(bai_df, function(i) head(i, 10))
dput(smalldflist)

Produces the following output:

structure(list(IN_DonaldsonWoods_QUAL.txt = structure(list(X..i.. = c(4.5528243479162, 32.6474339976978, 52.7116018957456, 170.932582874866, 227.0430440174, 191.462399206825, 226.94053541991, 274.854835798233, 336.457600434571, 409.132933511232)), .Names = "X..i..", row.names = c("1725", "1726", "1727", "1728", "1729", "1730", "1731", "1732", "1733", "1734"), class = "data.frame"), IN_DonaldsonWoods_QURU.txt = structure(list( X..i.. = c(4.33729067152776, 5.72878688080428, 13.0247658962315, 22.0205798005054, 25.9885943197615, 18.9273551074104, 43.5197887382031, 58.2775710248884, 72.9225976242458, 73.0466756114972)), .Names = "X..i..", row.names = c("1827", "1828", "1829", "1830", "1831", "1832", "1833", "1834", "1835", "1836"), class = "data.frame"), IN_DonaldsonWoods_QUVE.txt = structure(list( X..i.. = c(7.87253273859391, 18.9481296742303, 42.5055176960097, 62.9980951594496, 88.906442207264, 74.2523230533691, 106.911242713809, 152.445167763284, 192.399603839633, 221.263660216113)), .Names = "X..i..", row.names = c("1731", "1732", "1733", "1734", "1735", "1736", "1737", "1738", "1739", "1740"), class = "data.frame"), IN_LillyDickey_QUAL.txt = structure(list( X..i.. = c(8.29576810088555, 17.2934968058816, 31.2091720401804, 33.8966066349882, 47.6496887415004, 32.9921546763907, 82.2281435044324, 108.068226885475, 103.894002151431, 110.255812097949)), .Names = "X..i..", row.names = c("1863", "1864", "1865", "1866", "1867", "1868", "1869", "1870", "1871", "1872"), class = "data.frame"), IN_LillyDickey_QUMO.txt = structure(list( X..i.. = c(3.42413493048312, 8.0847630303073, 19.6833503197648, 13.791136218324, 21.4638165402601, 30.6707376168741, 30.8789937938806, 26.8661212585221, 24.0732956549621, 29.7872997715364)), .Names = "X..i..", row.names = c("1867", "1868", "1869", "1870", "1871", "1872", "1873", "1874", "1875", "1876"), class = "data.frame"), IN_Pioneers_QUAL.txt = structure(list( X..i.. = c(9.14340435634345, 23.5108626053757, 33.8507393822465, 46.1027716604662, 57.5247983011993, 50.5892015892391, 92.2448163706925, 225.832932372368, 278.367628044195, 193.931508820174)), .Names = "X..i..", row.names = c("1817", "1818", "1819", "1820", "1821", "1822", "1823", "1824", "1825", "1826"), class = "data.frame"), IN_Pioneers_QURU.txt = structure(list( X..i.. = c(122.443727611702, 658.649900930018, 830.471777578934, 843.357139228152, 1725.6495913006, 1244.38668477703, 973.00892131628, 1294.7441782001, 1717.18570086886, 1676.63841798444)), .Names = "X..i..", row.names = c("1861", "1862", "1863", "1864", "1865", "1866", "1867", "1868", "1869", "1870"), class = "data.frame"), OH_JohnsonWoods_QUAL.txt = structure(list( X..i.. = c(1.9113449704439, 3.39794661412248, 5.32688450342693, 6.41921626908008, 11.0307601252838, 13.0825342873437, 15.843680070585, 16.885746353779, 20.1011664347289, 19.853294774361)), .Names = "X..i..", row.names = c("1626", "1627", "1628", "1629", "1630", "1631", "1632", "1633", "1634", "1635"), class = "data.frame")), .Names = c("IN_DonaldsonWoods_QUAL.txt", "IN_DonaldsonWoods_QURU.txt", "IN_DonaldsonWoods_QUVE.txt", "IN_LillyDickey_QUAL.txt", "IN_LillyDickey_QUMO.txt", "IN_Pioneers_QUAL.txt", "IN_Pioneers_QURU.txt", "OH_JohnsonWoods_QUAL.txt"))

Upvotes: 2

Views: 1305

Answers (3)

Parfait
Parfait

Reputation: 107587

Reconsider the chain merge as @Djork shows but make sure you create an actual column named, Index equal to rownames(). Also, rename the X..1 column according to df# which also avoids the duplicate column warning during merges. Below dfs is equivalent to posted smalldflist:

dfs <- lapply(seq_along(dfs), function(i){
        dfs[[i]]$Index = rownames(dfs[[i]])        # CREATE INDEX
        colnames(dfs[[i]])[1] <- paste0("df", i)   # RENAME X..1 COLUMN

        return(dfs[[i]])
})

dfs[[1]]
#             df1 Index
# 1725   4.552824  1725
# 1726  32.647434  1726
# 1727  52.711602  1727
# 1728 170.932583  1728
# 1729 227.043044  1729
# 1730 191.462399  1730
# 1731 226.940535  1731
# 1732 274.854836  1732
# 1733 336.457600  1733
# 1734 409.132934  1734

finaldf <- Reduce(function(...) merge(..., by="Index", all=TRUE), dfs)

finaldf
#    Index    df1     df2     df3     df4    df5    df6   df7        df8
# 1   1626     NA      NA      NA      NA     NA     NA    NA   1.911345
# 2   1627     NA      NA      NA      NA     NA     NA    NA   3.397947
# 3   1628     NA      NA      NA      NA     NA     NA    NA   5.326885
# 4   1629     NA      NA      NA      NA     NA     NA    NA   6.419216
# 5   1630     NA      NA      NA      NA     NA     NA    NA  11.030760
# ...

Upvotes: 1

Djork
Djork

Reputation: 3369

You can use Reduce to merge multiple data frames. Set all = TRUE which adds NAs when no matches occurs. Note df is the list of data frames as you have set up, and by indicates the column used for merging. Therefore in your list of data frames, "Index" should be the name of the year column in each data frame.

Reduce(function(...) merge(..., by="Index", all=TRUE), df)

And thanks to @jazzuro providing sample data, here is the equivalent solution using Reduce in base R. In this sample set the column used for merging by="year":

df1 <- data.frame(year = c(1999, 1998, 1997),
              value = c(10, 13, 14))

df2 <- data.frame(year = c(1999, 1998),
              value = c(20, 11))

df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
              value = c(17, 8, 9, 19))
df <- list(df1=df1, df2=df2, df3=df3)
df_merge  <- Reduce(function(...) merge(..., by="year", all=TRUE), df)
colnames(df_merge) <- c("Index", names(df))

#   Index df1 df2 df3
# 1 1996  NA  NA  19
# 2 1997  14  NA   9
# 3 1998  13  11   8
# 4 1999  10  20  17

Upvotes: 2

jazzurro
jazzurro

Reputation: 23574

If you have the data you need only in global environment, you could try the following. First, you collect unique years in all data frames and create a master data frame, which includes unique years only. Then, you put all data frames in a list and merge each of them with master. Since you have the master data frame in temp, you remove it. Finally, you bind all data frames and change the long format to a wide format.

library(tidyverse)

# Create a data frame with all unique years

master <- data.frame(year = mget(ls()) %>%
                            sapply(`[`, 1) %>%
                            as_vector %>% 
                            unique)

# Merge each data frame with the master df
temp <- mget(ls()) %>%
        lapply(function(x){full_join(x, master, by = "year")})

# Remove the master df in the list
temp[["master"]] <- NULL

# Bind all dfs and make it wide.
bind_rows(temp, .id = "data") %>%
spread(key = data, value = value)

#  year df1 df2 df3
#1 1996  NA  NA  19
#2 1997  14  NA   9
#3 1998  13  11   8
#4 1999  10  20  17

DATA

df1 <- data.frame(year = c(1999, 1998, 1997),
                  value = c(10, 13, 14))

df2 <- data.frame(year = c(1999, 1998),
                  value = c(20, 11))

df3 <- data.frame(year = c(1999, 1998, 1997, 1996),
                  value = c(17, 8, 9, 19))

Upvotes: 1

Related Questions