Reputation: 3174
I'm joining data frames (tibbles) that have duplicated columns that I do not want to join. Example below is what I would usually do (joining by i
, but not a
or b
):
library(dplyr)
df1 <- tibble(i = letters[1:3], a = 1:3, b = 4:6)
df2 <- tibble(i = letters[1:3], a = 11:13, b = 14:16)
d <- full_join(df1, df2, by ="i")
d
#> # A tibble: 3 × 5
#> i a.x b.x a.y b.y
#> <chr> <int> <int> <int> <int>
#> 1 a 1 4 11 14
#> 2 b 2 5 12 15
#> 3 c 3 6 13 16
I want these duplicated variables to be returned as nested lists such as the output created below:
tibble(
i = letters[1:3],
a = list(c(1, 11), c(2, 12), c(3, 13)),
b = list(c(4, 14), c(5, 15), c(6, 16))
)
#> # A tibble: 3 × 3
#> i a b
#> <chr> <list> <list>
#> 1 a <dbl [2]> <dbl [2]>
#> 2 b <dbl [2]> <dbl [2]>
#> 3 c <dbl [2]> <dbl [2]>
Is there a simple way to do such a thing?
Aside, I've been playing around (unsuccessfully) with various stringr and tidyr methods. Here's an example that throws an error:
library(stringr)
library(tidyr)
# Find any variables with .x or .y
dup_var <- d %>% select(matches("\\.[xy]")) %>% names()
# Condense to the stems (original names) of these variables
dup_var_stems <- dup_var %>% str_replace("(\\.[x|y])+", "") %>% unique()
# For each stem, try to nest relevant data into a single variable
for (stem in dup_var_stems) {
d <- d %>% nest_(key_col = stem, nest_cols = names(d)[str_detect(names(d), paste0(stem, "[$|\\.]"))])
}
UPDATE
After answers from @Sotos and @conor, I'll mention that the solution needs to generalise to multiple joining and duplicated columns over many data frames. Below is an example where joining is done on five data frames by two columns (i
and j
). This creates five duplicated versions of columns a
and b
, with plenty of unique columns too c
:g
. One problem is that duplicating over so many data frames results in duplicated versions having no suffix, .x
, .x.x
, and so on. So simple regex match for .x|.y
will miss the no-suffix version of the column.
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Upvotes: 3
Views: 1430
Reputation: 51592
Here is one attempt,
library(dplyr)
library(tidyr)
melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value)) %>%
spread(a, new)
# A tibble: 3 × 3
# i a b
#* <chr> <list> <list>
#1 a <int [2]> <int [2]>
#2 b <int [2]> <int [2]>
#3 c <int [2]> <int [2]>
#With structure
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
#Or via reshape2 package
library(dplyr)
library(reshape2)
d1 <- melt(d, id.vars = 'i') %>%
group_by(a = sub('\\..*', '', variable), i) %>%
summarise(new = list(value))
d2 <- dcast(d1, i ~ a, value.var = 'new')
#d2
# i a b
#1 a 1, 11 4, 14
#2 b 2, 12 5, 15
#3 c 3, 13 6, 16
#with structure:
str(d2)
'data.frame': 3 obs. of 3 variables:
$ i: chr "a" "b" "c"
$ a:List of 3
..$ : int 1 11
..$ : int 2 12
..$ : int 3 13
$ b:List of 3
..$ : int 4 14
..$ : int 5 15
..$ : int 6 16
EDIT
To follow your thought,
library(dplyr)
library(reshape2)
library(purrr)
library(tidyr)
df <- melt(d, id.vars = c(names(d)[!grepl('a|b', names(d))]))
dots <- names(df)[!grepl('value', names(df))] %>% map(as.symbol)
df %>% mutate(variable = sub('\\..*', '', variable)) %>%
group_by_(.dots = dots) %>%
summarise(new = list(value)) %>%
spread(variable, new) %>%
ungroup()
# A tibble: 4 × 9
# i j c d e f g a b
#* <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Upvotes: 5
Reputation: 3174
After updating the question, I arrived at the following based on the melt()
solution provided by @Sotos (so please upvote that solution too if you think this works).
The following is a function that should take a data frame like the ones described, and nest duplicated columns. See comments throughout for explanation.
Create the problem data frame:
library(dplyr)
library(purrr)
id_cols <- tibble(i = c("x", "x", "y", "y"),
j = c(1, 2, 1, 2))
df1 <- id_cols %>% cbind(tibble(a = 1:4, b = 5:8, c = 21:24))
df2 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, d = 31:34))
df3 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, e = 31:34))
df4 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, f = 31:34))
df5 <- id_cols %>% cbind(tibble(a = 2:5, b = 6:9, g = 31:34))
datalist <- list(df1, df2, df3, df4, df5)
d <- reduce(datalist, full_join, by = c("i", "j"))
d
#> i j a.x b.x c a.y b.y d a.x.x b.x.x e a.y.y b.y.y f a b g
#> 1 x 1 1 5 21 2 6 31 2 6 31 2 6 31 2 6 31
#> 2 x 2 2 6 22 3 7 32 3 7 32 3 7 32 3 7 32
#> 3 y 1 3 7 23 4 8 33 4 8 33 4 8 33 4 8 33
#> 4 y 2 4 8 24 5 9 34 5 9 34 5 9 34 5 9 34
Create function nest_duplicates()
# Function to nest duplicated columns after joining multiple data frames
#
# Args:
# df Data frame of joined data frames with duplicated columns.
# suffixes Character string to match suffixes. E.g., the default "\\.[xy]"
# finds any columns ending with .x or .y
#
# Depends on: dplyr, tidyr, purrr, stringr
nest_duplicated <- function(df, suffixes = "\\.[xy]") {
# Search string to match any duplicated variables
search_string <- df %>%
dplyr::select(dplyr::matches(suffixes)) %>%
names() %>%
stringr::str_replace_all(suffixes, "") %>%
unique() %>%
stringr::str_c(collapse = "|") %>%
stringr::str_c("(", ., ")($|", suffixes, ")")
# Gather duplicated variables and convert names to stems
df <- df %>%
tidyr::gather(variable, value, dplyr::matches(search_string)) %>%
dplyr::mutate(variable = stringr::str_replace_all(variable, suffixes, ""))
# Group by all columns except value to convert duplicated rows into list, then
# spread by variable (var)
dots <- names(df)[!stringr::str_detect(names(df), "value")] %>% purrr::map(as.symbol)
df %>%
dplyr::group_by_(.dots = dots) %>%
dplyr::summarise(new = list(value)) %>%
tidyr::spread(variable, new) %>%
dplyr::ungroup()
}
Apply nest_duplicates()
:
nest_duplicated(d)
#> # A tibble: 4 × 9
#> i j c d e f g a b
#> * <chr> <dbl> <int> <int> <int> <int> <int> <list> <list>
#> 1 x 1 21 31 31 31 31 <int [5]> <int [5]>
#> 2 x 2 22 32 32 32 32 <int [5]> <int [5]>
#> 3 y 1 23 33 33 33 33 <int [5]> <int [5]>
#> 4 y 2 24 34 34 34 34 <int [5]> <int [5]>
Updates/improvements welcome!
Upvotes: 0
Reputation: 1287
Slightly more verbose than Sotos
answer, but this will also work.
library(dplyr)
library(tidyr)
library(stringr)
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>% group_by(i, col) %>%
summarise(val = list(val)) %>%
spread(col, val) %>%
ungroup()
i a b
<fctr> <list> <list>
1 a <int [2]> <int [2]>
2 b <int [2]> <int [2]>
3 c <int [2]> <int [2]>
If you want to use nest
to create lists
of dataframes
you can do this instead
d_tidy <- gather(d, col, val, a.x:b.y, -i)
d_tidy$col <- str_replace(d_tidy$col, ".x|.y", "")
d_tidy %>%
group_by(i, col) %>%
nest(col) %>%
spread(col, data)
i a b
<fctr> <list> <list>
1 a <tbl_df [2,0]> <tbl_df [2,0]>
2 b <tbl_df [2,0]> <tbl_df [2,0]>
3 c <tbl_df [2,0]> <tbl_df [2,0]>
Upvotes: 0