Reputation: 343
I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
Upvotes: 2
Views: 104
Reputation: 42544
There is an alternative solution using melt()
and dcast()
to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25 1: a1 b1 A, B A NA 2: a1 b2 A B A, B 3: a1 b3 B NA B 4: a2 b1 NA NA A
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
Upvotes: 2
Reputation: 47300
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr
you can avoid the not so pretty '[<-'
and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
Upvotes: 3
Reputation: 51582
To elaborate to the idea commented by @zx8754, and using dplyr
package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5 # Groups: id_1 [2] id_1 id_2 col_1 col2 col_25 <chr> <chr> <chr> <chr> <chr> 1 a1 b1 A B A <NA> 2 a1 b2 A B A B 3 a1 b3 B <NA> B 4 a2 b1 <NA> <NA> A
NOTE:
NA
(not characters)as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
Upvotes: 1