Reputation: 204
Given 2 dataframes:
df1
col1 col2 col3
43 21 "a"
32 31 "b"
NA 12 "c"
44 NA "d"
df2
cl4 cl5 cl6
43 1 "text"
12 0 "text2"
32 44 "text3"
How can I merge them with a left_join, if one value of the columns in c("col1", "col2")
matches a value in the columns c("cl4", "cl5")
?
Additional information: all variables can have missing values, except cl6 which is always completed.
Expected result:
col1 col2 col3 cl4 cl5 cl6
43 21 "a" 43 1 "text"
32 31 "b" 32 44 "text3"
NA 12 "c" 12 0 "text2"
44 NA "d" 32 44 "text3"
I have some code that works, but I think there must be a better solution if there are a lot of joins to be done (in my real dataframes I have 24 joins to do...). Here is my code:
list_vars = c('cl4', "cl5", "cl6")
list_vars_rename = c("col4", "col5", "col6")
#MERGE 1
df1_merged <- left_join(df1, df2, by=c("col1" = "cl4"), na_matches = "never") #ignore NAs
df1_merged$cl4 <- df1_merged$col1 #because cl4 disappears during the join
df1_merged[is.na(df1_merged$cl6), "cl4"] <- NA #cl4 equals NA if no match = if cl6 NA
setnames(df1_merged, old = list_vars, new = list_vars_rename, skip_absent = T) #rename cols
#MERGE 2
df1_merged <- left_join(df1_merged, df2, by=c("col1" = "cl5"), na_matches = "never")
df1_merged <- as.data.frame(df1_merged) #because was a tibble
df1_merged$cl5 <- df1_merged$col1 #because cl4 disappears during the join
df1_merged[is.na(df1_merged$cl6), "cl5"] <- NA #cl5 equals NA if no match = if cl6 NA
for (i in seq_along(list_vars_rename)){
df1_merged[,list_vars_rename[i]] <- ifelse(is.na(df1_merged[,list_vars_rename[i]]), df1_merged[,list_vars[i]], df1_merged[,list_vars_rename[i]])
} #fill col4, col5 & col6 with the values of cl4, cl5 & cl6 we got in the join
df1_merged = df1_merged[, !(names(df1_merged) %in% list_vars)] #drop cl4 ,cl5 & cl6
#MERGE 3
df1_merged <- left_join(df1_merged, ventes, by=c("col2" = "cl4"), na_matches = "never")
df1_merged <- as.data.frame(df1_merged)
df1_merged$cl4 <- df1_merged$col2
df1_merged[is.na(df1_merged$cl6), "cl4"] <- NA
for (i in seq_along(list_vars_rename)){
df1_merged[,list_vars_rename[i]] <- ifelse(is.na(df1_merged[,list_vars_rename[i]]), df1_merged[,list_vars[i]], df1_merged[,list_vars_rename[i]])
}
df1_merged= df1_merged[, !(names(df1_merged) %in% list_vars)]
###etc. until the last merge.
Upvotes: 0
Views: 87
Reputation: 612
I didn't quite get there, but maybe this code helps:
library(tidyverse)
df1 <- read_table("col1 col2 col3
43 21 a
32 31 b
NA 12 c
44 NA d")
df2 <- read_table("cl4 cl5 cl6
43 1 text
12 0 text2
32 44 text3")
cols_1 <- c("col1", "col2")
cols_2 <- c("cl4", "cl5")
df1 %>%
pivot_longer(cols = all_of(cols_1)) %>%
left_join(df2 %>% pivot_longer(cols = all_of(cols_2)), by = "value", suffix = c(".df1", ".df2")) %>%
filter(!is.na(name.df1) & !is.na(name.df2))
#> # A tibble: 4 x 5
#> col3 name.df1 value cl6 name.df2
#> <chr> <chr> <dbl> <chr> <chr>
#> 1 a col1 43 text cl4
#> 2 b col1 32 text3 cl4
#> 3 c col2 12 text2 cl4
#> 4 d col1 44 text3 cl5
Created on 2021-07-27 by the reprex package (v2.0.0)
The output contains the columns containing the important info (col3
and cl6
), and it tells you what columns matched (name.df1
and name.df2
), and what the matching value is (value
). But I couldn't figure out how to add back the other information to match your desired output. Also I didn't deal with NA
s.
Upvotes: 1