Reputation: 311
I've got a dataset like this:
df1 <- data.frame(
col1 = c(1, 2, 3),
col2 = c(4, 5, 6),
col3 = c(2, 8, 9),
col4 = c(5, 11, 12),
col5 = c(13, 14, 15),
col6 = c(16, 17, 18),
col7 = c(19, 20, 21),
col8 = c(22, 23, 24)
)
and a second that has a 'key' of matches that I'm looking for in df1 :
df2 <- data.frame(
colA = c(1, 2, 3),
colB = c(4, 5, 6),
value = c(100, 200, 300)
)
what I'm trying to do is find each pair-wise column match from left to right and create a new column containing the value from df2 everytime there is a match so that it looks like this:
df3 <- data.frame(
col1 = c(1, 2, 3),
col2 = c(4, 5, 6),
col3 = c(2, 8, 9),
col4 = c(5, 11, 12),
col5 = c(13, 14, 15),
col6 = c(16, 17, 18),
col7 = c(19, 20, 21),
col8 = c(22, 23, 24),
match1 = c(100, 200, 300),
match2 = c(200, NA, NA)
)
I've tried this kind of approach:
df_match <- inner_join(df1, df2, by = c("col1" = "colA", "col2" = "colB"))
df1$matched_value <- df_match$value[match(paste(df1$col1, df1$col2), paste(df_match$col1, df_match$col2))]
but it only returns one match across the rows. The other issue is I'm running this through many iterations that have varying numbers of columns in df1. I'm thinking I need something along the lines of across rows starts with 'col' but I'm pretty stuck.
Upvotes: 5
Views: 85
Reputation: 40171
Another tidyverse option could be (assuming overlapping columns such as 1-2, 2-3, 3-4):
df1 %>%
bind_cols(map_dfc(.x = head(seq_along(df1), -1),
~ df1 %>%
select("colA" = .x,
"colB" = .x + 1) %>%
left_join(df2) %>%
select(!!paste0("match", .x) := value)))
col1 col2 col3 col4 col5 col6 col7 col8 match1 match2 match3 match4 match5 match6 match7
1 1 4 2 5 13 16 19 22 100 NA 200 NA NA NA NA
2 2 5 8 11 14 17 20 23 200 NA NA NA NA NA NA
3 3 6 9 12 15 18 21 24 300 NA NA NA NA NA NA
Upvotes: 0
Reputation: 102529
Try combn
+ match
+ colSums
lut <- combn(df1, 2, \(...) do.call(paste, ...))
d <- matrix(with(df2, value[match(lut, paste(colA, colB))]), nrow(lut))
cbind(df1, as.data.frame(d[, colSums(!is.na(d)) > 0]))
and you will obtain
col1 col2 col3 col4 col5 col6 col7 col8 V1 V2
1 1 4 2 5 13 16 19 22 100 200
2 2 5 8 11 14 17 20 23 200 NA
3 3 6 9 12 15 18 21 24 300 NA
Upvotes: 1
Reputation: 19339
You can try the following using Reduce
:
df3 <- Reduce(left_join,
append(list(df1),
lapply(1:(ncol(df1) - 1), \(i)
setNames(df2, c(paste0("col", i+0:1), paste0("match", i))))))
Which returns:
col1 col2 col3 col4 col5 col6 col7 col8 match1 match2 match3 match4 match5 match6 match7
1 1 4 2 5 13 16 19 22 100 NA 200 NA NA NA NA
2 2 5 8 11 14 17 20 23 200 NA NA NA NA NA NA
3 3 6 9 12 15 18 21 24 300 NA NA NA NA NA NA
It would then be easy to omit the "match" columns that are all NA, if desired. Eg.
select(df3, where(~any(!is.na(.))))
col1 col2 col3 col4 col5 col6 col7 col8 match1 match3
1 1 4 2 5 13 16 19 22 100 200
2 2 5 8 11 14 17 20 23 200 NA
3 3 6 9 12 15 18 21 24 300 NA
Upvotes: 1
Reputation: 73562
We could match
the rle
in outer
.
> fun <- \(x, y) all(diff(match(y, with(rle(as.vector(x)), values[lengths == 1]))) == 1)
> res <- outer(asplit(df1, 1), asplit(df2[1:2], 1), Vectorize(fun)) |>
+ apply(1, which)
> cbind(df1, t(sapply(res, \(i) `length<-`(df2$value[i], max(lengths(res))))))
col1 col2 col3 col4 col5 col6 col7 col8 1 2
1 1 4 2 5 13 16 19 22 100 200
2 2 5 8 11 14 17 20 23 200 NA
3 3 6 9 12 15 18 21 24 300 NA
Upvotes: 0
Reputation: 79328
in Base R:
reshape(df1, matrix(1:ncol(df1), 2), dir="long")|>
merge(df2, by.x = c('col1', 'col2'), by.y = c('colA', 'colB'))|>
reshape(dir = 'wide')|>
(\(x)subset(x, select = startsWith(names(x), "value")))()|>
cbind(df1, match=_)
col1 col2 col3 col4 col5 col6 col7 col8 match.value.1 match.value.2
1 1 4 2 5 13 16 19 22 100 200
2 2 5 8 11 14 17 20 23 200 NA
4 3 6 9 12 15 18 21 24 300 NA
in Tidyverse:
df1%>%
rename_with(~str_replace_all(.x, "\\d", ~(as.numeric(.) - 1)%/%2) %>%
str_c(rep(1:2, length = length(.x)), sep="_value")) %>%
rowid_to_column() %>%
pivot_longer(-rowid, names_to = c("grp", ".value"), names_sep = "_") %>%
right_join(df2, join_by(value1==colA, value2==colB)) %>%
pivot_wider(id_cols = rowid, names_from = grp, values_from = value) %>%
select(-rowid) %>%
bind_cols(df1, .)
col1...1 col2 col3 col4 col5 col6 col7 col8 col0 col1...10
1 1 4 2 5 13 16 19 22 100 200
2 2 5 8 11 14 17 20 23 200 NA
3 3 6 9 12 15 18 21 24 300 NA
Upvotes: 0