Reputation: 1969
Goal: I have a table of records (for example people) with a dynamic number of variables (e.g., email, phone, birthday,..., others). I want to compare each row with every other row and sum the number of variables that match.
# Input
my_data <- tibble(person = c("A","B","C","E","F"),
email = c("[email protected]", "[email protected]", NA, NA, NA),
phone = c(NA, NA, NA, 801, 801),
birthday = c("Jan1", "Jan1", NA, NA, NA))
# A tibble: 5 x 4
person email phone birthday
<chr> <chr> <dbl> <chr>
1 A [email protected] NA Jan1
2 B [email protected] NA Jan1
3 C NA NA NA
4 E NA 801 NA
5 F NA 801 NA
Using tidyr::expand_grid
, I can get every combination of records.
# Munging
my_data_a <- my_data %>%
rename_with(~str_c(., "_a"), everything())
my_data_b <- my_data %>%
rename_with(~str_c(., "_b"), everything())
grid <- expand_grid(my_data_a, my_data_b)
# A tibble: 25 x 9
person_a email_a phone_a birthday_a person_b email_b phone_b birthday_b total
<chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <int>
1 A [email protected] NA Jan1 A [email protected] NA Jan1 2
2 A [email protected] NA Jan1 B [email protected] NA Jan1 2
3 A [email protected] NA Jan1 C NA NA NA 0
4 A [email protected] NA Jan1 E NA 801 NA 0
5 A [email protected] NA Jan1 F NA 801 NA 0
6 B [email protected] NA Jan1 A [email protected] NA Jan1 2
7 B [email protected] NA Jan1 B [email protected] NA Jan1 2
8 B [email protected] NA Jan1 C NA NA NA 0
9 B [email protected] NA Jan1 E NA 801 NA 0
10 B [email protected] NA Jan1 F NA 801 NA 0
# … with 15 more rows
Now I can compare each of the variables manually, but the issue is I will have more than email, phone, birthday.
grid %>%
mutate(email_match = email_a == email_b,
phone_match = phone_a == phone_b,
birthday_match = birthday_a == birthday_b) %>%
mutate(across(everything(), ~replace_na(., 0)),
total = email_match + phone_match + birthday_match) %>%
select(person_a, person_b, total)
# Output
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# … with 15 more rows
This can be done by brute force in a for loop, but the data set is large:
# Brute force
a_col_start <- 2
a_col_end <- ncol(grid)/2
b_col_start <- a_col_end + 2
b_col_end <- ncol(grid)
for (i in 1:nrow(grid)) {
grid[i,"total"] <- sum(grid[i,a_col_start:a_col_end] == grid[i,b_col_start:b_col_end], na.rm = TRUE)
}
grid %>%
select(person_a, person_b, total)
Upvotes: 1
Views: 69
Reputation: 21928
You can use pmap
function from package purrr
for your purpose. It will make it easy to compare two vectors (in the same row) element-wise:
library(dplyr)
library(purrr)
library(stringr)
grid %>%
mutate(total = pmap_dbl(grid, ~ sum(c(...)[str_detect(names(grid), "_a")][-1] ==
c(...)[str_detect(names(grid), "_b")][-1], na.rm = TRUE))) %>%
select(contains("person"), total)
# A tibble: 25 x 3
person_a person_b total
<chr> <chr> <dbl>
1 A A 2
2 A B 2
3 A C 0
4 A E 0
5 A F 0
6 B A 2
7 B B 2
8 B C 0
9 B E 0
10 B F 0
# ... with 15 more rows
Upvotes: 1
Reputation: 68
If you only want unique combinations you can use combn()
to get all pairwise combinations and use this as input to Map()
to get the sum of matches for each pair of rows.
people <- combn(my_data$person, 2)
match_finder <- function(x, y) {
personx <- my_data[my_data$person == x, ]
persony <- my_data[my_data$person == y, ]
match_sum <- sum(personx == persony, na.rm = TRUE)
list(person1 = as.character(x), person2 = as.character(y), match_sum = match_sum)
}
output <- Map(match_finder, people[1, ], people[2, ], USE.NAMES = FALSE)
as.data.frame(do.call(rbind, output))
person1 person2 match_sum
1 A B 2
2 A C 0
3 A E 0
4 A F 0
5 B C 0
6 B E 0
7 B F 0
8 C E 0
9 C F 0
10 E F 1
Upvotes: 0