More efficient program to create columns and sum pairwise comparisons of specific conditions in a large dataset

Question

I have a dataset like this:

dat1 <- read.table(text = "
  nodepair 3 4 5
1    A6_A1 2 5 1
2    A6_A2 2 5 1
3    A6_A3 2 5 1
4    AL_A1 1 0 0
5     D_A6 0 3 0
6     F_A1 1 0 1
7      H_D 0 0 2
8      H_H 0 0 2 
", header = TRUE, check.names = FALSE)

And I need to write a program to efficiently create new columns which sums each pairwise comparison when a zero appears between pairs-- specifically to create the 'b' condition is when a nodepair is unique to the first variable but not the second, and the 'c' condition is when a nodepair is unique to the second and not the first. Here is the desired output:

dat2 <- read.table(text = "
  nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
4    AL_A1 1 0 0    1   NA    1   NA    0    0
5     D_A6 0 3 0   NA    3    0    0    3   NA
6     F_A1 1 0 1    1   NA   NA   NA   NA    1
7      H_D 0 0 2    0    0   NA    2   NA    2
8      H_H 0 0 2    0    0   NA    2   NA    2  
", header = TRUE, check.names = FALSE)

This code will work but is certainly less than ideal for my much larger dataset:

dat1 <- dat1 %>%
  mutate('3-4b' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-4c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  mutate('3-5c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  
  mutate('4-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`4`,`5`)))))) %>%
  mutate('4-5c' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`4`,`5`))))))

M-- · Accepted Answer

library(dplyr)
library(purrr)

t(combn(names(dat1)[-1], 2)) %>% 
  rbind(., .[,c(2,1)]) %>% 
  cbind(rep(c("b", "c"), each = nrow(.)/2)) %>% 
  as.data.frame() %>% 
  mutate(V4 = ifelse(V3 == "b", paste0(V1, "-", V2, V3), 
                                paste0(V2, "-", V1, V3))) %>% 
  arrange(V4) %>% 
  group_split(row_number(), .keep = FALSE) %>% 
  map(., 
      ~ dat1 %>% transmute(!!.x[[4]] := 
                             rowSums(cbind(dat1[, .x[[1]]], 
                                           dat1[, .x[[2]]])) * 
                             ifelse(dat1[, .x[[2]]] == 0, 1, NA))) %>% 
  bind_cols(dat1, .)
  

#>   nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
#> 1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
#> 2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
#> 3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
#> 4    AL_A1 1 0 0    1   NA    1   NA    0    0
#> 5     D_A6 0 3 0   NA    3    0    0    3   NA
#> 6     F_A1 1 0 1    1   NA   NA   NA   NA    1
#> 7      H_D 0 0 2    0    0   NA    2   NA    2
#> 8      H_H 0 0 2    0    0   NA    2   NA    2

^{Created on 2024-02-15 with reprex v2.0.2}

More efficient program to create columns and sum pairwise comparisons of specific conditions in a large dataset

Answers (2)

Related Questions