Hard_Course
Hard_Course

Reputation: 311

More efficient program to create columns and sum pairwise comparisons of specific conditions in a large dataset

I have a dataset like this:

dat1 <- read.table(text = "
  nodepair 3 4 5
1    A6_A1 2 5 1
2    A6_A2 2 5 1
3    A6_A3 2 5 1
4    AL_A1 1 0 0
5     D_A6 0 3 0
6     F_A1 1 0 1
7      H_D 0 0 2
8      H_H 0 0 2 
", header = TRUE, check.names = FALSE)

And I need to write a program to efficiently create new columns which sums each pairwise comparison when a zero appears between pairs-- specifically to create the 'b' condition is when a nodepair is unique to the first variable but not the second, and the 'c' condition is when a nodepair is unique to the second and not the first. Here is the desired output:

dat2 <- read.table(text = "
  nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
4    AL_A1 1 0 0    1   NA    1   NA    0    0
5     D_A6 0 3 0   NA    3    0    0    3   NA
6     F_A1 1 0 1    1   NA   NA   NA   NA    1
7      H_D 0 0 2    0    0   NA    2   NA    2
8      H_H 0 0 2    0    0   NA    2   NA    2  
", header = TRUE, check.names = FALSE)

This code will work but is certainly less than ideal for my much larger dataset:

dat1 <- dat1 %>%
  mutate('3-4b' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-4c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`4`)))))) %>%
  mutate('3-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  mutate('3-5c' = case_when(`3` == 0 ~ as.integer(rowSums(across(c(`3`,`5`)))))) %>%
  
  mutate('4-5b' = case_when(`5` == 0 ~ as.integer(rowSums(across(c(`4`,`5`)))))) %>%
  mutate('4-5c' = case_when(`4` == 0 ~ as.integer(rowSums(across(c(`4`,`5`)))))) 

Upvotes: 2

Views: 94

Answers (2)

M--
M--

Reputation: 29153

library(dplyr)
library(purrr)

t(combn(names(dat1)[-1], 2)) %>% 
  rbind(., .[,c(2,1)]) %>% 
  cbind(rep(c("b", "c"), each = nrow(.)/2)) %>% 
  as.data.frame() %>% 
  mutate(V4 = ifelse(V3 == "b", paste0(V1, "-", V2, V3), 
                                paste0(V2, "-", V1, V3))) %>% 
  arrange(V4) %>% 
  group_split(row_number(), .keep = FALSE) %>% 
  map(., 
      ~ dat1 %>% transmute(!!.x[[4]] := 
                             rowSums(cbind(dat1[, .x[[1]]], 
                                           dat1[, .x[[2]]])) * 
                             ifelse(dat1[, .x[[2]]] == 0, 1, NA))) %>% 
  bind_cols(dat1, .)
  

#>   nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
#> 1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
#> 2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
#> 3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
#> 4    AL_A1 1 0 0    1   NA    1   NA    0    0
#> 5     D_A6 0 3 0   NA    3    0    0    3   NA
#> 6     F_A1 1 0 1    1   NA   NA   NA   NA    1
#> 7      H_D 0 0 2    0    0   NA    2   NA    2
#> 8      H_H 0 0 2    0    0   NA    2   NA    2

Created on 2024-02-15 with reprex v2.0.2

Upvotes: 2

Onyambu
Onyambu

Reputation: 79328

In base R you could write a small function to do the task and use combn to iterate over the 2 combinations:

fn <- function(x){
  i1 <- x[, 1] == 0
  i2 <- x[, 2] == 0
  x[] <- ifelse(i1 | i2, rowSums(x), NA)
  x[i1, 1] <- NA
  x[i2, 2] <- NA
  x[i1&i2,]  <- 0
  names(x) <- paste0(paste(names(x), collapse = "-"), c('b','c'))
  x
}


cbind(dat1,do.call(cbind, combn(dat1[-1], 2, fn, simplify = FALSE)))



 nodepair 3 4 5 3-4b 3-4c 3-5b 3-5c 4-5b 4-5c
1    A6_A1 2 5 1   NA   NA   NA   NA   NA   NA
2    A6_A2 2 5 1   NA   NA   NA   NA   NA   NA
3    A6_A3 2 5 1   NA   NA   NA   NA   NA   NA
4    AL_A1 1 0 0    1   NA    1   NA    0    0
5     D_A6 0 3 0   NA    3    0    0    3   NA
6     F_A1 1 0 1    1   NA   NA   NA   NA    1
7      H_D 0 0 2    0    0   NA    2   NA    2
8      H_H 0 0 2    0    0   NA    2   NA    2

Upvotes: 2

Related Questions