Sebastian Hesse
Sebastian Hesse

Reputation: 545

Coding challenge: Calculate sum of col values if rows in other col are TRUE (table summary)

We have a table where the first col contains values (protein counts) while the following cols are logical vectors (T or F, indicating if protein_id has the property). For each col we seek the sum of all values where col = T and the count of T.

With example data the task might be better to describe:

[please excuse that the example data require a package for the random id generator, if you know a base R solution please wrote comment and I will include it here].

library("stringi")

value <- c(sample(2:5, 20 , replace=T),
           sample(6:10, 20 , replace=T), 
           sample(1:7, 20 ,  replace=T), 
           sample(3:10, 20 , replace=T), 
           sample(10:20, 20 , replace=T) )

data <- data.frame(
  id = stringi::stri_rand_strings(20, 5),
  value = value,
  nucleus = sample(c(TRUE,FALSE), 20, TRUE),
  membrane = sample(c(TRUE,FALSE), 20, TRUE),
  mitochondria = sample(c(TRUE,FALSE), 20, TRUE))

For each property col we seek the sum of all values and the count of all ids. Next, check if TRUE in multiple cols. If yes: new col with string of all colnames sep by _ and sum of all values. Lastly a col with all ids sep by ;

expected_result_1 <- data.frame(
  property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
  value_sum = c('x', 'y', 'z', 'w'),
  n_ids = c(4, 3, 1, 2),
  ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))

A dplyr solutions would be great!

Thank you!

Sebastian

Upvotes: 1

Views: 92

Answers (2)

Dunois
Dunois

Reputation: 1843

Is this what you're looking for? (Code below has been edited!)

data %<>% 
  pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")

data %<>% filter(is_true)

data %>% 
  group_by(property) %>%
  transmute(value_sum = sum(value), 
            ids = paste0(id, collapse = ","), 
            n_ids = length(str_split(ids, ","))) %>%
  ungroup() %>% 
  distinct(property, .keep_all = TRUE)

# # A tibble: 3 x 4
# property    value_sum ids                                                                                      n_ids
# <chr>           <int> <chr>                                                                                    <int>
#   1 membrane          400 ACiov,3XyaR,z68K4,GaUxZ,YZioV,mEZtJ,J5T67,T6Ine,a33ed,Flgnx,g33vm,ACiov,3XyaR,z68K4,GaU…    55
#   2 nucleus           312 tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8EgQ,Sl9nM,T6Ine,2zgbM,tDCzP,H8DS4,3XyaR,z68K4,YZioV,t8E…    45
#   3 mitochondr…       310 tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6Ine,ViWyl,2zgbM,g33vm,tDCzP,Ey1PM,3XyaR,FWPXg,t8EgQ,T6I…    45

UPDATE:

I had misunderstood OP's problem. I'm including a solution here that still uses dplyr (somewhat) and uses gtools to calculate the new column names.

library("stringi")
library("stringr")
library("dplyr")
library("magrittr")
library("tidyr")
library("gtools")

#Sample data.
value <- c(sample(2:5, 20 , replace=T),
           sample(6:10, 20 , replace=T), 
           sample(1:7, 20 ,  replace=T), 
           sample(3:10, 20 , replace=T), 
           sample(10:20, 20 , replace=T) )

data <- data.frame(
  id = stringi::stri_rand_strings(20, 5),
  value = value,
  nucleus = sample(c(TRUE,FALSE), 20, TRUE),
  membrane = sample(c(TRUE,FALSE), 20, TRUE),
  mitochondria = sample(c(TRUE,FALSE), 20, TRUE))


expected_result_1 <- data.frame(
  property = c('nucleus', 'membrane', 'mitochondria', 'nucleus_ membrane'),
  value_sum = c('x', 'y', 'z', 'w'),
  n_ids = c(4, 3, 1, 2),
  ids = c("MSATv;1NFZ4;Kftq5;JANXo", "htiFJ;kCHtA8;jXXh", "kCHtA", "MSATv_htiFJ"))




#SOLUTION.

#Getting the column names.
cols <- data %>% select(-matches("id|value")) %>% colnames()

#Don't even ask. Suffices to know that generously using lapply() 
#enough times can convert anything into anything.
myfunc <- function(x){
  unique(
    unlist(
      lapply(
        lapply(
          str_split(
            apply(
              gtools::permutations(length(cols), x, v = cols),
              1, paste, collapse = "_"),
            "_"), 
          sort), 
        paste, collapse = "_")
      )
    )
}

#Using the friendly function from above to get all possible 
#combinations of the relevant column names.
newcols <- unlist(lapply(2:length(cols), myfunc))

#Adding these columns to the data.frame and evaluating them conditionally.
#(So if all corresponding individual columns are TRUE, then the new column
#is also TRUE, and FALSE otherwise.)
for(i in 1:length(newcols)){
  #i <- 1
  
  currefs <- unlist(str_split(newcols[i], "_"))
  data[, newcols[i]] <- NA
  data[, newcols[i]] <- eval(parse(text = paste0("data$", currefs, collapse = " & ")))
  
  #for(j in 1:nrow(data)){
  #  data[j, newcols[i]] <- eval(parse(text = paste0("data$", currefs, "[j]", collapse = " & ")))
  #}
  #eval(parse(text = paste0("data$", currefs)))
  
}


#Pivoting longer to gather all these columns together.
data %<>% 
  pivot_longer(cols = -matches("id|value"), names_to = "property", values_to = "is_true")

#Retaining only TRUE values.
data %<>% filter(is_true)

#Calculating value_sum, ids, and n_ids.
data %>% 
  group_by(property) %>%
  transmute(value_sum = sum(value), 
            ids = paste0(unique(id), collapse = ","), 
            n_ids = length(unique(unlist(str_split(ids, ","))))) %>%
  ungroup() %>% 
  distinct(property, .keep_all = TRUE)


# # A tibble: 7 x 4
# property                    value_sum ids                                                                      n_ids
# <chr>                           <int> <chr>                                                                    <int>
# 1 nucleus                           472 LpA3Q,2s04A,m1QMR,Lh4HH,wILrx,xKLfq,hUvsn,22cPw,NMTgz,V42mZ,GnJBd,Fwjbr…    13
# 2 membrane                          521 LpA3Q,m1QMR,kYSIh,Lh4HH,CDPr4,wILrx,qAPpb,Zfavp,hUvsn,22cPw,NMTgz,N0RPZ…    14
# 3 mitochondria                      252 LpA3Q,kYSIh,Zfavp,N0RPZ,V42mZ,GnJBd,onM21                                    7
# 4 membrane_mitochondria             182 LpA3Q,kYSIh,Zfavp,N0RPZ,GnJBd                                                5
# 5 membrane_nucleus                  338 LpA3Q,m1QMR,Lh4HH,wILrx,hUvsn,22cPw,NMTgz,GnJBd,Fwjbr                        9
# 6 mitochondria_nucleus              142 LpA3Q,V42mZ,GnJBd,onM21                                                      4
# 7 membrane_mitochondria_nucl…        72 LpA3Q,GnJBd                                                                  2

Upvotes: 1

ThomasIsCoding
ThomasIsCoding

Reputation: 101753

I am not sure whether the code below can give the desired output, but here is a base R attempt.

  • First, we can define a user function f, which helps to summarize the information by properties in data
f <- function(cols) {
  idx <- rowSums(data[cols]) == length(cols)
  data.frame(
    property = paste0(cols, collapse = "_"),
    value_sum = sum(data$value[idx],na.rm = TRUE),
    n_ids = length(unique(data$id[idx])),
    ids = toString(unique(data$id[idx]))
  )
}
  • Then, we select the columns (see v is the vector of selected column names), and run the following code
v <- c("nucleus", "membrane", "mitochondria")
output <- do.call(
  rbind,
  unlist(
    lapply(
      seq_along(v),
      function(k) combn(v, k, FUN = f, simplify = FALSE)
    ),
    recursive = FALSE
  )
)

and we will get

> output
                       property value_sum n_ids
1                       nucleus       406    11
2                      membrane       367    10
3                  mitochondria       278     8
4              nucleus_membrane       193     5
5          nucleus_mitochondria       135     4
6         membrane_mitochondria       136     4
7 nucleus_membrane_mitochondria        37     1
                                                                          ids
1 zMknh, TUJhp, QVf8L, P5vps, w4NX6, 2IVbG, AT0RG, SxiO7, ErRUg, 1wIAO, YgefT
2        P5vps, w4NX6, nj3Tv, 2IVbG, xRMA3, eZzb4, ErRUg, l9qwa, SQWq3, YgefT
3                      P5vps, QMw74, eZzb4, AT0RG, SxiO7, l9qwa, 1wIAO, SQWq3
4                                           P5vps, w4NX6, 2IVbG, ErRUg, YgefT
5                                                  P5vps, AT0RG, SxiO7, 1wIAO
6                                                  P5vps, eZzb4, l9qwa, SQWq3
7                                                                       P5vps

Upvotes: 3

Related Questions