GaB
GaB

Reputation: 1132

How to populate Yes and No when applying conditions accros 3 columns only with dplyr?

I am trying to create a new column, say test, with several conditions based on 3 columns. I am tryiing to achieve this with tidyverse only. Here are my conditions:

This is the type of dataset I have:

structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L, 
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L, 
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L, 
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L, 
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L, 
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L, 
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L, 
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L, 
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L, 
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No", 
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L, 
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No", 
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L, 
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L, 
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes", 
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
    row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier", 
    "1.01 Hospital identifier", "1.01 Hospital identifier"), 
    expected = c("value in level set", "value in level set", 
    "value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'", 
    "'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
    )), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))

And this is how it looks, but only first 10 rows, if you look in detail, I have different groups of matches across the 3 columns

# A tibble: 60 x 4
       id previous_cabg previous_pci previous_ami
    <int> <fct>         <fct>        <fct>       
 1 112139 No            No           No          
 2  43919 No            No           No          
 3  92430 No            Yes          Yes         
 4  87137 NA            NA           No          
 5  95417 No            No           No          
 6  66955 NA            NA           NA          
 7  16293 NA            NA           No          
 8  61396 No            Yes          Yes         
 9  25379 No            Yes          No          
10  79229 No            No           No        

I am hoping to solve this only with tidyverse or a mix of tidyverse and r base.

This is what I have tried, yet I feel it is not so wise. I believe it is not wise, since this code will be part of automation process and if I will get other categories, than Yes and No, like Unknown as thisn appeared later in the next dataset extracts, then I wish the code will avoid all the other cases from the conditions I have given above.

dplyr::mutate(first_attack = 
                  dplyr::case_when(previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'No'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes'
                                   
                                   # deal with the unknown category
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~  'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Unknown'  | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~  'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes', 
                                   previous_cabg == 'Yes' |  previous_pci == 'No'  | previous_ami == 'No'  ~  'Yes',
                                   previous_cabg == 'No'  |  previous_pci == 'No'  | previous_ami == 'Yes' ~  'Yes',
                                   previous_cabg == 'No'  | previous_pci == 'Yes'  | previous_ami == 'No'  ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes', 
                                   previous_cabg == 'Yes' |  previous_pci == 'Unknown'   | previous_ami == 'Unknown'   ~  'Yes',
                                   previous_cabg == 'Unknown'   |  previous_pci == 'Unknown'   | previous_ami == 'Yes' ~  'Yes',
                                   previous_cabg == 'Unknown'   | previous_pci == 'Yes'  | previous_ami == 'Unknown'   ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'No'  | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Unknown'  | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes', 
                                   previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes', 
                                   previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes', 
                                   
                                   
                                   previous_cabg == 'No'  | previous_pci == 'No'  |  previous_ami == 'No' ~ 'No', 
                                   previous_cabg == 'Yes' | previous_pci == 'Yes' |  previous_ami == 'Yes' ~'Yes'
                                   
                  ))

Upvotes: 0

Views: 442

Answers (2)

Greg
Greg

Reputation: 3326

These operations are rowwise(), so they're not very efficient, but this solution in the tidyverse should cleanly achieve what you want.

Let us call your sample dataset by the name dataset. Then the following workflow

library(tidyverse)


# ...
# Code to generate your 'dataset'.
# ...


# Define custom logic across a single row.
get_first_attack <- function(values_across_row) {
  # "Yes" overrides all other values.
  if(isTRUE(any(values_across_row == "Yes"))){
    return("Yes")
  }
  # "No" overrides all missing values: 'NA' and "Unknown".
  else if(isTRUE(any(values_across_row == "No"))) {
    return("No")
  }
  # "Unknown" overrides all other missing values: 'NA'.
  else if(isTRUE(any(values_across_row == "Unknown"))) {
    return("Unknown")
  }
  # All values are missing: 'NA'.
  else {
    return(as.character(NA))
  }
}


dataset %>%
  # Examine row by row.
  dplyr::rowwise() %>%
  # Compare values across each row according to the logic in 'get_first_attack()'.
  dplyr::mutate(first_attack = get_first_attack(across(previous_cabg:previous_ami))) %>%
  # Exit row-wise approach, to restore efficiency.
  dplyr::ungroup() %>%
  # Factor 'first_attack' exactly like its neighboring column.
  dplyr::mutate(first_attack = factor(first_attack, levels = levels(previous_ami)))

should give you these results

# A tibble: 60 x 5
       id previous_cabg previous_pci previous_ami first_attack
    <int> <fct>         <fct>        <fct>        <fct>       
 1 112139 No            No           No           No          
 2  43919 No            No           No           No          
 3  92430 No            Yes          Yes          Yes         
 4  87137 NA            NA           No           No          
 5  95417 No            No           No           No          
 6  66955 NA            NA           NA           NA          
 7  16293 NA            NA           No           No          
 8  61396 No            Yes          Yes          Yes         
 9  25379 No            Yes          No           Yes         
10  79229 No            No           No           No          
# ... with 50 more rows

where the first_attack column is fittingly defined as a factor with three levels: "Yes", "No", and "Unknown".

Upvotes: 2

Yuan Yin
Yuan Yin

Reputation: 126

So in summary, your condition is:

  • For each row, if any column is 'Yes', output 'Yes'
  • For each row, if all column is NA, output NA
  • For each row, if all column is 'Unknown', output 'Unknown'
  • Otherwise output 'No'

If this is the case, you can do:

# Convert your data structure into a data.frame
dat <- as.data.frame(dat)

# Remove id col
id <- dat$id
dat <- subset(dat, select = -c(id))

# For each row, check if there is a 'Yes' under any column. If so, return 'Yes'; otherwise return 'No'
output <- apply(dat, 1, function(x) ifelse('Yes' %in% x, 'Yes', 'No'))

# For each row, check if NA under all column. If so, return TRUE; otherwise return FALSE.
isNA <- apply(dat, 1, function(x) ifelse(all(is.na(x)), TRUE, FALSE))

# Now merge output and isNA
output[isNA] <- NA

# For each row, check if 'Unknown' under all column. If so, return TRUE; otherwise return FALSE.
isUK <- apply(dat, 1, function(x) ifelse(all('Unknown' == x), TRUE, FALSE))

# Now merge output and isUK
output[isUK] <- 'Unknown'

# Append the output character vector to a new col of the data frame
dat$id <- id
dat$test <- output

Upvotes: 2

Related Questions