Reputation: 1132
I am trying to create a new column, say test, with several conditions based on 3 columns. I am tryiing to achieve this with tidyverse only. Here are my conditions:
Yes
in one column, regardless of other categories (No/Unknown/NA
) in the same id across columns: previous_cabg, previous_pci, previous_ami
then assign Yes
in test variableNo
in all columns for the same id then assign NO for the test variableNO
for one column and NA/Unknown
in the other columns for the same id then assign with No
in the test variableYes
in all column for the same id then assign Yes
in the test variableYes in one column and
NA/Unknownfor the same id in each column then assign
Yes`in test variableThis is the type of dataset I have:
structure(list(id = c(112139L, 43919L, 92430L, 87137L, 95417L,
66955L, 16293L, 61396L, 25379L, 79229L, 27107L, 63243L, 50627L,
17968L, 83015L, 96549L, 7332L, 4873L, 98131L, 93506L, 52894L,
59327L, 85003L, 96623L, 82999L, 65769L, 67063L, 21744L, 62961L,
2229L, 103673L, 9367L, 60215L, 74044L, 58422L, 57530L, 100399L,
46483L, 108690L, 62017L, 46467L, 79562L, 4800L, 119158L, 103222L,
32908L, 14491L, 30293L, 52558L, 122304L, 42281L, 1553L, 111771L,
23087L, 30147L, 37842L, 51552L, 20148L, 28L, 7477L), previous_cabg = structure(c(1L,
1L, 1L, NA, 1L, NA, NA, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, NA, 1L, 1L, NA, 1L, NA, 1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 3L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Unknown", "Yes"), class = "factor"), previous_pci = structure(c(1L,
1L, 2L, NA, 1L, NA, NA, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, NA, 2L, 1L, NA, 2L, NA, 1L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L,
2L, 2L, NA, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, NA, 1L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes", "Unknown"), class = "factor"), previous_ami = structure(c(2L,
2L, 1L, 2L, 2L, NA, 2L, 1L, 2L, 2L, NA, 1L, 2L, 2L, 2L, 2L, 2L,
1L, NA, 1L, 2L, NA, 1L, NA, 2L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 1L,
2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, NA, 2L, 2L, 2L, 1L, 2L), .Label = c("Yes",
"No", "Unknown"), class = "factor")), row.names = c(NA, -60L), problems = structure(list(
row = c(34136L, 121773L, 121779L), col = c("1.01 Hospital identifier",
"1.01 Hospital identifier", "1.01 Hospital identifier"),
expected = c("value in level set", "value in level set",
"value in level set"), actual = c("CMH", "CMH", "CMH"), file = c("'../../data/changed/minap_2020_2021_second.csv'",
"'../../data/changed/minap_2020_2021_second.csv'", "'../../data/changed/minap_2020_2021_second.csv'"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
)), class = c("tbl_df", "tbl", "data.frame"))
And this is how it looks, but only first 10 rows, if you look in detail, I have different groups of matches across the 3 columns
# A tibble: 60 x 4
id previous_cabg previous_pci previous_ami
<int> <fct> <fct> <fct>
1 112139 No No No
2 43919 No No No
3 92430 No Yes Yes
4 87137 NA NA No
5 95417 No No No
6 66955 NA NA NA
7 16293 NA NA No
8 61396 No Yes Yes
9 25379 No Yes No
10 79229 No No No
I am hoping to solve this only with tidyverse or a mix of tidyverse and r base.
This is what I have tried, yet I feel it is not so wise. I believe it is not wise, since this code will be part of automation process and if I will get other categories, than Yes and No
, like Unknown
as thisn appeared later in the next dataset extracts, then I wish the code will avoid all the other cases from the conditions I have given above.
dplyr::mutate(first_attack =
dplyr::case_when(previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes'
# deal with the unknown category
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'No' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'No' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'No' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Unknown' ~ 'Yes',
previous_cabg == 'Yes' | previous_pci == 'Unknown' | previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'Unknown' | previous_pci == 'Yes'| previous_ami == 'Yes' ~ 'Yes',
previous_cabg == 'No' | previous_pci == 'No' | previous_ami == 'No' ~ 'No',
previous_cabg == 'Yes' | previous_pci == 'Yes' | previous_ami == 'Yes' ~'Yes'
))
Upvotes: 0
Views: 442
Reputation: 3326
These operations are rowwise()
, so they're not very efficient, but this solution in the tidyverse
should cleanly achieve what you want.
Let us call your sample dataset by the name dataset
. Then the following workflow
library(tidyverse)
# ...
# Code to generate your 'dataset'.
# ...
# Define custom logic across a single row.
get_first_attack <- function(values_across_row) {
# "Yes" overrides all other values.
if(isTRUE(any(values_across_row == "Yes"))){
return("Yes")
}
# "No" overrides all missing values: 'NA' and "Unknown".
else if(isTRUE(any(values_across_row == "No"))) {
return("No")
}
# "Unknown" overrides all other missing values: 'NA'.
else if(isTRUE(any(values_across_row == "Unknown"))) {
return("Unknown")
}
# All values are missing: 'NA'.
else {
return(as.character(NA))
}
}
dataset %>%
# Examine row by row.
dplyr::rowwise() %>%
# Compare values across each row according to the logic in 'get_first_attack()'.
dplyr::mutate(first_attack = get_first_attack(across(previous_cabg:previous_ami))) %>%
# Exit row-wise approach, to restore efficiency.
dplyr::ungroup() %>%
# Factor 'first_attack' exactly like its neighboring column.
dplyr::mutate(first_attack = factor(first_attack, levels = levels(previous_ami)))
should give you these results
# A tibble: 60 x 5
id previous_cabg previous_pci previous_ami first_attack
<int> <fct> <fct> <fct> <fct>
1 112139 No No No No
2 43919 No No No No
3 92430 No Yes Yes Yes
4 87137 NA NA No No
5 95417 No No No No
6 66955 NA NA NA NA
7 16293 NA NA No No
8 61396 No Yes Yes Yes
9 25379 No Yes No Yes
10 79229 No No No No
# ... with 50 more rows
where the first_attack
column is fittingly defined as a factor
with three levels: "Yes"
, "No"
, and "Unknown"
.
Upvotes: 2
Reputation: 126
So in summary, your condition is:
If this is the case, you can do:
# Convert your data structure into a data.frame
dat <- as.data.frame(dat)
# Remove id col
id <- dat$id
dat <- subset(dat, select = -c(id))
# For each row, check if there is a 'Yes' under any column. If so, return 'Yes'; otherwise return 'No'
output <- apply(dat, 1, function(x) ifelse('Yes' %in% x, 'Yes', 'No'))
# For each row, check if NA under all column. If so, return TRUE; otherwise return FALSE.
isNA <- apply(dat, 1, function(x) ifelse(all(is.na(x)), TRUE, FALSE))
# Now merge output and isNA
output[isNA] <- NA
# For each row, check if 'Unknown' under all column. If so, return TRUE; otherwise return FALSE.
isUK <- apply(dat, 1, function(x) ifelse(all('Unknown' == x), TRUE, FALSE))
# Now merge output and isUK
output[isUK] <- 'Unknown'
# Append the output character vector to a new col of the data frame
dat$id <- id
dat$test <- output
Upvotes: 2