Reputation: 1036
I have a data frame like this:
df<-structure(list(MRN = c("53634", "65708", "72122", "40458", "03935",
"67473", "20281", "52479", "10261", "40945", "40630", "92295",
"43505", "80719", "39492", "44720", "70691", "21351", "03457",
"02182"), Outcome_Diagnosis_1 = c(NA, NA, NA, "Seroma of breast [N64.89]",
"Breast implant capsular contracture [T85.44XA]; Breast implant capsular contracture [T85.44XA]; Breast implant capsular contracture [T85.44XA]",
NA, NA, NA, "Acquired breast deformity [N64.89]", NA, NA, NA,
NA, "Acquired breast deformity [N64.89]", NA, NA, NA, NA, NA,
NA), Outcome_Diagnosis_2 = c(NA, NA, NA, "Extrusion of breast implant, initial encounter [T85.49XA]; Extrusion of breast implant, initial encounter [T85.49XA]; Extrusion of breast implant, initial encounter [T85.49XA]",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Capsular contracture of breast implant, subsequent encounter [T85.44XD]; Capsular contracture of breast implant, subsequent encounter [T85.44XD]; Capsular contracture of breast implant, subsequent encounter [T85.44XD]",
NA, NA, NA, NA, NA, NA), Outcome_Diagnosis_3 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Acquired breast deformity [N64.89]; Capsular contracture of breast implant, initial encounter [T85.44XA]; Capsular contracture of breast implant, initial encounter [T85.44XA]; Capsular contracture of breast implant, initial encounter [T85.44XA]",
NA, NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
And I have a few vectors like this:
Infection<-c("L76","L00", "L01","L02","L03","L04", "L05","L08")
Hematoma<-c("N64.89","M79.81")
Seroma<- c("L76.34")
Necrosis<- c("N64.1","T86.821")
CapsularContracture<- c("T85.44")
MechanicalComplications<- c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49")
What I'd like to do is create new columns in the data frame that are TRUE
/FALSE
for if that vector was found in each row. (And it would just be TRUE even if it shows up multiple times in that row, i.e. it doesn't need to "count" them).
So the output I want would be something like this:
The reason I am struggling and came to stack for help is I don't really know how to combine searching for particular strings (that might be within a longer sentence in that column) and looking over multiple columns.
Additional Info that might be important:
Upvotes: 2
Views: 442
Reputation: 21908
Here is another base R solution you could use albeit similar to some extent. As pointed out cleverly by dear @r2evans I also changed my pattern matching to fixed = TRUE
which I was not aware of in the first place:
cbind(df, as.data.frame(do.call(cbind, lst |>
lapply(function(a) {
sapply(a, function(b) {
apply(df[-1], 1, function(c) as.logical(Reduce(`+`, grepl(b, c, fixed = TRUE))))
}) |> rowSums() |> as.logical()
}))))
Infection Hematoma Seroma Necrosis CapsularContracture MechanicalComplications
1 FALSE FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE FALSE FALSE
4 FALSE TRUE FALSE FALSE FALSE TRUE
5 FALSE FALSE FALSE FALSE TRUE TRUE
6 FALSE FALSE FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE FALSE FALSE
8 FALSE FALSE FALSE FALSE FALSE FALSE
9 FALSE TRUE FALSE FALSE FALSE FALSE
10 FALSE FALSE FALSE FALSE FALSE FALSE
11 FALSE FALSE FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE FALSE FALSE
14 FALSE TRUE FALSE FALSE TRUE TRUE
15 FALSE FALSE FALSE FALSE FALSE FALSE
16 FALSE FALSE FALSE FALSE FALSE FALSE
17 FALSE FALSE FALSE FALSE FALSE FALSE
18 FALSE FALSE FALSE FALSE FALSE FALSE
19 FALSE FALSE FALSE FALSE FALSE FALSE
20 FALSE FALSE FALSE FALSE FALSE FALSE
In order to accommodate the result I only put the output of newly created columns here, but the code binds them to the original data set.
lst <- list(Infection = c("L76", "L00", "L01", "L02", "L03", "L04",
"L05", "L08"), Hematoma = c("N64.89", "M79.81"), Seroma = "L76.34",
Necrosis = c("N64.1", "T86.821"), CapsularContracture = "T85.44",
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42",
"T85.43", "T85.49"))
Upvotes: 1
Reputation: 39858
You could store your vectors in a list:
lst <- list(Infection = c("L76","L00", "L01","L02","L03","L04", "L05","L08"),
Hematoma = c("N64.89","M79.81"),
Seroma = c("L76.34"),
Necrosis = c("N64.1","T86.821"),
CapsularContracture = c("T85.44"),
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49"))
And then, using dplyr
and purrr
you could do:
imap(lst,
~ df %>%
mutate(!!.y := reduce(across(Outcome_Diagnosis_1:Outcome_Diagnosis_3, function(y) grepl(paste(sub("\\.", "", .x), collapse = "|"), sub("\\.", "", y))), `|`))) %>%
reduce(full_join)
MRN Outcome_Diagnos… Outcome_Diagnos… Outcome_Diagnos… Infection Hematoma Seroma Necrosis CapsularContrac…
<chr> <chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
1 53634 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
2 65708 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
3 72122 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
4 40458 Seroma of breas… Extrusion of br… <NA> FALSE TRUE FALSE FALSE FALSE
5 03935 Breast implant … <NA> <NA> FALSE FALSE FALSE FALSE TRUE
6 67473 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
7 20281 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
8 52479 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
9 10261 Acquired breast… <NA> <NA> FALSE TRUE FALSE FALSE FALSE
10 40945 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE
Upvotes: 3
Reputation: 160417
out <- lapply(manythings, function(thing) {
rowSums(
do.call(cbind, lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE)))
) > 0
})
tibble(cbind(df, out))
# # A tibble: 20 x 10
# MRN Outcome_Diagnosis_1 Outcome_Diagnosis_2 Outcome_Diagnosis_3 Infection Hematoma Seroma Necrosis CapsularContrac~ MechanicalCompl~
# <chr> <chr> <chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 53634 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 2 65708 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 3 72122 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 4 40458 Seroma of breast [N6~ Extrusion of breast ~ <NA> FALSE TRUE FALSE FALSE FALSE TRUE
# 5 03935 Breast implant capsu~ <NA> <NA> FALSE FALSE FALSE FALSE TRUE TRUE
# 6 67473 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 7 20281 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 8 52479 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 9 10261 Acquired breast defo~ <NA> <NA> FALSE TRUE FALSE FALSE FALSE FALSE
# 10 40945 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 11 40630 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 12 92295 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 13 43505 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 14 80719 Acquired breast defo~ Capsular contracture~ Acquired breast defo~ FALSE TRUE FALSE FALSE TRUE TRUE
# 15 39492 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 16 44720 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 17 70691 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 18 21351 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 19 03457 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
# 20 02182 <NA> <NA> <NA> FALSE FALSE FALSE FALSE FALSE FALSE
Same as @tmfmnk, I recommend putting all of your patterns into a named list:
manythings <- list(
Infection = c("L76","L00", "L01","L02","L03","L04", "L05","L08"),
Hematoma = c("N64.89","M79.81"),
Seroma = c("L76.34"),
Necrosis = c("N64.1","T86.821"),
CapsularContracture = c("T85.44"),
MechanicalComplications = c("T85", "T85.4", "T85.41", "T85.42", "T85.43", "T85.49"))
Also, we should note that while grepl
is good for this, it does not vectorize the pattern=
argument, so we need to do that externally. Further, since some of your patterns have regex-sensitive characters (i.e., .
which matches anything), we need to guard against regex-injection. For instance, if we aren't careful, then "N64.89"
as a pattern will incorrectly match "N64989"
. For this, I use fixed=TRUE
as a safeguard. Unfortunately, this also hampers our ability to shape the patterns such that we can check for all of them in one step. Instead, we'll vectorize it, searching a fixed-regex (single element of one of your vectors of patterns) and aggregate the results.
So let's do one of the pattern-vectors against one column of the frame:
Vectorize(grepl, vectorize.args = "pattern")(manythings[[2]], df[[2]], fixed = TRUE)
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# [4,] TRUE FALSE
# [5,] FALSE FALSE
# [6,] FALSE FALSE
# [7,] FALSE FALSE
# [8,] FALSE FALSE
# [9,] TRUE FALSE
# [10,] FALSE FALSE
# [11,] FALSE FALSE
# [12,] FALSE FALSE
# [13,] FALSE FALSE
# [14,] TRUE FALSE
# [15,] FALSE FALSE
# [16,] FALSE FALSE
# [17,] FALSE FALSE
# [18,] FALSE FALSE
# [19,] FALSE FALSE
# [20,] FALSE FALSE
Now we can reduce that so that we know of one of the patterns is found within each cell of this one column:
rowSums(
Vectorize(grepl, vectorize.args = "pattern")(manythings[[2]], df[[2]], fixed = TRUE)
) > 0
# [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
Now we can iterate that process over each of the vectors of patterns within manythings
:
lapply(manythings, function(thing) {
rowSums(
Vectorize(grepl, vectorize.args = "pattern")(thing, df[[2]], fixed = TRUE)
) > 0
})
# $Infection
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $Hematoma
# [1] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
# $Seroma
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $Necrosis
# [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $CapsularContracture
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# $MechanicalComplications
# [1] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
All of this has been to a single column, df[[2]]
. In order to apply this across multiple (selectable) columns, I'll employ some tricks with column-binding (in the code at the top). To break that down,
lapply(df[,2:4], ...)
subsets the data we want to search so just a few columns. Any way you want to select columns will fit here. This will return a list
of matrices, something like:
lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE))
# $Outcome_Diagnosis_1
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
# $Outcome_Diagnosis_2
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
# $Outcome_Diagnosis_3
# N64.89 M79.81
# [1,] FALSE FALSE
# [2,] FALSE FALSE
# [3,] FALSE FALSE
# ...
do.call(cbind, ...)
will take each of those embedded matrices and combine them into a single matrix:
do.call(cbind, lapply(df[,2:4], function(col) Vectorize(grepl, vectorize.args = "pattern")(thing, col, fixed = TRUE)))
# N64.89 M79.81 N64.89 M79.81 N64.89 M79.81
# [1,] FALSE FALSE FALSE FALSE FALSE FALSE
# [2,] FALSE FALSE FALSE FALSE FALSE FALSE
# [3,] FALSE FALSE FALSE FALSE FALSE FALSE
# [4,] TRUE FALSE FALSE FALSE FALSE FALSE
# ...
which allows us to use rowSums(.) > 0
to determine if any patterns (in each column) is met for each row.
Upvotes: 3