Nelly
Nelly

Reputation: 423

Search many variables with a vector of patterns in R/tidyverse

I would like to search many variables (key1 to key30) within my dataframe (data_df) for any of the patterns (stored in vector "my_patterns"). The results would be stored in 30 dummy variables/columns (key1_match to key30_match) for each observation, with 1 indicating that "keyX" variable has a match to one of the values in "my_patterns" vector, while 0 no match. for a particular observation. All I need to know is that there is a match, not which match.

How can I do this in R and preferably using tidyverse functions?

my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
data_df <- structure(list(id = c(1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1317, 11832, 
1943, 1316, 8317, 13405, 12881, 12881, 12881, 12881, 12881, 12881, 
12882, 12882, 12882, 12882, 12883, 12883, 12883), key1 = c("", 
"", "", "", "DR029", "", "AF063", "UJD05", "JCF12", "", "AF021", 
"DT022", "XS912", "UJC12", "UJD05", "JAH00", "UJD02", "DT016", 
"DT016", "", "DV071", "DR029", "2154", "", "AJ079", "XV018", 
"7462", "7460", "LEG10"), key2 = c(NA, NA, NA, NA, NA, NA, NA, 
NA, "JFF00", NA, "AF021", "DT022", "XS912", "UJC12", "UJD05", 
"JAH00", "UJD05", "DT017", "DT017", NA, "DV022", "JDB10", NA, 
NA, "AJ080", NA, NA, "7461", "LCA06"), key3 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, "UJD02", NA, "AF021", "DT022", "ZV033", "UJC12", 
"UJD05", "JAH00", "AF012", "DT019", "DT019", NA, "DV079", NA, 
NA, NA, "DR029", NA, NA, "7469", NA), key4 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", 
"JAH00", "AG053", NA, "DT024", NA, "DV027", NA, NA, NA, "DT016", 
NA, NA, "7280", NA), key5 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, "AF021", "DT022", "DV071", "UJC12", "UJD05", "JKB30", 
"JAH01", NA, NA, NA, "DV064", NA, NA, NA, "UJD02", NA, NA, NA, 
NA), key6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", 
"DT022", "DV071", "UJC12", "UJD05", "JKB30", "JCA55", NA, NA, 
NA, "DV040", NA, NA, NA, NA, NA, NA, NA, NA), key7 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DT022", "DV071", "UJD05", 
"JCA55", "JKB30", "UJD02", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA), key8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
"UJD02", "DV051", "DV071", "UJD05", "JCA55", "JKB30", NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key9 = c(NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", "DV053", "DV071", "UJD05", 
"JCA55", "JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA), key10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "UJD02", 
"DV055", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA), key11 = c(NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", "UJD05", "TPW99", 
"JFK10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    key12 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053", 
    "DV057", "DV071", "UJD05", "TPW99", "JFK10", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key13 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AF053", "DV057", "DV071", 
    "JCA55", "AJ050", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key14 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AF053", "DV057", "DV071", "JCA55", "AJ050", 
    "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AF053", 
    "DV057", "DV071", "JCA55", "AJ050", "JFB40", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key16 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071", 
    "JCA55", "AG040", "JFB40", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key17 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "AG040", 
    "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key18 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AG009", 
    "DV057", "DV071", "JCA55", "AG040", "JFF23", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key19 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AG009", "DV057", "DV071", 
    "JCA55", "XS009", "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key20 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AG009", "DV057", "DV071", "JCA55", "XS009", 
    "JFF23", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key21 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", 
    "DV057", "DT016", "JCA55", "XS009", "JWA00", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key22 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071", 
    "JCA55", "UJD05", "JWA00", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key23 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "AJ004", "DV057", "XS918", "JCA55", "UJD05", 
    "JWA00", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), key24 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", 
    "DV057", "DV071", "JCA55", "JCA55", "JWA00", NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key25 = c(NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "AJ004", "DV057", "DV071", 
    "JCA55", "TPW99", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key26 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "DV057", "DV071", "JCA55", "AJ050", "QBB99", 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key27 = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071", 
    "JCA55", "AG040", "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), key28 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, "DV057", "DV071", "JCA55", "XS009", "QBB99", 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), key29 = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "DV057", "DV071", 
    "JCA55", NA, "QBB99", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA), key30 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, "DV057", "DV071", "JCA55", NA, "QBB99", NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11814L, 106482L, 17355L, 
11807L, 74026L, 120903L, 116030L, 116031L, 116032L, 116033L, 
116034L, 116035L, 116036L, 116037L, 116038L, 116039L, 116040L, 
116041L, 116042L), class = "data.frame")

Upvotes: 1

Views: 254

Answers (3)

PaulS
PaulS

Reputation: 25383

Yet another solution:

library(tidyr)

data_df %>% 
  pivot_wider(id,values_from = -id, values_fn=function(x) sum(x %in% my_patterns))

#> # A tibble: 12 × 31
#>       id key1_ key2_ key3_ key4_ key5_ key6_ key7_ key8_ key9_ key10_ key11_
#>    <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>  <int>  <int>
#>  1     1     0     0     0     0     0     0     0     0     0      0      0
#>  2     2     0     0     0     0     0     0     0     0     0      0      0
#>  3     3     1     0     1     0     0     0     0     0     0      0      0
#>  4  1317     1     1     1     1     1     1     1     1     1      1      0
#>  5 11832     1     1     1     1     1     1     1     0     1      0      0
#>  6  1943     0     0     0     0     0     0     0     0     0      0      0
#>  7  1316     1     1     1     1     1     1     1     1     1      1      1
#>  8  8317     1     1     1     1     1     1     1     1     1      0      0
#>  9 13405     0     0     0     0     0     0     0     0     0      0      0
#> 10 12881     1     1     1     1     1     1     1     0     0      0      0
#> 11 12882     0     0     0     0     1     0     0     0     0      0      0
#> 12 12883     0     0     0     0     0     0     0     0     0      0      0
#> # … with 19 more variables: key12_ <int>, key13_ <int>, key14_ <int>,
#> #   key15_ <int>, key16_ <int>, key17_ <int>, key18_ <int>, key19_ <int>,
#> #   key20_ <int>, key21_ <int>, key22_ <int>, key23_ <int>, key24_ <int>,
#> #   key25_ <int>, key26_ <int>, key27_ <int>, key28_ <int>, key29_ <int>,
#> #   key30_ <int>

Upvotes: 0

GuedesBF
GuedesBF

Reputation: 9878

With dplyr, we can use mutate with all columns that starts_with 'key'. value %in% my_pattern outputs TRUE/FALSE if any pattern matches the value. We can coerce to numeric with +(...). Finally, summarise by id with max

library(dplyr)

data_df %>%
    mutate(across(starts_with('key'), ~ +(.x %in% my_patterns))) %>%
    group_by(id)%>%
    summarise(across(starts_with('key'), max))


# A tibble: 12 × 31
      id  key1  key2  key3  key4  key5  key6  key7  key8  key9 key10 key11 key12 key13 key14 key15 key16 key17 key18 key19 key20 key21 key22
   <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 2     2     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 3     3     1     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 4  1316     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1     1
 5  1317     1     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0
 6  1943     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 7  8317     1     1     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     1
 8 11832     1     1     1     1     1     1     1     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0
 9 12881     1     1     1     1     1     1     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
10 12882     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
11 12883     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
12 13405     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
# … with 8 more variables: key23 <int>, key24 <int>, key25 <int>, key26 <int>, key27 <int>, key28 <int>, key29 <int>, key30 <int>

Or we can simplify it a little further by including all transformations inside summarise(across():

library(dplyr)

data_df %>%
    group_by(id) %>%
    summarise(across(starts_with('key'), ~+(any(.x %in% my_patterns))))

Upvotes: 2

danlooo
danlooo

Reputation: 10637

library(tidyverse)

my_patterns <- c("AF021", "DT022", "DV053", "UJC12", "UJD02", "UJD05", "AF012", "AG053", "JAH01", "JCA55", "QBB99")
# any element -> OR regex
my_regex <- paste0(my_patterns, collapse = "|")

data_df %>%
  as_tibble() %>%
  pivot_longer(-id) %>%
  transmute(
    id,
    name,
    value = value %>% map_dbl(~ .x %>%
      str_detect(my_regex) %>%
      replace_na(0))
  ) %>%
  distinct(id, name, .keep_all = TRUE) %>%
  pivot_wider(names_from = name, values_from = value)
#> # A tibble: 12 x 31
#>       id  key1  key2  key3  key4  key5  key6  key7  key8  key9 key10 key11 key12
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1     1     0     0     0     0     0     0     0     0     0     0     0     0
#>  2     2     0     0     0     0     0     0     0     0     0     0     0     0
#>  3     3     0     0     0     0     0     0     0     0     0     0     0     0
#>  4  1317     1     1     1     1     1     1     1     1     1     1     0     0
#>  5 11832     1     1     1     1     1     1     1     0     1     0     0     0
#>  6  1943     0     0     0     0     0     0     0     0     0     0     0     0
#>  7  1316     1     1     1     1     1     1     1     1     1     1     1     1
#>  8  8317     1     1     1     1     1     1     1     1     1     0     0     0
#>  9 13405     0     0     0     0     0     0     0     0     0     0     0     0
#> 10 12881     1     1     1     1     1     1     1     0     0     0     0     0
#> 11 12882     0     0     0     0     0     0     0     0     0     0     0     0
#> 12 12883     0     0     0     0     0     0     0     0     0     0     0     0
#> # … with 18 more variables: key13 <dbl>, key14 <dbl>, key15 <dbl>, key16 <dbl>,
#> #   key17 <dbl>, key18 <dbl>, key19 <dbl>, key20 <dbl>, key21 <dbl>,
#> #   key22 <dbl>, key23 <dbl>, key24 <dbl>, key25 <dbl>, key26 <dbl>,
#> #   key27 <dbl>, key28 <dbl>, key29 <dbl>, key30 <dbl>

Created on 2021-12-01 by the reprex package (v2.0.1)

Upvotes: 1

Related Questions