k_grace
k_grace

Reputation: 31

Advanced filtering by string characters. Multiple conditions with substring

I am trying to filter a large dataframe by ICD-10 codes.

I have the following dataframe as shown below.

     prindx  secdx1  secdx2  secdx3  secdx4  secdx5  secdx6  secdx7  secdx8  secdx9 secdx10 secdx11 secdx12 secdx13 secdx14 secdx15 secdx16 secdx17 secdx18 secdx19 secdx20 secdx21 secdx22 secdx23
1   S02118A   R6520    J690    R403    A419    J151   J9621     E43 S32019A  J15211 S22030A    A047 T797XXA S22040A S32029A S32039A S32049A S32059A  Y92413 X820XXA                                
9   S020XXB S066X9A S065X9A S064X9A S0102XA S82841A S62307A S52502A S52602A   G9349    E872    D688    G910   J9601 T8119XA     D62    G960   G9782 V5988XA  Y92413    Y838                        
28  S14123A   J9600 S069X9A    J690   G8252   G9340 S0219XA S2232XA    E872    E873    N179    E871   R1312  M62838    R410 S62521A    B964   M4802 W130XXA   Y9339  Y92038                        
44  S065X9A   A4151   R6521    J690    J810  G40919     J90  L89152     E43    J155   J9600 T17590A    E871    E872    I272 S066X9A S0219XA S061X9A   Y9301    Y999 W108XXA                        
95  S0219XB   J9600    J690   G9340  J95821    E871  R45851     D62   F1520    G960   G9782   R4701   E8351  G40909 S061X9A   F0781    E876   K5900 X749XXA    Y929    Y846                        
209 S12100A   G9741 S062X0A    N183    I129    R443   G4730    R410  Z96641 W109XXA    Y838  Y92234                                                                                                
    secdx24 secdx25 secdx26 secdx27 secdx28 secdx29 ID POISON
1                                                    1   NULL
9                                                    2   NULL
28                                                   3   NULL
44                                                   4   NULL
95                                                   5   NULL
209                                                  6   NULL

I am trying to select those that contain the following ICD codes "S52". However this is complicated further by the fact that I need those codes that have the 6th character of "1", "2", "3" or "4" only. For example S52602A If both of these conditions are met then I need a the column "POISON" value to equal "TRUE". Thanks!

Dput below

structure(list(prindx = c("S02118A", "S020XXB", "S14123A", "S065X9A", 
"S0219XB", "S12100A", "S2243XA", "S32052A", "S0219XB", "S065X9A", 
"S060X1A", "S060X9A", "S069X1A", "S065X0A", "S0210XA", "S72331A", 
"S065X0A", "S0264XA", "S066X0A", "S066X0A"), secdx1 = c("R6520", 
"S066X9A", "J9600", "A4151", "J9600", "G9741", "J9600", "S066X0A", 
"I2699", "S066X9A", "R55", "F200", "S42292A", "I10", "J9601", 
"S060X9A", "G935", "J690", "S12000A", "R4701"), secdx2 = c("J690", 
"S065X9A", "S069X9A", "R6521", "J690", "S062X0A", "S270XXA", 
"S37032A", "T794XXA", "J9601", "R112", "I10", "S42425A", "R531", 
"S270XXA", "S42001A", "J9600", "G92", "E871", "I10"), secdx3 = c("R403", 
"S064X9A", "J690", "J690", "G9340", "N183", "S069X9A", "S52572A", 
"I82411", "S270XXA", "Y939", "S01112A", "V4959XA", "E669", "S066X5A", 
"K5900", "R569", "Z6843", "E8342", "E780"), secdx4 = c("A419", 
"S0102XA", "G8252", "J810", "J95821", "I129", "S25391A", "I252", 
"E440", "J150", "W1830XA", "S022XXA", "Y92488", "Z6835", "G931", 
"S8012XA", "D62", "E871", "B370", "R739"), secdx5 = c("J151", 
"S82841A", "G9340", "G40919", "E871", "R443", "S3210XA", "E039", 
"E872", "S22049A", "Y92011", "R40241", "", "W19XXXA", "R1312", 
"S8011XA", "E8342", "D62", "F10129", "S065X0A"), secdx6 = c("J9621", 
"S62307A", "S0219XA", "J90", "R45851", "G4730", "E872", "R0781", 
"D62", "S02413A", "", "S0101XA", "", "", "S27321A", "R402352", 
"E872", "J9601", "F1510", "R001"), secdx7 = c("E43", "S52502A", 
"S2232XA", "L89152", "D62", "R410", "I959", "R0902", "J9602", 
"S27322A", "", "H1130", "", "", "D62", "R402132", "I69354", "J9811", 
"D509", "R079"), secdx8 = c("S32019A", "S52602A", "E872", "E43", 
"F1520", "Z96641", "S27321A", "I4510", "J9601", "S22059A", "", 
"E119", "", "", "G8191", "R402242", "F10239", "Z430", "F17210", 
"E876"), secdx9 = c("J15211", "G9349", "E873", "J155", "G960", 
"W109XXA", "S42322A", "E8809", "E871", "C9591", "", "Z23", "", 
"", "S02401A", "V4362XA", "R1310", "S01411A", "S0219XA", "E8342"
), secdx10 = c("S22030A", "E872", "N179", "J9600", "G9782", "Y838", 
"S0451XA", "F17210", "I824Z2", "J9811", "", "F319", "", "", "S023XXA", 
"Y92410", "I10", "S1181XA", "R739", "W1830XA"), secdx11 = c("A047", 
"D688", "E871", "T17590A", "R4701", "Y92234", "D62", "R312", 
"J9811", "S0219XA", "", "S8251XA", "", "", "S065X5A", "", "F17210", 
"E669", "G8911", "Y9321"), secdx12 = c("T797XXA", "G910", "R1312", 
"E871", "E8351", "", "S52615A", "Z955", "G8101", "S01512A", "", 
"S8010XA", "", "", "S062X5A", "", "S062X0A", "Z713", "R001", 
"Y9289"), secdx13 = c("S22040A", "J9601", "M62838", "E872", "G40909", 
"", "S66022A", "K219", "I82611", "H02209", "", "Z720", "", "", 
"F0781", "", "E876", "L0889", "T50995A", ""), secdx14 = c("S32029A", 
"T8119XA", "R410", "I272", "S061X9A", "", "F19921", "R413", "J449", 
"J9809", "", "Y00XXXA", "", "", "D6959", "", "W010XXA", "B9562", 
"W108XXA", ""), secdx15 = c("S32039A", "D62", "S62521A", "S066X9A", 
"F0781", "", "S52272A", "V892XXA", "R1310", "S020XXA", "", "Y9389", 
"", "", "F1210", "", "Y92003", "T4275XA", "Y92009", ""), secdx16 = c("S32049A", 
"G960", "B964", "S0219XA", "E876", "", "S022XXA", "", "Z781", 
"H1131", "", "Y9289", "", "", "R471", "", "", "S069X0A", "Y908", 
""), secdx17 = c("S32059A", "G9782", "M4802", "S061X9A", "K5900", 
"", "S42021A", "", "S062X6A", "R338", "", "", "", "", "H532", 
"", "", "I10", "", ""), secdx18 = c("Y92413", "V5988XA", "W130XXA", 
"Y9301", "X749XXA", "", "V4362XA", "", "X72XXXA", "W1809XA", 
"", "", "", "", "V4988XA", "", "", "E8351", "", ""), secdx19 = c("X820XXA", 
"Y92413", "Y9339", "Y999", "Y929", "", "Y939", "", "Y92009", 
"Y990", "", "", "", "", "Y92411", "", "", "R739", "", ""), secdx20 = c("", 
"Y838", "Y92038", "W108XXA", "Y846", "", "Y92411", "", "", "", 
"", "", "", "", "Y9389", "", "", "D638", "", ""), secdx21 = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"G4733", "", ""), secdx22 = c("", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "G8911", "", ""), secdx23 = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"F1510", "", ""), secdx24 = c("", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "V535XXA", "", ""), secdx25 = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"Y92413", "", ""), secdx26 = c("", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "Y92230", "", ""), secdx27 = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), secdx28 = c("", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", ""), secdx29 = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), ID = 1:20, POISON = c("NULL", "NULL", "NULL", "NULL", 
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", 
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL"
)), row.names = c(1L, 9L, 28L, 44L, 95L, 209L, 286L, 363L, 506L, 
608L, 628L, 699L, 760L, 778L, 905L, 909L, 1018L, 1027L, 1126L, 
1315L), class = "data.frame")

Upvotes: 1

Views: 55

Answers (1)

Brian
Brian

Reputation: 8275

If I'm understanding correctly, this looks in every column (everything()) and returns TRUE if any column (if_any()) matches the regular expression.

^S52..[1234] means that the value starts (^) with S52, then has two characters that can be anything (. and .), which is a total of 5 characters. Then it looks in the 6th character for any of 1, 2, 3, or 4.

library(dplyr)
library(stringr)

your_df %>%
  mutate(POISON = if_any(everything(), ~str_detect(., "^S52..[1234]")))

Results stored in the POISON column:

 [1] FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Additional details on regex, based on a comment: To modify a regex literal, like abc to include both "abc" and "def", you'd use (abc|def). So to include other prefixes, you could use ^(S52|S53|S54)..[01234]. This particular example is the same as ^S5[234]..[01234], and also ^S5[2-4]..[1-4]. Choose the flavor you find readable.

Upvotes: 1

Related Questions