Reputation: 31
I am trying to filter a large dataframe by ICD-10 codes.
I have the following dataframe as shown below.
prindx secdx1 secdx2 secdx3 secdx4 secdx5 secdx6 secdx7 secdx8 secdx9 secdx10 secdx11 secdx12 secdx13 secdx14 secdx15 secdx16 secdx17 secdx18 secdx19 secdx20 secdx21 secdx22 secdx23
1 S02118A R6520 J690 R403 A419 J151 J9621 E43 S32019A J15211 S22030A A047 T797XXA S22040A S32029A S32039A S32049A S32059A Y92413 X820XXA
9 S020XXB S066X9A S065X9A S064X9A S0102XA S82841A S62307A S52502A S52602A G9349 E872 D688 G910 J9601 T8119XA D62 G960 G9782 V5988XA Y92413 Y838
28 S14123A J9600 S069X9A J690 G8252 G9340 S0219XA S2232XA E872 E873 N179 E871 R1312 M62838 R410 S62521A B964 M4802 W130XXA Y9339 Y92038
44 S065X9A A4151 R6521 J690 J810 G40919 J90 L89152 E43 J155 J9600 T17590A E871 E872 I272 S066X9A S0219XA S061X9A Y9301 Y999 W108XXA
95 S0219XB J9600 J690 G9340 J95821 E871 R45851 D62 F1520 G960 G9782 R4701 E8351 G40909 S061X9A F0781 E876 K5900 X749XXA Y929 Y846
209 S12100A G9741 S062X0A N183 I129 R443 G4730 R410 Z96641 W109XXA Y838 Y92234
secdx24 secdx25 secdx26 secdx27 secdx28 secdx29 ID POISON
1 1 NULL
9 2 NULL
28 3 NULL
44 4 NULL
95 5 NULL
209 6 NULL
I am trying to select those that contain the following ICD codes "S52". However this is complicated further by the fact that I need those codes that have the 6th character of "1", "2", "3" or "4" only. For example S52602A If both of these conditions are met then I need a the column "POISON" value to equal "TRUE". Thanks!
Dput below
structure(list(prindx = c("S02118A", "S020XXB", "S14123A", "S065X9A",
"S0219XB", "S12100A", "S2243XA", "S32052A", "S0219XB", "S065X9A",
"S060X1A", "S060X9A", "S069X1A", "S065X0A", "S0210XA", "S72331A",
"S065X0A", "S0264XA", "S066X0A", "S066X0A"), secdx1 = c("R6520",
"S066X9A", "J9600", "A4151", "J9600", "G9741", "J9600", "S066X0A",
"I2699", "S066X9A", "R55", "F200", "S42292A", "I10", "J9601",
"S060X9A", "G935", "J690", "S12000A", "R4701"), secdx2 = c("J690",
"S065X9A", "S069X9A", "R6521", "J690", "S062X0A", "S270XXA",
"S37032A", "T794XXA", "J9601", "R112", "I10", "S42425A", "R531",
"S270XXA", "S42001A", "J9600", "G92", "E871", "I10"), secdx3 = c("R403",
"S064X9A", "J690", "J690", "G9340", "N183", "S069X9A", "S52572A",
"I82411", "S270XXA", "Y939", "S01112A", "V4959XA", "E669", "S066X5A",
"K5900", "R569", "Z6843", "E8342", "E780"), secdx4 = c("A419",
"S0102XA", "G8252", "J810", "J95821", "I129", "S25391A", "I252",
"E440", "J150", "W1830XA", "S022XXA", "Y92488", "Z6835", "G931",
"S8012XA", "D62", "E871", "B370", "R739"), secdx5 = c("J151",
"S82841A", "G9340", "G40919", "E871", "R443", "S3210XA", "E039",
"E872", "S22049A", "Y92011", "R40241", "", "W19XXXA", "R1312",
"S8011XA", "E8342", "D62", "F10129", "S065X0A"), secdx6 = c("J9621",
"S62307A", "S0219XA", "J90", "R45851", "G4730", "E872", "R0781",
"D62", "S02413A", "", "S0101XA", "", "", "S27321A", "R402352",
"E872", "J9601", "F1510", "R001"), secdx7 = c("E43", "S52502A",
"S2232XA", "L89152", "D62", "R410", "I959", "R0902", "J9602",
"S27322A", "", "H1130", "", "", "D62", "R402132", "I69354", "J9811",
"D509", "R079"), secdx8 = c("S32019A", "S52602A", "E872", "E43",
"F1520", "Z96641", "S27321A", "I4510", "J9601", "S22059A", "",
"E119", "", "", "G8191", "R402242", "F10239", "Z430", "F17210",
"E876"), secdx9 = c("J15211", "G9349", "E873", "J155", "G960",
"W109XXA", "S42322A", "E8809", "E871", "C9591", "", "Z23", "",
"", "S02401A", "V4362XA", "R1310", "S01411A", "S0219XA", "E8342"
), secdx10 = c("S22030A", "E872", "N179", "J9600", "G9782", "Y838",
"S0451XA", "F17210", "I824Z2", "J9811", "", "F319", "", "", "S023XXA",
"Y92410", "I10", "S1181XA", "R739", "W1830XA"), secdx11 = c("A047",
"D688", "E871", "T17590A", "R4701", "Y92234", "D62", "R312",
"J9811", "S0219XA", "", "S8251XA", "", "", "S065X5A", "", "F17210",
"E669", "G8911", "Y9321"), secdx12 = c("T797XXA", "G910", "R1312",
"E871", "E8351", "", "S52615A", "Z955", "G8101", "S01512A", "",
"S8010XA", "", "", "S062X5A", "", "S062X0A", "Z713", "R001",
"Y9289"), secdx13 = c("S22040A", "J9601", "M62838", "E872", "G40909",
"", "S66022A", "K219", "I82611", "H02209", "", "Z720", "", "",
"F0781", "", "E876", "L0889", "T50995A", ""), secdx14 = c("S32029A",
"T8119XA", "R410", "I272", "S061X9A", "", "F19921", "R413", "J449",
"J9809", "", "Y00XXXA", "", "", "D6959", "", "W010XXA", "B9562",
"W108XXA", ""), secdx15 = c("S32039A", "D62", "S62521A", "S066X9A",
"F0781", "", "S52272A", "V892XXA", "R1310", "S020XXA", "", "Y9389",
"", "", "F1210", "", "Y92003", "T4275XA", "Y92009", ""), secdx16 = c("S32049A",
"G960", "B964", "S0219XA", "E876", "", "S022XXA", "", "Z781",
"H1131", "", "Y9289", "", "", "R471", "", "", "S069X0A", "Y908",
""), secdx17 = c("S32059A", "G9782", "M4802", "S061X9A", "K5900",
"", "S42021A", "", "S062X6A", "R338", "", "", "", "", "H532",
"", "", "I10", "", ""), secdx18 = c("Y92413", "V5988XA", "W130XXA",
"Y9301", "X749XXA", "", "V4362XA", "", "X72XXXA", "W1809XA",
"", "", "", "", "V4988XA", "", "", "E8351", "", ""), secdx19 = c("X820XXA",
"Y92413", "Y9339", "Y999", "Y929", "", "Y939", "", "Y92009",
"Y990", "", "", "", "", "Y92411", "", "", "R739", "", ""), secdx20 = c("",
"Y838", "Y92038", "W108XXA", "Y846", "", "Y92411", "", "", "",
"", "", "", "", "Y9389", "", "", "D638", "", ""), secdx21 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"G4733", "", ""), secdx22 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "G8911", "", ""), secdx23 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"F1510", "", ""), secdx24 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "V535XXA", "", ""), secdx25 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"Y92413", "", ""), secdx26 = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "Y92230", "", ""), secdx27 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", ""), secdx28 = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", ""), secdx29 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", ""), ID = 1:20, POISON = c("NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL",
"NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL"
)), row.names = c(1L, 9L, 28L, 44L, 95L, 209L, 286L, 363L, 506L,
608L, 628L, 699L, 760L, 778L, 905L, 909L, 1018L, 1027L, 1126L,
1315L), class = "data.frame")
Upvotes: 1
Views: 55
Reputation: 8275
If I'm understanding correctly, this looks in every column (everything()
) and returns TRUE if any column (if_any()
) matches the regular expression.
^S52..[1234]
means that the value starts (^
) with S52
, then has two characters that can be anything (.
and .
), which is a total of 5 characters. Then it looks in the 6th character for any of 1, 2, 3, or 4.
library(dplyr)
library(stringr)
your_df %>%
mutate(POISON = if_any(everything(), ~str_detect(., "^S52..[1234]")))
Results stored in the POISON
column:
[1] FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Additional details on regex, based on a comment:
To modify a regex literal, like abc
to include both "abc" and "def", you'd use (abc|def)
. So to include other prefixes, you could use ^(S52|S53|S54)..[01234]
. This particular example is the same as ^S5[234]..[01234]
, and also ^S5[2-4]..[1-4]
. Choose the flavor you find readable.
Upvotes: 1