Reputation: 865
I have the following data:
events <- structure(list(ID = structure(c(7L, 7L, 7L, 7L, 5L, 5L, 5L, 6L,
6L, 6L, 8L, 8L, 9L, 9L, 10L, 10L, 1L, 2L, 3L, 4L), .Label = c("ST10273049951.90",
"ST10273085397.90", "ST10273204081.90", "ST10273262134.90", "ST10273679485.90",
"ST10274019056.90", "ST10274109968.90", "ST10274426483.90", "ST10274430159.90",
"ST10274433406.90"), class = "factor"), TVTProcedureStartDate = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 9L, 9L, 9L, 6L, 6L, 8L, 8L, 7L, 7L, 4L,
5L, 10L, 3L), .Label = c("11/6/2015", "12/15/2016", "12/16/2014",
"5/20/2014", "6/17/2014", "6/30/2017", "7/18/2017", "7/6/2017",
"9/13/2016", "9/9/2014"), class = "factor"), DCDate = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 10L, 10L, 10L, 7L, 7L, 6L, 6L, 8L, 8L,
4L, 5L, 9L, 3L), .Label = c("1/1/2017", "11/15/2015", "12/22/2014",
"5/23/2014", "6/29/2014", "7/15/2017", "7/5/2017", "8/3/2017",
"9/13/2014", "9/22/2016"), class = "factor"), CE_EventOccurred = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "Yes", class = "factor"), CE_EventDate = structure(c(4L,
6L, 7L, 8L, 2L, 3L, 1L, 18L, 19L, 20L, 11L, 14L, 15L, 16L, 12L,
13L, 9L, 10L, 17L, 5L), .Label = c("11/10/2015", "11/6/2015",
"11/8/2015", "12/15/2016", "12/16/2014", "12/16/2016", "12/21/2016",
"12/23/2016", "5/20/2014", "6/23/2014", "6/30/2017", "7/18/2017",
"7/22/2017", "7/3/2017", "7/7/2017", "7/9/2017", "9/10/2014",
"9/13/2016", "9/14/2016", "9/16/2016"), class = "factor"), Annular.Dissection..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Aortic.Dissection..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L), .Label = c("0", "E008"), class = "factor"), Atrial.Fibrillation..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L), .Label = c("0", "E006"), class = "factor"), Bleeding.at.Access.Site..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E017"), class = "factor"), Cardiac.Arrest..In.Hospital. = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L), .Label = c("0", "E005"), class = "factor"), Conduction.Native.Pacer.Disturbance.Req.ICD..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L), .Label = c("0", "E040"), class = "factor"), Conduction.Native.Pacer.Disturbance.Req.Pacer..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E039"), class = "factor"), Endocarditis..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), GI.Bleed..In.Hospital. = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Hematoma.at.Access.Site..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Ischemic.Stroke..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E011"), class = "factor"), Major.Vascular.Complications..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E041"), class = "factor"), Minor.Vascular.Complication..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Mitral.Leaflet.Injury...detected.during.surgery..In.Hospital. = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E045"), class = "factor"), Mitral.Subvalvular.Injury..detected.during.surgery..In.Hospital. = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E047"), class = "factor"), New.Requirement.for.Dialysis..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Other.Bleed..In.Hospital. = structure(c(1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L), .Label = c("0", "E022"), class = "factor"), Perforation.with.or.w.o.Tamponade..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L), .Label = c("0", "E009"), class = "factor"), Retroperitoneal.Bleeding..In.Hospital. = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Single.Leaflet.Device.Attachment..In.Hospital. = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E049"), class = "factor"), Unplanned.Other.Cardiac.Surgery.or.Intervention..In.Hospital. = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("0", "E031"), class = "factor"), Unplanned.Vascular.Surgery.or.Intervention..In.Hospital. = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L), .Label = c("0", "E032"), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
There are one or more records for the same patient with different events or same events in different dates. I need to somehow group this data by each id and have data for each row only. Now, some columns doesn't really matter, but the ones that are events and has (in hospital)
are important. When you check the data, you'll see that these events are either 0
or has some kind of code that begins with letter E
. This data can be changed, and instead of the code, it should be replaced by 1. i.e. if there are 1 or more of the same event in multiple rows, the single consolidated rows should only show 1
.
CE_EventDate
s should also appear in the same column separated by comma (,).
I tried doing this with dplyr
method summarise
which worked fine, but couldn't figure out how to group by ID and keep all columns with any
in the column names. I'm not sure how to obtain the desired output. Something like the following.
ID TVTProcedureStartDate DCDate CE_EventOccurred CE_EventDate Annular Dissection (In Hospital) Aortic Dissection (In Hospital) Atrial Fibrillation (In Hospital) Bleeding at Access Site (In Hospital) Cardiac Arrest (In Hospital) Conduction/Native Pacer Disturbance Req ICD (In Hospital) Conduction/Native Pacer Disturbance Req Pacer (In Hospital) Endocarditis (In Hospital) GI Bleed (In Hospital) Hematoma at Access Site (In Hospital) Ischemic Stroke (In Hospital) Major Vascular Complications (In Hospital) Minor Vascular Complication (In Hospital) Mitral Leaflet Injury - detected during surgery (In Hospital) Mitral Subvalvular Injury -detected during surgery (In Hospital) New Requirement for Dialysis (In Hospital) Other Bleed (In Hospital) Perforation with or w/o Tamponade (In Hospital) Retroperitoneal Bleeding (In Hospital) Single Leaflet Device Attachment (In Hospital) Unplanned Other Cardiac Surgery or Intervention (In Hospital) Unplanned Vascular Surgery or Intervention (In Hospital)
ST10274109968.90 12/15/2016 1/1/2017 Yes 12/15/2016,12/16/2016,12/21/2016,12/23/2016 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0
ST10273262134.90 12/16/2014 12/22/2014 Yes 12/16/2014 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
I tried various ways to group
and summarise
but none produced the results I am looking for or generated error due to syntax error. Could use some help with this. I've been playing around with this for a few days now
Upvotes: 1
Views: 67
Reputation: 6441
If i read your instructions correctly, this might be what you want:
#convert everything to character
events %>%
mutate_all(as.character) -> intermediate
intermediate %>%
group_by(ID, TVTProcedureStartDate, DCDate, CE_EventOccurred) %>%
summarise_at(vars(ends_with("Hospital.")), list(. = ~case_when(any(. != "0") ~ "1",
TRUE ~ "0"))) %>%
inner_join(intermediate %>%
group_by(ID) %>%
summarise(CE_EventDate = list(unique(CE_EventDate))),
by = "ID") %>%
select(ID, TVTProcedureStartDate, DCDate, CE_EventOccurred, CE_EventDate, ends_with("._.")) -> result
> result
# A tibble: 10 x 27
ID TVTProcedureSta~ DCDate CE_EventOccurred CE_EventDate Annular.Dissect~ Aortic.Dissecti~
<chr> <chr> <chr> <chr> <list> <chr> <chr>
1 ST10~ 5/20/2014 5/23/~ Yes <chr [1]> 0 0
2 ST10~ 6/17/2014 6/29/~ Yes <chr [1]> 0 0
3 ST10~ 9/9/2014 9/13/~ Yes <chr [1]> 0 0
4 ST10~ 12/16/2014 12/22~ Yes <chr [1]> 0 1
5 ST10~ 11/6/2015 11/15~ Yes <chr [3]> 0 0
6 ST10~ 9/13/2016 9/22/~ Yes <chr [3]> 0 0
7 ST10~ 12/15/2016 1/1/2~ Yes <chr [4]> 0 0
8 ST10~ 6/30/2017 7/5/2~ Yes <chr [2]> 0 0
9 ST10~ 7/6/2017 7/15/~ Yes <chr [2]> 0 0
10 ST10~ 7/18/2017 8/3/2~ Yes <chr [2]> 0 0
# ... with 20 more variables: Atrial.Fibrillation..In.Hospital._. <chr>,
Upvotes: 1
Reputation: 887118
May be we need
library(tidyverse)
events %>%
mutate_at(6:ncol(.), as.character) %>%
group_by(ID) %>%
mutate_if(is.character, list(~ +(any(str_detect(., "^E"))))) %>%
distinct(ID, .keep_all = TRUE)
Upvotes: 1