Kuni
Kuni

Reputation: 865

How to group by ID while keeping all columns and dichotomizing their presence using dplyr?

I have the following data:

events <- structure(list(ID = structure(c(7L, 7L, 7L, 7L, 5L, 5L, 5L, 6L, 
6L, 6L, 8L, 8L, 9L, 9L, 10L, 10L, 1L, 2L, 3L, 4L), .Label = c("ST10273049951.90", 
"ST10273085397.90", "ST10273204081.90", "ST10273262134.90", "ST10273679485.90", 
"ST10274019056.90", "ST10274109968.90", "ST10274426483.90", "ST10274430159.90", 
"ST10274433406.90"), class = "factor"), TVTProcedureStartDate = structure(c(2L, 
2L, 2L, 2L, 1L, 1L, 1L, 9L, 9L, 9L, 6L, 6L, 8L, 8L, 7L, 7L, 4L, 
5L, 10L, 3L), .Label = c("11/6/2015", "12/15/2016", "12/16/2014", 
"5/20/2014", "6/17/2014", "6/30/2017", "7/18/2017", "7/6/2017", 
"9/13/2016", "9/9/2014"), class = "factor"), DCDate = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 10L, 10L, 10L, 7L, 7L, 6L, 6L, 8L, 8L, 
4L, 5L, 9L, 3L), .Label = c("1/1/2017", "11/15/2015", "12/22/2014", 
"5/23/2014", "6/29/2014", "7/15/2017", "7/5/2017", "8/3/2017", 
"9/13/2014", "9/22/2016"), class = "factor"), CE_EventOccurred = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "Yes", class = "factor"), CE_EventDate = structure(c(4L, 
6L, 7L, 8L, 2L, 3L, 1L, 18L, 19L, 20L, 11L, 14L, 15L, 16L, 12L, 
13L, 9L, 10L, 17L, 5L), .Label = c("11/10/2015", "11/6/2015", 
"11/8/2015", "12/15/2016", "12/16/2014", "12/16/2016", "12/21/2016", 
"12/23/2016", "5/20/2014", "6/23/2014", "6/30/2017", "7/18/2017", 
"7/22/2017", "7/3/2017", "7/7/2017", "7/9/2017", "9/10/2014", 
"9/13/2016", "9/14/2016", "9/16/2016"), class = "factor"), Annular.Dissection..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Aortic.Dissection..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L), .Label = c("0", "E008"), class = "factor"), Atrial.Fibrillation..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 1L, 1L), .Label = c("0", "E006"), class = "factor"), Bleeding.at.Access.Site..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E017"), class = "factor"), Cardiac.Arrest..In.Hospital. = structure(c(1L, 
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L), .Label = c("0", "E005"), class = "factor"), Conduction.Native.Pacer.Disturbance.Req.ICD..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L), .Label = c("0", "E040"), class = "factor"), Conduction.Native.Pacer.Disturbance.Req.Pacer..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E039"), class = "factor"), Endocarditis..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), GI.Bleed..In.Hospital. = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Hematoma.at.Access.Site..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Ischemic.Stroke..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E011"), class = "factor"), Major.Vascular.Complications..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E041"), class = "factor"), Minor.Vascular.Complication..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Mitral.Leaflet.Injury...detected.during.surgery..In.Hospital. = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E045"), class = "factor"), Mitral.Subvalvular.Injury..detected.during.surgery..In.Hospital. = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E047"), class = "factor"), New.Requirement.for.Dialysis..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Other.Bleed..In.Hospital. = structure(c(1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 
1L, 1L), .Label = c("0", "E022"), class = "factor"), Perforation.with.or.w.o.Tamponade..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L), .Label = c("0", "E009"), class = "factor"), Retroperitoneal.Bleeding..In.Hospital. = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Single.Leaflet.Device.Attachment..In.Hospital. = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E049"), class = "factor"), Unplanned.Other.Cardiac.Surgery.or.Intervention..In.Hospital. = structure(c(1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("0", "E031"), class = "factor"), Unplanned.Vascular.Surgery.or.Intervention..In.Hospital. = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L), .Label = c("0", "E032"), class = "factor")), class = "data.frame", row.names = c(NA, 
-20L))

There are one or more records for the same patient with different events or same events in different dates. I need to somehow group this data by each id and have data for each row only. Now, some columns doesn't really matter, but the ones that are events and has (in hospital) are important. When you check the data, you'll see that these events are either 0 or has some kind of code that begins with letter E. This data can be changed, and instead of the code, it should be replaced by 1. i.e. if there are 1 or more of the same event in multiple rows, the single consolidated rows should only show 1.

CE_EventDates should also appear in the same column separated by comma (,).

I tried doing this with dplyr method summarise which worked fine, but couldn't figure out how to group by ID and keep all columns with any in the column names. I'm not sure how to obtain the desired output. Something like the following.

ID  TVTProcedureStartDate   DCDate  CE_EventOccurred    CE_EventDate    Annular Dissection (In Hospital)    Aortic Dissection (In Hospital) Atrial Fibrillation (In Hospital)   Bleeding at Access Site (In Hospital)   Cardiac Arrest (In Hospital)    Conduction/Native Pacer Disturbance Req ICD (In Hospital)   Conduction/Native Pacer Disturbance Req Pacer (In Hospital) Endocarditis (In Hospital)  GI Bleed (In Hospital)  Hematoma at Access Site (In Hospital)   Ischemic Stroke (In Hospital)   Major Vascular Complications (In Hospital)  Minor Vascular Complication (In Hospital)   Mitral Leaflet Injury - detected during surgery (In Hospital)   Mitral Subvalvular Injury -detected during surgery (In Hospital)    New Requirement for Dialysis (In Hospital)  Other Bleed (In Hospital)   Perforation with or w/o Tamponade (In Hospital) Retroperitoneal Bleeding (In Hospital)  Single Leaflet Device Attachment (In Hospital)  Unplanned Other Cardiac Surgery or Intervention (In Hospital)   Unplanned Vascular Surgery or Intervention (In Hospital)
ST10274109968.90    12/15/2016  1/1/2017    Yes 12/15/2016,12/16/2016,12/21/2016,12/23/2016 0   0   0   0   1   0   0   0   0   0   0   0   0   1   1   0   1   0   0   1   1   0
ST10273262134.90    12/16/2014  12/22/2014  Yes 12/16/2014  0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1

I tried various ways to group and summarise but none produced the results I am looking for or generated error due to syntax error. Could use some help with this. I've been playing around with this for a few days now

Upvotes: 1

Views: 67

Answers (2)

Humpelstielzchen
Humpelstielzchen

Reputation: 6441

If i read your instructions correctly, this might be what you want:

#convert everything to character
events %>%
  mutate_all(as.character) -> intermediate


  intermediate %>%
  group_by(ID, TVTProcedureStartDate, DCDate, CE_EventOccurred) %>%
  summarise_at(vars(ends_with("Hospital.")), list(. = ~case_when(any(. != "0") ~ "1",
                                                               TRUE ~ "0"))) %>%
  inner_join(intermediate %>%
               group_by(ID) %>%
               summarise(CE_EventDate = list(unique(CE_EventDate))),
             by = "ID") %>%
  select(ID, TVTProcedureStartDate, DCDate, CE_EventOccurred, CE_EventDate, ends_with("._.")) -> result

> result
# A tibble: 10 x 27
   ID    TVTProcedureSta~ DCDate CE_EventOccurred CE_EventDate Annular.Dissect~ Aortic.Dissecti~
   <chr> <chr>            <chr>  <chr>            <list>       <chr>            <chr>           
 1 ST10~ 5/20/2014        5/23/~ Yes              <chr [1]>    0                0               
 2 ST10~ 6/17/2014        6/29/~ Yes              <chr [1]>    0                0               
 3 ST10~ 9/9/2014         9/13/~ Yes              <chr [1]>    0                0               
 4 ST10~ 12/16/2014       12/22~ Yes              <chr [1]>    0                1               
 5 ST10~ 11/6/2015        11/15~ Yes              <chr [3]>    0                0               
 6 ST10~ 9/13/2016        9/22/~ Yes              <chr [3]>    0                0               
 7 ST10~ 12/15/2016       1/1/2~ Yes              <chr [4]>    0                0               
 8 ST10~ 6/30/2017        7/5/2~ Yes              <chr [2]>    0                0               
 9 ST10~ 7/6/2017         7/15/~ Yes              <chr [2]>    0                0               
10 ST10~ 7/18/2017        8/3/2~ Yes              <chr [2]>    0                0               
# ... with 20 more variables: Atrial.Fibrillation..In.Hospital._. <chr>,

Upvotes: 1

akrun
akrun

Reputation: 887118

May be we need

library(tidyverse)
events %>%
     mutate_at(6:ncol(.), as.character) %>% 
     group_by(ID) %>% 
     mutate_if(is.character, list(~ +(any(str_detect(., "^E"))))) %>% 
     distinct(ID, .keep_all = TRUE)

Upvotes: 1

Related Questions