kam
kam

Reputation: 345

Creating column with value based on condition in other columns

I got this df

structure(list(subj = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 
6L, 6L, 7L, 7L, 7L, 7L), adm_number = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), label_2 = c("dead", 
"dead", "dead", "dead", "dead", "dead", "dead", "dead", "dead", 
"dead", "dead", "alive", "alive", "alive", "alive", "alive", 
"alive", "alive", "alive", "alive", "alive", "alive", "alive", 
"dead", "dead", "dead", "dead", "dead", "dead", "dead", "dead", 
"dead"), dateIN = structure(c(14213, 14270, 14377, 14384, 14407, 
14415, 14442, 14714, 14893, 14988, 15091, 13773, 14343, 15446, 
15454, 15480, 15571, 15601, 15672, 15692, 14104, 15417, 15523, 
13188, 14308, 14431, 14930, 14948, 13498, 13539, 14169, 14278
), class = "Date"), dateOUT = structure(c(14225, 14291, 14379, 
14389, 14411, 14420, 14447, 14719, 14896, 14991, 15092, 13783, 
14351, 15450, 15479, 15485, 15573, 15610, 15689, 15702, 14142, 
15418, 15541, 13204, 14319, 14445, 14936, 14962, 13507, 13551, 
14177, 14285), class = "Date"), dateCENS = structure(c(15092, 
15092, 15092, 15092, 15092, 15092, 15092, 15092, 15092, 15092, 
15092, 15705, 15705, 15705, 15705, 15705, 15705, 15705, 15705, 
15705, 15705, 15705, 15705, 13985, 14962, 14962, 14962, 14962, 
15623, 15623, 15623, 15623), class = "Date")), row.names = c(NA, 
-32L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7ffc190152e0>)

I want to create an extra column(label_3) like this

    structure(list(subj = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 6L, 6L, 
6L, 6L, 7L, 7L, 7L, 7L), adm_number = c(1L, 2L, 3L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), label_2 = c("dead", 
"dead", "dead", "dead", "dead", "dead", "dead", "dead", "dead", 
"dead", "dead", "alive", "alive", "alive", "alive", "alive", 
"alive", "alive", "alive", "alive", "alive", "alive", "alive", 
"dead", "dead", "dead", "dead", "dead", "dead", "dead", "dead", 
"dead"), dateIN = structure(c(14213, 14270, 14377, 14384, 14407, 
14415, 14442, 14714, 14893, 14988, 15091, 13773, 14343, 15446, 
15454, 15480, 15571, 15601, 15672, 15692, 14104, 15417, 15523, 
13188, 14308, 14431, 14930, 14948, 13498, 13539, 14169, 14278
), class = "Date"), dateOUT = structure(c(14225, 14291, 14379, 
14389, 14411, 14420, 14447, 14719, 14896, 14991, 15092, 13783, 
14351, 15450, 15479, 15485, 15573, 15610, 15689, 15702, 14142, 
15418, 15541, 13204, 14319, 14445, 14936, 14962, 13507, 13551, 
14177, 14285), class = "Date"), dateCENS = structure(c(15092, 
15092, 15092, 15092, 15092, 15092, 15092, 15092, 15092, 15092, 
15092, 15705, 15705, 15705, 15705, 15705, 15705, 15705, 15705, 
15705, 15705, 15705, 15705, 13985, 14962, 14962, 14962, 14962, 
15623, 15623, 15623, 15623), class = "Date"), label_3 = c("dead_in", 
"dead_in", "dead_in", "dead_in", "dead_in", "dead_in", "dead_in", 
"dead_in", "dead_in", "dead_in", "dead_in", "alive", "alive", 
"alive", "alive", "alive", "alive", "alive", "alive", "alive", 
"alive", "alive", "alive", "dead_out", "dead_in", "dead_in", 
"dead_in", "dead_in", "dead_out", "dead_out", "dead_out", "dead_out"
)), row.names = c(NA, -32L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x7ffc190152e0>)

Basically, I want the status as dead_in if label_2 == dead and dateCENS is within dateIN and dateOUT, and status as dead_out if dateCENS is outside dateIN and dateOUT and alive otherwise when label_2 == alive.

Upvotes: 1

Views: 53

Answers (1)

r2evans
r2evans

Reputation: 160397

I think your desired status is inconsistent:

     subj adm_number label_2     dateIN    dateOUT   dateCENS  label_3
    <int>      <int>  <char>     <Date>     <Date>     <Date>   <char>
 1:     1          1    dead 2008-11-30 2008-12-12 2011-04-28  dead_in <-- dateCENS > dateOUT
 2:     1          2    dead 2009-01-26 2009-02-16 2011-04-28  dead_in <-- "
 3:     1          3    dead 2009-05-13 2009-05-15 2011-04-28  dead_in <-- "
 4:     1          4    dead 2009-05-20 2009-05-25 2011-04-28  dead_in <-- "
 5:     1          5    dead 2009-06-12 2009-06-16 2011-04-28  dead_in <-- "
 6:     1          6    dead 2009-06-20 2009-06-25 2011-04-28  dead_in <-- "
 7:     1          7    dead 2009-07-17 2009-07-22 2011-04-28  dead_in <-- "
 8:     1          8    dead 2010-04-15 2010-04-20 2011-04-28  dead_in <-- "
 9:     1          9    dead 2010-10-11 2010-10-14 2011-04-28  dead_in <-- "
10:     1         10    dead 2011-01-14 2011-01-17 2011-04-28  dead_in <-- "
11:     1         11    dead 2011-04-27 2011-04-28 2011-04-28  dead_in
12:     2          1   alive 2007-09-17 2007-09-27 2012-12-31    alive 
13:     2          2   alive 2009-04-09 2009-04-17 2012-12-31    alive    
14:     2          3   alive 2012-04-16 2012-04-20 2012-12-31    alive    
15:     2          4   alive 2012-04-24 2012-05-19 2012-12-31    alive    
16:     2          5   alive 2012-05-20 2012-05-25 2012-12-31    alive    
17:     2          6   alive 2012-08-19 2012-08-21 2012-12-31    alive    
18:     3          1   alive 2012-09-18 2012-09-27 2012-12-31    alive    
19:     3          2   alive 2012-11-28 2012-12-15 2012-12-31    alive    
20:     3          3   alive 2012-12-18 2012-12-28 2012-12-31    alive    
21:     4          1   alive 2008-08-13 2008-09-20 2012-12-31    alive    
22:     4          2   alive 2012-03-18 2012-03-19 2012-12-31    alive    
23:     4          3   alive 2012-07-02 2012-07-20 2012-12-31    alive    
24:     5          1    dead 2006-02-09 2006-02-25 2008-04-16 dead_out 
25:     6          1    dead 2009-03-05 2009-03-16 2010-12-19  dead_in <-- dateCENS > dateOUT
26:     6          2    dead 2009-07-06 2009-07-20 2010-12-19  dead_in <-- "
27:     6          3    dead 2010-11-17 2010-11-23 2010-12-19  dead_in <-- "
28:     6          4    dead 2010-12-05 2010-12-19 2010-12-19  dead_in  
29:     7          1    dead 2006-12-16 2006-12-25 2012-10-10 dead_out 
30:     7          2    dead 2007-01-26 2007-02-07 2012-10-10 dead_out 
31:     7          3    dead 2008-10-17 2008-10-25 2012-10-10 dead_out 
32:     7          4    dead 2009-02-03 2009-02-10 2012-10-10 dead_out 
     subj adm_number label_2     dateIN    dateOUT   dateCENS  label_3  

That aside, I think the trick is to use data.table::fcase:

df[, status := fcase(
  label_2 == "dead" & between(dateCENS, dateIN, dateOUT), "dead_in",
  label_2 == "dead", "dead_out",
  rep(TRUE, .N), "alive")
][]
#      subj adm_number label_2     dateIN    dateOUT   dateCENS   status
#     <int>      <int>  <char>     <Date>     <Date>     <Date>   <char>
#  1:     1          1    dead 2008-11-30 2008-12-12 2011-04-28 dead_out
#  2:     1          2    dead 2009-01-26 2009-02-16 2011-04-28 dead_out
#  3:     1          3    dead 2009-05-13 2009-05-15 2011-04-28 dead_out
#  4:     1          4    dead 2009-05-20 2009-05-25 2011-04-28 dead_out
#  5:     1          5    dead 2009-06-12 2009-06-16 2011-04-28 dead_out
#  6:     1          6    dead 2009-06-20 2009-06-25 2011-04-28 dead_out
#  7:     1          7    dead 2009-07-17 2009-07-22 2011-04-28 dead_out
#  8:     1          8    dead 2010-04-15 2010-04-20 2011-04-28 dead_out
#  9:     1          9    dead 2010-10-11 2010-10-14 2011-04-28 dead_out
# 10:     1         10    dead 2011-01-14 2011-01-17 2011-04-28 dead_out
# 11:     1         11    dead 2011-04-27 2011-04-28 2011-04-28  dead_in
# 12:     2          1   alive 2007-09-17 2007-09-27 2012-12-31    alive
# 13:     2          2   alive 2009-04-09 2009-04-17 2012-12-31    alive
# 14:     2          3   alive 2012-04-16 2012-04-20 2012-12-31    alive
# 15:     2          4   alive 2012-04-24 2012-05-19 2012-12-31    alive
# 16:     2          5   alive 2012-05-20 2012-05-25 2012-12-31    alive
# 17:     2          6   alive 2012-08-19 2012-08-21 2012-12-31    alive
# 18:     3          1   alive 2012-09-18 2012-09-27 2012-12-31    alive
# 19:     3          2   alive 2012-11-28 2012-12-15 2012-12-31    alive
# 20:     3          3   alive 2012-12-18 2012-12-28 2012-12-31    alive
# 21:     4          1   alive 2008-08-13 2008-09-20 2012-12-31    alive
# 22:     4          2   alive 2012-03-18 2012-03-19 2012-12-31    alive
# 23:     4          3   alive 2012-07-02 2012-07-20 2012-12-31    alive
# 24:     5          1    dead 2006-02-09 2006-02-25 2008-04-16 dead_out
# 25:     6          1    dead 2009-03-05 2009-03-16 2010-12-19 dead_out
# 26:     6          2    dead 2009-07-06 2009-07-20 2010-12-19 dead_out
# 27:     6          3    dead 2010-11-17 2010-11-23 2010-12-19 dead_out
# 28:     6          4    dead 2010-12-05 2010-12-19 2010-12-19  dead_in
# 29:     7          1    dead 2006-12-16 2006-12-25 2012-10-10 dead_out
# 30:     7          2    dead 2007-01-26 2007-02-07 2012-10-10 dead_out
# 31:     7          3    dead 2008-10-17 2008-10-25 2012-10-10 dead_out
# 32:     7          4    dead 2009-02-03 2009-02-10 2012-10-10 dead_out
#      subj adm_number label_2     dateIN    dateOUT   dateCENS   status

The abbreviated second conditional (just label_2 == "dead") is possible because if dateCENS is between in/out, then the first conditional would be used; when we get to the second, it's clear that the previous condition was false. The same effect logically (at slightly-reduced calculation) is had with:

df[, status := fcase(
  label_2 == "dead" &  between(dateCENS, dateIN, dateOUT), "dead_in",
  label_2 == "dead" & !between(dateCENS, dateIN, dateOUT), "dead_out",
  rep(TRUE, .N), "alive")
]

Edit: I think you want some any or all logic.

df[, status := fcase(
#   label_2 == "dead" & any(between(dateCENS, dateIN, dateOUT)), "dead_in",
#   label_2 == "dead" & all(!between(dateCENS, dateIN, dateOUT)), "dead_out",
#   rep(TRUE, .N), "alive")
# , by = .(subj)]
+ + +      subj adm_number label_2     dateIN    dateOUT   dateCENS  label_3   status
#     <int>      <int>  <char>     <Date>     <Date>     <Date>   <char>   <char>
#  1:     1          1    dead 2008-11-30 2008-12-12 2011-04-28  dead_in  dead_in
#  2:     1          2    dead 2009-01-26 2009-02-16 2011-04-28  dead_in  dead_in
#  3:     1          3    dead 2009-05-13 2009-05-15 2011-04-28  dead_in  dead_in
#  4:     1          4    dead 2009-05-20 2009-05-25 2011-04-28  dead_in  dead_in
#  5:     1          5    dead 2009-06-12 2009-06-16 2011-04-28  dead_in  dead_in
#  6:     1          6    dead 2009-06-20 2009-06-25 2011-04-28  dead_in  dead_in
#  7:     1          7    dead 2009-07-17 2009-07-22 2011-04-28  dead_in  dead_in
#  8:     1          8    dead 2010-04-15 2010-04-20 2011-04-28  dead_in  dead_in
#  9:     1          9    dead 2010-10-11 2010-10-14 2011-04-28  dead_in  dead_in
# 10:     1         10    dead 2011-01-14 2011-01-17 2011-04-28  dead_in  dead_in
# 11:     1         11    dead 2011-04-27 2011-04-28 2011-04-28  dead_in  dead_in
# 12:     2          1   alive 2007-09-17 2007-09-27 2012-12-31    alive    alive
# 13:     2          2   alive 2009-04-09 2009-04-17 2012-12-31    alive    alive
# 14:     2          3   alive 2012-04-16 2012-04-20 2012-12-31    alive    alive
# 15:     2          4   alive 2012-04-24 2012-05-19 2012-12-31    alive    alive
# 16:     2          5   alive 2012-05-20 2012-05-25 2012-12-31    alive    alive
# 17:     2          6   alive 2012-08-19 2012-08-21 2012-12-31    alive    alive
# 18:     3          1   alive 2012-09-18 2012-09-27 2012-12-31    alive    alive
# 19:     3          2   alive 2012-11-28 2012-12-15 2012-12-31    alive    alive
# 20:     3          3   alive 2012-12-18 2012-12-28 2012-12-31    alive    alive
# 21:     4          1   alive 2008-08-13 2008-09-20 2012-12-31    alive    alive
# 22:     4          2   alive 2012-03-18 2012-03-19 2012-12-31    alive    alive
# 23:     4          3   alive 2012-07-02 2012-07-20 2012-12-31    alive    alive
# 24:     5          1    dead 2006-02-09 2006-02-25 2008-04-16 dead_out dead_out
# 25:     6          1    dead 2009-03-05 2009-03-16 2010-12-19  dead_in  dead_in
# 26:     6          2    dead 2009-07-06 2009-07-20 2010-12-19  dead_in  dead_in
# 27:     6          3    dead 2010-11-17 2010-11-23 2010-12-19  dead_in  dead_in
# 28:     6          4    dead 2010-12-05 2010-12-19 2010-12-19  dead_in  dead_in
# 29:     7          1    dead 2006-12-16 2006-12-25 2012-10-10 dead_out dead_out
# 30:     7          2    dead 2007-01-26 2007-02-07 2012-10-10 dead_out dead_out
# 31:     7          3    dead 2008-10-17 2008-10-25 2012-10-10 dead_out dead_out
# 32:     7          4    dead 2009-02-03 2009-02-10 2012-10-10 dead_out dead_out
#      subj adm_number label_2     dateIN    dateOUT   dateCENS  label_3   status

(label_3 is yours, status is mine.)

Upvotes: 1

Related Questions