Conditional increment of an integer based on a condition inside a R dplyr group_by

Question

I have got a dataset, where a campaign workflow sends emails on daily basis to contacts that meet certain criteria.

The workflow sends 3 different email communications A, B, and C separated by a number of days and based on recipients interaction events. A and B email communications have two versions A1,A2 and B1,B2.

Recipients can qualify to enter the campaign workflow again in the future. The data set has no meta data to indicate a start of a new campaign for each recipient_id. Therefore to analyse the data further, I need to develop two new columns for identifying new campaign within a recipient_id group and based on some logic described below:

Approach 1: if step > lead(step) marks an end to a campaign. OR if step < lag(step) marks the start of a new campaign, therefore increment campaign count by 1.

Approach 2: if step > lead(step) & lead(date)-date > 14 days marks end of a campaign. OR if step < lag(step) & date - lag(date) > 14 days marks the start of a new campaign, therefore increment campaign count by 1.

This is the input data set:

structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3", 
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1", 
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2", 
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L, 
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L, 
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L, 
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L, 
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L, 
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L, 
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L, 
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L, 
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3, 
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037, 
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032, 
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764, 
19044, 19047, 18969, 19044, 18781, 18962), class = "Date")), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L), groups = structure(list(
    campaign = c("campaign_x", "campaign_x", "campaign_x", "campaign_x", 
    "campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
    "campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
    "campaign_x", "campaign_x", "campaign_x"), recipient_id = c(54L, 
    197L, 8388L, 8426L, 10903L, 14469L, 17466L, 17807L, 21666L, 
    23935L, 24287L, 25412L, 31361L, 31365L, 40849L, 40860L, 41737L
    ), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10:11, 12:13, 
        14:15, 16L, 17L, 18L, 19L, 20:21, 22:24, 25:26, 27L, 
        28L, 29:30), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))

This is the expected output dataframe:

structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3", 
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1", 
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1", 
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2", 
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L, 
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L, 
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L, 
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L, 
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L, 
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L, 
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L, 
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L, 
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3, 
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037, 
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032, 
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764, 
19044, 19047, 18969, 19044, 18781, 18962), class = "Date"), campaign_num_v1 = c(1, 
2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 
1, 2, 1, 1, 1, 1, 1, 2), campaign_num_v2 = c(1, 1, 1, 1, 2, 1, 
1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 
1, 1, 2)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -30L), groups = structure(list(campaign = c("campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x", 
"campaign_x"), recipient_id = c(54L, 197L, 8388L, 8426L, 10903L, 
14469L, 17466L, 17807L, 21666L, 23935L, 24287L, 25412L, 31361L, 
31365L, 40849L, 40860L, 41737L), .rows = structure(list(1:3, 
    4:5, 6:7, 8:9, 10:11, 12:13, 14:15, 16L, 17L, 18L, 19L, 20:21, 
    22:24, 25:26, 27L, 28L, 29:30), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))

This is an image of how first 10 observations of output dataframe would look like with the new campaign_num_v1 and campaign_num_v2 columns based on Approach 1 and Approach 2 described above added to the original dataframe:

I have tried multiple variations from solutions presented in different questions, but was not able to get it to work properly. Here is one of the simplest versions of coding I attempted, but does not work:

  dat %>% group_by(campaign, com_elm, recipient_id) %>%
  arrange(recipient_id, date) %>%
  ungroup() %>%
  group_by(campaign, recipient_id ) %>%
  mutate(switch = ifelse(is.na(step > lead(step)), FALSE,step > lead(step))) %>%
  mutate(campaign_num_v1 = cumsum(step>lead(step)),
         campaign_num_v2 = cumsum(switch))

A solution using dplyr would be ideal, but I won't mind a different approach if there is no choice. Thanks.

langtang · Accepted Answer

You can use lag() with cumsum(), like this, and take care of the leading NA, using coalesce() thanks to Martin's helpful comment.

df %>% arrange(campaign, recipient_id,date) %>% 
  group_by(campaign,recipient_id) %>% 
  mutate(campaign_num1 = cumsum(coalesce(lag(step)>=step, TRUE)),
         campaign_num2 = cumsum(coalesce(lag(step)>=step & date-lag(date)>14, TRUE))
)

Output: (first ten rows)

# A tibble: 30 x 8
# Groups:   campaign, recipient_id [17]
   campaign   com_elm       com_elm_id recipient_id  step date       campaign_num1 campaign_num2
                                                       
 1 campaign_x campaign_x_C3     808001           54     3 2022-02-06             1             1
 2 campaign_x campaign_x_B1     811001           54     1 2022-02-09             2             1
 3 campaign_x campaign_x_B2     814001           54     2 2022-02-12             2             1
 4 campaign_x campaign_x_C3     509005          197     3 2021-05-31             1             1
 5 campaign_x campaign_x_C3     729060          197     3 2021-11-29             2             2
 6 campaign_x campaign_x_B1     817002         8388     1 2022-02-14             1             1
 7 campaign_x campaign_x_B2     820002         8388     2 2022-02-17             1             1
 8 campaign_x campaign_x_C3     792002         8426     3 2022-01-24             1             1
 9 campaign_x campaign_x_C3     793003         8426     3 2022-01-27             2             1
10 campaign_x campaign_x_B1     820003        10903     1 2022-02-17             1             1

Conditional increment of an integer based on a condition inside a R dplyr group_by

Answers (1)

Related Questions