Reputation: 341
I have got a dataset, where a campaign workflow sends emails on daily basis to contacts that meet certain criteria.
The workflow sends 3 different email communications A, B, and C separated by a number of days and based on recipients interaction events. A and B email communications have two versions A1,A2 and B1,B2.
Recipients can qualify to enter the campaign workflow again in the future. The data set has no meta data to indicate a start of a new campaign for each recipient_id
. Therefore to analyse the data further, I need to develop two new columns for identifying new campaign within a recipient_id
group and based on some logic described below:
Approach 1: if step > lead(step)
marks an end to a campaign. OR if step < lag(step)
marks the start of a new campaign, therefore increment campaign count by 1.
Approach 2: if step > lead(step) & lead(date)-date > 14
days marks end of a campaign. OR if step < lag(step) & date - lag(date) > 14
days marks the start of a new campaign, therefore increment campaign count by 1.
This is the input data set:
structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3",
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1",
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2",
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L,
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L,
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L,
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L,
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L,
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L,
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L,
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L,
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3,
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037,
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032,
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764,
19044, 19047, 18969, 19044, 18781, 18962), class = "Date")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L), groups = structure(list(
campaign = c("campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x"), recipient_id = c(54L,
197L, 8388L, 8426L, 10903L, 14469L, 17466L, 17807L, 21666L,
23935L, 24287L, 25412L, 31361L, 31365L, 40849L, 40860L, 41737L
), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10:11, 12:13,
14:15, 16L, 17L, 18L, 19L, 20:21, 22:24, 25:26, 27L,
28L, 29:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))
This is the expected output dataframe:
structure(list(campaign = c("campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x"), com_elm = c("campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B2", "campaign_x_C3", "campaign_x_B1", "campaign_x_C3",
"campaign_x_B1", "campaign_x_A1", "campaign_x_C3", "campaign_x_B1",
"campaign_x_B1", "campaign_x_C3", "campaign_x_B1", "campaign_x_A1",
"campaign_x_C3", "campaign_x_C3", "campaign_x_B1", "campaign_x_B2",
"campaign_x_C3", "campaign_x_B1", "campaign_x_C3", "campaign_x_C3"
), com_elm_id = c(808001L, 811001L, 814001L, 509005L, 729060L,
817002L, 820002L, 792002L, 793003L, 820003L, 824003L, 792002L,
811001L, 787001L, 811001L, 468023L, 792002L, 812001L, 812001L,
808001L, 811001L, 468023L, 468006L, 491014L, 825002L, 828002L,
741001L, 825002L, 512001L, 733001L), recipient_id = c(54L, 54L,
54L, 197L, 197L, 8388L, 8388L, 8426L, 8426L, 10903L, 10903L,
14469L, 14469L, 17466L, 17466L, 17807L, 21666L, 23935L, 24287L,
25412L, 25412L, 31361L, 31361L, 31361L, 31365L, 31365L, 40849L,
40860L, 41737L, 41737L), step = c(3, 1, 2, 3, 3, 1, 2, 3, 3,
1, 2, 3, 1, 3, 1, 1, 3, 1, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 3
), date = structure(c(19029, 19032, 19035, 18778, 18960, 19037,
19040, 19016, 19019, 19040, 19043, 19015, 19032, 19011, 19032,
18746, 19015, 19033, 19033, 19029, 19032, 18746, 18746, 18764,
19044, 19047, 18969, 19044, 18781, 18962), class = "Date"), campaign_num_v1 = c(1,
2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1,
1, 2, 1, 1, 1, 1, 1, 2), campaign_num_v2 = c(1, 1, 1, 1, 2, 1,
1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
1, 1, 2)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -30L), groups = structure(list(campaign = c("campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x", "campaign_x", "campaign_x", "campaign_x", "campaign_x",
"campaign_x"), recipient_id = c(54L, 197L, 8388L, 8426L, 10903L,
14469L, 17466L, 17807L, 21666L, 23935L, 24287L, 25412L, 31361L,
31365L, 40849L, 40860L, 41737L), .rows = structure(list(1:3,
4:5, 6:7, 8:9, 10:11, 12:13, 14:15, 16L, 17L, 18L, 19L, 20:21,
22:24, 25:26, 27L, 28L, 29:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -17L), .drop = TRUE))
This is an image of how first 10 observations of output dataframe would look like with the new campaign_num_v1
and campaign_num_v2
columns based on Approach 1 and Approach 2 described above added to the original dataframe:
I have tried multiple variations from solutions presented in different questions, but was not able to get it to work properly. Here is one of the simplest versions of coding I attempted, but does not work:
dat %>% group_by(campaign, com_elm, recipient_id) %>%
arrange(recipient_id, date) %>%
ungroup() %>%
group_by(campaign, recipient_id ) %>%
mutate(switch = ifelse(is.na(step > lead(step)), FALSE,step > lead(step))) %>%
mutate(campaign_num_v1 = cumsum(step>lead(step)),
campaign_num_v2 = cumsum(switch))
A solution using dplyr would be ideal, but I won't mind a different approach if there is no choice. Thanks.
Upvotes: 0
Views: 48
Reputation: 24722
You can use lag()
with cumsum()
, like this, and take care of the leading NA, using coalesce()
thanks to Martin's helpful comment.
df %>% arrange(campaign, recipient_id,date) %>%
group_by(campaign,recipient_id) %>%
mutate(campaign_num1 = cumsum(coalesce(lag(step)>=step, TRUE)),
campaign_num2 = cumsum(coalesce(lag(step)>=step & date-lag(date)>14, TRUE))
)
Output: (first ten rows)
# A tibble: 30 x 8
# Groups: campaign, recipient_id [17]
campaign com_elm com_elm_id recipient_id step date campaign_num1 campaign_num2
<chr> <chr> <int> <int> <dbl> <date> <int> <int>
1 campaign_x campaign_x_C3 808001 54 3 2022-02-06 1 1
2 campaign_x campaign_x_B1 811001 54 1 2022-02-09 2 1
3 campaign_x campaign_x_B2 814001 54 2 2022-02-12 2 1
4 campaign_x campaign_x_C3 509005 197 3 2021-05-31 1 1
5 campaign_x campaign_x_C3 729060 197 3 2021-11-29 2 2
6 campaign_x campaign_x_B1 817002 8388 1 2022-02-14 1 1
7 campaign_x campaign_x_B2 820002 8388 2 2022-02-17 1 1
8 campaign_x campaign_x_C3 792002 8426 3 2022-01-24 1 1
9 campaign_x campaign_x_C3 793003 8426 3 2022-01-27 2 1
10 campaign_x campaign_x_B1 820003 10903 1 2022-02-17 1 1
Upvotes: 2