Reputation: 41
I have a dataset containing purchases made by different households across different retailers. For eg
Example Dataset
using dput()
structure(list(Household = c(76, 76, 76, 76, 76, 76, 76,
76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 126,
126, 126, 126, 126, 126, 126, 126, 126), Trip ID = c(1032497498L,
1025776063L, 1029419047L, 1030418100L, 1029502602L, 1034153056L,
1027035051L, 1027533991L, 1033515804L, 1032998207L, 1032066227L,
1028192785L, 1033419039L, 1028730296L, 1027388499L, 1030652869L,
1025638394L, 1034032718L, 1034032718L, 1025678520L, 1029490031L,
1029898838L, 1028024134L, 1030324171L, 1031983761L, 1031983761L,
1033767148L, 1023953965L, 1030954113L, 1030954113L, 1027392968L
), purchase_date = structure(c(1L, 2L, 23L, 50L, 52L, 74L, 77L,
94L, 148L, 158L, 176L, 179L, 196L, 211L, 224L, 246L, 271L, 286L,
286L, 309L, 329L, 346L, 2L, 9L, 46L, 46L, 50L, 58L, 66L, 66L,
68L), .Label = c("2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04",
"2012-01-05", "2012-01-06", "2012-01-07", "2012-01-08", "2012-01-09",
"2012-01-10", "2012-01-11", "2012-01-12", "2012-01-13", "2012-01-14",
"2012-01-15", "2012-01-16", "2012-01-17", "2012-01-18", "2012-01-19",
"2012-01-20", "2012-01-21", "2012-01-22", "2012-01-23", "2012-01-24",
"2012-01-25", "2012-01-26", "2012-01-27", "2012-01-28", "2012-01-29",
"2012-01-30", "2012-01-31", "2012-02-01", "2012-02-02", "2012-02-03",
"2012-02-04", "2012-02-05", "2012-02-06", "2012-02-07", "2012-02-08",
"2012-02-09", "2012-02-10", "2012-02-11", "2012-02-12", "2012-02-13",
"2012-02-14", "2012-02-15", "2012-02-16", "2012-02-17", "2012-02-18",
"2012-02-19", "2012-02-20", "2012-02-21", "2012-02-22", "2012-02-23",
"2012-02-24", "2012-02-25", "2012-02-26", "2012-02-27", "2012-02-28",
"2012-02-29", "2012-03-01", "2012-03-02", "2012-03-03", "2012-03-04",
"2012-03-05", "2012-03-06", "2012-03-07", "2012-03-08", "2012-03-09",
"2012-03-10", "2012-03-11", "2012-03-12", "2012-03-13", "2012-03-14",
"2012-03-15", "2012-03-16", "2012-03-17", "2012-03-18", "2012-03-19",
"2012-03-20", "2012-03-21", "2012-03-22", "2012-03-23", "2012-03-24",
"2012-03-25", "2012-03-26", "2012-03-27", "2012-03-28", "2012-03-29",
"2012-03-30", "2012-03-31", "2012-04-01", "2012-04-02", "2012-04-03",
"2012-04-04", "2012-04-05", "2012-04-06", "2012-04-07", "2012-04-08",
"2012-04-09", "2012-04-10", "2012-04-11", "2012-04-12", "2012-04-13",
"2012-04-14", "2012-04-15", "2012-04-16", "2012-04-17", "2012-04-18",
"2012-04-19", "2012-04-20", "2012-04-21", "2012-04-22", "2012-04-23",
"2012-04-24", "2012-04-25", "2012-04-26", "2012-04-27", "2012-04-28",
"2012-04-29", "2012-04-30", "2012-05-01", "2012-05-02", "2012-05-03",
"2012-05-04", "2012-05-05", "2012-05-06", "2012-05-07", "2012-05-08",
"2012-05-09", "2012-05-10", "2012-05-11", "2012-05-12", "2012-05-13",
"2012-05-14", "2012-05-15", "2012-05-16", "2012-05-17", "2012-05-18",
"2012-05-19", "2012-05-20", "2012-05-21", "2012-05-22", "2012-05-23",
"2012-05-24", "2012-05-25", "2012-05-26", "2012-05-27", "2012-05-28",
"2012-05-29", "2012-05-30", "2012-05-31", "2012-06-01", "2012-06-02",
"2012-06-03", "2012-06-04", "2012-06-05", "2012-06-06", "2012-06-07",
"2012-06-08", "2012-06-09", "2012-06-10", "2012-06-11", "2012-06-12",
"2012-06-13", "2012-06-14", "2012-06-15", "2012-06-16", "2012-06-17",
"2012-06-18", "2012-06-19", "2012-06-20", "2012-06-21", "2012-06-22",
"2012-06-23", "2012-06-24", "2012-06-25", "2012-06-26", "2012-06-27",
"2012-06-28", "2012-06-29", "2012-06-30", "2012-07-01", "2012-07-02",
"2012-07-03", "2012-07-04", "2012-07-05", "2012-07-06", "2012-07-07",
"2012-07-08", "2012-07-09", "2012-07-10", "2012-07-11", "2012-07-12",
"2012-07-13", "2012-07-14", "2012-07-15", "2012-07-16", "2012-07-17",
"2012-07-18", "2012-07-19", "2012-07-20", "2012-07-21", "2012-07-22",
"2012-07-23", "2012-07-24", "2012-07-25", "2012-07-26", "2012-07-27",
"2012-07-28", "2012-07-29", "2012-07-30", "2012-07-31", "2012-08-01",
"2012-08-02", "2012-08-03", "2012-08-04", "2012-08-05", "2012-08-06",
"2012-08-07", "2012-08-08", "2012-08-09", "2012-08-10", "2012-08-11",
"2012-08-12", "2012-08-13", "2012-08-14", "2012-08-15", "2012-08-16",
"2012-08-17", "2012-08-18", "2012-08-19", "2012-08-20", "2012-08-21",
"2012-08-22", "2012-08-23", "2012-08-24", "2012-08-25", "2012-08-26",
"2012-08-27", "2012-08-28", "2012-08-29", "2012-08-30", "2012-08-31",
"2012-09-01", "2012-09-02", "2012-09-03", "2012-09-04", "2012-09-05",
"2012-09-06", "2012-09-07", "2012-09-08", "2012-09-09", "2012-09-10",
"2012-09-11", "2012-09-12", "2012-09-13", "2012-09-14", "2012-09-15",
"2012-09-16", "2012-09-17", "2012-09-18", "2012-09-19", "2012-09-20",
"2012-09-21", "2012-09-22", "2012-09-23", "2012-09-24", "2012-09-25",
"2012-09-26", "2012-09-27", "2012-09-28", "2012-09-29", "2012-09-30",
"2012-10-01", "2012-10-02", "2012-10-03", "2012-10-04", "2012-10-05",
"2012-10-06", "2012-10-07", "2012-10-08", "2012-10-09", "2012-10-10",
"2012-10-11", "2012-10-12", "2012-10-13", "2012-10-14", "2012-10-15",
"2012-10-16", "2012-10-17", "2012-10-18", "2012-10-19", "2012-10-20",
"2012-10-21", "2012-10-22", "2012-10-23", "2012-10-24", "2012-10-25",
"2012-10-26", "2012-10-27", "2012-10-28", "2012-10-29", "2012-10-30",
"2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04",
"2012-11-05", "2012-11-06", "2012-11-07", "2012-11-08", "2012-11-09",
"2012-11-10", "2012-11-11", "2012-11-12", "2012-11-13", "2012-11-14",
"2012-11-15", "2012-11-16", "2012-11-17", "2012-11-18", "2012-11-19",
"2012-11-20", "2012-11-21", "2012-11-22", "2012-11-23", "2012-11-24",
"2012-11-25", "2012-11-26", "2012-11-27", "2012-11-28", "2012-11-29",
"2012-11-30", "2012-12-01", "2012-12-02", "2012-12-03", "2012-12-04",
"2012-12-05", "2012-12-06", "2012-12-07", "2012-12-08", "2012-12-09",
"2012-12-10", "2012-12-11", "2012-12-12", "2012-12-13", "2012-12-14",
"2012-12-15", "2012-12-16", "2012-12-17", "2012-12-18", "2012-12-19",
"2012-12-20", "2012-12-21", "2012-12-22", "2012-12-23", "2012-12-24",
"2012-12-25", "2012-12-26", "2012-12-27", "2012-12-28", "2012-12-29"
), class = "factor"), Retailer = c(11024, 11024, 11024,
11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024, 11024,
11024, 11024, 11024, 11024, 2353, 11024, 11024, 2353, 11024,
11024, 63882, 650, 89960, 89960, 650, 89960, 89960, 89960, 650
), Brand = structure(c(19L, 74L, 19L, 48L, 19L, 48L,
19L, 19L, 19L, 48L, 48L, 31L, 46L, 31L, 31L, 48L, 74L, 31L, 74L,
19L, 31L, 19L, 48L, 48L, 31L, 31L, 48L, 31L, 31L, 48L, 48L), .Label = c("ABUNDANCE",
"ALPEN", "AMERICAN BREAKFAST ", "ANNIE'S HOMEGROWN", "ARWHD MLS",
"BARBARA'S", "BEAR NAKED", "BEAR RIVER", "BOB'S RED MILL", "BOKOMO COUNTRY",
"BREAKFAST CHOICE", "BREAKFAST ZONE", "BROOKFARM MACADAMIA ",
"BRUGGEN", "BUCKEYE HEROES", "CADIA", "CASCADIAN FARM ", "CHOCOLATE SPOONERS",
"CTL BR", "DORSET", "ENJOY LIFE PERKY'S CRUNCHY FLX", "EREWHON",
"F-FACTOR", "FAMILIA", "FIELD DAY", "FINAX", "FLEURY FLAKES MARC ANDRE",
"FOOD FOR LIFE EZEKIEL 50", "FORRELLI ", "GEFEN KING", "GENERAL MILLS",
"GERONIMO PEYTON HILLIS REDZONE", "GLUCERNA", "GLUTINO", "GOLDEN FOODS",
"GRANVITA PUFFY'S", "GREENBRIER INT INC-NBL CRN FLK", "HEALTH VALLEY",
"HODGSON MILL", "HOME FAVORITE", "HOSTESS TOASTED OATS", "HSP",
"ISABEL'S WAY ", "JASPER", "JUSTIN VERLANDER'S FASTBLL FLK",
"KASHI", "KAY'S NATURALS BETTER BALANCE", "Kellogg", "KIND",
"KOZY SHACK READY GRAINS", "KRETSCHMER", "LADY LIBERTY", "LIEBER'S",
"LIVING INTENTIONS SPRFD CRL", "LOVE GROWN FOODS", "MAIZORO",
"MANISCHEWITZ", "MILL SELECT ", "MOTHER'S", "MULTIGRAIN SPNRS",
"NASH BROTHERS", "NATURE'S PATH", "NESTLE", "NEW ENGLAND NATURALS",
"NEWMAN'S OWN SWEET ENGH WHT PF", "NUTRISYSTEM NOURISH", "NUTRITIOUS LIVING",
"PAMPA ", "PASKESZ CHOCO RIOS", "PBLC LB BR-NBL HY NT TSD OT DM",
"PEACE", "POST", "PROTEIN PLUS CORN FLAKE", "QUAKER", "RALSTON",
"RLSTN", "SALLY'S ", "SCRUMPTIOUS SPELNDID CRLS CBB", "SEITENBACHER MUESLI",
"SIMPLY FIBER", "SKINNER'S RAISIN BRAN", "SORIANA WHOLE BRAN",
"STREIT'S FRUIT & NUT MUESLI", "SUNBELT", "SWAD", "T. ABRAHAM'S",
"TAANUG CORN FLAKES", "TASTY", "TEMMY'S", "THE", "THREE SISTERS",
"TIKISS SWEETENED PUFFD WHL WHT", "TRU ROOTS", "VOTTO'S", "WEETABIX",
"WHOLESOME GOODNESS", "WILD ROOTS ANCIENT ORIGINS", "WONDER CORN FLAKES",
"YOG ACTIVE"), class = "factor")), row.names = c(90609L, 222436L,
90606L, 688592L, 90607L, 688593L, 90605L, 90604L, 90608L, 668330L,
321377L, 567447L, 945385L, 567445L, 567443L, 892854L, 583186L,
567446L, 583185L, 168138L, 567444L, 60086L, 698120L, 698127L,
3933L, 809409L, 698129L, 15286L, 15284L, 698116L, 319455L), class = "data.frame")
I have to find whether a particular household has bought any brand from anywhere and then bought anything from a particular retailer on the next trip.
So, if, first I chose a retailer, 11024. Then for each trip for eg row 17, I would check whether the next trip is made to retailer 11024. If the next trip is made to the retailer 11024, I would mark row 17 as 1. Then for rows 18 & 19 - they are purchases on the same trip, so I would check row 20, which is the next trip. And, since that trip is not to 11024, I would mark the previous trip (both rows 18 & 19) as 0.
So, the resulting output, if I want to check for retailer "11024", would be
Example Output :
In all those cases when the next trip is made to the retailer 11024, the purchase instances of the previous trip is put to 1. And, whenever the next trip is made to some other retailer, it is put to 0. Trips are sorted by purchase date. Now, it is easy when I have only 1 purchase per trip. I do it by the following code using lead()
e1$next_ret = e1$retailer
e1$next_ret = ifelse(e1$next_ret == 6901,1,0)
temp = e1 %>%
group_by(Household) %>%
mutate(dummy = lead(next_ret, default = 0))
However, when there is more than 1 purchase in a single trip, it doesn't work. Please Help
Upvotes: 0
Views: 117
Reputation: 26218
Do you require this..
#df used
> df
household_code trip_code_uc purchase_date retailer_code Overall_Brand
90609 76 1032497498 2012-01-01 11024 CTL BR
222436 76 1025776063 2012-01-02 11024 QUAKER
90606 76 1029419047 2012-01-23 11024 CTL BR
688592 76 1030418100 2012-02-19 11024 Kellogg
90607 76 1029502602 2012-02-21 11024 CTL BR
688593 76 1034153056 2012-03-14 11024 Kellogg
90605 76 1027035051 2012-03-17 11024 CTL BR
90604 76 1027533991 2012-04-03 11024 CTL BR
90608 76 1033515804 2012-05-27 11024 CTL BR
668330 76 1032998207 2012-06-06 11024 Kellogg
321377 76 1032066227 2012-06-24 11024 Kellogg
567447 76 1028192785 2012-06-27 11024 GENERAL MILLS
945385 76 1033419039 2012-07-14 11024 KASHI
567445 76 1028730296 2012-07-29 11024 GENERAL MILLS
567443 76 1027388499 2012-08-11 11024 GENERAL MILLS
892854 76 1030652869 2012-09-02 11024 Kellogg
583186 76 1025638394 2012-09-27 2353 QUAKER
567446 76 1034032718 2012-10-12 11024 GENERAL MILLS
583185 76 1034032718 2012-10-12 11024 QUAKER
168138 76 1025678520 2012-11-04 2353 CTL BR
567444 76 1029490031 2012-11-24 11024 GENERAL MILLS
60086 76 1029898838 2012-12-11 11024 CTL BR
698120 126 1028024134 2012-01-02 63882 Kellogg
698127 126 1030324171 2012-01-09 650 Kellogg
3933 126 1031983761 2012-02-15 89960 GENERAL MILLS
809409 126 1031983761 2012-02-15 89960 GENERAL MILLS
698129 126 1033767148 2012-02-19 650 Kellogg
15286 126 1023953965 2012-02-27 89960 GENERAL MILLS
15284 126 1030954113 2012-03-06 89960 GENERAL MILLS
698116 126 1030954113 2012-03-06 89960 Kellogg
319455 126 1027392968 2012-03-08 650 Kellogg
Solution proposed assuming that you want- if a customer has purchased a item next time from same retailer then 1 else 0
library(dplyr)
library(tidyr)
df %>% mutate(rowid = row_number()) %>%
group_by(household_code, Overall_Brand) %>%
mutate(cohort_id = dense_rank(paste(purchase_date, trip_code_uc))) %>%
ungroup(retailer_code) %>% arrange(household_code, Overall_Brand, cohort_id) %>%
mutate(repeat_pur = ifelse(lead(retailer_code)==retailer_code, 1, 0),
repeat_pur = replace_na(repeat_pur, 0)) %>%
ungroup() %>% arrange(rowid) %>% select(-cohort_id, -rowid)
# A tibble: 31 x 6
household_code trip_code_uc purchase_date retailer_code Overall_Brand repeat_pur
<dbl> <int> <date> <dbl> <chr> <dbl>
1 76 1032497498 2012-01-01 11024 CTL BR 1
2 76 1025776063 2012-01-02 11024 QUAKER 0
3 76 1029419047 2012-01-23 11024 CTL BR 1
4 76 1030418100 2012-02-19 11024 Kellogg 1
5 76 1029502602 2012-02-21 11024 CTL BR 1
6 76 1034153056 2012-03-14 11024 Kellogg 1
7 76 1027035051 2012-03-17 11024 CTL BR 1
8 76 1027533991 2012-04-03 11024 CTL BR 1
9 76 1033515804 2012-05-27 11024 CTL BR 0
10 76 1032998207 2012-06-06 11024 Kellogg 1
11 76 1032066227 2012-06-24 11024 Kellogg 1
12 76 1028192785 2012-06-27 11024 GENERAL MILLS 1
13 76 1033419039 2012-07-14 11024 KASHI 0
14 76 1028730296 2012-07-29 11024 GENERAL MILLS 1
15 76 1027388499 2012-08-11 11024 GENERAL MILLS 1
16 76 1030652869 2012-09-02 11024 Kellogg 0
17 76 1025638394 2012-09-27 2353 QUAKER 0
18 76 1034032718 2012-10-12 11024 GENERAL MILLS 1
19 76 1034032718 2012-10-12 11024 QUAKER 0
20 76 1025678520 2012-11-04 2353 CTL BR 0
21 76 1029490031 2012-11-24 11024 GENERAL MILLS 0
22 76 1029898838 2012-12-11 11024 CTL BR 0
23 126 1028024134 2012-01-02 63882 Kellogg 0
24 126 1030324171 2012-01-09 650 Kellogg 1
25 126 1031983761 2012-02-15 89960 GENERAL MILLS 1
26 126 1031983761 2012-02-15 89960 GENERAL MILLS 1
27 126 1033767148 2012-02-19 650 Kellogg 0
28 126 1023953965 2012-02-27 89960 GENERAL MILLS 1
29 126 1030954113 2012-03-06 89960 GENERAL MILLS 0
30 126 1030954113 2012-03-06 89960 Kellogg 0
31 126 1027392968 2012-03-08 650 Kellogg 0
If I have not understood correctly, please tell the exact requirement.
Upvotes: 1