eflores89
eflores89

Reputation: 359

Group by adjoining dates in r

I have the following data frame

df<-structure(list(DATE = c("30/06/15", "23/06/15", "22/06/15", "21/06/15", 
"18/06/15", "12/06/15", "09/06/15", "08/06/15", "02/06/15", "08/04/15", 
"06/04/15", "05/04/15", "07/03/15", "06/03/15", "04/03/15", "03/03/15", 
"02/03/15", "26/01/15", "25/01/15", "20/01/15", "19/01/15", "18/01/15", 
"17/01/15", "16/01/15", "15/01/15", "14/01/15", "13/01/15", "12/01/15", 
"11/01/15", "10/01/15", "09/01/15", "08/01/15", "07/01/15", "06/01/15", 
"05/01/15", "04/01/15", "03/01/15", "02/01/15", "01/01/15", "31/12/14", 
"30/12/14", "29/12/14", "28/12/14", "27/12/14", "26/12/14", "25/12/14", 
"27/08/14", "26/08/14", "25/08/14"), TICKETS = c(17L, 15L, 22L, 
16L, 15L, 10L, 18L, 12L, 20L, 20L, 19L, 12L, 16L, 9L, 20L, 18L, 
15L, 19L, 13L, 18L, 21L, 27L, 17L, 17L, 18L, 18L, 21L, 18L, 22L, 
20L, 16L, 15L, 23L, 15L, 17L, 12L, 20L, 16L, 9L, 13L, 21L, 16L, 
16L, 14L, 10L, 7L, 15L, 12L, 14L), COLUMN = c(17L, 15L, 22L, 
16L, 15L, 10L, 18L, 12L, 20L, 20L, 19L, 12L, 16L, 9L, 20L, 18L, 
15L, 19L, 13L, 18L, 21L, 27L, 17L, 17L, 18L, 18L, 21L, 18L, 22L, 
20L, 16L, 15L, 23L, 15L, 17L, 12L, 20L, 16L, 9L, 13L, 21L, 16L, 
16L, 14L, 10L, 7L, 15L, 12L, 14L), FLAG = c(1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1), MA = c(0.318405243036592, 0.163298743855817, 0.706171490988531, 
0.240851993446204, 0.163298743855817, 0.153604587657018, 0.168145821955216, 
0.158451665756417, 0.153604587657018, 0.163298743855817, 0.182687056253413, 
0.172992900054615, 0.172992900054615, 0.158451665756417, 0.182687056253413, 
0.177839978154014, 0.187534134352813, 0.153604587657018, 0.172992900054615, 
0.168145821955216, 0.20207536865101, 0.226310759148007, 0.187534134352813, 
0.221463681048607, 0.240851993446204, 0.303864008738394, 0.318405243036592, 
0.33294647733479, 0.391111414527581, 0.39595849262698, 0.410499726925178, 
0.420193883123976, 0.463817586018569, 0.478358820316767, 0.468664664117968, 
0.439582195521573, 0.434735117422174, 0.381417258328782, 0.342640633533588, 
0.323252321135991, 0.337793555434189, 0.328099399235391, 0.303864008738394, 
0.284475696340797, 0.226310759148007, 0.163298743855817, 0.158451665756417, 
0.177839978154014, 0.168145821955216)), .Names = c("DATE", "TICKETS", 
"COLUMN", "FLAG", "MA"), row.names = c(412L, 316L, 302L, 288L, 
246L, 162L, 120L, 106L, 22L, 102L, 74L, 60L, 87L, 73L, 45L, 31L, 
17L, 351L, 337L, 267L, 253L, 239L, 225L, 211L, 197L, 183L, 169L, 
155L, 141L, 127L, 113L, 99L, 85L, 71L, 57L, 43L, 29L, 15L, 1L, 
426L, 418L, 405L, 392L, 378L, 364L, 350L, 374L, 360L, 346L), class = "data.frame")

Now, as can be seen, there are some dates where there is clearly a tendency to "group" together, for example december and january (date format is: dd/mm/yy).

Plainly, what I want is to find these groups of adjoining dates (i.e. 25- dec -2014 to 20 - jan - 2015 and june 21st to june 23rd 2015).

As long as there are two days adjoining each other I would count it as a "group". Once "groups" are named, I could filter the longest one, for example.

Idealy I would get something like:

structure(list(DATE = structure(c(47L, 36L, 35L, 34L, 31L, 24L, 
20L, 18L, 4L, 17L, 13L, 10L, 15L, 12L, 8L, 6L, 3L, 40L, 37L, 
33L, 32L, 30L, 29L, 28L, 27L, 26L, 25L, 23L, 22L, 21L, 19L, 16L, 
14L, 11L, 9L, 7L, 5L, 2L, 1L, 49L, 48L, 46L, 45L, 44L, 42L, 39L, 
43L, 41L, 38L), .Label = c("01/01/15", "02/01/15", "02/03/15", 
"02/06/15", "03/01/15", "03/03/15", "04/01/15", "04/03/15", "05/01/15", 
"05/04/15", "06/01/15", "06/03/15", "06/04/15", "07/01/15", "07/03/15", 
"08/01/15", "08/04/15", "08/06/15", "09/01/15", "09/06/15", "10/01/15", 
"11/01/15", "12/01/15", "12/06/15", "13/01/15", "14/01/15", "15/01/15", 
"16/01/15", "17/01/15", "18/01/15", "18/06/15", "19/01/15", "20/01/15", 
"21/06/15", "22/06/15", "23/06/15", "25/01/15", "25/08/14", "25/12/14", 
"26/01/15", "26/08/14", "26/12/14", "27/08/14", "27/12/14", "28/12/14", 
"29/12/14", "30/06/15", "30/12/14", "31/12/14"), class = "factor"), 
    TICKETS = c(17, 15, 22, 16, 15, 10, 18, 12, 20, 20, 19, 12, 
    16, 9, 20, 18, 15, 19, 13, 18, 21, 27, 17, 17, 18, 18, 21, 
    18, 22, 20, 16, 15, 23, 15, 17, 12, 20, 16, 9, 13, 21, 16, 
    16, 14, 10, 7, 15, 12, 14), COLUMN = c(17, 15, 22, 16, 15, 
    10, 18, 12, 20, 20, 19, 12, 16, 9, 20, 18, 15, 19, 13, 18, 
    21, 27, 17, 17, 18, 18, 21, 18, 22, 20, 16, 15, 23, 15, 17, 
    12, 20, 16, 9, 13, 21, 16, 16, 14, 10, 7, 15, 12, 14), FLAG = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1), MA = c(0.318405243036592, 
    0.163298743855817, 0.706171490988531, 0.240851993446204, 
    0.163298743855817, 0.153604587657018, 0.168145821955216, 
    0.158451665756417, 0.153604587657018, 0.163298743855817, 
    0.182687056253413, 0.172992900054615, 0.172992900054615, 
    0.158451665756417, 0.182687056253413, 0.177839978154014, 
    0.187534134352813, 0.153604587657018, 0.172992900054615, 
    0.168145821955216, 0.20207536865101, 0.226310759148007, 0.187534134352813, 
    0.221463681048607, 0.240851993446204, 0.303864008738394, 
    0.318405243036592, 0.33294647733479, 0.391111414527581, 0.39595849262698, 
    0.410499726925178, 0.420193883123976, 0.463817586018569, 
    0.478358820316767, 0.468664664117968, 0.439582195521573, 
    0.434735117422174, 0.381417258328782, 0.342640633533588, 
    0.323252321135991, 0.337793555434189, 0.328099399235391, 
    0.303864008738394, 0.284475696340797, 0.226310759148007, 
    0.163298743855817, 0.158451665756417, 0.177839978154014, 
    0.168145821955216), GROUP = c(0, 1, 1, 1, 0, 0, 2, 2, 0, 
    0, 3, 3, 4, 4, 5, 5, 5, 6, 6, 0, 7, 7, 7, 7, 7, 7, 7, 7, 
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 
    8, 8)), .Names = c("DATE", "TICKETS", "COLUMN", "FLAG", "MA", 
"GROUP"), row.names = c(NA, -49L), class = "data.frame")

Any ideas?

Upvotes: 1

Views: 78

Answers (1)

IRTFM
IRTFM

Reputation: 263342

You can use diff and cumsum to generate a group number for any chosen length of gap that would separate a group. (Since the diff will be negative as your proceed backward in time as in this case, you need to use a negative threshold. Here I used -2, which then only allows gaps less than two days to be in same group, but you could use any (lower) number.

df$dt <- as.Date(df$DATE, format="%d/%m/%y")
df$grp <- cumsum( c(0, diff(as.numeric(df$dt)) <   -2) )

> df$grp
 [1]  0  1  1  1  2  3  4  4  5  6  6  6  7  7  7  7  7  8  8  9
[21]  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
[41]  9  9  9  9  9  9 10 10 10

Upvotes: 5

Related Questions