Reputation: 179
I need to split a data frame into several new/different data frames if the previous date has a difference greater than 5 days from the current date.
For example: If the list of dates are as follows:
2016-11-21
2016-11-22
2016-11-22
2016-11-24
2016-11-30
2016-12-01
2016-12-02
2016-12-10
2016-12-12
2016-12-20
...
I would like a new dataframe for each date that has a difference of more than 5 days (While keeping all of the information in the other columns- not pictured here for simplicity):
df1:
2016-11-21
2016-11-22
2016-11-22
2016-11-24
df2:
2016-11-30
2016-12-01
2016-12-02
df3:
2016-12-10
2016-12-12
df4:
2016-12-20
...
So far I have been able to use:
#split dataframe
split(d, as_date(d$date) <= lag(as_date(d$date) + days(5))) -> test
#set list as datatable
dt_list <- map(test, as.data.table)
#bind lists into new dataframe
dt <- rbindlist(dt_list, fill = TRUE, idcol = T)
which gives me one dataframe with either true or false based on these conditions- (not what I want):
id date lc lon lat Timestamp difftime
<chr> <dttm> <fct> <dbl> <dbl> <date> <lgl>
1 158696 2016-12-11 05:26:38 A 135. -3.14 2016-12-11 FALSE
2 158696 2016-12-26 08:13:44 B 135. -3.28 2016-12-26 FALSE
3 158696 2017-01-09 06:35:37 A 136. -3.14 2017-01-09 FALSE
4 158696 2017-02-02 08:30:50 0 135. -3.22 2017-02-02 FALSE
5 158696 2017-02-18 18:28:08 B 135. -3.27 2017-02-18 FALSE
6 158696 2016-11-21 05:46:52 B 135. -3.22 2016-11-21 TRUE
7 158696 2016-11-21 05:46:52 B 135. -3.22 2016-11-21 TRUE
8 158696 2016-11-21 08:04:15 B 135. -3.22 2016-11-21 TRUE
9 158696 2016-11-21 08:21:10 B 135. -3.22 2016-11-21 TRUE
10 158696 2016-11-21 20:34:23 B 135. -3.22 2016-11-21 TRUE
I am happy to use a completely different code than what I have above, but would prefer to use Dplyr if possible, as I am most familiar with it.
Here is a dput of my dataframe:
structure(list(id = c("158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696", "158696", "158696",
"158696", "158696", "158696", "158696", "158696"), date = structure(c(1481672621,
1482740024, 1482888520, 1483943737, 1483954584, 1483957732, 1484351217,
1484372486, 1484635601, 1484639873, 1484649374, 1484654932, 1484910955,
1484914028, 1484992088, 1485077809, 1485083628, 1485116630, 1485155703,
1485241506, 1485245737, 1485302728, 1485394165, 1485407109, 1486024250,
1486026218, 1486153828, 1486184167, 1486243717, 1486244406, 1486244406,
1486244406, 1487442488, 1487462032, 1487488051, 1487495298, 1487507472,
1487553204, 1487573823, 1487756502, 1487756774, 1487758235, 1487763098,
1487795742, 1487812234, 1487929687, 1487932240, 1487979233, 1488246245,
1488315015, 1488793040), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
lc = structure(c(6L, 6L, 5L, 5L, 5L, 6L, 6L, 5L, 5L, 6L,
6L, 6L, 2L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 5L, 6L, 6L, 6L, 1L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 5L,
6L, 6L, 6L, 6L, 6L, 2L, 6L, 6L, 6L, 6L, 6L), .Label = c("0",
"1", "2", "3", "A", "B"), class = "factor"), lon = c(134.9559,
134.9635, 135.0038, 135.5555, 135.5994, 135.6039, 135.5398,
135.4953, 135.5485, 135.5502, 135.5447, 135.5461, 135.0267,
135.0148, 135.0258, 135.0221, 135.0013, 135.0037, 135.0306,
134.9676, 134.9523, 134.8742, 134.8938, 134.8815, 135.0038,
135.0047, 134.9429, 134.9381, 134.9218, 134.929, 134.9092,
134.9218, 135.045, 135.0479, 135.0794, 135.0884, 135.0796,
135.087, 135.092, 135.1609, 135.1625, 135.1626, 135.1649,
135.1601, 135.1714, 135.0644, 135.064, 135.0759, 135.0841,
135.0849, 134.9314), lat = c(-3.148, -3.275, -3.2625, -3.1398,
-3.1349, -3.1339, -2.5095, -2.511, -2.6931, -2.6923, -2.7032,
-2.7059, -3.2218, -3.2327, -3.2284, -3.2234, -3.2688, -3.2888,
-3.2257, -3.2096, -3.208, -3.2129, -3.179, -3.1945, -3.2158,
-3.2157, -3.1747, -3.1701, -3.1604, -3.1735, -3.1735, -3.1735,
-3.2697, -3.2623, -3.3155, -3.3182, -3.3046, -3.3131, -3.3125,
-3.2767, -3.2765, -3.2766, -3.2769, -3.2808, -3.283, -3.2069,
-3.2073, -3.1983, -3.312, -3.2977, -3.1022)), row.names = 47:97, class = "data.frame")
Which has a few dates that are different by more than 5 days. I am relatively new to R and would really appreciate your help!
Upvotes: 4
Views: 1162
Reputation: 66500
library(dplyr)
df_grp = df %>% mutate(grp = cumsum(c(1, diff(date) > 5*86400)))
split(df_grp, df_grp$grp)
Upvotes: 2
Reputation: 521289
We can use lead()
here to create a pseudo group for each island of dates. Then, split on that island value.
d$gap <- ifelse(as.numeric(difftime(d$date, lead(d$date), units = "days")) > 5, 1, 0)
d$grp <- cumsum(df$gap)
dt_list <- split(d, d$grp)
Upvotes: 0