Reputation: 1008
I have a data.table of huts that a group if hikers are visiting.
library(data.table)
dt <- data.table(time = as.POSIXct(as.Date(10:35, origin = "2020-01-01")),
hut= c(1, NA, NA, 8, 1, 1, NA, NA, NA, 1, NA, NA, NA,
4, NA, NA, 4, NA, 5, NA, NA, 4, NA, 4, NA, 1))
The pattern is that they will move from a hut (e.g. hut 1) out in the wild (hut = NA) and come back within 2-5 days. This is an event. Sometime they will go to a new hut (e.g. hut 4) - this is not an event. The problem is that some times they will accidentially be in a hut inside an event (as in row 4). So this is still an event. The output should look like this, but I have no idea how to get that. The real data is billions of rows, so it should also be efficient, hence the data.table:
dt[, event:= c(NA, 1,1,1, NA, NA, 2,2,2,
NA, NA,NA, NA, NA,
3,3, NA, 4,4,4,4,NA, 5, NA,NA, NA)]
dt
time hut event
1: 2020-01-11 01:00:00 1 NA
2: 2020-01-12 01:00:00 NA 1
3: 2020-01-13 01:00:00 NA 1
4: 2020-01-14 01:00:00 8 1
5: 2020-01-15 01:00:00 1 NA
6: 2020-01-16 01:00:00 1 NA
7: 2020-01-17 01:00:00 NA 2
8: 2020-01-18 01:00:00 NA 2
9: 2020-01-19 01:00:00 NA 2
10: 2020-01-20 01:00:00 1 NA
11: 2020-01-21 01:00:00 NA NA
12: 2020-01-22 01:00:00 NA NA
13: 2020-01-23 01:00:00 NA NA
14: 2020-01-24 01:00:00 4 NA
15: 2020-01-25 01:00:00 NA 3
16: 2020-01-26 01:00:00 NA 3
17: 2020-01-27 01:00:00 4 NA
18: 2020-01-28 01:00:00 NA 4
19: 2020-01-29 01:00:00 5 4
20: 2020-01-30 01:00:00 NA 4
21: 2020-01-31 01:00:00 NA 4
22: 2020-02-01 01:00:00 4 NA
23: 2020-02-02 01:00:00 NA 5
24: 2020-02-03 01:00:00 4 NA
25: 2020-02-04 01:00:00 NA NA
26: 2020-02-05 01:00:00 1 NA
Upvotes: 1
Views: 128
Reputation: 25225
Here is another option using non-equi join:
dt[, rn := .I]
visits <- dt[!is.na(hut)]
visits[, c("start", "end") := .(time + 2L, time + 5L)]
rows <- visits[visits, on=.(hut, time>=start, time<=end), mult="first", nomatch=0L,
.(hut, i.time, x.time, i.rn, x.rn)]
dt[rows, on=.(rn>i.rn, rn<x.rn), event := 1L]
dt[, ri := rleid(event)][!is.na(event), event := rleid(ri)]
dt[rn %in% unique(c(rows$i.rn, rows$x.rn)), event := NA_integer_]
dt[, c("ri", "rn") := NULL][]
output:
time hut event
1: 2020-01-11 1 NA
2: 2020-01-12 NA 1
3: 2020-01-13 NA 1
4: 2020-01-14 8 1
5: 2020-01-15 1 NA
6: 2020-01-16 1 NA
7: 2020-01-17 NA 2
8: 2020-01-18 NA 2
9: 2020-01-19 NA 2
10: 2020-01-20 1 NA
11: 2020-01-21 NA NA
12: 2020-01-22 NA NA
13: 2020-01-23 NA NA
14: 2020-01-24 4 NA
15: 2020-01-25 NA 3
16: 2020-01-26 NA 3
17: 2020-01-27 4 NA
18: 2020-01-28 NA 4
19: 2020-01-29 5 4
20: 2020-01-30 NA 4
21: 2020-01-31 NA 4
22: 2020-02-01 4 NA
23: 2020-02-02 NA 5
24: 2020-02-03 4 NA
25: 2020-02-04 NA NA
26: 2020-02-05 1 NA
time hut event
Alternatively, using a rolling join instead of the non-equi join above:
is <- 2L
intvl <- 5L - is
dt[, c("rn", "oned") := .(.I, time + is)]
rows <- dt[dt[!is.na(hut)], on=.(hut, time=oned), roll=-intvl, nomatch=0L,
.(hut, i.rn, x.rn)]
#the rest of the code from the non-equi join above is needed here as well
data:
library(data.table)
dt <- data.table(time = as.Date(10:35, origin = "2020-01-01"),
hut= c(1, NA, NA, 8, 1, 1, NA, NA, NA, 1, NA, NA, NA,
4, NA, NA, 4, NA, 5, NA, NA, 4, NA, 4, NA, 1))
Upvotes: 1
Reputation: 1076
Ok, it's not obvious, but let's try ..
library(data.table)
dt <- data.table(time = as.POSIXct(as.Date(10:35, origin = "2020-01-01")),
hut= c(1, NA, NA, 8, 1, 1, NA, NA, NA, 1, NA, NA, NA,
4, NA, NA, 4, NA, 5, NA, NA, 4, NA, 4, NA, 1))
library(dplyr)
dt[, last.hut1 := lag(hut, n = 1, order_by = time)]
dt[, last.hut2 := lag(hut, n = 2, order_by = time)]
dt[, last.hut3 := lag(hut, n = 3, order_by = time)]
dt[, last.hut4 := lag(hut, n = 4, order_by = time)]
dt[, last.hut5 := lag(hut, n = 5, order_by = time)]
dt[, next.hut1 := lead(hut, n = 1, order_by = time)]
dt[, next.hut2 := lead(hut, n = 2, order_by = time)]
dt[, next.hut3 := lead(hut, n = 3, order_by = time)]
dt[, next.hut4 := lead(hut, n = 4, order_by = time)]
dt[, next.hut5 := lead(hut, n = 5, order_by = time)]
dt[, end.event := case_when((hut == last.hut2 | hut == last.hut3 | hut == last.hut4 | hut == last.hut5)
& (last.hut1 != hut | is.na(last.hut1)) ~ 1,
TRUE ~ 0)]
dt[, start.event := case_when((hut == next.hut2 | hut == next.hut3 | hut == next.hut4 | hut == next.hut5)
& (next.hut1 != hut | is.na(next.hut1)) ~ 1,
TRUE ~ 0)]
dt[, start.event2 := cumsum(start.event)]
dt[, end.event2 := cumsum(end.event)]
dt[, event := case_when((start.event2 > end.event2) & (start.event == 0) & (end.event == 0) ~ start.event2,
TRUE ~ NA_real_)]
dt[ ,c("last.hut1", "last.hut2", "last.hut3", "last.hut4", "last.hut5",
"next.hut1", "next.hut2", "next.hut3", "next.hut4", "next.hut5",
"start.event", "start.event2", "end.event", "end.event2") := .(NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL)]
Upvotes: 0