Reputation:
I'm working on next step of my data aggregation following previous question. There Jon Spring pointed me to a solution for indicating number of active events in given time interval.
At next step I'd like to be able to aggregate this data and obtain number of observations with same ID that were active at any point during the fixed time interval.
Starting with a toy dataset of seven events with five IDs:
library(tidyverse); library(lubridate)
df1 <- tibble::tibble(
id = c("a", "b", "c", "c", "c", "d", "e"),
start = c(ymd_hms("2018-12-10 13:01:00"),
ymd_hms("2018-12-10 13:07:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:48:00"),
ymd_hms("2018-12-10 14:52:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:45:00")),
end = c(ymd_hms("2018-12-10 13:05:00"),
ymd_hms("2018-12-10 13:17:00"),
ymd_hms("2018-12-10 14:46:00"),
ymd_hms("2018-12-10 14:50:00"),
ymd_hms("2018-12-10 15:01:00"),
ymd_hms("2018-12-10 14:51:00"),
ymd_hms("2018-12-10 15:59:00")))
I could bruteforce loop over each line of data frame and 'expand' each record to specified intervals that cover time period from start to end, here using 15 minutes:
for (i in 1:nrow(df1)) {
right <- df1 %>%
slice(i) %>%
mutate(start_floor = floor_date(start, "15 mins"))
left <- tibble::tibble(
timestamp = seq.POSIXt(right$start_floor,
right$end,
by = "15 mins"),
id = right$id)
if (i == 1){
result <- left
}
else {
result <- bind_rows(result, left) %>%
distinct()
}
}
Then it's a matter of simple aggregation to obtain final result:
result_agg <- result %>%
group_by(timestamp) %>%
summarise(users_mac = n())
That gives desired result, but will probably not scale well to dataset I need to use it with (~7 millions records at the moment.. and growing).
Is there any better solution to this problem?
Upvotes: 2
Views: 302
Reputation: 789
A tidy solution could be achieved using the tsibble package.
library(tidyverse)
#> Registered S3 methods overwritten by 'ggplot2':
#> method from
#> [.quosures rlang
#> c.quosures rlang
#> print.quosures rlang
#> Registered S3 method overwritten by 'rvest':
#> method from
#> read_xml.response xml2
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tsibble, warn.conflicts = FALSE)
df1 <- tibble(
id = c("a", "b", "c", "c", "c", "d", "e"),
start = c(ymd_hms("2018-12-10 13:01:00"),
ymd_hms("2018-12-10 13:07:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:48:00"),
ymd_hms("2018-12-10 14:52:00"),
ymd_hms("2018-12-10 14:45:00"),
ymd_hms("2018-12-10 14:45:00")),
end = c(ymd_hms("2018-12-10 13:05:00"),
ymd_hms("2018-12-10 13:17:00"),
ymd_hms("2018-12-10 14:46:00"),
ymd_hms("2018-12-10 14:50:00"),
ymd_hms("2018-12-10 15:01:00"),
ymd_hms("2018-12-10 14:51:00"),
ymd_hms("2018-12-10 15:59:00")))
df1 %>%
mutate(
start = floor_date(start, "15 mins"),
end = floor_date(end, "15 mins")
) %>%
gather("label", "index", start:end) %>%
distinct(id, index) %>%
mutate(date = as_date(index)) %>%
as_tsibble(key = c(id, date), index = index) %>%
fill_gaps() %>%
index_by(index) %>%
summarise(users_mac = n())
#> # A tsibble: 7 x 2 [15m] <UTC>
#> index users_mac
#> <dttm> <int>
#> 1 2018-12-10 13:00:00 2
#> 2 2018-12-10 13:15:00 1
#> 3 2018-12-10 14:45:00 3
#> 4 2018-12-10 15:00:00 2
#> 5 2018-12-10 15:15:00 1
#> 6 2018-12-10 15:30:00 1
#> 7 2018-12-10 15:45:00 1
Created on 2019-05-17 by the reprex package (v0.2.1)
Upvotes: 3
Reputation: 389055
I am not sure about the efficiency but one way to do this is by creating a sequence of 15 minute interval time from the minimum time to the maximum time in the data and then find the users which lie in that time.
library(tidyverse)
library(lubridate)
timestamp = floor_date(seq(min(df1$start), max(df1$end), by = "15 mins"), "15 mins")
tibble(timestamp) %>%
mutate(users_mac = map_dbl(timestamp,~with(df1, n_distinct(id[(
start > . | end > .) & (start < . + minutes(15) | end < . + minutes(15))])))) %>%
filter(users_mac != 0)
# timestamp users_mac
# <dttm> <dbl>
#1 2018-12-10 13:00:00 2
#2 2018-12-10 13:15:00 1
#3 2018-12-10 14:45:00 3
#4 2018-12-10 15:00:00 2
#5 2018-12-10 15:15:00 1
#6 2018-12-10 15:30:00 1
#7 2018-12-10 15:45:00 1
Upvotes: 2
Reputation: 2959
Using lubridate's as.interval()
and int_overlaps()
functions, followed by some tidyverse data wrangling to get the summary data:
library(dplyr)
library(tidyr)
library(lubridate)
# list of 15-minute time increments (buckets)
timestamp <- tibble(start = floor_date(seq(min(df1$start), max(df1$end), by = "15 mins"), "15 mins"),
end = lead(start, 1),
interval = as.interval(start, end)) %>%
na.omit() %>%
.$interval
# add in interval on df1 start -- end times
df1 <- mutate(df1, interval = as.interval(start, end))
# find if each record is in each bucket - may not scale if there are many buckets?
tmp <- sapply(df1$interval,
function(x, timestamp) int_overlaps(x, timestamp),
timestamp) %>%
t()
colnames(tmp) <- int_start(timestamp) %>% as.character()
# count how many unique ids in each time bucket
bind_cols(df1, as_tibble(tmp)) %>%
select(-start, -end, -interval) %>%
gather(key = start, value = logged, -id) %>%
filter(logged) %>%
group_by(start) %>%
summarise(n = n_distinct(id))
# A tibble: 7 x 2
start n
<chr> <int>
1 2018-12-10 13:00:00 2
2 2018-12-10 13:15:00 1
3 2018-12-10 14:30:00 3
4 2018-12-10 14:45:00 3
5 2018-12-10 15:00:00 2
6 2018-12-10 15:15:00 1
7 2018-12-10 15:30:00 1
Upvotes: 0