Reputation: 378
I am trying to correct some erroneous entries in a dataset in a conditional manner. I need to do this by group and the condition is based on the difference between 2 timestamps across 2 different datasets.
Here are some examples of the types of data that I am working with:-
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
Both datasets look like this:-
print(df1)
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC Group5 2020-01-02 17:00:00
11 CCC Group5 2020-01-02 18:00:00
12 CCC Group5 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD Group2 2020-01-05 05:00:00
16 DDD Group2 2020-01-05 06:00:00
17 DDD Group2 2020-01-05 07:00:00
18 DDD Group2 2020-01-05 08:00:00
print(df2)
UserID StartTime EndTime
1 AAA 2020-01-01 00:53:00 2020-01-01 01:01:00
2 AAA 2020-01-01 01:01:00 2020-01-01 01:57:00
3 AAA 2020-01-01 01:57:00 2020-01-01 02:44:00
4 BBB 2020-01-02 12:03:00 2020-01-02 13:03:00
5 BBB 2020-01-02 13:03:00 2020-01-02 13:56:00
6 BBB 2020-01-02 13:56:00 2020-01-02 14:26:00
7 BBB 2020-01-02 14:26:00 2020-01-02 15:23:00
8 CCC 2020-01-02 15:12:00 2020-01-02 15:29:00
9 CCC 2020-01-02 15:29:00 2020-01-02 15:58:00
10 DDD 2020-01-05 03:00:00 2020-01-05 03:12:00
11 DDD 2020-01-05 03:12:00 2020-01-05 03:57:00
The timestamp column in df1 is rounded up to each hour, whereas there is a starting timestamp and an ending timestamp in df2 (both are granular and rounded to the minute). There are some entries in df1 that are incorrect because they do not appear in df2 at a corresponding time.
For instance, UserID
CCC last EndTime
timestamp in df2 is 2020-01-02 15:58:00, but in df1, CCC appears at 2020-01-02 17:00:00, 2020-01-02 18:00:00 & 2020-01-02 19:00:00; similar instance with UserID
DDD.
What I want to do
If a UserID
has an entry in df1, with a df1$Time
timestamp that is >=60 minutes than their last df2$EndTime
timestamp in df2, I want the entry in the df1$Value
to be changes to "NoGroup".
Here is an illustrative example of the desired outcome:-
UserID Value Time
1 AAA Group1 2020-01-01 01:00:00
2 AAA Group1 2020-01-01 02:00:00
3 AAA Group2 2020-01-01 03:00:00
4 BBB Group3 2020-01-02 12:00:00
5 BBB Group3 2020-01-02 13:00:00
6 BBB Group1 2020-01-02 14:00:00
7 BBB Group2 2020-01-02 15:00:00
8 CCC Group4 2020-01-02 15:00:00
9 CCC Group5 2020-01-02 16:00:00
10 CCC NoGroup 2020-01-02 17:00:00
11 CCC NoGroup 2020-01-02 18:00:00
12 CCC NoGroup 2020-01-02 19:00:00
13 DDD Group1 2020-01-05 03:00:00
14 DDD Group2 2020-01-05 04:00:00
15 DDD NoGroup 2020-01-05 05:00:00
16 DDD NoGroup 2020-01-05 06:00:00
17 DDD NoGroup 2020-01-05 07:00:00
18 DDD NoGroup 2020-01-05 08:00:00
Any pointers are appreciated as always :)
Upvotes: 0
Views: 72
Reputation: 10627
df1<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "CCC", "CCC", "CCC", "DDD", "DDD",
"DDD", "DDD", "DDD", "DDD"), Value = c("Group1", "Group1", "Group2",
"Group3", "Group3", "Group1", "Group2", "Group4", "Group5", "Group5",
"Group5", "Group5", "Group1", "Group2", "Group2", "Group2", "Group2",
"Group2"), Time = structure(c(1577840400, 1577844000, 1577847600,
1577966400, 1577970000, 1577973600, 1577977200, 1577977200, 1577980800,
1577984400, 1577988000, 1577991600, 1578193200, 1578196800, 1578200400,
1578204000, 1578207600, 1578211200), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -18L), class = "data.frame")
df2<-structure(list(UserID = c("AAA", "AAA", "AAA", "BBB", "BBB",
"BBB", "BBB", "CCC", "CCC", "DDD", "DDD"), StartTime = structure(c(1577839980,
1577840460, 1577843820, 1577966580, 1577970180, 1577973360, 1577975160,
1577977920, 1577978940, 1578193200, 1578193920), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), EndTime = structure(c(1577840460,
1577843820, 1577846640, 1577970180, 1577973360, 1577975160, 1577978580,
1577978940, 1577980680, 1578193920, 1578196620), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -11L), class = "data.frame")
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
no_groups <-
df1 %>%
as_tibble() %>%
left_join(df2 %>% as_tibble()) %>%
group_by(UserID) %>%
mutate(
last_end = max(EndTime)
) %>%
mutate(
no_group = all((last_end - Time) <= minutes(60))
) %>%
distinct(UserID, no_group)
#> Joining, by = "UserID"
no_groups
#> # A tibble: 4 x 2
#> # Groups: UserID [4]
#> UserID no_group
#> <chr> <lgl>
#> 1 AAA FALSE
#> 2 BBB FALSE
#> 3 CCC TRUE
#> 4 DDD TRUE
df1 %>%
as_tibble() %>%
left_join(no_groups) %>%
mutate(Value = ifelse(no_group, "NoGroup", Value)) %>%
select(-no_group)
#> Joining, by = "UserID"
#> # A tibble: 18 x 3
#> UserID Value Time
#> <chr> <chr> <dttm>
#> 1 AAA Group1 2020-01-01 01:00:00
#> 2 AAA Group1 2020-01-01 02:00:00
#> 3 AAA Group2 2020-01-01 03:00:00
#> 4 BBB Group3 2020-01-02 12:00:00
#> 5 BBB Group3 2020-01-02 13:00:00
#> 6 BBB Group1 2020-01-02 14:00:00
#> 7 BBB Group2 2020-01-02 15:00:00
#> 8 CCC NoGroup 2020-01-02 15:00:00
#> 9 CCC NoGroup 2020-01-02 16:00:00
#> 10 CCC NoGroup 2020-01-02 17:00:00
#> 11 CCC NoGroup 2020-01-02 18:00:00
#> 12 CCC NoGroup 2020-01-02 19:00:00
#> 13 DDD NoGroup 2020-01-05 03:00:00
#> 14 DDD NoGroup 2020-01-05 04:00:00
#> 15 DDD NoGroup 2020-01-05 05:00:00
#> 16 DDD NoGroup 2020-01-05 06:00:00
#> 17 DDD NoGroup 2020-01-05 07:00:00
#> 18 DDD NoGroup 2020-01-05 08:00:00
Created on 2021-09-17 by the reprex package (v2.0.0)
Upvotes: 1
Reputation: 2783
using dplyr
:
df1 %>%
left_join(df2 %>% group_by(UserID) %>% filter(EndTime == max(EndTime)), by = "UserID") %>%
mutate(Value = if_else(Time-EndTime >= 60, "NoGroup", Value)) %>%
select(-c(4,5))
First you join
the last EndTimes
of each UserID
from df2
into df1
, next you check if any Time
is more than 60 minutes past this EndTime
and change the Value
accordingly. Lastly you remove the columns that were added during the join
Upvotes: 1