Reputation: 1023
Hopefully i can explain what i'm trying to do sufficiently. I have df1 with values of the start and end times of activities. However I want to use these times to see if the speed of the boat (df2) exceeds a certain threshold between two fishing activities to decide if they should be separate activities (i.e. the boat has steamed to a new location) or the same activity.
df1 <- data.frame(
vessel_pln=c(rep("AU89",5)),
start_time=c("2018-11-02 05:14:26 GMT","2018-11-02 07:48:16 GMT","2018-11-02 09:03:28 GMT","2018-11-02 10:17:25 GMT","2018-11-05 06:39:12 GMT"),
start_lat=c(55.69713617,55.69693433,55.69539050,55.69043650,55.69103567),
start_lon=c(-5.65051533,-5.65031783,-5.65317850,-5.65859250,-5.65830600),
end_time=c("2018-11-02 06:54:37 GMT","2018-11-02 08:55:24 GMT","2018-11-02 10:00:14 GMT","2018-11-02 11:55:47 GMT","2018-11-05 08:33:35 GMT"),
end_lat=c(55.69462700,55.69539367,55.69454683,55.69370050,55.69302200),
end_lon=c(-5.65454983,-5.65317550,-5.65567667,-5.65628133,-5.65317550),
activity=c(1,2,3,4,5),
new_activity=c(rep("NO",5)))
library(chron) tt <- times(1:200/288)
df2 <- data.frame(
vessel_pln=c(rep("AU89",200)),
GPSTime=c(chron(rep("2/11/18", length = length(tt)), tt)),
Speed=c(runif(200,0,3)))
df2 <- as.POSIXct(df2$GPSTime,format="(%d/%m/%y %H%M%S)",tz="GMT")
df2[108, "Speed"] <- 3.2
i'd like to know if the 'Speed' (df2) > 3 between the 'end_time' (df1) of row [i] and the 'start_time' (df1) of row [i+1] . If it does then change the 'new_activity' (df1) column to "YES".
with the above data i should get the following results:
df3 <- data.frame(
vessel_pln=c(rep("AU89",5)),
start_time=c("2018-11-02 05:14:26 GMT","2018-11-02 07:48:16 GMT","2018-11-02 09:03:28 GMT","2018-11-02 10:17:25 GMT","2018-11-02 16:39:12 GMT"),
start_lat=c(55.69713617,55.69693433,55.69539050,55.69043650,55.69103567),
start_lon=c(-5.65051533,-5.65031783,-5.65317850,-5.65859250,-5.65830600),
end_time=c("2018-11-02 06:54:37 GMT","2018-11-02 08:55:24 GMT","2018-11-02 10:00:14 GMT","2018-11-02 11:55:47 GMT","2018-11-02 18:33:35 GMT"),
end_lat=c(55.69462700,55.69539367,55.69454683,55.69370050,55.69302200),
end_lon=c(-5.65454983,-5.65317550,-5.65567667,-5.65628133,-5.65317550),
activity=c(1,2,3,4,5),
new_activity=c("NO","NO","YES","NO","NO")))
Upvotes: 0
Views: 89
Reputation: 14764
Here's also how you could approach this with data.table
(and a bit of magrittr
to improve readability); should be fast even for larger datasets:
library(data.table)
library(magrittr)
col_names <- names(df1)
df1 <- setDT(df1)[, lapply(.SD, as.character)] %>%
.[, `:=` (end_join = as.POSIXct(end_time),
start_join = shift(as.POSIXct(start_time), type = "lead")), by = vessel_pln] %>%
.[is.na(start_join), start_join := as.POSIXct(as.character(end_time))]
df2 <- setDT(df2)[, lapply(.SD, as.character)][, `:=` (GPSTime = as.POSIXct(GPSTime))]
final <- df2[df1, on = .(GPSTime <= start_join, GPSTime >= end_join, vessel_pln = vessel_pln)] %>%
.[, new_activity := as.character(ifelse(any(Speed > 3), "YES", "NO")), by = activity] %>%
.[!duplicated(activity), ..col_names] %>%
.[is.na(new_activity), new_activity := "NO"]
Note that I have modified your data example a bit, since otherwise it is impossible to find a match between dates (in one df
you have 11th Feb, in the other 2nd Nov):
library(chron)
df1 <- data.frame(
vessel_pln=c(rep("AU89",5)),
start_time=c("2018-11-02 05:14:26 GMT","2018-11-02 07:48:16 GMT","2018-11-02 09:03:28 GMT","2018-11-02 10:17:25 GMT","2018-11-05 06:39:12 GMT"),
start_lat=c(55.69713617,55.69693433,55.69539050,55.69043650,55.69103567),
start_lon=c(-5.65051533,-5.65031783,-5.65317850,-5.65859250,-5.65830600),
end_time=c("2018-11-02 06:54:37 GMT","2018-11-02 08:55:24 GMT","2018-11-02 10:00:14 GMT","2018-11-02 11:55:47 GMT","2018-11-05 08:33:35 GMT"),
end_lat=c(55.69462700,55.69539367,55.69454683,55.69370050,55.69302200),
end_lon=c(-5.65454983,-5.65317550,-5.65567667,-5.65628133,-5.65317550),
activity=c(1,2,3,4,5),
new_activity=c(rep("NO",5)))
tt <- times(1:200/288)
df2 <- data.frame(
vessel_pln=c(rep("AU89",200)),
GPSTime=c(chron(rep("11/2/18", length = length(tt)), tt)),
Speed=c(runif(200,0,3)))
df2$GPSTime <- as.POSIXct(df2$GPSTime,format="(%d/%m/%y %H%M%S)",tz="GMT")
df2[108, "Speed"] <- 3.2
Now the output is actually with all NO
, as there is only 1 case with Speed
> 3, and this doesn't fall between any end_time
and next start_time
:
vessel_pln start_time start_lat start_lon end_time end_lat end_lon activity new_activity
1: AU89 2018-11-02 05:14:26 GMT 55.69713617 -5.65051533 2018-11-02 06:54:37 GMT 55.694627 -5.65454983 1 NO
2: AU89 2018-11-02 07:48:16 GMT 55.69693433 -5.65031783 2018-11-02 08:55:24 GMT 55.69539367 -5.6531755 2 NO
3: AU89 2018-11-02 09:03:28 GMT 55.6953905 -5.6531785 2018-11-02 10:00:14 GMT 55.69454683 -5.65567667 3 NO
4: AU89 2018-11-02 10:17:25 GMT 55.6904365 -5.6585925 2018-11-02 11:55:47 GMT 55.6937005 -5.65628133 4 NO
5: AU89 2018-11-05 06:39:12 GMT 55.69103567 -5.658306 2018-11-05 08:33:35 GMT 55.693022 -5.6531755 5 NO
However, if you'd modify this a bit, and replace in 3rd row of df1
˛the end_time
with 09:44:00
, you'd get:
vessel_pln start_time start_lat start_lon end_time end_lat end_lon activity new_activity
1: AU89 2018-11-02 05:14:26 GMT 55.69713617 -5.65051533 2018-11-02 06:54:37 GMT 55.694627 -5.65454983 1 NO
2: AU89 2018-11-02 07:48:16 GMT 55.69693433 -5.65031783 2018-11-02 08:55:24 GMT 55.69539367 -5.6531755 2 NO
3: AU89 2018-11-02 09:03:28 GMT 55.6953905 -5.6531785 2018-11-02 09:44:00 GMT 55.69454683 -5.65567667 3 YES
4: AU89 2018-11-02 10:17:25 GMT 55.6904365 -5.6585925 2018-11-02 11:55:47 GMT 55.6937005 -5.65628133 4 NO
5: AU89 2018-11-05 06:39:12 GMT 55.69103567 -5.658306 2018-11-05 08:33:35 GMT 55.693022 -5.6531755 5 NO
Upvotes: 1
Reputation: 2185
First, in order to compar df1$start_time
and df2$GPSTime
, you need the same type for these two.
df1$start_time <- as.POSIXct(as.character(df1$start_time),format = "%Y-%m-%d %H:%M:%S", tz="GMT")
df1$end_time <- as.POSIXct(as.character(df1$end_time),format = "%Y-%m-%d %H:%M:%S", tz="GMT")
df2$GPSTime <- as.POSIXct(as.character(df2$GPSTime), format="(%d/%m/%y %H:%M:%S)", tz= 'GMT')
Then, you can merge df1
and df2
and compar the different time. Then filter in order to keep the good times.
temp <- df1 %>%
left_join(df2, by = 'vessel_pln') %>%
mutate(BETWEEN = (GPSTime >= start_time & GPSTime < end_time)) %>%
filter(BETWEEN == TRUE)
#filter(Speed > 3)
You can check if it worked, and finally filter to only keep Speed > 3 (I don't do it because I have no Speed > 3 in my example dataset).
temp %>%
filter(activity == 1) %>%
select(start_time, end_time, GPSTime, Speed) %>%
head()
# start_time end_time GPSTime Speed
# 1 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:15:00 0.8461418
# 2 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:20:00 0.8610450
# 3 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:25:00 2.8171262
# 4 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:30:00 1.8165029
# 5 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:35:00 2.0697528
# 6 2018-11-02 05:14:26 2018-11-02 06:54:37 2018-11-02 05:40:00 0.5855299
Upvotes: 0