Reputation: 2708
I have a data frame where some of the hours in Time GMT
are missing.
Normally, the hours should be shown in a sequence from 00:00 to 23:00, but sometimes an hour is missed.
Where an hour is missing in the sequence, I would like to insert a new row.
The new row will be a copy of the previous row, but with the following columns changed as follows:
Time GMT
: will contain the next hour of the previous row. i.e, if previous == 5:00, new == 6:00
Sample Measurement
: will contain the average between the previous value and the next value in Sample Measurement column.
MDL
: will contain the average between the previous value and the next value in column MDL
What have I tried
library(dplyr)
library(tidyr)
class(hrOzone$Time.GMT) # output type "character"
hrOzone %>%
group_by(Date.GMT) %>%
complete(Time.GMT = full_seq(01:24, 1), fill = list(count = 0))
head(hrOzone$Time.GMT, n = 100L)
DataSource url: https://drive.google.com/file/d/1o1voBktR3i8ROt1Hp59OW5t3_uukIA3j/view?usp=sharing
Date GMT Time GMT Sample Measurement MDL
01/03/2016 21:00:00 0.036 0.005
01/03/2016 22:00:00 0.035 0.007
01/03/2016 23:00:00 0.029 0.008
02/03/2016 00:00:00 0.03
02/03/2016 01:00:00 0.01
02/03/2016 02:00:00
02/03/2016 03:00:00
02/03/2016 04:00:00
02/03/2016 05:00:00
02/03/2016 07:00:00
02/03/2016 08:00:00
02/03/2016 09:00:00
02/03/2016 10:00:00
02/03/2016 11:00:00
02/03/2016 12:00:00
02/03/2016 13:00:00
02/03/2016 14:00:00
02/03/2016 16:00:00
02/03/2016 17:00:00
02/03/2016 18:00:00
02/03/2016 19:00:00
02/03/2016 20:00:00
02/03/2016 21:00:00
02/03/2016 22:00:00
02/03/2016 23:00:00
02/03/2016 00:00:00
03/03/2016 01:00:00
03/03/2016 02:00:00
03/03/2016 03:00:00
03/03/2016 04:00:00
03/03/2016 06:00:00
03/03/2016 08:00:00
Update as per Waldi request
> dput(head(hrOzone,20))
structure(list(State.Code = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), County.Code = c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), Site.Num = c(10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L
), Parameter.Code = c(44201L, 44201L, 44201L, 44201L, 44201L,
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L,
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L), POC = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), Latitude = c(30.497478, 30.497478, 30.497478, 30.497478,
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478,
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478,
30.497478, 30.497478, 30.497478, 30.497478), Longitude = c(-87.880258,
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258,
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258,
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258,
-87.880258), Datum = c("NAD83", "NAD83", "NAD83", "NAD83", "NAD83",
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83",
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83",
"NAD83"), Parameter.Name = c("Ozone", "Ozone", "Ozone", "Ozone",
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone",
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone",
"Ozone", "Ozone"), Date.Local = c("2016-03-01", "2016-03-01",
"2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01",
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02",
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02",
"2016-03-02", "2016-03-02", "2016-03-02"), Time.Local = c("15:00",
"16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00",
"23:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00",
"07:00", "08:00", "10:00", "11:00", "12:00"), Date.GMT = c("2016-03-01",
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02",
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02",
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02",
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02"), Time.GMT = c("21:00",
"22:00", "23:00", "00:00", "01:00", "02:00", "03:00", "04:00",
"05:00", "07:00", "08:00", "09:00", "10:00", "11:00", "12:00",
"13:00", "14:00", "16:00", "17:00", "18:00"), Sample.Measurement = c(0.041,
0.041, 0.042, 0.041, 0.038, 0.038, 0.036, 0.035, 0.029, 0.026,
0.03, 0.03, 0.028, 0.027, 0.025, 0.023, 0.025, 0.034, 0.036,
0.038), Units.of.Measure = c("Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million",
"Parts per million", "Parts per million", "Parts per million"
), MDL = c(0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005,
0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005,
0.005, 0.005, 0.005), Uncertainty = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Qualifier = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", ""), Method.Type = c("FEM", "FEM", "FEM", "FEM", "FEM",
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM",
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM"), Method.Code = c(47L,
47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L,
47L, 47L, 47L, 47L, 47L, 47L), Method.Name = c("INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET",
"INSTRUMENTAL - ULTRA VIOLET"), State.Name = c("Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"
), County.Name = c("Baldwin", "Baldwin", "Baldwin", "Baldwin",
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin",
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin",
"Baldwin", "Baldwin", "Baldwin", "Baldwin"), Date.of.Last.Change = c("2016-06-20",
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20",
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20",
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20",
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20")), row.names = c(NA,
20L), class = "data.frame")
>
Upvotes: 3
Views: 337
Reputation: 41240
Another variation on linear interpolation:
library(dplyr)
library(purrr)
library(lubridate)
data <- hrOzone %>% select(Date.GMT,Time.GMT,Sample.Measurement) %>%
mutate(date = ymd(Date.GMT) + hm(Time.GMT))
data.extended <- approx(x = data$date,
y = data$Sample.Measurement,
xout = seq(from = min(data$date),
to = max(data$date),
by = 3600))
map2_dfr(data.extended$x, data.extended$y,
~(list(DateTime.Gmt=.x,
Date.GMT=format(.x,"%Y-%m-%d"),
Time.GMT=format(.x,"%H:%M"),
Sample.Measurement=.y))))
# A tibble: 22 x 4
DateTime.Gmt Date.GMT Time.GMT Sample.Measurement
<dttm> <chr> <chr> <dbl>
1 2016-03-01 21:00:00 2016-03-01 21:00 0.041
2 2016-03-01 22:00:00 2016-03-01 22:00 0.041
3 2016-03-01 23:00:00 2016-03-01 23:00 0.042
4 2016-03-02 00:00:00 2016-03-02 00:00 0.041
5 2016-03-02 01:00:00 2016-03-02 01:00 0.038
6 2016-03-02 02:00:00 2016-03-02 02:00 0.038
7 2016-03-02 03:00:00 2016-03-02 03:00 0.036
8 2016-03-02 04:00:00 2016-03-02 04:00 0.035
9 2016-03-02 05:00:00 2016-03-02 05:00 0.029
10 2016-03-02 06:00:00 2016-03-02 06:00 0.0275
# ... with 12 more rows
Upvotes: 3
Reputation: 389155
Here's a tidyverse
approach using zoo
's na.approx
for linear interplotation.
library(dplyr)
library(lubridate)
library(tidyr)
library(zoo)
hrOzone %>%
select(Date.GMT,Time.GMT,Sample.Measurement, MDL) %>%
unite(datetime, Date.GMT, Time.GMT, sep = ' ') %>%
mutate(datetime = ymd_hm(datetime)) %>%
group_by(date = as.Date(datetime)) %>%
complete(datetime = seq(min(datetime), max(datetime), 'hour')) %>%
mutate(across(c(Sample.Measurement, MDL), na.approx)) %>%
ungroup
For the data shared this returns -
# date datetime Sample.Measurement MDL
#1 2016-03-01 2016-03-01 21:00:00 0.0410 0.005
#2 2016-03-01 2016-03-01 22:00:00 0.0410 0.005
#3 2016-03-01 2016-03-01 23:00:00 0.0420 0.005
#4 2016-03-02 2016-03-02 00:00:00 0.0410 0.005
#5 2016-03-02 2016-03-02 01:00:00 0.0380 0.005
#6 2016-03-02 2016-03-02 02:00:00 0.0380 0.005
#7 2016-03-02 2016-03-02 03:00:00 0.0360 0.005
#8 2016-03-02 2016-03-02 04:00:00 0.0350 0.005
#9 2016-03-02 2016-03-02 05:00:00 0.0290 0.005
#10 2016-03-02 2016-03-02 06:00:00 0.0275 0.005
#11 2016-03-02 2016-03-02 07:00:00 0.0260 0.005
#12 2016-03-02 2016-03-02 08:00:00 0.0300 0.005
#13 2016-03-02 2016-03-02 09:00:00 0.0300 0.005
#14 2016-03-02 2016-03-02 10:00:00 0.0280 0.005
#15 2016-03-02 2016-03-02 11:00:00 0.0270 0.005
#16 2016-03-02 2016-03-02 12:00:00 0.0250 0.005
#17 2016-03-02 2016-03-02 13:00:00 0.0230 0.005
#18 2016-03-02 2016-03-02 14:00:00 0.0250 0.005
#19 2016-03-02 2016-03-02 15:00:00 0.0295 0.005
#20 2016-03-02 2016-03-02 16:00:00 0.0340 0.005
#21 2016-03-02 2016-03-02 17:00:00 0.0360 0.005
#22 2016-03-02 2016-03-02 18:00:00 0.0380 0.005
Upvotes: 1
Reputation: 93908
Linear interpolation FTW, which should also account for the instance where you have consecutive missing values.
dtrng <- range(as.POSIXct(hrOzone$Date.GMT, tz="UTC") +
as.difftime(hrOzone$Time.GMT, format="%H:%M"))
dts <- seq(dtrng[1], dtrng[2], by="1 hour")
out <- merge(
lapply(c(Date.GMT="%Y-%m-%d", Time.GMT="%H:%M"), format, x=dts),
cbind(hrOzone, orig=1), all.x=TRUE
)
sel <- which(is.na(out$orig))
vars <- c("Sample.Measurement", "MDL")
out[sel, vars] <- lapply(out[vars], function(x) approx(x, xout=sel)$y)
Output:
# Date.GMT Time.GMT Sample.Measurement MDL orig
#1 2016-03-01 21:00 0.0410 0.005 1
#2 2016-03-01 22:00 0.0410 0.005 1
#3 2016-03-01 23:00 0.0420 0.005 1
#4 2016-03-02 00:00 0.0410 0.005 1
#5 2016-03-02 01:00 0.0380 0.005 1
#6 2016-03-02 02:00 0.0380 0.005 1
#7 2016-03-02 03:00 0.0360 0.005 1
#8 2016-03-02 04:00 0.0350 0.005 1
#9 2016-03-02 05:00 0.0290 0.005 1
#10 2016-03-02 06:00 0.0275 0.005 NA
#11 2016-03-02 07:00 0.0260 0.005 1
#12 2016-03-02 08:00 0.0300 0.005 1
#13 2016-03-02 09:00 0.0300 0.005 1
#14 2016-03-02 10:00 0.0280 0.005 1
#15 2016-03-02 11:00 0.0270 0.005 1
#16 2016-03-02 12:00 0.0250 0.005 1
#17 2016-03-02 13:00 0.0230 0.005 1
#18 2016-03-02 14:00 0.0250 0.005 1
#19 2016-03-02 15:00 0.0295 0.005 NA
#20 2016-03-02 16:00 0.0340 0.005 1
#21 2016-03-02 17:00 0.0360 0.005 1
#22 2016-03-02 18:00 0.0380 0.005 1
Upvotes: 2
Reputation: 16988
You could use tidyverse
:
library(dplyr)
library(tidyr)
library(stringr)
hrOzone %>%
expand(Date.Local, Time.Local=str_c(str_pad(0:23, 2, "left", "0"), ":00")) %>%
left_join(hrOzone, by=c("Date.Local", "Time.Local")) %>%
arrange(Date.Local, Time.Local) %>%
mutate(Time.GMT = ifelse(is.na(Time.GMT),
str_c(str_pad(as.character((as.integer(str_sub(lead(Time.GMT), 1, 2)) +
as.integer(str_sub(lag(Time.GMT), 1, 2)))/2), 2, "left", "0"), ":00"),
Time.GMT),
Sample.Measurement = ifelse(is.na(Sample.Measurement),
(lag(Sample.Measurement) + lead(Sample.Measurement))/2,
Sample.Measurement),
MDL = ifelse(is.na(MDL),
(lag(MDL) + lead(MDL))/2,
MDL)) %>%
fill(everything(), .direction="down")
Upvotes: 2