bibscy
bibscy

Reputation: 2708

Adding missing hours to dataframe in R

I have a data frame where some of the hours in Time GMT are missing.
Normally, the hours should be shown in a sequence from 00:00 to 23:00, but sometimes an hour is missed.

Where an hour is missing in the sequence, I would like to insert a new row.
The new row will be a copy of the previous row, but with the following columns changed as follows:

What have I tried

library(dplyr)
library(tidyr)

class(hrOzone$Time.GMT)   # output type "character"

hrOzone %>%
  group_by(Date.GMT) %>%
  complete(Time.GMT = full_seq(01:24, 1), fill = list(count = 0))

head(hrOzone$Time.GMT, n = 100L)

DataSource url: https://drive.google.com/file/d/1o1voBktR3i8ROt1Hp59OW5t3_uukIA3j/view?usp=sharing

Date GMT Time GMT Sample Measurement MDL

01/03/2016 21:00:00 0.036 0.005
01/03/2016 22:00:00 0.035 0.007
01/03/2016 23:00:00 0.029 0.008
02/03/2016 00:00:00 0.03
02/03/2016 01:00:00 0.01
02/03/2016 02:00:00
02/03/2016 03:00:00
02/03/2016 04:00:00
02/03/2016 05:00:00
02/03/2016 07:00:00
02/03/2016 08:00:00
02/03/2016 09:00:00
02/03/2016 10:00:00
02/03/2016 11:00:00
02/03/2016 12:00:00
02/03/2016 13:00:00
02/03/2016 14:00:00
02/03/2016 16:00:00
02/03/2016 17:00:00
02/03/2016 18:00:00
02/03/2016 19:00:00
02/03/2016 20:00:00
02/03/2016 21:00:00
02/03/2016 22:00:00
02/03/2016 23:00:00
02/03/2016 00:00:00
03/03/2016 01:00:00
03/03/2016 02:00:00
03/03/2016 03:00:00
03/03/2016 04:00:00
03/03/2016 06:00:00
03/03/2016 08:00:00

Update as per Waldi request

> dput(head(hrOzone,20))
structure(list(State.Code = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), County.Code = c(3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L), Site.Num = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L
), Parameter.Code = c(44201L, 44201L, 44201L, 44201L, 44201L, 
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L), POC = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), Latitude = c(30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478), Longitude = c(-87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258), Datum = c("NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83"), Parameter.Name = c("Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone"), Date.Local = c("2016-03-01", "2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02"), Time.Local = c("15:00", 
"16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", 
"23:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", 
"07:00", "08:00", "10:00", "11:00", "12:00"), Date.GMT = c("2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02"), Time.GMT = c("21:00", 
"22:00", "23:00", "00:00", "01:00", "02:00", "03:00", "04:00", 
"05:00", "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", 
"13:00", "14:00", "16:00", "17:00", "18:00"), Sample.Measurement = c(0.041, 
0.041, 0.042, 0.041, 0.038, 0.038, 0.036, 0.035, 0.029, 0.026, 
0.03, 0.03, 0.028, 0.027, 0.025, 0.023, 0.025, 0.034, 0.036, 
0.038), Units.of.Measure = c("Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million"
), MDL = c(0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 
0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 
0.005, 0.005, 0.005), Uncertainty = c(NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Qualifier = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), Method.Type = c("FEM", "FEM", "FEM", "FEM", "FEM", 
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", 
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM"), Method.Code = c(47L, 
47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 
47L, 47L, 47L, 47L, 47L, 47L), Method.Name = c("INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET"), State.Name = c("Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"
), County.Name = c("Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin"), Date.of.Last.Change = c("2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20")), row.names = c(NA, 
20L), class = "data.frame")
> 

Upvotes: 3

Views: 337

Answers (4)

Waldi
Waldi

Reputation: 41240

Another variation on linear interpolation:

library(dplyr)
library(purrr)
library(lubridate)

data <- hrOzone %>% select(Date.GMT,Time.GMT,Sample.Measurement) %>%
                    mutate(date  = ymd(Date.GMT) + hm(Time.GMT)) 
             
data.extended <- approx(x = data$date,
                        y = data$Sample.Measurement,
                        xout = seq(from = min(data$date),
                                   to   = max(data$date),
                                   by   = 3600))

map2_dfr(data.extended$x, data.extended$y,
         ~(list(DateTime.Gmt=.x,
                Date.GMT=format(.x,"%Y-%m-%d"),
                Time.GMT=format(.x,"%H:%M"),
                Sample.Measurement=.y))))

# A tibble: 22 x 4
   DateTime.Gmt        Date.GMT   Time.GMT Sample.Measurement
   <dttm>              <chr>      <chr>                 <dbl>
 1 2016-03-01 21:00:00 2016-03-01 21:00                0.041 
 2 2016-03-01 22:00:00 2016-03-01 22:00                0.041 
 3 2016-03-01 23:00:00 2016-03-01 23:00                0.042 
 4 2016-03-02 00:00:00 2016-03-02 00:00                0.041 
 5 2016-03-02 01:00:00 2016-03-02 01:00                0.038 
 6 2016-03-02 02:00:00 2016-03-02 02:00                0.038 
 7 2016-03-02 03:00:00 2016-03-02 03:00                0.036 
 8 2016-03-02 04:00:00 2016-03-02 04:00                0.035 
 9 2016-03-02 05:00:00 2016-03-02 05:00                0.029 
10 2016-03-02 06:00:00 2016-03-02 06:00                0.0275
# ... with 12 more rows

Upvotes: 3

Ronak Shah
Ronak Shah

Reputation: 389155

Here's a tidyverse approach using zoo's na.approx for linear interplotation.

library(dplyr)
library(lubridate)
library(tidyr)
library(zoo)

hrOzone %>%
  select(Date.GMT,Time.GMT,Sample.Measurement, MDL) %>%
  unite(datetime, Date.GMT, Time.GMT, sep = ' ') %>%
  mutate(datetime = ymd_hm(datetime)) %>%
  group_by(date = as.Date(datetime)) %>%
  complete(datetime = seq(min(datetime), max(datetime), 'hour')) %>%
  mutate(across(c(Sample.Measurement, MDL), na.approx)) %>%
  ungroup

For the data shared this returns -

#         date            datetime Sample.Measurement   MDL
#1  2016-03-01 2016-03-01 21:00:00             0.0410 0.005
#2  2016-03-01 2016-03-01 22:00:00             0.0410 0.005
#3  2016-03-01 2016-03-01 23:00:00             0.0420 0.005
#4  2016-03-02 2016-03-02 00:00:00             0.0410 0.005
#5  2016-03-02 2016-03-02 01:00:00             0.0380 0.005
#6  2016-03-02 2016-03-02 02:00:00             0.0380 0.005
#7  2016-03-02 2016-03-02 03:00:00             0.0360 0.005
#8  2016-03-02 2016-03-02 04:00:00             0.0350 0.005
#9  2016-03-02 2016-03-02 05:00:00             0.0290 0.005
#10 2016-03-02 2016-03-02 06:00:00             0.0275 0.005
#11 2016-03-02 2016-03-02 07:00:00             0.0260 0.005
#12 2016-03-02 2016-03-02 08:00:00             0.0300 0.005
#13 2016-03-02 2016-03-02 09:00:00             0.0300 0.005
#14 2016-03-02 2016-03-02 10:00:00             0.0280 0.005
#15 2016-03-02 2016-03-02 11:00:00             0.0270 0.005
#16 2016-03-02 2016-03-02 12:00:00             0.0250 0.005
#17 2016-03-02 2016-03-02 13:00:00             0.0230 0.005
#18 2016-03-02 2016-03-02 14:00:00             0.0250 0.005
#19 2016-03-02 2016-03-02 15:00:00             0.0295 0.005
#20 2016-03-02 2016-03-02 16:00:00             0.0340 0.005
#21 2016-03-02 2016-03-02 17:00:00             0.0360 0.005
#22 2016-03-02 2016-03-02 18:00:00             0.0380 0.005

Upvotes: 1

thelatemail
thelatemail

Reputation: 93908

Linear interpolation FTW, which should also account for the instance where you have consecutive missing values.

dtrng <- range(as.POSIXct(hrOzone$Date.GMT, tz="UTC") + 
         as.difftime(hrOzone$Time.GMT, format="%H:%M"))
dts <- seq(dtrng[1], dtrng[2], by="1 hour")

out <- merge(
  lapply(c(Date.GMT="%Y-%m-%d", Time.GMT="%H:%M"), format, x=dts),
  cbind(hrOzone, orig=1), all.x=TRUE
)

sel <- which(is.na(out$orig))
vars <- c("Sample.Measurement", "MDL")

out[sel, vars] <- lapply(out[vars], function(x) approx(x, xout=sel)$y)

Output:

#     Date.GMT Time.GMT Sample.Measurement   MDL orig
#1  2016-03-01    21:00             0.0410 0.005    1
#2  2016-03-01    22:00             0.0410 0.005    1
#3  2016-03-01    23:00             0.0420 0.005    1
#4  2016-03-02    00:00             0.0410 0.005    1
#5  2016-03-02    01:00             0.0380 0.005    1
#6  2016-03-02    02:00             0.0380 0.005    1
#7  2016-03-02    03:00             0.0360 0.005    1
#8  2016-03-02    04:00             0.0350 0.005    1
#9  2016-03-02    05:00             0.0290 0.005    1
#10 2016-03-02    06:00             0.0275 0.005   NA
#11 2016-03-02    07:00             0.0260 0.005    1
#12 2016-03-02    08:00             0.0300 0.005    1
#13 2016-03-02    09:00             0.0300 0.005    1
#14 2016-03-02    10:00             0.0280 0.005    1
#15 2016-03-02    11:00             0.0270 0.005    1
#16 2016-03-02    12:00             0.0250 0.005    1
#17 2016-03-02    13:00             0.0230 0.005    1
#18 2016-03-02    14:00             0.0250 0.005    1
#19 2016-03-02    15:00             0.0295 0.005   NA
#20 2016-03-02    16:00             0.0340 0.005    1
#21 2016-03-02    17:00             0.0360 0.005    1
#22 2016-03-02    18:00             0.0380 0.005    1

Upvotes: 2

Martin Gal
Martin Gal

Reputation: 16988

You could use tidyverse:

library(dplyr)
library(tidyr)
library(stringr)

hrOzone %>% 
  expand(Date.Local, Time.Local=str_c(str_pad(0:23, 2, "left", "0"), ":00")) %>%
  left_join(hrOzone, by=c("Date.Local", "Time.Local")) %>%
  arrange(Date.Local, Time.Local) %>%
  mutate(Time.GMT = ifelse(is.na(Time.GMT), 
                           str_c(str_pad(as.character((as.integer(str_sub(lead(Time.GMT), 1, 2)) +
                                          as.integer(str_sub(lag(Time.GMT), 1, 2)))/2), 2, "left", "0"), ":00"), 
                           Time.GMT),
         Sample.Measurement = ifelse(is.na(Sample.Measurement), 
                                     (lag(Sample.Measurement) + lead(Sample.Measurement))/2, 
                                     Sample.Measurement),
         MDL = ifelse(is.na(MDL), 
                      (lag(MDL) + lead(MDL))/2, 
                      MDL)) %>%
  fill(everything(), .direction="down")

Upvotes: 2

Related Questions