jakes
jakes

Reputation: 2085

A code to imput missing values with linear dependency

I've got a data.frame with weekly data as below:

structure(list(date = structure(1:9, .Label = c("2017-01-01", 
"2017-01-08", "2017-01-15", "2017-01-22", "2017-01-29", "2017-02-05", 
"2017-02-12", "2017-02-19", "2017-02-26"), class = "factor"), 
    value = c("6", "5", "5", "5", "5", "5", "6", "8", "10")), row.names = c(NA, 
-9L), class = "data.frame")

Now I want to calculate daily data with smoothing that changes the values from day to day linearly with an amount of change equal to one seventh of weekly change. So the expected output would be:

structure(list(date = structure(c(17167, 17168, 17169, 17170, 
17171, 17172, 17173, 17174, 17175, 17176, 17177, 17178, 17179, 
17180, 17181, 17182, 17183, 17184, 17185, 17186, 17187, 17188, 
17189, 17190, 17191, 17192, 17193, 17194, 17195, 17196, 17197, 
17198, 17199, 17200, 17201, 17202, 17203, 17204, 17205, 17206, 
17207, 17208, 17209, 17210, 17211, 17212, 17213, 17214, 17215, 
17216, 17217, 17218, 17219, 17220, 17221, 17222, 17223, 17224, 
17225), class = "Date"), value = c(6, 5.857, 5.714, 5.571, 5.429, 
5.286, 5.143, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5.143, 5.286, 5.429, 5.571, 
5.714, 5.857, 6, 6.286, 6.571, 6.857, 7.143, 7.429, 7.714, 8, 
8.286, 8.571, 8.857, 9.143, 9.429, 9.714, 10, 8.57, 7.14)), row.names = c(NA, 
-59L), class = c("tbl_df", "tbl", "data.frame"))

Please note that last two dates are not within time span from first data frame and they decrease by (10 - 0) / 7 as the difference is calculated from last observed value (10) to 0.

How can I obtain expected output?

Upvotes: 1

Views: 42

Answers (1)

zoowalk
zoowalk

Reputation: 2134

does this help?

library(tidyverse)
library(lubridate)
#> 
#> Attache Paket: 'lubridate'
#> The following object is masked from 'package:base':
#> 
#>     date
library(padr)

mydf<- data.frame(stringsAsFactors=FALSE,
          index = c(1, 2, 3, 4, 5, 6, 7, 8, 9),
          date = c("01.01.2017", "08.01.2017", "15.01.2017", "22.01.2017",
                 "29.01.2017", "05.02.2017", "12.02.2017", "19.02.2017",
                 "26.02.2017"),
          value = c(6, 5, 5, 5, 5, 5, 6, 8, 10)
)


mydf %>% 
  mutate(date=lubridate::dmy(date)) %>% 
  pad(.,interval="day", end_val=lubridate::dmy("28.02.2017")) %>% 
  mutate(value2=approx(date,value,date)$y)
#>    index       date value    value2
#> 1      1 2017-01-01     6  6.000000
#> 2     NA 2017-01-02    NA  5.857143
#> 3     NA 2017-01-03    NA  5.714286
#> 4     NA 2017-01-04    NA  5.571429
#> 5     NA 2017-01-05    NA  5.428571
#> 6     NA 2017-01-06    NA  5.285714
#> 7     NA 2017-01-07    NA  5.142857
#> 8      2 2017-01-08     5  5.000000
#> 9     NA 2017-01-09    NA  5.000000
#> 10    NA 2017-01-10    NA  5.000000
#> 11    NA 2017-01-11    NA  5.000000
#> 12    NA 2017-01-12    NA  5.000000
#> 13    NA 2017-01-13    NA  5.000000
#> 14    NA 2017-01-14    NA  5.000000
#> 15     3 2017-01-15     5  5.000000
#> 16    NA 2017-01-16    NA  5.000000
#> 17    NA 2017-01-17    NA  5.000000
#> 18    NA 2017-01-18    NA  5.000000
#> 19    NA 2017-01-19    NA  5.000000
#> 20    NA 2017-01-20    NA  5.000000
#> 21    NA 2017-01-21    NA  5.000000
#> 22     4 2017-01-22     5  5.000000
#> 23    NA 2017-01-23    NA  5.000000
#> 24    NA 2017-01-24    NA  5.000000
#> 25    NA 2017-01-25    NA  5.000000
#> 26    NA 2017-01-26    NA  5.000000
#> 27    NA 2017-01-27    NA  5.000000
#> 28    NA 2017-01-28    NA  5.000000
#> 29     5 2017-01-29     5  5.000000
#> 30    NA 2017-01-30    NA  5.000000
#> 31    NA 2017-01-31    NA  5.000000
#> 32    NA 2017-02-01    NA  5.000000
#> 33    NA 2017-02-02    NA  5.000000
#> 34    NA 2017-02-03    NA  5.000000
#> 35    NA 2017-02-04    NA  5.000000
#> 36     6 2017-02-05     5  5.000000
#> 37    NA 2017-02-06    NA  5.142857
#> 38    NA 2017-02-07    NA  5.285714
#> 39    NA 2017-02-08    NA  5.428571
#> 40    NA 2017-02-09    NA  5.571429
#> 41    NA 2017-02-10    NA  5.714286
#> 42    NA 2017-02-11    NA  5.857143
#> 43     7 2017-02-12     6  6.000000
#> 44    NA 2017-02-13    NA  6.285714
#> 45    NA 2017-02-14    NA  6.571429
#> 46    NA 2017-02-15    NA  6.857143
#> 47    NA 2017-02-16    NA  7.142857
#> 48    NA 2017-02-17    NA  7.428571
#> 49    NA 2017-02-18    NA  7.714286
#> 50     8 2017-02-19     8  8.000000
#> 51    NA 2017-02-20    NA  8.285714
#> 52    NA 2017-02-21    NA  8.571429
#> 53    NA 2017-02-22    NA  8.857143
#> 54    NA 2017-02-23    NA  9.142857
#> 55    NA 2017-02-24    NA  9.428571
#> 56    NA 2017-02-25    NA  9.714286
#> 57     9 2017-02-26    10 10.000000

edited version to include jakes comment:

mydf <- mydf %>% 
  mutate(date=lubridate::dmy(date))

end_row <- tibble(date=max(mydf$date)+7,value=0)

bind_rows(mydf, end_row) %>% 
  pad(.,interval="day") %>%  
  mutate(value2=approx(date,value,date)$y) %>% 
  filter(date<dmy("28.02.2017")+1)

Upvotes: 2

Related Questions