Reputation: 522

Add date points between separate dates in a dataframe and create blanks (NA) in the other columns were those newly rows were created in r

This is how my data looks like:

> dput(head(h01_NDVI_specveg_data_spectra,6))
structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139, 
15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868, 
0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

I have separate dates without order as you can see in the example (ex.: 2011-04-12; 2011-04-28; 2011-05-31...). What I want is to insert the missing dates between the dates that I have. On top of that, consequently, I want to create new rows for the other columns, where for NDVI those rows would be NA.

Check this example of the desired output:

ID	collection_date	NDVI
h01	2011-04-12	0.5817694
h01	2011-04-13	NA
h01	2011-04-14	NA
h01	2011-04-15	NA
h01	2011-04-16	NA
h01	2011-04-17	NA
h01	2011-04-18	NA
h01	2011-04-19	NA
h01	2011-04-20	NA
h01	2011-04-21	NA
h01	2011-04-22	NA
h01	2011-04-23	NA
h01	2011-04-24	NA
h01	2011-04-25	NA
h01	2011-04-26	NA
h01	2011-04-27	NA
h01	2011-04-28	0.5394456
h01	2011-04-29	NA
h01	2011-04-30	NA
...	..........	..

Any help will be much appreciated.

Upvotes: 1

Answers (3)

Ronak Shah

Reputation: 389235

You may use tidyr::complete -

library(dplyr)
library(tidyr)

df %>%
  group_by(ID) %>%
  complete(collection_date = seq(min(collection_date), 
                 max(collection_date), by = 'days')) %>%
  ungroup

#   ID    collection_date   NDVI
#   <chr> <date>           <dbl>
# 1 h01   2011-04-12       0.582
# 2 h01   2011-04-13      NA    
# 3 h01   2011-04-14      NA    
# 4 h01   2011-04-15      NA    
# 5 h01   2011-04-16      NA    
# 6 h01   2011-04-17      NA    
# 7 h01   2011-04-18      NA    
# 8 h01   2011-04-19      NA    
# 9 h01   2011-04-20      NA    
#10 h01   2011-04-21      NA    
#11 h01   2011-04-22      NA    
#12 h01   2011-04-23      NA    
#13 h01   2011-04-24      NA    
#14 h01   2011-04-25      NA    
#15 h01   2011-04-26      NA    
#16 h01   2011-04-27      NA    
#17 h01   2011-04-28       0.539
#18 h01   2011-04-29      NA    
#19 h01   2011-04-30      NA    
#20 h01   2011-05-01      NA  
#...
#...

The benefit of this approach would be that it would create missing dates based on min and max for each ID.

Upvotes: 1

Marek Fiołka

Reputation: 4949

library(tidyverse)
library(lubridate)
df = structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"
), collection_date = structure(c(15076, 15092, 15125, 15139, 
 15159, 15170), class = "Date"), NDVI = c(0.581769436997319, 0.539445628997868, 
 0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155
 )), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))


df2 = tibble(
  ID = "h01",
  collection_date = seq(ymd("2011-04-10"), ymd("2011-07-16"), 1)
) %>% left_join(df, by = c("ID", "collection_date"))

df2 %>% head(10)

output

# A tibble: 98 x 3
   ID    collection_date   NDVI
   <chr> <date>           <dbl>
 1 h01   2011-04-10      NA    
 2 h01   2011-04-11      NA    
 3 h01   2011-04-12       0.582
 4 h01   2011-04-13      NA    
 5 h01   2011-04-14      NA    
 6 h01   2011-04-15      NA    
 7 h01   2011-04-16      NA    
 8 h01   2011-04-17      NA    
 9 h01   2011-04-18      NA    
10 h01   2011-04-19      NA    
# ... with 88 more rows

output df2 %>% tail(10)

# A tibble: 10 x 3
   ID    collection_date   NDVI
   <chr> <date>           <dbl>
 1 h01   2011-07-07      NA    
 2 h01   2011-07-08      NA    
 3 h01   2011-07-09      NA    
 4 h01   2011-07-10      NA    
 5 h01   2011-07-11      NA    
 6 h01   2011-07-12      NA    
 7 h01   2011-07-13      NA    
 8 h01   2011-07-14      NA    
 9 h01   2011-07-15       0.269
10 h01   2011-07-16      NA

Upvotes: 1

dario

Reputation: 6483

df1 <- structure(list(ID = c("h01", "h01", "h01", "h01", "h01", "h01"),
                      collection_date = structure(c(15076, 15092, 15125, 15139, 
                                 15159, 15170), class = "Date"),
                      NDVI = c(0.581769436997319, 0.539445628997868, 0.338541666666667, 0.302713987473904, 0.305882352941176, 0.269439421338155)),
                 row.names = c(NA, -6L), class = c("data.frame"))

We create a data.frame containing all dates and tidyr::left_join it with the existing (incomplete) data. The NA are created automatically.

library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days")) %>% 
  left_join(df1) %>% 
  arrange(collection_date) %>% 
  select(ID, collection_date, everything())

Returns:

     ID collection_date      NDVI
1   h01      2011-04-12 0.5817694
2  <NA>      2011-04-13        NA
3  <NA>      2011-04-14        NA
4  <NA>      2011-04-15        NA
5  <NA>      2011-04-16        NA
6  <NA>      2011-04-17        NA
7  <NA>      2011-04-18        NA
8  <NA>      2011-04-19        NA
9  <NA>      2011-04-20        NA
10 <NA>      2011-04-21        NA
11 <NA>      2011-04-22        NA
12 <NA>      2011-04-23        NA
13 <NA>      2011-04-24        NA
14 <NA>      2011-04-25        NA
15 <NA>      2011-04-26        NA
16 <NA>      2011-04-27        NA
17  h01      2011-04-28 0.5394456
18 <NA>      2011-04-29        NA
19 <NA>      2011-04-30        NA
20 <NA>      2011-05-01        NA
21 <NA>      2011-05-02        NA
22 <NA>      2011-05-03        NA
23 <NA>      2011-05-04        NA
24 <NA>      2011-05-05        NA
25 <NA>      2011-05-06        NA
26 <NA>      2011-05-07        NA
27 <NA>      2011-05-08        NA
28 <NA>      2011-05-09        NA
29 <NA>      2011-05-10        NA
30 <NA>      2011-05-11        NA
31 <NA>      2011-05-12        NA
32 <NA>      2011-05-13        NA
33 <NA>      2011-05-14        NA
34 <NA>      2011-05-15        NA
35 <NA>      2011-05-16        NA
36 <NA>      2011-05-17        NA
37 <NA>      2011-05-18        NA
38 <NA>      2011-05-19        NA
39 <NA>      2011-05-20        NA
40 <NA>      2011-05-21        NA
41 <NA>      2011-05-22        NA
42 <NA>      2011-05-23        NA
43 <NA>      2011-05-24        NA
44 <NA>      2011-05-25        NA
45 <NA>      2011-05-26        NA
46 <NA>      2011-05-27        NA
47 <NA>      2011-05-28        NA
48 <NA>      2011-05-29        NA
49 <NA>      2011-05-30        NA
50  h01      2011-05-31 0.3385417
51 <NA>      2011-06-01        NA
52 <NA>      2011-06-02        NA
53 <NA>      2011-06-03        NA
54 <NA>      2011-06-04        NA
55 <NA>      2011-06-05        NA
56 <NA>      2011-06-06        NA
57 <NA>      2011-06-07        NA
58 <NA>      2011-06-08        NA
59 <NA>      2011-06-09        NA
60 <NA>      2011-06-10        NA
61 <NA>      2011-06-11        NA
62 <NA>      2011-06-12        NA
63 <NA>      2011-06-13        NA
64  h01      2011-06-14 0.3027140
65 <NA>      2011-06-15        NA
66 <NA>      2011-06-16        NA
67 <NA>      2011-06-17        NA
68 <NA>      2011-06-18        NA
69 <NA>      2011-06-19        NA
70 <NA>      2011-06-20        NA
71 <NA>      2011-06-21        NA
72 <NA>      2011-06-22        NA
73 <NA>      2011-06-23        NA
74 <NA>      2011-06-24        NA
75 <NA>      2011-06-25        NA
76 <NA>      2011-06-26        NA
77 <NA>      2011-06-27        NA
78 <NA>      2011-06-28        NA
79 <NA>      2011-06-29        NA
80 <NA>      2011-06-30        NA
81 <NA>      2011-07-01        NA
82 <NA>      2011-07-02        NA
83 <NA>      2011-07-03        NA
84  h01      2011-07-04 0.3058824
85 <NA>      2011-07-05        NA
86 <NA>      2011-07-06        NA
87 <NA>      2011-07-07        NA
88 <NA>      2011-07-08        NA
89 <NA>      2011-07-09        NA
90 <NA>      2011-07-10        NA
91 <NA>      2011-07-11        NA
92 <NA>      2011-07-12        NA
93 <NA>      2011-07-13        NA
94 <NA>      2011-07-14        NA
95  h01      2011-07-15 0.2694394

Edit:

In order to have ID = "h01" everywhere we just add it to the constructed data.frame. I.e.:

library(dplyr)
library(tidyr)
data.frame(collection_date = seq.Date(min(df1$collection_date), max(df1$collection_date), "days"),
           ID = "h01") %>% 
  left_join(df1) %>% 
  arrange(collection_date) %>% 
  select(ID, collection_date, everything())

Upvotes: 1

Add date points between separate dates in a dataframe and create blanks (NA) in the other columns were those newly rows were created in r

Answers (3)

Edit:

Related Questions