LEE
LEE

Reputation: 328

how to fill the NA values by using known formula from another dataframe in r

I have a dataframe named 'test1' like below, (here 'day' are "POSIXt" objects)

 day                     Rain      SWC_11    SWC_12    SWC_13    SWC_14   SWC_21   
01/01/2019  00:00:00     0.0        51    60      63         60        64 
02/01/2019  00:00:00     0.2        51.5      60.3      63.4     60.8      64.4
03/01/2019  00:00:00     0.0        51.3      60.3      63.3     60.6      64.1 
04/01/2019  00:00:00     0.4        NA        NA        NA       NA        NA   
05/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA
06/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA
07/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA
08/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA
09/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA
10/01/2019  00:00:00     0.0        NA        NA        NA       NA        NA

And another dataframe named 'test2', like below

    SWC_11_(Intercept)  SWC_11_slope  SWC_12_(Intercept)  SWC_12_slope  SWC_13_(Intercept)  SWC_13_slope  SWC_14_(Intercept)  SWC_14_slope  SWC_21(Intercept)  SWC_21_slope
    10471.95            -6.563423e-06    4063.32          -2.525118e-06     75040.76        -4.726106e-05        7742.763    -4.842427e-06     22965.85       -1.443707e-05

What I want to do now is fill in the missing (NA) values with the corresponding coefficients. I would have a model like this:

missing variables of SWC_11= SWC_11_(Intercept) + SWC_11_slope*day
missing variables of SWC_12= SWC_12_(Intercept)+ SWC_12_slope*day

Other columns are in the same manner. I think here sapply function should help,

 test1<- data.frame(sapply(test2, function(x) )))

But now I was kind of confused about how to write the function part then. Hope someone could help. Thanks.

Upvotes: 0

Views: 58

Answers (2)

Ronak Shah
Ronak Shah

Reputation: 388862

Conceptually this one is similar to @Duck's solution but maybe in fewer number of steps.

library(dplyr)
library(tidyr)
library(lubridate)

test2 %>%
  #Get the data in long format with SWC number
  pivot_longer(cols = everything(), names_to = c('name', '.value'), 
               names_pattern = '(SWC_\\d+).*(slope|Intercept)') %>%
  #Join the data with test1
  right_join(test1 %>% pivot_longer(cols = contains('SWC')), by = 'name') %>% 
  #Select first non-NA value between value and val
  mutate(value = coalesce(value, Intercept + slope * day(day))) %>%
  select(-Intercept, -slope) %>%
  #Get the data in wide format
  pivot_wider()

# A tibble: 10 x 7
#   day                  Rain  SWC_11 SWC_12  SWC_13 SWC_14  SWC_21
#   <dttm>              <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
# 1 2019-01-01 00:00:00   0      51     60      63     60      64  
# 2 2019-01-02 00:00:00   0.2    51.5   60.3    63.4   60.8    64.4
# 3 2019-01-03 00:00:00   0      51.3   60.3    63.3   60.6    64.1
# 4 2019-01-04 00:00:00   0.4 10472.  4063.  75041.  7743.  22966. 
# 5 2019-01-05 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
# 6 2019-01-06 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
# 7 2019-01-07 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
# 8 2019-01-08 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
# 9 2019-01-09 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
#10 2019-01-10 00:00:00   0   10472.  4063.  75041.  7743.  22966.

Upvotes: 0

Duck
Duck

Reputation: 39595

I would suggest a tidyverse approach where you reshape your data and then merge in order to compute the values for the missing variables. I am not clear about the day so what I did is extract the day from your date variable but you can change that if it is necessary. You have to do some cleaning steps for your variable names but all is in the code. Here the solution:

library(tidyverse)
#First format test2
test2 %>% pivot_longer(everything()) %>%
  #Mutate for cleaning
  mutate(name2=ifelse(grepl('Intercept',name),'Intercept','slope')) %>%
  mutate(name=gsub('Intercept|slope','',name),name=substr(name,1,6)) %>%
  #format to wide
  pivot_wider(names_from = name2,values_from=value) %>%
  #Left join with original test 1 in long format
  left_join(
    test1 %>% pivot_longer(-c(day,Rain)) %>%
      #Format date to extract days
      mutate(Day=as.numeric(format(as.Date(day,'%d/%m/%Y'),'%d')))) %>%
  #Compute new values
  mutate(value2=ifelse(is.na(value),Intercept+slope*Day,value)) %>%
  select(name,day,Rain,value2) %>%
  pivot_wider(names_from = name,values_from=value2)

Output:

# A tibble: 10 x 7
   day                  Rain  SWC_11 SWC_12  SWC_13 SWC_14  SWC_21
   <chr>               <dbl>   <dbl>  <dbl>   <dbl>  <dbl>   <dbl>
 1 01/01/2019 00:00:00   0      51     60      63     60      64  
 2 02/01/2019 00:00:00   0.2    51.5   60.3    63.4   60.8    64.4
 3 03/01/2019 00:00:00   0      51.3   60.3    63.3   60.6    64.1
 4 04/01/2019 00:00:00   0.4 10472.  4063.  75041.  7743.  22966. 
 5 05/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
 6 06/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
 7 07/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
 8 08/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
 9 09/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 
10 10/01/2019 00:00:00   0   10472.  4063.  75041.  7743.  22966. 

Some data used:

#Data 1
test1 <- structure(list(day = c("01/01/2019 00:00:00", "02/01/2019 00:00:00", 
"03/01/2019 00:00:00", "04/01/2019 00:00:00", "05/01/2019 00:00:00", 
"06/01/2019 00:00:00", "07/01/2019 00:00:00", "08/01/2019 00:00:00", 
"09/01/2019 00:00:00", "10/01/2019 00:00:00"), Rain = c(0, 0.2, 
0, 0.4, 0, 0, 0, 0, 0, 0), SWC_11 = c(51, 51.5, 51.3, NA, NA, 
NA, NA, NA, NA, NA), SWC_12 = c(60, 60.3, 60.3, NA, NA, NA, NA, 
NA, NA, NA), SWC_13 = c(63, 63.4, 63.3, NA, NA, NA, NA, NA, NA, 
NA), SWC_14 = c(60, 60.8, 60.6, NA, NA, NA, NA, NA, NA, NA), 
    SWC_21 = c(64, 64.4, 64.1, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 
-10L), class = "data.frame")

#Data2
test2 <- structure(list(SWC_11_.Intercept. = 10471.95, SWC_11_slope = -6.563423e-06, 
    SWC_12_.Intercept. = 4063.32, SWC_12_slope = -2.525118e-06, 
    SWC_13_.Intercept. = 75040.76, SWC_13_slope = -4.726106e-05, 
    SWC_14_.Intercept. = 7742.763, SWC_14_slope = -4.842427e-06, 
    SWC_21.Intercept. = 22965.85, SWC_21_slope = -1.443707e-05), class = "data.frame", row.names = c(NA, 
-1L))

Upvotes: 1

Related Questions