chipsin
chipsin

Reputation: 675

Automated interpolation of missing cells

I am trying to sort a variable containing NA values by another column, then perform interpolation on the variable containing the NA values using the imputeTS::na_interpolation function. I can do this on a column by column basis, but I am struggling to automate this process across multiple columns. An example dataset is below. I have Var_A with NA values, and column A_rank which I would like to sort by before performing the interpolation on Var_A.

df <- data.frame("ID" = 1:16)
df$Var_A <- c(NA_real_,NA_real_,NA_real_,1,2,1,3,NA_real_,4,1,5,14,NA_real_,NA_real_,NA_real_,NA_real_)
df$Var_B <- c(10,NA_real_,NA_real_,0,12,12,NA_real_,12,0,14,NA_real_,14,16,16,16,16)
df$Var_C <- c(10,12,14,NA_real_,10,12,14,16,10,12,14,16,10,NA_real_,14,16)
df$Var_D <- c(10,12,14,16,10,12,NA_real_,16,10,12,14,16,10,12,14,NA_real_)
df$A_rank <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)
df$B_rank <- c(7,9,10,8,11,12,14,13,15,4,5,3,6,2,16,1)
df$C_rank <- c(1,12,2,16,3,4,5,15,6,7,9,8,10,11,13,14)
df$D_rank <- c(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)
df

   ID Var_A Var_B Var_C Var_D A_rank B_rank C_rank D_rank
1   1    NA    10    10    10      1      7      1     16
2   2    NA    NA    12    12      2      9     12     15
3   3    NA    NA    14    14      3     10      2     14
4   4     1     0    NA    16      4      8     16     13
5   5     2    12    10    10      5     11      3     12
6   6     1    12    12    12      6     12      4     11
7   7     3    NA    14    NA      7     14      5     10
8   8    NA    12    16    16      8     13     15      9
9   9     4     0    10    10      9     15      6      8
10 10     1    14    12    12     10      4      7      7
11 11     5    NA    14    14     11      5      9      6
12 12    14    14    16    16     12      3      8      5
13 13    NA    16    10    10     13      6     10      4
14 14    NA    16    NA    12     14      2     11      3
15 15    NA    16    14    14     15     16     13      2
16 16    NA    16    16    NA     16      1     14      1

I can do this on a column by column basis using the code below, but I have two problems. The first is that this is a manual task and I would preferably like to work with an automated solution as I have numerous columns to work on. Below I have also included an attempt to automate this sorting and interpolating methods in lapply which worked, but failed when I extended to include an arrange statement at the end. The second problem I have is is that I only want to apply the interpolation between values (i.e. I do not want to extrapolate).

df <- df %>% arrange(A_rank) 
df <- df %>% mutate(Var_A_new = na_interpolation(Var_A, option = "linear"))
df <- df %>% arrange(B_rank) 
df <- df %>% mutate(Var_B_new = na_interpolation(Var_B, option = "linear"))
df <- df %>% arrange(C_rank) 
df <- df %>% mutate(Var_C_new = na_interpolation(Var_C, option = "linear"))
df <- df %>% arrange(D_rank) 
df <- df %>% mutate(Var_D_new = na_interpolation(Var_D, option = "linear"))

   ID Var_A Var_B Var_C Var_D A_rank B_rank C_rank D_rank Var_A_new Var_B_new Var_C_new Var_D_new
1   1    NA    10    10    10      1      7      1     16       1.0        10        10        10
2   2    NA    NA    12    12      2      9     12     15       1.0         4        12        12
3   3    NA    NA    14    14      3     10      2     14       1.0         8        14        14
4   4     1     0    NA    16      4      8     16     13       1.0         0        16        16
5   5     2    12    10    10      5     11      3     12       2.0        12        10        10
6   6     1    12    12    12      6     12      4     11       1.0        12        12        12
7   7     3    NA    14    NA      7     14      5     10       3.0         6        14        14
8   8    NA    12    16    16      8     13     15      9       3.5        12        16        16
9   9     4     0    10    10      9     15      6      8       4.0         0        10        10
10 10     1    14    12    12     10      4      7      7       1.0        14        12        12
11 11     5    NA    14    14     11      5      9      6       5.0        15        14        14
12 12    14    14    16    16     12      3      8      5      14.0        14        16        16
13 13    NA    16    10    10     13      6     10      4      14.0        16        10        10
14 14    NA    16    NA    12     14      2     11      3      14.0        16        11        12
15 15    NA    16    14    14     15     16     13      2      14.0        16        14        14
16 16    NA    16    16    NA     16      1     14      1      14.0        16        16        14


int.cols <- grep('Var', names(df), value = TRUE)

sort.cols <- grep('_rank', names(df), value = TRUE)

lapply(int.cols, function(x) df %>% na_interpolation(x, option = "linear") %>% arrange(sort.cols))

Below is the desired output with interpolation only occurring on cells between the the lower and upper ranked bounds. For example, in Var_A, only one NA value is replaced in the Var_A_new column (i.e. ID = 8) as as the other NA values are at the start and end of the ranked vector. This is also the for ID = 4 for Var_C and ID = 16 for Var_D, as these cells occur at the ends of the ranked vector and should not be extrapolated.

   ID Var_A Var_B Var_C Var_D A_rank B_rank C_rank D_rank Var_A_new Var_B_new Var_C_new Var_D_new
1   1    NA    10    10    10      1      7      1     16        NA        10        10        10
2   2    NA    NA    12    12      2      9     12     15        NA         4        12        12
3   3    NA    NA    14    14      3     10      2     14        NA         8        14        14
4   4     1     0    NA    16      4      8     16     13       1.0         0        NA        16
5   5     2    12    10    10      5     11      3     12       2.0        12        10        10
6   6     1    12    12    12      6     12      4     11       1.0        12        12        12
7   7     3    NA    14    NA      7     14      5     10       3.0         6        14        14
8   8    NA    12    16    16      8     13     15      9        NA        12        16        16
9   9     4     0    10    10      9     15      6      8       4.0         0        10        10
10 10     1    14    12    12     10      4      7      7       1.0        14        12        12
11 11     5    NA    14    14     11      5      9      6       5.0        15        14        14
12 12    14    14    16    16     12      3      8      5      14.0        14        16        16
13 13    NA    16    10    10     13      6     10      4        NA        16        10        10
14 14    NA    16    NA    12     14      2     11      3        NA        16        11        12
15 15    NA    16    14    14     15     16     13      2        NA        16        14        14
16 16    NA    16    16    NA     16      1     14      1        NA        16        16        NA

Upvotes: 0

Views: 57

Answers (1)

Valeri Voev
Valeri Voev

Reputation: 2242

Probably not the most elegant solution, plus assumes that column names conform to some patterns, but it works and can be generalized:

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
library(imputeTS)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo

arrange_and_impute <- function(df, arr_col, impute_col) {
  new_col_name <- paste0(impute_col, "_new")
  df <- df %>% arrange(!! sym(arr_col)) 
  df[[new_col_name]] <- imputeTS::na_interpolation(df[[impute_col]], option = "linear")
  df
}

df <- data.frame("ID" = 1:16)
df$Var_A <- c(NA_real_,NA_real_,NA_real_,1,2,1,3,NA_real_,4,1,5,14,NA_real_,NA_real_,NA_real_,NA_real_)
df$Var_B <- c(10,NA_real_,NA_real_,0,12,12,NA_real_,12,0,14,NA_real_,14,16,16,16,16)
df$Var_C <- c(10,12,14,NA_real_,10,12,14,16,10,12,14,16,10,NA_real_,14,16)
df$Var_D <- c(10,12,14,16,10,12,NA_real_,16,10,12,14,16,10,12,14,NA_real_)
df$A_rank <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)
df$B_rank <- c(7,9,10,8,11,12,14,13,15,4,5,3,6,2,16,1)
df$C_rank <- c(1,12,2,16,3,4,5,15,6,7,9,8,10,11,13,14)
df$D_rank <- c(16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)

var_colnames <- grep("Var", colnames(df), value = TRUE)
end_letters <- purrr::map_chr(var_colnames, ~substr(.x, nchar(.x), nchar(.x)))
rank_columns <- paste0(end_letters, "_rank")

dfs <- purrr::map2(.x = set_names(rank_columns, rank_columns), .y = var_colnames, ~arrange_and_impute(df, .x, .y) ) 

final_df <- dfs[[1]]
for (i in (2:length(dfs))) {
  
  final_df <- inner_join(final_df, dfs[[i]], by = c("ID", var_colnames, rank_columns) )
  
}
final_df
#>    ID Var_A Var_B Var_C Var_D A_rank B_rank C_rank D_rank Var_A_new Var_B_new
#> 1   1    NA    10    10    10      1      7      1     16       1.0        10
#> 2   2    NA    NA    12    12      2      9     12     15       1.0         4
#> 3   3    NA    NA    14    14      3     10      2     14       1.0         8
#> 4   4     1     0    NA    16      4      8     16     13       1.0         0
#> 5   5     2    12    10    10      5     11      3     12       2.0        12
#> 6   6     1    12    12    12      6     12      4     11       1.0        12
#> 7   7     3    NA    14    NA      7     14      5     10       3.0         6
#> 8   8    NA    12    16    16      8     13     15      9       3.5        12
#> 9   9     4     0    10    10      9     15      6      8       4.0         0
#> 10 10     1    14    12    12     10      4      7      7       1.0        14
#> 11 11     5    NA    14    14     11      5      9      6       5.0        15
#> 12 12    14    14    16    16     12      3      8      5      14.0        14
#> 13 13    NA    16    10    10     13      6     10      4      14.0        16
#> 14 14    NA    16    NA    12     14      2     11      3      14.0        16
#> 15 15    NA    16    14    14     15     16     13      2      14.0        16
#> 16 16    NA    16    16    NA     16      1     14      1      14.0        16
#>    Var_C_new Var_D_new
#> 1         10        10
#> 2         12        12
#> 3         14        14
#> 4         16        16
#> 5         10        10
#> 6         12        12
#> 7         14        14
#> 8         16        16
#> 9         10        10
#> 10        12        12
#> 11        14        14
#> 12        16        16
#> 13        10        10
#> 14        11        12
#> 15        14        14
#> 16        16        14

Created on 2020-09-14 by the reprex package (v0.3.0)

Upvotes: 2

Related Questions