Reputation: 13
The code below is what I am currently using. It will take a tibble of population data and join a separate tibble with three different variables. I then am only running the process of taking three different lags of "variable_a", testing the correlation of them to the target variable "population, and then saving any of the lags that have a greater than 0.5 correlation as values in a "variable_a_lag" list.
The problem I am trying to solve is because some datasets are tested against variables a, b, c..., others are tested against variables i, j, k.... and so on. **Instead of hardcoding a different script for different variables, I want to be able to have code that pulls the variable column names into a list and then a loop is ran over all of those variables, similiar to what I demonstrated with "variable_a". I want to be able to save all the lag correlation figures as values exactly as I have here.
If there is a better idea, please let me know. If this is the correct thinking, please let me know how I can call the column names from the list in a loop that will allow me to perform the function below. Preemptive thank you.
library(dplyr)
population_data <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
population = c(15066061, 15549704, 18687056, 18206234, 18948074, 19424572, 19786907, 19545826, 18139647, 19633439, 17136602, 15589939)) %>%
mutate_if(is.character, as.Date)
external_variables <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
variable_a = c(0.317307116, 0.352468819, 0.366432676, 0.368892142, 0.365603593, 0.367213977, 0.384769145, 0.423860447, 0.460068567, 0.462793275, 0.415102119, 0.353779592),
variable_b = c(0.172175256, 0.187336159, 0.2064973, 0.225618438, 0.226878513, 0.225218414, 0.236999116, 0.241679394, 0.249559864, 0.264060727, 0.278541589, 0.293562483),
variable_c = c(0.326591281, 0.338010556, 0.341884545, 0.344272213, 0.348808904, 0.355742496, 0.363469639, 0.372550879, 0.387764013, 0.414237696, 0.453669722, 0.480433344)) %>%
mutate_if(is.character, as.Date)
data_variable_merge <- population_data %>%
left_join(external_variables, by = c("date" = "date"))
variable_a_lags <- data_variable_merge %>%
select(population, variable_a) %>%
mutate(lag_1 = lag(variable_a, n =1)) %>%
mutate(lag_2 = lag(variable_a, n =2)) %>%
mutate(lag_3 = lag(variable_a, n =3))
variable_a_correlation_table <- data.frame(variable_a_lags = numeric(),
correlation = numeric(),
stringsAsFactors = FALSE)
for (col_name in names(variable_a_lags)) {
variable_a_correlation <- cor(variable_a_lags$population, variable_a_lags[[col_name]], use = "complete.obs")
variable_a_correlation_table <- rbind(variable_a_correlation_table,
data.frame(variable_a_lags = col_name,
correlation = variable_a_correlation,
stringsAsFactors = FALSE))
}
variable_a_lags <- variable_a_correlation_table %>%
slice(-c(1:2)) %>%
mutate(correlation = abs(correlation - 0 )) %>%
filter(correlation > "0.5" ) %>%
pull(variable_a_lags)
Upvotes: 0
Views: 97
Reputation: 17544
With dplyr
I'd opt for a long format so correlations could be calculated by groups.
As cor()
works with matrices
x <- 1:5; y <- 1:5
(y_ = vapply(1:3, \(n_) dplyr::lag(y, n_), numeric(length(y))))
#> [,1] [,2] [,3]
#> [1,] NA NA NA
#> [2,] 1 NA NA
#> [3,] 2 1 NA
#> [4,] 3 2 1
#> [5,] 4 3 2
cor(x, y_, use = "pairwise.complete.obs")
#> [,1] [,2] [,3]
#> [1,] 1 1 1
, we can construct one for its y
argument in a helper function to get all pairwise correlations for one variable in one go.
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(tibble)
# correlations between x and (lag(y, 1), ..., lag(y, n)) as a 2 column tibble
lagged_cor <- function(x, y, n = 3){
cols <- setNames(1:n, paste0("lag_",1:n))
cor(
x,
vapply(cols, \(n_) lag(y, n_), numeric(length(y))),
use = "pairwise.complete.obs"
)[1,] %>%
enframe(name = "lag", value = "cor")
}
# test
lagged_cor(population_data$population, external_variables$variable_a)
#> # A tibble: 3 × 2
#> lag cor
#> <chr> <dbl>
#> 1 lag_1 0.0585
#> 2 lag_2 -0.632
#> 3 lag_3 -0.689
# join, wide to long, apply lagged_cor() by groups through
# reframe() instead of summarise() as lagged_cor() returns more than 1 rows;
# finally unnest 2-column cor_ tibble-in-a-column
correlation_table <-
population_data %>%
left_join(external_variables, by = join_by(date == date)) %>%
pivot_longer(!c(date, population), names_to = "var", cols_vary = "fastest") |>
reframe(cor_ = lagged_cor(population, value, 3), .by = var) |>
unnest(cor_)
Resulting correlations:
correlation_table
#> # A tibble: 9 × 3
#> var lag cor
#> <chr> <chr> <dbl>
#> 1 variable_a lag_1 0.0585
#> 2 variable_a lag_2 -0.632
#> 3 variable_a lag_3 -0.689
#> 4 variable_b lag_1 -0.0164
#> 5 variable_b lag_2 -0.409
#> 6 variable_b lag_3 -0.338
#> 7 variable_c lag_1 -0.373
#> 8 variable_c lag_2 -0.719
#> 9 variable_c lag_3 -0.672
Filter and extract:
correlation_table %>%
filter(abs(cor) > .5) %>%
split(~ var) %>%
lapply(pull, var = "lag") %>%
str()
#> List of 2
#> $ variable_a: chr [1:2] "lag_2" "lag_3"
#> $ variable_c: chr [1:2] "lag_2" "lag_3"
Intermediate steps:
## post-join:
#> # A tibble: 12 × 5
#> date population variable_a variable_b variable_c
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 2024-01-01 15066061 0.317 0.172 0.327
#> 2 2024-02-01 15549704 0.352 0.187 0.338
#> 3 2024-03-01 18687056 0.366 0.206 0.342
#> 4 2024-04-01 18206234 0.369 0.226 0.344
#> 5 2024-05-01 18948074 0.366 0.227 0.349
#> 6 2024-06-01 19424572 0.367 0.225 0.356
#> 7 2024-07-01 19786907 0.385 0.237 0.363
#> 8 2024-08-01 19545826 0.424 0.242 0.373
#> 9 2024-09-01 18139647 0.460 0.250 0.388
#> 10 2024-10-01 19633439 0.463 0.264 0.414
#> 11 2024-11-01 17136602 0.415 0.279 0.454
#> 12 2024-12-01 15589939 0.354 0.294 0.480
## post-pivot:
#> # A tibble: 36 × 4
#> date population var value
#> <date> <dbl> <chr> <dbl>
#> 1 2024-01-01 15066061 variable_a 0.317
#> 2 2024-01-01 15066061 variable_b 0.172
#> 3 2024-01-01 15066061 variable_c 0.327
#> 4 2024-02-01 15549704 variable_a 0.352
#> 5 2024-02-01 15549704 variable_b 0.187
#> 6 2024-02-01 15549704 variable_c 0.338
#> 7 2024-03-01 18687056 variable_a 0.366
#> 8 2024-03-01 18687056 variable_b 0.206
#> 9 2024-03-01 18687056 variable_c 0.342
#> 10 2024-04-01 18206234 variable_a 0.369
#> # ℹ 26 more rows
## post-reframe:
#> # A tibble: 9 × 2
#> var cor_$lag $cor
#> <chr> <chr> <dbl>
#> 1 variable_a lag_1 0.0585
#> 2 variable_a lag_2 -0.632
#> 3 variable_a lag_3 -0.689
#> 4 variable_b lag_1 -0.0164
#> 5 variable_b lag_2 -0.409
#> 6 variable_b lag_3 -0.338
#> 7 variable_c lag_1 -0.373
#> 8 variable_c lag_2 -0.719
#> 9 variable_c lag_3 -0.672
Example data:
population_data <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
population = c(15066061, 15549704, 18687056, 18206234, 18948074, 19424572, 19786907, 19545826, 18139647, 19633439, 17136602, 15589939)) %>%
mutate_if(is.character, as.Date)
external_variables <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
variable_a = c(0.317307116, 0.352468819, 0.366432676, 0.368892142, 0.365603593, 0.367213977, 0.384769145, 0.423860447, 0.460068567, 0.462793275, 0.415102119, 0.353779592),
variable_b = c(0.172175256, 0.187336159, 0.2064973, 0.225618438, 0.226878513, 0.225218414, 0.236999116, 0.241679394, 0.249559864, 0.264060727, 0.278541589, 0.293562483),
variable_c = c(0.326591281, 0.338010556, 0.341884545, 0.344272213, 0.348808904, 0.355742496, 0.363469639, 0.372550879, 0.387764013, 0.414237696, 0.453669722, 0.480433344)) %>%
mutate_if(is.character, as.Date)
Created on 2025-01-24 with reprex v2.1.1
Upvotes: 1
Reputation: 172
If you wanted to generalize it, you could write a series of functions and apply those to the list of relevant variables. Using the data you provided you could for example do the following:
# Identify the variables you're going to be iterating over. Here
# i've gone variable_a, variable_b and variable_c.
variables <- paste0("variable_", letters[1:3])
# Make a function generalized so that the lag can be calculated
# for different variables of intrest (i.e. variable_a,
# variable_b, etc.)
lag.calc <- function(data, variable){
variable_lag = data %>%
select(population, any_of(variable)) %>%
mutate(lag_1 = lag(!!sym(variable), n = 1)) %>%
mutate(lag_2 = lag(!!sym(variable), n = 2)) %>%
mutate(lag_3 = lag(!!sym(variable), n = 3))
return(variable_lag)
}
# Apply the function using sapply. This iterates over the vector
# of variables to calculate lag per variable and produces a list
# of data frame's as output. Each element in the list contains
# results of the lag calculations for a specific variable.
lags <- sapply(variables,
lag.calc,
data = data_variable_merge,
USE.NAMES = TRUE,
simplify = FALSE)
# Make a function to calculate correlation. This function can
# essentially be used to perform a nested loop. The function
# first selects an element from the list (a data frame) of lag
# data just generated and then subsequently uses another function
# to calculate the correlation coefficients between variables in
# the selected list element.
corr.calc <- function(data, variable1){
dat = data[[variable1]]
df = sapply(names(dat),
function(variable2){
correlation = cor(dat[,"population"],
dat[,variable2],
use = "complete.obs")
return(correlation)
})
corr_table = data.frame(variable = names(df),
correlation = df,
row.names = NULL,
stringsAsFactors = FALSE)
return(corr_table)
}
# Now the function is applied to the list containing the lag data
# per variable.
all_correlations <- sapply(variables,
corr.calc,
data = lags,
USE.NAMES = TRUE,
simplify = FALSE)
# Now the final function is defined with which the selection of
# variables for which the correlation coefficient is greater than
# 0.5 can be generalized.
corr.select <- function(data, variable){
dat = data[[variable]]
df = dat %>%
slice(-c(1:2)) %>%
filter(abs(correlation) > 0.5 ) %>%
pull(variable)
return(df)
}
# Implement the function
selected_correlations <- sapply(variables,
corr.select,
data = all_correlations,
USE.NAMES = TRUE)
Upvotes: 1