Jacob Bruno
Jacob Bruno

Reputation: 13

How to use column names in a list of values in a loop?

The code below is what I am currently using. It will take a tibble of population data and join a separate tibble with three different variables. I then am only running the process of taking three different lags of "variable_a", testing the correlation of them to the target variable "population, and then saving any of the lags that have a greater than 0.5 correlation as values in a "variable_a_lag" list.

The problem I am trying to solve is because some datasets are tested against variables a, b, c..., others are tested against variables i, j, k.... and so on. **Instead of hardcoding a different script for different variables, I want to be able to have code that pulls the variable column names into a list and then a loop is ran over all of those variables, similiar to what I demonstrated with "variable_a". I want to be able to save all the lag correlation figures as values exactly as I have here.

If there is a better idea, please let me know. If this is the correct thinking, please let me know how I can call the column names from the list in a loop that will allow me to perform the function below. Preemptive thank you.

library(dplyr)

population_data <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
                          population = c(15066061, 15549704, 18687056, 18206234, 18948074, 19424572, 19786907, 19545826, 18139647, 19633439, 17136602, 15589939)) %>%
                          mutate_if(is.character, as.Date)

external_variables <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
                     variable_a = c(0.317307116, 0.352468819, 0.366432676, 0.368892142, 0.365603593, 0.367213977, 0.384769145, 0.423860447, 0.460068567, 0.462793275, 0.415102119, 0.353779592),
                     variable_b = c(0.172175256, 0.187336159, 0.2064973, 0.225618438, 0.226878513, 0.225218414, 0.236999116, 0.241679394, 0.249559864, 0.264060727, 0.278541589, 0.293562483),
                     variable_c = c(0.326591281, 0.338010556, 0.341884545, 0.344272213, 0.348808904, 0.355742496, 0.363469639, 0.372550879, 0.387764013, 0.414237696, 0.453669722, 0.480433344)) %>%
                     mutate_if(is.character, as.Date)

data_variable_merge <- population_data %>%
  left_join(external_variables, by = c("date" = "date"))

variable_a_lags <- data_variable_merge %>%
  select(population, variable_a) %>%
  mutate(lag_1 = lag(variable_a, n =1)) %>%
  mutate(lag_2 = lag(variable_a, n =2)) %>%
  mutate(lag_3 = lag(variable_a, n =3))

variable_a_correlation_table <- data.frame(variable_a_lags = numeric(),
                                              correlation = numeric(),
                                              stringsAsFactors = FALSE)

for (col_name in names(variable_a_lags)) {
variable_a_correlation <- cor(variable_a_lags$population, variable_a_lags[[col_name]], use = "complete.obs")

variable_a_correlation_table <- rbind(variable_a_correlation_table, 
data.frame(variable_a_lags = col_name,
                                                                                           
correlation = variable_a_correlation,
                                                                                           
stringsAsFactors = FALSE))
}

variable_a_lags <- variable_a_correlation_table %>%
 slice(-c(1:2)) %>%
 mutate(correlation = abs(correlation - 0 )) %>%
 filter(correlation > "0.5" ) %>%
 pull(variable_a_lags) 

Upvotes: 0

Views: 97

Answers (2)

margusl
margusl

Reputation: 17544

With dplyr I'd opt for a long format so correlations could be calculated by groups.
As cor() works with matrices

x <- 1:5; y <- 1:5
(y_ = vapply(1:3, \(n_) dplyr::lag(y, n_), numeric(length(y))))
#>      [,1] [,2] [,3]
#> [1,]   NA   NA   NA
#> [2,]    1   NA   NA
#> [3,]    2    1   NA
#> [4,]    3    2    1
#> [5,]    4    3    2
cor(x, y_, use = "pairwise.complete.obs")
#>      [,1] [,2] [,3]
#> [1,]    1    1    1

, we can construct one for its y argument in a helper function to get all pairwise correlations for one variable in one go.

library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(tibble)

# correlations between x and (lag(y, 1), ..., lag(y, n)) as a 2 column tibble
lagged_cor <- function(x, y, n = 3){
  cols <- setNames(1:n, paste0("lag_",1:n))
  cor(
    x,
    vapply(cols, \(n_) lag(y, n_), numeric(length(y))),
    use = "pairwise.complete.obs"
  )[1,] %>% 
  enframe(name = "lag", value = "cor")
}

# test
lagged_cor(population_data$population, external_variables$variable_a)
#> # A tibble: 3 × 2
#>   lag       cor
#>   <chr>   <dbl>
#> 1 lag_1  0.0585
#> 2 lag_2 -0.632 
#> 3 lag_3 -0.689

# join, wide to long, apply lagged_cor() by groups through 
# reframe() instead of summarise() as lagged_cor() returns more than 1 rows;
# finally unnest 2-column cor_ tibble-in-a-column
correlation_table <- 
  population_data %>%
  left_join(external_variables, by = join_by(date == date)) %>%
  pivot_longer(!c(date, population), names_to = "var", cols_vary = "fastest")  |> 
  reframe(cor_ = lagged_cor(population, value, 3), .by = var) |> 
  unnest(cor_)

Resulting correlations:

correlation_table
#> # A tibble: 9 × 3
#>   var        lag       cor
#>   <chr>      <chr>   <dbl>
#> 1 variable_a lag_1  0.0585
#> 2 variable_a lag_2 -0.632 
#> 3 variable_a lag_3 -0.689 
#> 4 variable_b lag_1 -0.0164
#> 5 variable_b lag_2 -0.409 
#> 6 variable_b lag_3 -0.338 
#> 7 variable_c lag_1 -0.373 
#> 8 variable_c lag_2 -0.719 
#> 9 variable_c lag_3 -0.672

Filter and extract:

correlation_table %>%
  filter(abs(cor) > .5) %>%
  split(~ var) %>%
  lapply(pull, var = "lag") %>%
  str()
#> List of 2
#>  $ variable_a: chr [1:2] "lag_2" "lag_3"
#>  $ variable_c: chr [1:2] "lag_2" "lag_3"

Intermediate steps:

## post-join:
#> # A tibble: 12 × 5
#>    date       population variable_a variable_b variable_c
#>    <date>          <dbl>      <dbl>      <dbl>      <dbl>
#>  1 2024-01-01   15066061      0.317      0.172      0.327
#>  2 2024-02-01   15549704      0.352      0.187      0.338
#>  3 2024-03-01   18687056      0.366      0.206      0.342
#>  4 2024-04-01   18206234      0.369      0.226      0.344
#>  5 2024-05-01   18948074      0.366      0.227      0.349
#>  6 2024-06-01   19424572      0.367      0.225      0.356
#>  7 2024-07-01   19786907      0.385      0.237      0.363
#>  8 2024-08-01   19545826      0.424      0.242      0.373
#>  9 2024-09-01   18139647      0.460      0.250      0.388
#> 10 2024-10-01   19633439      0.463      0.264      0.414
#> 11 2024-11-01   17136602      0.415      0.279      0.454
#> 12 2024-12-01   15589939      0.354      0.294      0.480

## post-pivot:
#> # A tibble: 36 × 4
#>    date       population var        value
#>    <date>          <dbl> <chr>      <dbl>
#>  1 2024-01-01   15066061 variable_a 0.317
#>  2 2024-01-01   15066061 variable_b 0.172
#>  3 2024-01-01   15066061 variable_c 0.327
#>  4 2024-02-01   15549704 variable_a 0.352
#>  5 2024-02-01   15549704 variable_b 0.187
#>  6 2024-02-01   15549704 variable_c 0.338
#>  7 2024-03-01   18687056 variable_a 0.366
#>  8 2024-03-01   18687056 variable_b 0.206
#>  9 2024-03-01   18687056 variable_c 0.342
#> 10 2024-04-01   18206234 variable_a 0.369
#> # ℹ 26 more rows

## post-reframe:
#> # A tibble: 9 × 2
#>   var        cor_$lag    $cor
#>   <chr>      <chr>      <dbl>
#> 1 variable_a lag_1     0.0585
#> 2 variable_a lag_2    -0.632 
#> 3 variable_a lag_3    -0.689 
#> 4 variable_b lag_1    -0.0164
#> 5 variable_b lag_2    -0.409 
#> 6 variable_b lag_3    -0.338 
#> 7 variable_c lag_1    -0.373 
#> 8 variable_c lag_2    -0.719 
#> 9 variable_c lag_3    -0.672

Example data:

population_data <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
                          population = c(15066061, 15549704, 18687056, 18206234, 18948074, 19424572, 19786907, 19545826, 18139647, 19633439, 17136602, 15589939)) %>%
                          mutate_if(is.character, as.Date)

external_variables <- tibble(date = c("2024-01-01", "2024-02-01", "2024-03-01", "2024-04-01", "2024-05-01", "2024-06-01", "2024-07-01", "2024-08-01", "2024-09-01", "2024-10-01", "2024-11-01", "2024-12-01"),
                     variable_a = c(0.317307116, 0.352468819, 0.366432676, 0.368892142, 0.365603593, 0.367213977, 0.384769145, 0.423860447, 0.460068567, 0.462793275, 0.415102119, 0.353779592),
                     variable_b = c(0.172175256, 0.187336159, 0.2064973, 0.225618438, 0.226878513, 0.225218414, 0.236999116, 0.241679394, 0.249559864, 0.264060727, 0.278541589, 0.293562483),
                     variable_c = c(0.326591281, 0.338010556, 0.341884545, 0.344272213, 0.348808904, 0.355742496, 0.363469639, 0.372550879, 0.387764013, 0.414237696, 0.453669722, 0.480433344)) %>%
                     mutate_if(is.character, as.Date)

Created on 2025-01-24 with reprex v2.1.1

Upvotes: 1

baobab
baobab

Reputation: 172

If you wanted to generalize it, you could write a series of functions and apply those to the list of relevant variables. Using the data you provided you could for example do the following:

# Identify the variables you're going to be iterating over. Here
# i've gone variable_a, variable_b and variable_c.
variables <- paste0("variable_", letters[1:3])

# Make a function generalized so that the lag can be calculated
# for different variables of intrest (i.e. variable_a,
# variable_b, etc.)
lag.calc <- function(data, variable){
  
  variable_lag = data %>%
    select(population, any_of(variable)) %>%
    mutate(lag_1 = lag(!!sym(variable), n = 1)) %>%
    mutate(lag_2 = lag(!!sym(variable), n = 2)) %>%
    mutate(lag_3 = lag(!!sym(variable), n = 3))
  
  return(variable_lag)
}

# Apply the function using sapply. This iterates over the vector
# of variables to calculate lag per variable and produces a list 
# of data frame's as output. Each element in the list contains 
# results of the lag calculations for a specific variable.
lags <- sapply(variables,
               lag.calc,
               data = data_variable_merge,
               USE.NAMES = TRUE,
               simplify = FALSE)

# Make a function to calculate correlation. This function can 
# essentially be used to perform a nested loop. The function 
# first selects an element from the list (a data frame) of lag 
# data just generated and then subsequently uses another function
# to calculate the correlation coefficients between variables in
# the selected list element.

corr.calc <- function(data, variable1){
  
  dat = data[[variable1]]
  
  df = sapply(names(dat),
              function(variable2){
                correlation = cor(dat[,"population"], 
                                  dat[,variable2], 
                                  use = "complete.obs")
                
                return(correlation)
              }) 

  corr_table = data.frame(variable = names(df),
                          correlation = df,
                          row.names = NULL,
                          stringsAsFactors = FALSE)
  
  return(corr_table)
}

# Now the function is applied to the list containing the lag data
# per variable.
all_correlations <- sapply(variables,
               corr.calc,
               data = lags,
               USE.NAMES = TRUE,
               simplify = FALSE)


# Now the final function is defined with which the selection of
# variables for which the correlation coefficient is greater than
# 0.5 can be generalized.
corr.select <- function(data, variable){
  
  dat = data[[variable]]
  
  df = dat %>%
    slice(-c(1:2)) %>%
    filter(abs(correlation) > 0.5 ) %>%
    pull(variable)
  
  return(df)
}

# Implement the function
selected_correlations <- sapply(variables,
                                corr.select,
                                data = all_correlations,
                                USE.NAMES = TRUE)

Upvotes: 1

Related Questions