How to reduce the run time of STAN code by vectorizing it and removing redundant loops?

Question

I have this below STAN code for marketing mix modelling application with thousands records and hundreds of media/control variables. Using** pystan version 2.19.1.1** I am new to stan. This current setup is taking 10-12 hours to run daily granular data with hundreds of media+control variables. STAN code is below

functions {
 // the Hill function
 real Hill(real t, real ec, real slope) {
  return 1 / (1 + (t / ec)^(-slope));
 }
 // the adstock transformation with a vector of weights
 real Adstock(row_vector t, row_vector weights) {
  return dot_product(t, weights) / sum(weights);
 }
}

data {
 // the total number of observations
 int N;
 
  // the total number of training observations
 int T;
 // the total number of holdout observations
 int H;
 
 int n_interactions;
 int interaction_left[n_interactions];
 int interaction_right[n_interactions];

 int tau_dist_type;

 int noise_var_dist_type;


 real tau_dist_mean;
 real tau_dist_sd;

 real noise_var_dist_mean;
 real noise_var_dist_sd;


 // training data indexes 
 int training_index[T];
 // holdout data indexes 
 int holdout_index[H];

 real Y_train[T];
 real Y_holdout[H];
 
 // the maximum duration of lag effect, in weeks
 int max_lag;
 // the number of media channels
 int num_media;
 row_vector[num_media] media_prior_dist_type;
 row_vector[num_media] media_prior_mean;
 row_vector[num_media] media_prior_sd;

 row_vector[num_media] retain_rate_dist_type;
 row_vector[num_media] retain_rate_dist_mean;
 row_vector[num_media] retain_rate_dist_sd;

 row_vector[num_media] delay_dist_type;
 row_vector[num_media] delay_dist_mean;
 row_vector[num_media] delay_dist_sd;

 row_vector[num_media] slope_dist_type;
 row_vector[num_media] slope_dist_mean;
 row_vector[num_media] slope_dist_sd;

 row_vector[num_media] ec_dist_type;
 row_vector[num_media] ec_dist_mean;
 row_vector[num_media] ec_dist_sd;



 
 // a vector of 0 to max_lag - 1
 //row_vector[max_lag] lag_vec;
 // 3D array of media variables
 row_vector[max_lag] X_media[N, num_media];
 // the number of other control variables
 int num_ctrl;
 row_vector[num_ctrl] ctrl_prior_dist_type;
 row_vector[num_ctrl] ctrl_prior_mean;
 row_vector[num_ctrl] ctrl_prior_sd;
 
 // a matrix of control variables
 row_vector[num_ctrl] X_ctrl[N];
 
 
 row_vector[num_media] slope;
}

parameters {
 // residual variance
 real noise_var;
 // the intercept
 real tau;
 // the coefficients for media variables
 vector[num_media] beta_medias;
 // coefficients for other control variables
 vector[num_ctrl] gamma_ctrl;
 // the retention rate and delay parameter for the adstock transformation of
 // each media
 vector[num_media] retain_rate;
 //vector[num_media] delay;
 // ec50 and slope for Hill function of each media
 vector[num_media] ec;
 vector[n_interactions] beta_interactions;
 // vector[num_media] slope;
}

transformed parameters {
 // a vector of the mean response
 real mu[T];
 // the cumulative media effect after adstock
 real cum_effect;
 // the cumulative media effect after adstock, and then Hill transformation
 row_vector[num_media] cum_effects_hill[T];
 row_vector[max_lag] lag_weights;
 row_vector[n_interactions] cum_effects_hill_interaction[T];
 
 
 for (nn in 1:T) {
  for (media in 1 : num_media) {
   for (lag in 1 : max_lag) {
    lag_weights[lag] <- pow(retain_rate[media], (lag - 1) ); 
   }
   cum_effect <- Adstock(X_media[training_index[nn], media], lag_weights);
   cum_effects_hill[nn, media] <- Hill(cum_effect, ec[media], slope[media]);
  }
  

  
  if(n_interactions > 0)
   for (inter in 1:n_interactions){
    cum_effects_hill_interaction[nn,inter] = cum_effects_hill[nn,interaction_left[inter]]*cum_effects_hill[nn,interaction_right[inter]]; 
   }
   
  if(n_interactions > 0) 
    mu[nn] <- tau +
              dot_product(cum_effects_hill[nn], beta_medias) +
              dot_product(X_ctrl[training_index[nn]], gamma_ctrl) + 
              dot_product(cum_effects_hill_interaction[nn],beta_interactions);
  else  
   mu[nn] <- tau +
            dot_product(cum_effects_hill[nn], beta_medias) +
            dot_product(X_ctrl[training_index[nn]], gamma_ctrl);
 }
}
model {

    tau ~ normal(tau_dist_mean,tau_dist_sd);

  for (media_index in 1 : num_media) {
     beta_medias[media_index] ~ normal(media_prior_mean[media_index],media_prior_sd[media_index]);
 
   

    retain_rate[media_index] ~ normal(retain_rate_dist_mean[media_index],retain_rate_dist_sd[media_index]);
   
 
    slope[media_index] ~ normal(slope_dist_mean[media_index],slope_dist_sd[media_index]);
    ec[media_index] ~ beta(ec_dist_mean[media_index],ec_dist_sd[media_index]);
  
}
  for (ctrl_index in 1 : num_ctrl) {
 
    gamma_ctrl[ctrl_index] ~ normal(ctrl_prior_mean[ctrl_index],ctrl_prior_sd[ctrl_index]);
  
    
  }
 
  noise_var ~ inv_gamma(noise_var_dist_mean,noise_var_dist_sd);
  Y_train ~ normal(mu, sqrt(noise_var));
}

import pystan
stan_file = 'MMMv2.stan'
stanmodel = pystan.stan(stan_file,data=stan_data,chains = chains,control=dict({'max_treedepth':max_treedepth,'adapt_delta':adapt_delta,'stepsize' : stepsize}), iter = iterations, verbose = False,n_jobs = n_jobs,seed = 9966)

Here if you observe in the above transformed parameter block of stan code, it is looping through each record in the data file with max lag 40 days and hence number of iterations are very huge.

My objective is to replace this for loops with matrix/array multiplications and change the *hill * and *adstock * functions to accept matrix/array. I tried to vectorized this by below code and with new version of pystan-3.10.0 but this code getting several error, can you help me to vectorize this stan code so I can reduce the run time. Or if there is another way to reduce run time.

functions {
  // The Hill function (vectorized for each media channel)
  vector Hill(vector t, vector ec, vector slope) {
    return 1 ./ (1 + pow((t ./ ec),(-slope)));
  }

  // The Adstock transformation (vectorized)
  vector Adstock(vector t, vector weights) {
    return (t.*weights)/ rowwise_sum(weights);
  }
}

data {
 // the total number of observations
 int N;
 
  // the total number of training observations
 int T;
 // the total number of holdout observations
 int H;
 
 int n_interactions;
 array[n_interactions] int interaction_left;
 array[n_interactions] int interaction_right;

 int tau_dist_type;

 int noise_var_dist_type;


 real tau_dist_mean;
 real tau_dist_sd;

 real noise_var_dist_mean;
 real noise_var_dist_sd;


 // training data indexes 
 array[T] int training_index;
 // holdout data indexes 
 array[H] int holdout_index;

 array[T] real Y_train;
 array[H] real Y_holdout;
 
 // the maximum duration of lag effect, in weeks
 int max_lag;
 // the number of media channels
 int num_media;
 row_vector[num_media] media_prior_dist_type;
 row_vector[num_media] media_prior_mean;
 row_vector[num_media] media_prior_sd;

 row_vector[num_media] retain_rate_dist_type;
 row_vector[num_media] retain_rate_dist_mean;
 row_vector[num_media] retain_rate_dist_sd;

 row_vector[num_media] delay_dist_type;
 row_vector[num_media] delay_dist_mean;
 row_vector[num_media] delay_dist_sd;

 row_vector[num_media] slope_dist_type;
 row_vector[num_media] slope_dist_mean;
 row_vector[num_media] slope_dist_sd;

 row_vector[num_media] ec_dist_type;
 row_vector[num_media] ec_dist_mean;
 row_vector[num_media] ec_dist_sd;



 
 // a vector of 0 to max_lag - 1
 //row_vector[max_lag] lag_vec;
 // 3D array of media variables
 array[N, num_media] row_vector[max_lag] X_media;
 // the number of other control variables
 int num_ctrl;
 row_vector[num_ctrl] ctrl_prior_dist_type;
 row_vector[num_ctrl] ctrl_prior_mean;
 row_vector[num_ctrl] ctrl_prior_sd;
 
 // a matrix of control variables
 array[N] row_vector[num_ctrl] X_ctrl;
 
 
 row_vector[num_media] slope;
}

parameters {
 // residual variance
 real noise_var;
 // the intercept
 real tau;
 // the coefficients for media variables
 vector[num_media] beta_medias;
 // coefficients for other control variables
 vector[num_ctrl] gamma_ctrl;
 // the retention rate and delay parameter for the adstock transformation of
 // each media
 vector[num_media] retain_rate;
 //vector[num_media] delay;
 // ec50 and slope for Hill function of each media
 vector[num_media] ec;
 vector[n_interactions] beta_interactions;
 // vector[num_media] slope;
}

transformed parameters {
 matrix[T, num_media] cum_effects;               // Cumulative effects after Adstock
 matrix[T, num_media] cum_effects_hill;          // After Hill transformation
 matrix[T, n_interactions] cum_effects_hill_interaction; // Interaction effects
 vector[T] mu; 
 array[num_media] row_vector[max_lag] lag_weights; // Mean response

 for (media in 1 : num_media) {
   for (lag in 1 : max_lag) {
    lag_weights[media,lag] = pow(retain_rate[media], (lag - 1) ); 
   }
 }

  // Apply Adstock transformation
  
  cum_effects = Adstock(X_media[training_index, ], lag_weights);

  // Apply Hill transformation using vectorized `Hill` function
  cum_effects_hill = Hill(cum_effects, ec, slope);

  // Compute interaction effects if applicable
  if (n_interactions > 0) {
    for (inter in 1:n_interactions) {
      cum_effects_hill_interaction[, inter] = 
        cum_effects_hill[, interaction_left[inter]] .* cum_effects_hill[, interaction_right[inter]];
    }
  }

  // Compute mu using dot products for medias, controls, and interactions
  if (n_interactions > 0) {
    mu = tau + 
         cum_effects_hill * beta_medias + 
         X_ctrl[training_index, ] * gamma_ctrl + 
         cum_effects_hill_interaction * beta_interactions;
  } else {
    mu = tau + 
         cum_effects_hill * beta_medias + 
         X_ctrl[training_index, ] * gamma_ctrl;
  }
}

model {
    tau ~ normal(tau_dist_mean,tau_dist_sd);
  for (media_index in 1 : num_media) {
    beta_medias[media_index] ~ normal(media_prior_mean[media_index],media_prior_sd[media_index]);

 retain_rate[media_index] ~    normal(retain_rate_dist_mean[media_index],retain_rate_dist_sd[media_index]);
    slope[media_index] ~ normal(slope_dist_mean[media_index],slope_dist_sd[media_index]);
    ec[media_index] ~ beta(ec_dist_mean[media_index],ec_dist_sd[media_index]);
  }
  for (ctrl_index in 1 : num_ctrl) {
    gamma_ctrl[ctrl_index] ~ normal(ctrl_prior_mean[ctrl_index],ctrl_prior_sd[ctrl_index]); 
  }
  
  noise_var ~ inv_gamma(noise_var_dist_mean,noise_var_dist_sd);
  Y_train ~ normal(mu, sqrt(noise_var));
}

import stan
import nest_asyncio
nest_asyncio.apply()

# Path to your Stan model file
stan_file = 'MMMv2.stan'

# Read the Stan model code
with open(stan_file, 'r') as file:
    stan_code = file.read()

# Compile the Stan model
stan_model = stan.build(stan_code, data=stan_data, random_seed=9966)

# Sample from the posterior
fit = stan_model.sample(
    num_chains=chains,
    num_samples=iterations,
    num_warmup=int(iterations / 2),
    adapt_delta=adapt_delta,
    max_treedepth=max_treedepth,
    step_size=stepsize
)

# Access results
print(fit)

Let me know if I missed any information. pardon me, this is my first question to stackoverflow. Thanks in advance.

How to reduce the run time of STAN code by vectorizing it and removing redundant loops?

Answers (0)

Related Questions