Reputation: 1
I have this below STAN code for marketing mix modelling application with thousands records and hundreds of media/control variables. Using** pystan version 2.19.1.1** I am new to stan. This current setup is taking 10-12 hours to run daily granular data with hundreds of media+control variables. STAN code is below
functions {
// the Hill function
real Hill(real t, real ec, real slope) {
return 1 / (1 + (t / ec)^(-slope));
}
// the adstock transformation with a vector of weights
real Adstock(row_vector t, row_vector weights) {
return dot_product(t, weights) / sum(weights);
}
}
data {
// the total number of observations
int<lower=1> N;
// the total number of training observations
int<lower=1> T;
// the total number of holdout observations
int<lower=0> H;
int<lower=0> n_interactions;
int interaction_left[n_interactions];
int interaction_right[n_interactions];
int<lower=0> tau_dist_type;
int<lower=0> noise_var_dist_type;
real tau_dist_mean;
real<lower=0> tau_dist_sd;
real noise_var_dist_mean;
real<lower=0> noise_var_dist_sd;
// training data indexes
int training_index[T];
// holdout data indexes
int holdout_index[H];
real Y_train[T];
real Y_holdout[H];
// the maximum duration of lag effect, in weeks
int<lower=1> max_lag;
// the number of media channels
int<lower=1> num_media;
row_vector[num_media] media_prior_dist_type;
row_vector[num_media] media_prior_mean;
row_vector[num_media] media_prior_sd;
row_vector[num_media] retain_rate_dist_type;
row_vector[num_media] retain_rate_dist_mean;
row_vector[num_media] retain_rate_dist_sd;
row_vector[num_media] delay_dist_type;
row_vector[num_media] delay_dist_mean;
row_vector[num_media] delay_dist_sd;
row_vector[num_media] slope_dist_type;
row_vector[num_media] slope_dist_mean;
row_vector[num_media] slope_dist_sd;
row_vector[num_media] ec_dist_type;
row_vector[num_media] ec_dist_mean;
row_vector[num_media] ec_dist_sd;
// a vector of 0 to max_lag - 1
//row_vector[max_lag] lag_vec;
// 3D array of media variables
row_vector[max_lag] X_media[N, num_media];
// the number of other control variables
int<lower=1> num_ctrl;
row_vector[num_ctrl] ctrl_prior_dist_type;
row_vector[num_ctrl] ctrl_prior_mean;
row_vector[num_ctrl] ctrl_prior_sd;
// a matrix of control variables
row_vector[num_ctrl] X_ctrl[N];
row_vector<lower=0>[num_media] slope;
}
parameters {
// residual variance
real<lower=0> noise_var;
// the intercept
real tau;
// the coefficients for media variables
vector<lower=0>[num_media] beta_medias;
// coefficients for other control variables
vector[num_ctrl] gamma_ctrl;
// the retention rate and delay parameter for the adstock transformation of
// each media
vector<lower=0,upper=1>[num_media] retain_rate;
//vector<lower=0,upper=max_lag-1>[num_media] delay;
// ec50 and slope for Hill function of each media
vector<lower=0,upper=1>[num_media] ec;
vector<lower=0>[n_interactions] beta_interactions;
// vector<lower=0>[num_media] slope;
}
transformed parameters {
// a vector of the mean response
real mu[T];
// the cumulative media effect after adstock
real cum_effect;
// the cumulative media effect after adstock, and then Hill transformation
row_vector[num_media] cum_effects_hill[T];
row_vector[max_lag] lag_weights;
row_vector[n_interactions] cum_effects_hill_interaction[T];
for (nn in 1:T) {
for (media in 1 : num_media) {
for (lag in 1 : max_lag) {
lag_weights[lag] <- pow(retain_rate[media], (lag - 1) );
}
cum_effect <- Adstock(X_media[training_index[nn], media], lag_weights);
cum_effects_hill[nn, media] <- Hill(cum_effect, ec[media], slope[media]);
}
if(n_interactions > 0)
for (inter in 1:n_interactions){
cum_effects_hill_interaction[nn,inter] = cum_effects_hill[nn,interaction_left[inter]]*cum_effects_hill[nn,interaction_right[inter]];
}
if(n_interactions > 0)
mu[nn] <- tau +
dot_product(cum_effects_hill[nn], beta_medias) +
dot_product(X_ctrl[training_index[nn]], gamma_ctrl) +
dot_product(cum_effects_hill_interaction[nn],beta_interactions);
else
mu[nn] <- tau +
dot_product(cum_effects_hill[nn], beta_medias) +
dot_product(X_ctrl[training_index[nn]], gamma_ctrl);
}
}
model {
tau ~ normal(tau_dist_mean,tau_dist_sd);
for (media_index in 1 : num_media) {
beta_medias[media_index] ~ normal(media_prior_mean[media_index],media_prior_sd[media_index]);
retain_rate[media_index] ~ normal(retain_rate_dist_mean[media_index],retain_rate_dist_sd[media_index]);
slope[media_index] ~ normal(slope_dist_mean[media_index],slope_dist_sd[media_index]);
ec[media_index] ~ beta(ec_dist_mean[media_index],ec_dist_sd[media_index]);
}
for (ctrl_index in 1 : num_ctrl) {
gamma_ctrl[ctrl_index] ~ normal(ctrl_prior_mean[ctrl_index],ctrl_prior_sd[ctrl_index]);
}
noise_var ~ inv_gamma(noise_var_dist_mean,noise_var_dist_sd);
Y_train ~ normal(mu, sqrt(noise_var));
}
import pystan
stan_file = 'MMMv2.stan'
stanmodel = pystan.stan(stan_file,data=stan_data,chains = chains,control=dict({'max_treedepth':max_treedepth,'adapt_delta':adapt_delta,'stepsize' : stepsize}), iter = iterations, verbose = False,n_jobs = n_jobs,seed = 9966)
Here if you observe in the above transformed parameter block of stan code, it is looping through each record in the data file with max lag 40 days and hence number of iterations are very huge.
My objective is to replace this for loops with matrix/array multiplications and change the *hill * and *adstock * functions to accept matrix/array. I tried to vectorized this by below code and with new version of pystan-3.10.0 but this code getting several error, can you help me to vectorize this stan code so I can reduce the run time. Or if there is another way to reduce run time.
functions {
// The Hill function (vectorized for each media channel)
vector Hill(vector t, vector ec, vector slope) {
return 1 ./ (1 + pow((t ./ ec),(-slope)));
}
// The Adstock transformation (vectorized)
vector Adstock(vector t, vector weights) {
return (t.*weights)/ rowwise_sum(weights);
}
}
data {
// the total number of observations
int<lower=1> N;
// the total number of training observations
int<lower=1> T;
// the total number of holdout observations
int<lower=0> H;
int<lower=0> n_interactions;
array[n_interactions] int interaction_left;
array[n_interactions] int interaction_right;
int<lower=0> tau_dist_type;
int<lower=0> noise_var_dist_type;
real tau_dist_mean;
real<lower=0> tau_dist_sd;
real noise_var_dist_mean;
real<lower=0> noise_var_dist_sd;
// training data indexes
array[T] int training_index;
// holdout data indexes
array[H] int holdout_index;
array[T] real Y_train;
array[H] real Y_holdout;
// the maximum duration of lag effect, in weeks
int<lower=1> max_lag;
// the number of media channels
int<lower=1> num_media;
row_vector[num_media] media_prior_dist_type;
row_vector[num_media] media_prior_mean;
row_vector[num_media] media_prior_sd;
row_vector[num_media] retain_rate_dist_type;
row_vector[num_media] retain_rate_dist_mean;
row_vector[num_media] retain_rate_dist_sd;
row_vector[num_media] delay_dist_type;
row_vector[num_media] delay_dist_mean;
row_vector[num_media] delay_dist_sd;
row_vector[num_media] slope_dist_type;
row_vector[num_media] slope_dist_mean;
row_vector[num_media] slope_dist_sd;
row_vector[num_media] ec_dist_type;
row_vector[num_media] ec_dist_mean;
row_vector[num_media] ec_dist_sd;
// a vector of 0 to max_lag - 1
//row_vector[max_lag] lag_vec;
// 3D array of media variables
array[N, num_media] row_vector[max_lag] X_media;
// the number of other control variables
int<lower=1> num_ctrl;
row_vector[num_ctrl] ctrl_prior_dist_type;
row_vector[num_ctrl] ctrl_prior_mean;
row_vector[num_ctrl] ctrl_prior_sd;
// a matrix of control variables
array[N] row_vector[num_ctrl] X_ctrl;
row_vector<lower=0>[num_media] slope;
}
parameters {
// residual variance
real<lower=0> noise_var;
// the intercept
real tau;
// the coefficients for media variables
vector<lower=0>[num_media] beta_medias;
// coefficients for other control variables
vector[num_ctrl] gamma_ctrl;
// the retention rate and delay parameter for the adstock transformation of
// each media
vector<lower=0,upper=1>[num_media] retain_rate;
//vector<lower=0,upper=max_lag-1>[num_media] delay;
// ec50 and slope for Hill function of each media
vector<lower=0,upper=1>[num_media] ec;
vector<lower=0>[n_interactions] beta_interactions;
// vector<lower=0>[num_media] slope;
}
transformed parameters {
matrix[T, num_media] cum_effects; // Cumulative effects after Adstock
matrix[T, num_media] cum_effects_hill; // After Hill transformation
matrix[T, n_interactions] cum_effects_hill_interaction; // Interaction effects
vector[T] mu;
array[num_media] row_vector[max_lag] lag_weights; // Mean response
for (media in 1 : num_media) {
for (lag in 1 : max_lag) {
lag_weights[media,lag] = pow(retain_rate[media], (lag - 1) );
}
}
// Apply Adstock transformation
cum_effects = Adstock(X_media[training_index, ], lag_weights);
// Apply Hill transformation using vectorized `Hill` function
cum_effects_hill = Hill(cum_effects, ec, slope);
// Compute interaction effects if applicable
if (n_interactions > 0) {
for (inter in 1:n_interactions) {
cum_effects_hill_interaction[, inter] =
cum_effects_hill[, interaction_left[inter]] .* cum_effects_hill[, interaction_right[inter]];
}
}
// Compute mu using dot products for medias, controls, and interactions
if (n_interactions > 0) {
mu = tau +
cum_effects_hill * beta_medias +
X_ctrl[training_index, ] * gamma_ctrl +
cum_effects_hill_interaction * beta_interactions;
} else {
mu = tau +
cum_effects_hill * beta_medias +
X_ctrl[training_index, ] * gamma_ctrl;
}
}
model {
tau ~ normal(tau_dist_mean,tau_dist_sd);
for (media_index in 1 : num_media) {
beta_medias[media_index] ~ normal(media_prior_mean[media_index],media_prior_sd[media_index]);
retain_rate[media_index] ~ normal(retain_rate_dist_mean[media_index],retain_rate_dist_sd[media_index]);
slope[media_index] ~ normal(slope_dist_mean[media_index],slope_dist_sd[media_index]);
ec[media_index] ~ beta(ec_dist_mean[media_index],ec_dist_sd[media_index]);
}
for (ctrl_index in 1 : num_ctrl) {
gamma_ctrl[ctrl_index] ~ normal(ctrl_prior_mean[ctrl_index],ctrl_prior_sd[ctrl_index]);
}
noise_var ~ inv_gamma(noise_var_dist_mean,noise_var_dist_sd);
Y_train ~ normal(mu, sqrt(noise_var));
}
import stan
import nest_asyncio
nest_asyncio.apply()
# Path to your Stan model file
stan_file = 'MMMv2.stan'
# Read the Stan model code
with open(stan_file, 'r') as file:
stan_code = file.read()
# Compile the Stan model
stan_model = stan.build(stan_code, data=stan_data, random_seed=9966)
# Sample from the posterior
fit = stan_model.sample(
num_chains=chains,
num_samples=iterations,
num_warmup=int(iterations / 2),
adapt_delta=adapt_delta,
max_treedepth=max_treedepth,
step_size=stepsize
)
# Access results
print(fit)
Let me know if I missed any information. pardon me, this is my first question to stackoverflow. Thanks in advance.
Upvotes: 0
Views: 32