LSTM training fades to Nan when batch_number=2

Question

I am running a very simple multilayer LSTM network. I have set different transformations for the data, namely StandardScaler(), RobustScaler() and MinMaxScaler(). The code runs smooth and I get no errors when I use this scalers. But if I use the raw data, after epoch>=2 it gets only NAn's. I've set some prints to see if it was something related with the shape of the tensors, tried different learning rates, different optimizers, changed the activation function in LSTM layers, the kernel initializer...nothing works. The strange thing is that, that the same code that generates the transformed data is the same that I pass my data through to to get the tensors for the raw data...

The code is the following:

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
def scaling_types(batched_regressor: list, scale_target, scaling_option='identity', return_scaler=True):

    """
    This function computed several transformations in the train_regressors, independently,
    for each batch, also independtly. If a batch_size=32, then 32 different scallings will be
    done. Include_target should be set = False, because it could introduce leakage if several
    steps_out are predicted. Even for just 1 step_out that could be problem, but not as serious
    Args:
        batched_regressor = [input_window, input_target]
        input_window:  Input feature data (e.g., shape: [batch_size, window_size, n_features]).
            If input_window.shape=(batch_size, window_size) it is transformed in input_window.shape=(batch_size, window_size, 1)
        input_target:  Target values (e.g., shape: [batch_size, steps_out])
            If input_target.shape=(batch_size, window_size) it is transformed in input_target.shape=(batch_size, window_size, 1)
        scaling_option: Scaling method to apply. Choose from the following options:
                        - 'identity': No scaling
                        - 'standard': StandardScaler (zero mean, unit variance)
                        - 'minmax': MinMaxScaler (scales between 0 and 1)
                        - 'robust': RobustScaler (scales with median and IQR)
        scale_target: Whether to scale the target values (default: False)
        return_scaler: To return the scaling parameters to perform inverse transform

    Returns:
        input_window_scaled, input_target_scaled: Scaled versions of input_window and input_target
        with shape (batch_size, n_steps_out, 1). If no input_target is provided, it just return the
        transformed input_window
        scaler: parameters for inverse transformation, if needed
    """

    # Initialize scaler based on the selected option
    if scaling_option == 'standard':
        scaler = StandardScaler()
    elif scaling_option == 'minmax':
        scaler = MinMaxScaler()
    elif scaling_option == 'robust':
        scaler = RobustScaler()
    elif scaling_option == 'identity':
        scaler = None  # 'identity' - no scaling

    # Check how many arrays are in the input
    if isinstance(batched_regressor, list):
        input_windows = batched_regressor[0]
        input_targets = batched_regressor[1] if len(batched_regressor) > 1 else None
    else:
        input_windows = batched_regressor
        input_targets = None

    if input_windows is not None and input_targets is not None:
        if input_windows.ndim == 2 and input_targets.ndim == 2:
            print('Transformação certa')
            input_windows = np.expand_dims(input_windows, -1)
            input_targets = np.expand_dims(input_targets, -1)
        elif input_windows.ndim == 3 and input_targets.ndim == 3:
            input_windows = input_windows
            input_targets = input_targets
        elif input_windows.ndim == 2 and input_targets.ndim == 3:
            input_windows = np.expand_dims(input_windows, -1)
            input_targets = input_targets
        elif input_windows.ndim == 3 and input_targets.ndim == 2:
            input_windows = input_windows
            input_targets = np.expand_dims(input_targets, -1)
        else:
            print('Check input_window and input_target shapes:')
            raise ValueError(f"Unexpected input shapes: input_windows={input_windows.shape}, input_targets={input_targets.shape}")

        if scaler:
            print('entramos no scaler que implica input_targets different de None e que não estamos no identity')
            if scale_target:
                print('entramos no combined data')
                # Concatenate the current regressor and target arrays (potential leakage?)
                combined_data = np.concatenate([input_windows, input_targets], axis=1)

                if scaler != StandardScaler():
                    # Fit and transform the combined data
                    print('entramos no scaler certo data')
                    scaled_combined = np.array([scaler.fit_transform(regressor_train_window) for regressor_train_window in combined_data])
                    print('estamos a fazer o scale certo')
                    # Split the scaled data back into regressors and targets
                    input_windows = scaled_combined[:, :input_windows.shape[1], :]
                    input_targets = scaled_combined[:, input_windows.shape[1]:, :]
                if scaler == StandardScaler():
                    scaled_combined = np.array([scaler.fit_transform(regressor_train_window + 1e-8) for regressor_train_window in combined_data])
                    print('estamos a usar o standard scaling')
                    # Split the scaled data back into regressors and targets
                    input_windows = scaled_combined[:, :input_windows.shape[1], :]
                    input_targets = scaled_combined[:, input_windows.shape[1]:, :]
            else:
                if scaler == MinMaxScaler() or RobustScaler():
                    input_windows = np.array([scaler.fit_transform(regressor_train_window) for regressor_train_window in input_windows])
                elif scaler == StandardScaler():
                    input_windows = np.array([scaler.fit_transform(regressor_train_window + 1e-8) for regressor_train_window in input_windows])

        else:
            input_windows = np.array(input_windows)
            input_targets = np.array(input_targets)
    
    elif input_targets is None:
        if input_windows.ndim == 2:
            input_windows = np.expand_dims(input_windows, -1)
        elif input_windows.ndim == 3:
            input_windows = input_windows
        
        if scaler:        
            if scaler == MinMaxScaler() or RobustScaler():
                # Fit and transform the combined data
                input_windows = np.array([scaler.fit_transform(regressor_train_window) for regressor_train_window in input_windows])
            if scaler == StandardScaler():
                input_windows = np.array([scaler.fit_transform(regressor_train_window + 1e-8) for regressor_train_window in input_windows])
        else:
            input_windows = input_windows
    
    if input_targets is None:
        return (input_windows, scaler) if return_scaler else input_windows
    else:
        if return_scaler:
            return input_windows, input_targets, scaler
        else:
            return input_windows, input_targets

The following code is the custom LSTM model which as is it right now is just a very simple architecture:

from keras import Model
from keras.layers import LSTM, Dense
from keras.initializers import HeNormal, Zeros, RandomNormal
from keras.layers import LeakyReLU
class LSTM_Model(Model):
    def __init__(self, lstm_layer_1, lstm_layer_2, lstm_layer_3, lstm_layer_4, steps_out):
        super(LSTM_Model, self).__init__()
        white_noise_initializer = RandomNormal(mean=0.0, stddev=0.000005)
        self.lstm_layer_1 = LSTM(lstm_layer_1, activation= None, kernel_initializer=white_noise_initializer, recurrent_initializer=white_noise_initializer, return_sequences=True)
        self.lstm_layer_1_activation = LeakyReLU()
        self.lstm_layer_2 = LSTM(lstm_layer_2, activation= None, kernel_initializer=white_noise_initializer, recurrent_initializer=white_noise_initializer, return_sequences=True)
        self.lstm_layer_2_activation = LeakyReLU()
        self.lstm_layer_3 = LSTM(lstm_layer_3, activation= None, kernel_initializer=white_noise_initializer, recurrent_initializer=white_noise_initializer, return_sequences=True)
        self.lstm_layer_3_activation = LeakyReLU()
        self.lstm_layer_4 = LSTM(lstm_layer_4, activation= None, kernel_initializer=white_noise_initializer, recurrent_initializer=white_noise_initializer, return_sequences=True)
        self.lstm_layer_4_activation = LeakyReLU()
        self.dense_layer = Dense(1, kernel_initializer=white_noise_initializer)
        self.steps_out = steps_out

    def call(self, inputs, training=True):
        if training==True:
            h = self.lstm_layer_1(inputs)
            h = self.lstm_layer_1_activation(h)
            h = self.lstm_layer_2(h)
            h = self.lstm_layer_2_activation(h)
            h = self.lstm_layer_3(h)
            h = self.lstm_layer_3_activation(h)
            h = self.lstm_layer_4(h)
            h = self.lstm_layer_4_activation(h)
            h = self.dense_layer(h)
            h = h[:,-self.steps_out, :] #keep only the desired output_values of the output
        return h

my training routine:

from functions_utils import Sliding_window_setup
from functions_utils.Sliding_window_setup import create_sliding_window_overlapping
import tensorflow as tf
import numpy as np
from keras.losses import MeanAbsolutePercentageError, MeanSquaredError
from keras.models import save_model, load_model
import random
from HP_Tunning_LSTM_models.LSTM_model_class import LSTM_Model
import pandas as pd
from functions_utils.scaling_transforms import scaling_types
import os
# Set seeds for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)
def Recursive_LSTM_function_train(all_data: pd.DataFrame,
    sliding_window_setup: Sliding_window_setup,
    window_size: int,
    batch_size: int,
    epochs: int,
    optimizer,
    learning_rate: float,
    scaling_option: str,
    scale_target,
    trained_model = LSTM_Model(256,256,256,256, steps_out=1)):


    train_regressors,train_target,test_regressors,test_target = sliding_window_setup(all_data,window_size,steps_out=1)
    
    #train_regressors.shape = (n_examples, window_size), LSTM input.shape = (batch_size, window_size, n_features). Neste caso, n_features=1
    train_regressors = np.expand_dims(train_regressors, -1)
    train_target = np.expand_dims(train_target, -1)

    buffer_size = train_regressors.shape[0]//2 #select a relatively wide buffer size to ensure randonmness in training examples
    train_dataset = tf.data.Dataset.from_tensor_slices((train_regressors, train_target))
    train_dataset = train_dataset.shuffle(buffer_size, seed=seed).batch(batch_size, drop_remainder=True)    

    optimizer.learning_rate = learning_rate

    train_loss = MeanSquaredError()

    for epoch in range(epochs):

        #calcular metric para a batch. Este tf.keras.metrics.Mean() ACUMULA as losses em cada 
        # batch (em vez de subsituir a loss do batch anterior pela loss do batch present). 
        # Assim no final de cada epoch e a Loss Média de cada epoch
        epoch_loss_avg = tf.keras.metrics.Mean()

        # Training loop
        for batch, (train_regressors, train_target) in enumerate(train_dataset):
            print (f'batch_numer: {batch}')
            #print (f'train_regressors: {train_regressors}')
            #print (f'train_target: {train_target}')
            print (f'train_regressors.shape = {train_regressors.shape}')
            print (f'train_target.shape: {train_target.shape}')

            #Scaling each train_regressor window independently inside each batch. Target is not being scaled, although it is possible. Se why it is not the function that defines the scaling
            train_regressors_scaled, train_target_scaled = scaling_types([train_regressors, train_target], scaling_option=scaling_option, scale_target=scale_target, return_scaler=False)
            with tf.GradientTape() as tape:
                y_pred = trained_model(train_regressors_scaled, training=True)
                print (f'y_pred.shape: {y_pred.shape}')
                print (f'y_pred: {y_pred}')
                loss = train_loss(train_target, y_pred)
            gradients = tape.gradient(loss, trained_model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, trained_model.trainable_variables))
            epoch_loss_avg(loss)
        
        print(f'Epoch: {epoch} ---------- Loss: {epoch_loss_avg.result().numpy()}')
        print('
')

    # Save the trained model at the end of training
    model_path = os.join('Optuna_LSTM_grid_search_results_1_steps_ahead/trained_models',f'model_{learning_rate}_{scaling_option}_for_{all_data.columns[0]}.h5')  # Unique name based on HPs
    save_model(trained_model, model_path)

    return trained_model, model_path

and finnaly, where all the code is called:

from ast import Dict, List
import tensorflow as tf
from keras.losses import MeanAbsolutePercentageError, MeanSquaredError
from keras.models import save_model, load_model
from functions_utils import Sliding_window_setup
from functions_utils.Sliding_window_setup import create_sliding_window_overlapping
from keras.optimizers.legacy import Adam, SGD
import pandas as pd
import numpy as np
import random, copy
from functions_utils.Pick_M import pick_M
from functions_utils.scaling_transforms import scaling_types
from HP_Tunning_LSTM_models.LSTM_model_class import LSTM_Model
from HP_Tunning_LSTM_models.LSTM_training_and_model_saving_loop import Recursive_LSTM_function_train
from HP_Tunning_LSTM_models.y_hat_dataframes import inference_and_prediction_interval_calculation
from HP_Tunning_LSTM_models.losses_df import errors_calculation_test_set
from HP_Tunning_LSTM_models.train_test_split import train_test_split
from typing import List
import optuna
from optuna.samplers import  BruteForceSampler
from datetime import datetime

# Set seeds for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)
my_LSTM, model_path = Recursive_LSTM_function_train(
        all_data = pick_M('M1'),
        sliding_window_setup = create_sliding_window_overlapping,
        window_size = 30,
        batch_size = 32,
        epochs = 100,
        optimizer = Adam(),
        learning_rate = 0.001,
        scaling_option = 'identity',
        scale_target = False,
        trained_model = LSTM_Model(256,256,256,256, steps_out=1),
    )

If more code is needed please ask.

LSTM training fades to Nan when batch_number>=2

Answers (0)

Related Questions

LSTM training fades to Nan when batch_number&gt;=2

Answers (0)

Related Questions

LSTM training fades to Nan when batch_number>=2