ValueError: One or more gradients are None, meaning no gradients are flowing

Question

I'm trying to train a model I tried to wrote reading this paper: A lightweight model using frequency, trend and temporal attention for long sequence time-series prediction

Now, during the training I obtain this error:

ValueError:
One or more gradients are None, meaning no gradients are flowing.

The main I wrote is:

import numpy as np
import tensorflow as tf
from input_handler import trend_extraction_with_avgpool
from temporalAttentionExtractor import TemporalAttentionExtractor, temporal_attention_extraction
from frequencyAttentionExtractor import FrequencyAttentionExtractor
from wave_gen import wave_with_noise
from lags import prepare_lags
from keras._tf_keras.keras.losses import MeanSquaredError
from keras._tf_keras.keras.optimizers import Adam
from dataset_splitter import split_test

# Hyperparameters
epochs = 100  # Number of training epochs
learning_rate = 0.001  # Learning rate for the optimizer

if __name__ == '__main__':
    # Step 1: Generate a noisy sine wave and prepare lagged time series
    sin_wave = wave_with_noise(frequency=10, resolution=520, amplitude=1, fn=np.sin)

    # Prepare lagged series with a lag of 3 time steps
    lagged_series = prepare_lags(sin_wave, 4)

    # Step 2: Reshape the lagged series if it's a 2D array
    if len(lagged_series.shape) == 2:
        lagged_series = lagged_series.values.reshape(lagged_series.shape[0], lagged_series.shape[1], 1)

    X_train, y_train, X_test, y_test, _, _ = split_test(lagged_series, 0.7)

    # Step 3: Extract batch size, time steps, and features
    B = X_train.shape[0]  # Batch size (number of samples)
    T = X_train.shape[1] // 2  # Time steps (assuming half of the total lagged series length)
    D = X_train.shape[2]  # Number of features (dimensionality of input)

    print("B:", B)
    print("T:", T)
    print(f"D: {D}

")

    # Step 4: Extract trend features using average pooling
    x_trend = trend_extraction_with_avgpool(X_train, T)

    # Step 5: Temporal attention model initialization and extraction
    x_input_temporal = temporal_attention_extraction(X_train, B, T, D)
    model_temporal = TemporalAttentionExtractor(B, T, D)
    temporal_output = model_temporal.call(x_input_temporal)

    # Step 6: Frequency attention model initialization and extraction
    model_frequency = FrequencyAttentionExtractor(B, T, D)
    frequency_output = model_frequency.call(X_train, T)

    print(f"
---Dimension of the output---
Trend: {x_trend.shape}
Temporal: {temporal_output.shape}
Frequency: {frequency_output.shape}")

    # Step 7: Define learnable weights for combining the different branches
    w_pool = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)
    w_mlp = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)
    w_fft = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)

    print(f"
---Shape of the variables---
Pool: {w_pool.shape}
MLP: {w_mlp.shape}
FFT: {w_fft.shape}

")

    # Step 8: Transpose the trend, temporal, and frequency outputs to match weight dimensions
    x_trend_transposed = tf.transpose(x_trend, perm=[0, 2, 1])
    temporal_output_transposed = tf.transpose(temporal_output, perm=[0, 2, 1])
    frequency_output_transposed = tf.transpose(frequency_output, perm=[0, 2, 1])

    # Step 9: Combine the outputs using the learnable weights and reduce dimensionality
    combined_output = (
            tf.matmul(w_pool, x_trend_transposed) +
            tf.matmul(w_mlp, temporal_output_transposed) +
            tf.matmul(w_fft, tf.cast(frequency_output_transposed, tf.float32))
    )

    # Now ensure the combined output has the correct shape [B, 1]
    combined_output = tf.reduce_sum(combined_output, axis=[1, 2], keepdims=True)  # Reducing to shape [B, 1]

    # Step 10: Define the loss function and optimizer
    loss_function = MeanSquaredError()
    optimizer = Adam(learning_rate=learning_rate)

    # Step 11: Training loop
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            tape.watch([w_pool, w_mlp, w_fft])

            # Calculate the loss
            loss = loss_function(y_train, combined_output)

        gradients = tape.gradient(loss, [w_pool, w_mlp, w_fft])
        print(f"
Gradients: {gradients}
")

        if any(g is None for g in gradients):
            raise ValueError("
One or more gradients are None, meaning no gradients are flowing.
")

        # Step 13: Apply the computed gradients to update the weights
        optimizer.apply_gradients(zip(gradients, [w_pool, w_mlp, w_fft]))

        # Step 14: Print the loss and weights after each epoch for monitoring
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.numpy()}")
        print(f"Weights - Pool: {w_pool.numpy()}, MLP: {w_mlp.numpy()}, FFT: {w_fft.numpy()}")

    print("Training completed.")

I don't know if the problem is the definition and usage of the weights or if it's something in the training section.

Output B:

361

 T: 2

 D: 1 

(361, 4, 1, 256) 

Casting complex values to real discards the imaginary part (361, 2, 1) 

---Dimension of the output--- 

Trend: (361, 2, 1) Temporal: (361, 2, 1) Frequency: (361, 2, 1) 

--Shape of the variables-- Pool: (361, 2, 1) MLP: (361, 2, 1) FFT: (361, 2, 1) 

Gradients: [None, None, None] 

Traceback (most recent call last): File "\main.py", line 90, in  raise ValueError("
One or more gradients are None, meaning no gradients are flowing.
") ValueError: One or more gradients are None, meaning no gradients are flowing.

ValueError: One or more gradients are None, meaning no gradients are flowing

Answers (0)

Related Questions