Reputation: 1
I'm trying to train a model I tried to wrote reading this paper: A lightweight model using frequency, trend and temporal attention for long sequence time-series prediction
Now, during the training I obtain this error:
ValueError:
One or more gradients are None, meaning no gradients are flowing.
The main I wrote is:
import numpy as np
import tensorflow as tf
from input_handler import trend_extraction_with_avgpool
from temporalAttentionExtractor import TemporalAttentionExtractor, temporal_attention_extraction
from frequencyAttentionExtractor import FrequencyAttentionExtractor
from wave_gen import wave_with_noise
from lags import prepare_lags
from keras._tf_keras.keras.losses import MeanSquaredError
from keras._tf_keras.keras.optimizers import Adam
from dataset_splitter import split_test
# Hyperparameters
epochs = 100 # Number of training epochs
learning_rate = 0.001 # Learning rate for the optimizer
if __name__ == '__main__':
# Step 1: Generate a noisy sine wave and prepare lagged time series
sin_wave = wave_with_noise(frequency=10, resolution=520, amplitude=1, fn=np.sin)
# Prepare lagged series with a lag of 3 time steps
lagged_series = prepare_lags(sin_wave, 4)
# Step 2: Reshape the lagged series if it's a 2D array
if len(lagged_series.shape) == 2:
lagged_series = lagged_series.values.reshape(lagged_series.shape[0], lagged_series.shape[1], 1)
X_train, y_train, X_test, y_test, _, _ = split_test(lagged_series, 0.7)
# Step 3: Extract batch size, time steps, and features
B = X_train.shape[0] # Batch size (number of samples)
T = X_train.shape[1] // 2 # Time steps (assuming half of the total lagged series length)
D = X_train.shape[2] # Number of features (dimensionality of input)
print("B:", B)
print("T:", T)
print(f"D: {D}\n\n")
# Step 4: Extract trend features using average pooling
x_trend = trend_extraction_with_avgpool(X_train, T)
# Step 5: Temporal attention model initialization and extraction
x_input_temporal = temporal_attention_extraction(X_train, B, T, D)
model_temporal = TemporalAttentionExtractor(B, T, D)
temporal_output = model_temporal.call(x_input_temporal)
# Step 6: Frequency attention model initialization and extraction
model_frequency = FrequencyAttentionExtractor(B, T, D)
frequency_output = model_frequency.call(X_train, T)
print(f"\n---Dimension of the output---\nTrend: {x_trend.shape}\nTemporal: {temporal_output.shape}\nFrequency: {frequency_output.shape}")
# Step 7: Define learnable weights for combining the different branches
w_pool = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)
w_mlp = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)
w_fft = tf.Variable(tf.zeros((B, T, D)), trainable=True, dtype=tf.float32)
print(f"\n---Shape of the variables---\nPool: {w_pool.shape}\nMLP: {w_mlp.shape}\nFFT: {w_fft.shape}\n\n")
# Step 8: Transpose the trend, temporal, and frequency outputs to match weight dimensions
x_trend_transposed = tf.transpose(x_trend, perm=[0, 2, 1])
temporal_output_transposed = tf.transpose(temporal_output, perm=[0, 2, 1])
frequency_output_transposed = tf.transpose(frequency_output, perm=[0, 2, 1])
# Step 9: Combine the outputs using the learnable weights and reduce dimensionality
combined_output = (
tf.matmul(w_pool, x_trend_transposed) +
tf.matmul(w_mlp, temporal_output_transposed) +
tf.matmul(w_fft, tf.cast(frequency_output_transposed, tf.float32))
)
# Now ensure the combined output has the correct shape [B, 1]
combined_output = tf.reduce_sum(combined_output, axis=[1, 2], keepdims=True) # Reducing to shape [B, 1]
# Step 10: Define the loss function and optimizer
loss_function = MeanSquaredError()
optimizer = Adam(learning_rate=learning_rate)
# Step 11: Training loop
for epoch in range(epochs):
with tf.GradientTape() as tape:
tape.watch([w_pool, w_mlp, w_fft])
# Calculate the loss
loss = loss_function(y_train, combined_output)
gradients = tape.gradient(loss, [w_pool, w_mlp, w_fft])
print(f"\nGradients: {gradients}\n")
if any(g is None for g in gradients):
raise ValueError("\nOne or more gradients are None, meaning no gradients are flowing.\n")
# Step 13: Apply the computed gradients to update the weights
optimizer.apply_gradients(zip(gradients, [w_pool, w_mlp, w_fft]))
# Step 14: Print the loss and weights after each epoch for monitoring
print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.numpy()}")
print(f"Weights - Pool: {w_pool.numpy()}, MLP: {w_mlp.numpy()}, FFT: {w_fft.numpy()}")
print("Training completed.")
I don't know if the problem is the definition and usage of the weights or if it's something in the training section.
Output B:
361
T: 2
D: 1
(361, 4, 1, 256)
Casting complex values to real discards the imaginary part (361, 2, 1)
---Dimension of the output---
Trend: (361, 2, 1) Temporal: (361, 2, 1) Frequency: (361, 2, 1)
--Shape of the variables-- Pool: (361, 2, 1) MLP: (361, 2, 1) FFT: (361, 2, 1)
Gradients: [None, None, None]
Traceback (most recent call last): File "\main.py", line 90, in <module> raise ValueError("\nOne or more gradients are None, meaning no gradients are flowing.\n") ValueError: One or more gradients are None, meaning no gradients are flowing.
Upvotes: 0
Views: 25