I. A
I. A

Reputation: 2312

Why would using the same dataset for training and testing gives different accuracies?

I've been looking into the loss function of the training and the validation dataset, and I keep seeing the validation loss being smaller than the training loss, even when they are the same data set. I'm trying to get some insight as to why this would be the case.

I am training a model in tensorflow to predict some time series data. Consequently, the model creation and preprocessing is as follows:

window_size = 40
batch_size  = 32
forecast_period = 6
model_name = "LSTM"
tf.keras.backend.clear_session()

_seed = 42
tf.random.set_seed(_seed)

def _sub_to_batch(sub):
    return sub.batch(window_size, drop_remainder=True)

def _return_input_output(tensor):
    _input  = tensor[:, :-forecast_period, :]
    _output = tensor[:, forecast_period:, :]
    return _input, _output

def _reshape_tensor(tensor):
    tensor = tf.expand_dims(tensor, axis=-1)
    tensor = tf.transpose(tensor, [1, 0, 2])
    return tensor


# total elements after unbatch(): 3813
train_ts_dataset = tf.data.Dataset.from_tensor_slices(train_ts)\
                            .window(window_size, shift=1)\
                            .flat_map(_sub_to_batch)\
                            .map(_reshape_tensor)\
                            .map(_return_input_output)
#                             .unbatch().shuffle(buffer_size=500, seed=_seed).batch(batch_size)\
#                             .map(_return_input_output)

valid_ts_dataset = tf.data.Dataset.from_tensor_slices(valid_ts)\
                            .window(window_size, shift=1)\
                            .flat_map(_sub_to_batch)\
                            .map(_reshape_tensor)\
                            .unbatch().shuffle(buffer_size=500, seed=_seed).batch(batch_size)\
                            .map(_return_input_output)

def _forecast_mae(y_pred, y_true):
    _y_pred = y_pred[:, -forecast_period:, :]
    _y_true = y_true[:, -forecast_period:, :]
    mae = tf.losses.MAE(_y_true, _y_pred)
    return mae

def _accuracy(y_pred, y_true):
    # print(y_true) => Tensor("sequential/time_distributed/Reshape_1:0", shape=(None, 34, 1), dtype=float32)
    # y_true[-forecast_period:, :]  =>   Tensor("strided_slice_4:0", shape=(None, 34, 1), dtype=float32)
    # y_true[:, -forecast_period:, :] => Tensor("strided_slice_4:0", shape=(None, 6, 1), dtype=float32)

    _y_pred = y_pred[:, -forecast_period:, :]
    _y_pred = tf.reshape(_y_pred, shape=[-1, forecast_period])
    _y_true = y_true[:, -forecast_period:, :]
    _y_true = tf.reshape(_y_true, shape=[-1, forecast_period])

    # MAPE: Tensor("Mean_1:0", shape=(None, 1), dtype=float32)
    MAPE = tf.math.reduce_mean(tf.math.abs((_y_pred - _y_true) / _y_true), axis=1, keepdims=True)

    accuracy = 1 - MAPE
    accuracy = tf.where(accuracy < 0, tf.zeros_like(accuracy), accuracy)
    accuracy = tf.reduce_mean(accuracy)
    return accuracy

model = k.models.Sequential([
    k.layers.Bidirectional(k.layers.LSTM(units=100, return_sequences=True), input_shape=(None, 1)),
    k.layers.Bidirectional(k.layers.LSTM(units=100, return_sequences=True)),
    k.layers.TimeDistributed(k.layers.Dense(1))
])

model_name = []
model_name_symbols = {"bidirectional": "BILSTM_1", "bidirectional_1": "BILSTM_2", "time_distributed": "td"}
for l in model.layers:
    model_name.append(model_name_symbols.get(l.name, l.name))

model_name = "_".join(model_name)
print(model_name)

for i, (x, y) in enumerate(train_ts_dataset):
    print(i, x.numpy().shape, y.numpy().shape)

The output of the shapes of the datasets is as follows:

BILSTM_1_BILSTM_2_td
0 (123, 34, 1) (123, 34, 1)
1 (123, 34, 1) (123, 34, 1)
2 (123, 34, 1) (123, 34, 1)
3 (123, 34, 1) (123, 34, 1)
4 (123, 34, 1) (123, 34, 1)
5 (123, 34, 1) (123, 34, 1)
6 (123, 34, 1) (123, 34, 1)
7 (123, 34, 1) (123, 34, 1)
8 (123, 34, 1) (123, 34, 1)

then:

_datetime = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")
_log_dir = os.path.join(".", "logs", "fit7", model_name, _datetime)

tensorboard_cb = k.callbacks.TensorBoard(log_dir=_log_dir)

model.compile(loss="mae", optimizer=tf.optimizers.Adam(learning_rate=0.001), metrics=[_forecast_mae, _accuracy])

history = model.fit(train_ts_dataset, epochs=100, validation_data=train_ts_dataset, callbacks=[tensorboard_cb])

I've been looking into the loss function of the training and the validation dataset, and I keep seeing the validation loss being smaller than the training loss. I could be underfitting. However, I replaced the validation set with the training set as a simple test to monitor the loss and accuracy while training and testing. But I'm still getting validation accuracy being greater than the training one. Below is the accuracy across the training and the validation datasets:

enter image description here

To me its very weird I'm getting a validation accuracy that is greater than the training accuracy though I'm using the same dataset for training and testing. And there is no dropout, no batchNormalization layer etc...

Any hint on what could be the reason for this behavior? That would be much appreciated!!

===================================================================

Here are made some modification to the code to check if the batch size has any effect or not. Additionally, in order to remove any doubts in the tf.data.Dataset, I used numpy arrays as input. Hence the new code is as follows:

custom_train_ts   = train_ts.transpose(1, 0)[..., np.newaxis]
custom_train_ts_x = custom_train_ts[:, :window_size, :] # size: 123, window_size, 1
custom_train_ts_y = custom_train_ts[:, -window_size:, :] # size: 123, window_size, 1

custom_valid_ts   = valid_ts.transpose(1, 0)[..., np.newaxis]
custom_valid_ts_x = custom_valid_ts[:, :window_size, :]
custom_valid_ts_y = custom_valid_ts[:, -window_size:, :]
custom_valid_ts   = (custom_valid_ts_x, custom_valid_ts_y)

Secondly, in order to make sure that the accuracy is calculated over the entire dataset, and not dependent on the batch size, I fed in the dataset as is, without batching. Additionally, I implemented a custom metric as follows:

def _accuracy(y_true, y_pred):
    # print(y_true) => Tensor("sequential/time_distributed/Reshape_1:0", shape=(None, 34, 1), dtype=float32)
    # y_true[-forecast_period:, :]  =>   Tensor("strided_slice_4:0", shape=(None, 34, 1), dtype=float32)
    # y_true[:, -forecast_period:, :] => Tensor("strided_slice_4:0", shape=(None, 6, 1), dtype=float32)

    _y_pred = y_pred[:, -forecast_period:, :]
    _y_pred = tf.reshape(_y_pred, shape=[-1, forecast_period])
    _y_true = y_true[:, -forecast_period:, :]
    _y_true = tf.reshape(_y_true, shape=[-1, forecast_period])

    # MAPE: Tensor("Mean_1:0", shape=(None, 1), dtype=float32)
    MAPE = tf.math.reduce_mean(tf.math.abs((_y_pred - _y_true) / _y_true), axis=1, keepdims=True)

    accuracy = 1 - MAPE
    accuracy = tf.where(accuracy < 0, tf.zeros_like(accuracy), accuracy)        
    accuracy = tf.reduce_mean(accuracy)
    return accuracy


class MyAccuracy(tf.keras.metrics.Metric):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.accuracy_function = _accuracy
        self.y_true_lst = []
        self.y_pred_lst = []

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.y_true_lst.append(y_true)
        self.y_pred_lst.append(y_pred)

    def result(self):
        y_true_concat = tf.concat(self.y_true_lst, axis=0)
        y_pred_concat = tf.concat(self.y_pred_lst, axis=0)
        accuracy = self.accuracy_function(y_true_concat, y_pred_concat)
        self.y_true_lst = []
        self.y_pred_lst = []
        return accuracy
    def get_config(self):
        base_config = super().get_config()
        return {**base_config}

Finally, model compile and fit as:

model.compile(loss="mae", optimizer=tf.optimizers.Adam(hparams["learning_rate"]), 
              metrics=[tf.metrics.MAE, MyAccuracy()])

history = model.fit(custom_train_ts_x, custom_train_ts_y, epochs=120, batch_size=123, validation_data=custom_valid_ts, 
                    callbacks=[tensorboard_cb])

And when I looked into the training and the validation accuracy in tensorboard I got the following:

enter image description here

Therefore, clearly, this doesn't make any sense. Additionally, in this case, I make sure that calculate the accuracy only one, at the end of the epoch after calling the result(). However, when looking into the validation loss, I found that the training loss is lower than the validation loss:

enter image description here

Upvotes: 9

Views: 1365

Answers (1)

amin
amin

Reputation: 289

They are different because optimizer updates the parameters at the end of each batch and the val_loss will be computed at the end, but the train_loss will be computed in the process.

Even if you just have one sample in a batch and just one batch in each epoch, they will differ from each other because the network will do the forward pass for your sample and compute the loss which will be called train_loss and after updating the parameters it will compute the loss again and this time it will be called val_loss (in this case your next epochs train_loss will be equal to your current val_loss).

So if you want to check if what I just said is true, you can simply set the learning_rate of your optimizer to 0, then you'll get the same loss.

And here is the code I've tested for the same problem on MNIST (temporarily you can view the code and the results on my colab from here):

# ---------------------------------
# Importing stuff
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

from keras.utils import to_categorical

# ---------------------------------
(trainX, trainy), (testX, testy) = keras.datasets.mnist.load_data()

# one-hot
trainy = to_categorical(trainy, 10)
testy = to_categorical(testy, 10)

# image should be in shape of (28, 28, 1) not (28, 28)
trainX = np.expand_dims(trainX, -1)
testX = np.expand_dims(testX, -1)

# normalize
trainX = trainX/255.
testX = testX/255.

# ---------------------------------
# Build the model
model = Sequential()
model.add(Input(trainX.shape[1:]))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))

model.summary()

Compile and fit for several scenarios:

# training on 1 sample, but with learning_rate != 0
opt = keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer = opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

batchX = trainX[0].reshape(1, 28, 28, 1)
batchy = trainy[0].reshape(1, 10)

model.fit(batchX, batchy, validation_data = (batchX, batchy), batch_size = 1, 
          shuffle = False, validation_batch_size = 1, epochs = 5)

# You will see that the loss and val_loss are different and the
# next steps loss is equal to the current steps val_loss
# training on 1 sample, with learning_rate == 0
opt = keras.optimizers.Adam(learning_rate = 0)
model.compile(optimizer = opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

batchX = trainX[0].reshape(1, 28, 28, 1)
batchy = trainy[0].reshape(1, 10)

model.fit(batchX, batchy, validation_data = (batchX, batchy), batch_size = 1, 
          shuffle = False, validation_batch_size = 1, epochs = 5)

# You will see that the loss and val_loss are equal because 
# the parameters will not change
# training on the complete dataset but with learning_rate != 0
opt = keras.optimizers.Adam(learning_rate = 0.001)
model.compile(optimizer = opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.fit(trainX, trainy, validation_data = (trainX, trainy), batch_size = 32, 
          shuffle = False, validation_batch_size = 32, epochs = 5)

# this is similar to the case you asked
# training on the complete dataset and learning_rate == 0
opt = keras.optimizers.Adam(learning_rate = 0)
model.compile(optimizer = opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

model.fit(trainX, trainy, validation_data = (trainX, trainy), batch_size = 32, 
          shuffle = False, validation_batch_size = 32, epochs = 5)

# set the learning_rate to zero and again you'll get loss == val_loss

Upvotes: 2

Related Questions