Different results for the same epoch using different number of total epochs

Question

I am training a Machine Learning model for STS task using the Sentence Transformers library. When I was testing it, I noticed that my model generated different results for the same number of epochs depending on the number of epochs I provided to the SentenceTransformerTrainingArguments function. As an example, I tested the model using 2 and 4 epochs, capturing the loss on the development/validation set and the Pearson correlation coefficient. Here are the results:

Id	EpochTotal	EpochParcial	EvalLoss	Pearson
1	2	1	2.7101428508758545	0.8982
2	2	2	2.676791191101074	0.9186
3	4	1	2.734797716140747	0.8934
4	4	2	2.613370180130005	0.9252
5	4	4	2.509316921234131	0.9404

In the table, the EpochTotal column represents the number of epochs my model was trained for, and the EpochPartial column refers to the epoch in which the results were collected. In the example, the results for Id's 1 and 3 should be the same, since I ran them for a single season (partial epoch). The same goes for Id's 2 and 4. To make sure that the values were not generated randomly due to the model initialization, I tested with the same parameters twice for each epoch, and found that they were not. Checking the bib, I found that it used seed = 42. Here is the code I am using:

import logging
import sys
import traceback
import json
from datetime import datetime
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from transformers import EarlyStoppingCallback
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr

model_name = "neuralmind/bert-large-portuguese-cased"
output_dir = "OutputModel"
dataset    = "assin2"

train_key = 'train'
valll_key = 'validation'
testt_key = 'test'

sent1_key = 'premise'
sent2_key = 'hypothesis'
label_key = 'relatedness_score'

logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

num_epochs = 4 # 2 e 4
train_batch_size = 8

model = SentenceTransformer(model_name)

train_loss = losses.CoSENTLoss(model)

# ----------------------------------------------------------------------------------
# Carregando dataset

dataset = load_dataset(dataset, trust_remote_code=True)

# ----------------------------------------------------------------------------------
# Normalizando os datatasets 0 e 1 (na teoria)

scMin = 1.0
scMax = 5.0

train_norm = []
for sc in dataset[train_key][label_key]:
    normalized = float((sc-scMin)/(scMax-scMin))
    train_norm.append(normalized)

testt_norm = []
for sc in dataset[testt_key][label_key]:
    normalized = float((sc-scMin)/(scMax-scMin))
    testt_norm.append(normalized)

valll_norm = []
for sc in dataset[valll_key][label_key]:
    normalized = float((sc-scMin)/(scMax-scMin))
    valll_norm.append(normalized)

# ----------------------------------------------------------------------------------------------
# Removendo todas as colunas desnecessárias

dataset = dataset.remove_columns(list(set(dataset[train_key].features.keys())-set([sent1_key, sent2_key])))

# ----------------------------------------------------------------------------------
# Adicionando as colunas normalizadas

label_key = 'score'
dataset[train_key] = dataset[train_key].add_column(label_key, train_norm)
dataset[testt_key] = dataset[testt_key].add_column(label_key, testt_norm)
dataset[valll_key] = dataset[valll_key].add_column(label_key, valll_norm)

logging.info(dataset)

# ----------------------------------------------------------------------------------

# 4. Define an evaluator for use during training. This is useful to keep track of alongside the evaluation loss.
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1      = dataset[valll_key][sent1_key],
    sentences2      = dataset[valll_key][sent2_key],
    scores          = dataset[valll_key][label_key],
    main_similarity = SimilarityFunction.COSINE,
    name            = "sts-dev",
)

# 5. Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir                  = output_dir,
    num_train_epochs            = num_epochs,
    per_device_train_batch_size = train_batch_size,
    per_device_eval_batch_size  = train_batch_size,
    learning_rate               = 1e-5,
    fp16                        = True,
    bf16                        = False,
    evaluation_strategy         = "epoch",
    logging_steps               = 100,
    run_name                    = "sts",
    save_strategy               = "no",
)

# 6. Create the trainer & start training
trainer = SentenceTransformerTrainer(
    model         = model,
    args          = args,
    train_dataset = dataset[train_key],
    eval_dataset  = dataset[valll_key],
    loss          = train_loss,
    evaluator     = dev_evaluator,
)

trainer.train()

Does it make sense for the model to achieve different results in the same epoch when the total number of epochs is different? Is there a parameter that depends on the total number of epochs and could be generating this difference? Does it make sense for the results to be different?

Different results for the same epoch using different number of total epochs

Answers (1)

Related Questions