Reputation: 504
I have a dataset I want to fine-tune a huggingface LLM with. This dataset is quite simple. It has two columns: one column has DNA sequences (each in the form of a string 5000 letters long). Another column has a binary label. My dataset is only 240 rows long.
For some reason, the trainer.train() step is not making any progress. I have access to 2 a100 GPUs, so I don't think it's a computational resource issue.
Here I'm loading in the data and tokenizing the sequences.
dataset = Dataset.from_pandas(df[['sequence', 'label']])
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", trust_remote_code=True)
def tokenize_function(examples):
outputs = tokenizer.batch_encode_plus(examples["sequence"], return_tensors="pt", truncation="longest_first", padding='max_length',
max_length=836)
return outputs
# Creating tokenized dataset
tokenized_dataset = dataset.map(
tokenize_function,
batched=True, batch_size=2048)
Here I'm reducing the parameters of the model:
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-v2-500m-multi-species", num_labels=2, trust_remote_code=True)
peft_config = LoraConfig(
task_type=TaskType.SEQ_CLS, inference_mode=False, r=1, lora_alpha= 32, lora_dropout=0.1, target_modules= ["query", "value"])
lora_classifier = get_peft_model(model, peft_config) # transform our classifier into a peft model
lora_classifier.print_trainable_parameters()
lora_classifier = nn.DataParallel(lora_classifier, device_ids=[0, 1])
lora_classifier = lora_classifier.to("cuda:0")
Here I'm prepping for training:
args_ = TrainingArguments(
"finetuned_NT",
remove_unused_columns=False,
evaluation_strategy="steps",
save_strategy="steps",
learning_rate=5e-4,
per_device_train_batch_size=32,
gradient_accumulation_steps= 1,
per_device_eval_batch_size= 32,
eval_steps=10,
logging_steps= 10,
load_best_model_at_end=True, # Keep the best model according to the evaluation
metric_for_best_model="ROC-AUC", # The mcc_score on the evaluation dataset used to select the best model
label_names=["label"],
dataloader_drop_last=True,
max_steps= 10
)
def compute_metrics(eval_pred):
# get predictions
predictions, labels = eval_pred
# apply softmax to get probabilities
probabilities = np.exp(predictions) / np.exp(predictions).sum(-1,
keepdims=True)
# use probabilities of the positive class for ROC AUC
positive_class_probs = probabilities[:, 1]
# compute auc
auc = np.round(auc_score.compute(prediction_scores=positive_class_probs,
references=labels)['roc_auc'],3)
# predict most probable class
predicted_classes = np.argmax(predictions, axis=1)
# compute accuracy
acc = np.round(accuracy.compute(predictions=predicted_classes,
references=labels)['accuracy'],3)
return {"Accuracy": acc, "ROC-AUC": auc}
Splitting data:
train_test_split = tokenized_dataset.select_columns(['label', 'input_ids', 'attention_mask']).train_test_split(test_size=0.2, seed=42)
# Now split the test set into test (50%) and validation (50%)
test_val_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)
# Combine the splits into a DatasetDict
final_split = DatasetDict({
'train': train_test_split['train'],
'test': test_val_split['train'],
'validation': test_val_split['test']
})
trainer = Trainer(
model=lora_classifier, # Assuming `lora_classifier` is your model
args=args_, # Make sure `args_` contains the correct training arguments
train_dataset=final_split['train'], # Train dataset from the split
eval_dataset=final_split['validation'], # Validation dataset from the split
tokenizer=tokenizer, # Tokenizer used for preprocessing
compute_metrics=compute_metrics # Metric computation function, assuming `compute_metrics_mcc` is defined
)
When I run trainier.train(), no progress bar comes up, no logs come up (I tried waiting for hours). It says like this with no change.
/sc/arion/work/test-env/envs/test/lib/python3.8/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
***** Running training *****
Num examples = 192
Num Epochs = 4
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 64
Gradient Accumulation steps = 1
Total optimization steps = 10
Number of trainable parameters = 1170434
Upvotes: 0
Views: 80