Reputation: 1
I'm trying to fine-tune a LLaMA model using LoRA, but I'm getting the following error during training:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
Here's my training setup:
import os
import time
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType
from safetensors.torch import save_file
def prepare_sample(example):
"""Prepare a single sample, returning None if invalid."""
if not isinstance(example['original_text'], str) or not isinstance(example['answers'], str):
return None
if not example['original_text'].strip() or not example['answers'].strip():
return None
return f"Soru: {example['original_text'].strip()}\nCevap: {example['answers'].strip()}"
def tokenize_function(examples, tokenizer):
"""Tokenize with proper error handling and validation (FIXED VERSION)."""
processed_texts = []
for idx in range(len(examples['original_text'])):
sample = prepare_sample({
'original_text': examples['original_text'][idx],
'answers': examples['answers'][idx]
})
if sample:
processed_texts.append(sample)
if not processed_texts:
return {"input_ids": [], "attention_mask": []}
tokenized = tokenizer(
processed_texts,
truncation=True,
max_length=256,
padding="max_length",
return_tensors=None
)
# Remove manual label assignment
return tokenized
def main():
# Memory optimization
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
print("Loading dataset...")
dataset_name = "hcsolakoglu/turkish-wikipedia-qa-4-million"
data = load_dataset(dataset_name)
print("Setting up model and tokenizer...")
base_model_name = "Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token
# Load model with configuration
model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="auto",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
)
# Disable model caching for gradient checkpointing
model.config.use_cache = False
print("Configuring LoRA...")
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=4,
lora_alpha=8,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj"],
bias="none",
inference_mode=False
)
model = get_peft_model(model, peft_config)
# Verify trainable parameters
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || "
f"all params: {all_param} || "
f"trainable%: {100 * trainable_params / all_param:.2f}"
)
print("Processing dataset...")
tokenized_data = data.map(
lambda x: tokenize_function(x, tokenizer),
batched=True,
batch_size=100,
num_proc=4,
remove_columns=data["train"].column_names,
desc="Tokenizing dataset"
)
print("Filtering dataset...")
tokenized_data = tokenized_data.filter(
lambda x: len(x["input_ids"]) > 0,
desc="Removing empty examples"
)
if len(tokenized_data["train"]) == 0:
raise ValueError("No valid training examples found after processing!")
print(f"Final dataset size: {len(tokenized_data['train'])} examples")
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
pad_to_multiple_of=8
)
training_args = TrainingArguments(
output_dir="./results",
learning_rate=4e-4,
per_device_train_batch_size=16,
gradient_accumulation_steps=4,
max_steps=1500,
lr_scheduler_type="cosine",
warmup_ratio=0.01,
weight_decay=0.01,
logging_dir="./logs",
save_strategy="steps",
save_steps=500,
save_total_limit=2,
bf16=True,
dataloader_num_workers=4,
dataloader_pin_memory=False,
optim="adamw_torch",
logging_steps=10,
report_to="none",
gradient_checkpointing=True,
remove_unused_columns=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
data_collator=data_collator,
)
try:
print("Starting training...")
start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Training completed in {(end_time - start_time) / 3600:.2f} hours")
output_dir = "./RealityLLMs"
os.makedirs(output_dir, exist_ok=True)
save_file(model.state_dict(), os.path.join(output_dir, "adapter_model.safetensors"))
print(f"Model saved at: {output_dir}")
except Exception as e:
print(f"Training failed with error: {str(e)}")
raise
if __name__ == "__main__":
main()
I'm using:
The code should properly fine-tune the model using LoRA adapters.
requires_grad=True
on model parametersWhat's causing this gradient computation error, and how can I fix it?
Upvotes: 0
Views: 34