ErenalpCet
ErenalpCet

Reputation: 1

RuntimeError with PyTorch when Fine-tuning LLM: "element 0 of tensors does not require grad"

I'm trying to fine-tune a LLaMA model using LoRA, but I'm getting the following error during training:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Code

Here's my training setup:

import os
import time
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType
from safetensors.torch import save_file

def prepare_sample(example):
    """Prepare a single sample, returning None if invalid."""
    if not isinstance(example['original_text'], str) or not isinstance(example['answers'], str):
        return None
    if not example['original_text'].strip() or not example['answers'].strip():
        return None
    return f"Soru: {example['original_text'].strip()}\nCevap: {example['answers'].strip()}"

def tokenize_function(examples, tokenizer):
    """Tokenize with proper error handling and validation (FIXED VERSION)."""
    processed_texts = []
    for idx in range(len(examples['original_text'])):
        sample = prepare_sample({
            'original_text': examples['original_text'][idx],
            'answers': examples['answers'][idx]
        })
        if sample:
            processed_texts.append(sample)
    
    if not processed_texts:
        return {"input_ids": [], "attention_mask": []}

    tokenized = tokenizer(
        processed_texts,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors=None
    )
    
    # Remove manual label assignment
    return tokenized

def main():
    # Memory optimization
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

    print("Loading dataset...")
    dataset_name = "hcsolakoglu/turkish-wikipedia-qa-4-million"
    data = load_dataset(dataset_name)
    
    print("Setting up model and tokenizer...")
    base_model_name = "Llama-3.2-1B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with configuration
    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
    )
    
    # Disable model caching for gradient checkpointing
    model.config.use_cache = False

    print("Configuring LoRA...")
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=4,
        lora_alpha=8,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        inference_mode=False
    )
    
    model = get_peft_model(model, peft_config)
    
    # Verify trainable parameters
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || "
        f"all params: {all_param} || "
        f"trainable%: {100 * trainable_params / all_param:.2f}"
    )

    print("Processing dataset...")
    tokenized_data = data.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True,
        batch_size=100,
        num_proc=4,
        remove_columns=data["train"].column_names,
        desc="Tokenizing dataset"
    )

    print("Filtering dataset...")
    tokenized_data = tokenized_data.filter(
        lambda x: len(x["input_ids"]) > 0,
        desc="Removing empty examples"
    )

    if len(tokenized_data["train"]) == 0:
        raise ValueError("No valid training examples found after processing!")

    print(f"Final dataset size: {len(tokenized_data['train'])} examples")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8
    )

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=4e-4,
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        max_steps=1500,
        lr_scheduler_type="cosine",
        warmup_ratio=0.01,
        weight_decay=0.01,
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=500,
        save_total_limit=2,
        bf16=True,
        dataloader_num_workers=4,
        dataloader_pin_memory=False,
        optim="adamw_torch",
        logging_steps=10,
        report_to="none",
        gradient_checkpointing=True,
        remove_unused_columns=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        data_collator=data_collator,
    )

    try:
        print("Starting training...")
        start_time = time.time()
        trainer.train()
        end_time = time.time()
        print(f"Training completed in {(end_time - start_time) / 3600:.2f} hours")
        
        output_dir = "./RealityLLMs"
        os.makedirs(output_dir, exist_ok=True)
        save_file(model.state_dict(), os.path.join(output_dir, "adapter_model.safetensors"))
        print(f"Model saved at: {output_dir}")
        
    except Exception as e:
        print(f"Training failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

I'm using:

Expected Behavior

The code should properly fine-tune the model using LoRA adapters.

What I've Tried

Environment

Question

What's causing this gradient computation error, and how can I fix it?

Upvotes: 0

Views: 34

Answers (0)

Related Questions