NinaNuska
NinaNuska

Reputation: 23

GPU running out of memory when trying to load a large pretrained model

I am using Hugging face to load some pretrained models to do some testing on some data.

My code looks like this:

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #Tried to mitigate out of memory problem on large models with this, but it doesn't help
import time
import random
import torch
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig, AutoModelForCausalLM
from huggingface_hub import login
from datasets import load_dataset
import time

def load_model_huggingface(model_name):
    """
    Load a Hugging Face model with specific quantization settings.

    Parameters:
        model_name (str): The name of the model as listed on Hugging Face. Example:

    Returns:
        tokenizer (AutoTokenizer): Tokenizer for the specified model.
        model (AutoModelForCausalLM): Quantized model loaded from Hugging Face.
    """

    # Load tokenizer and base model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    #model = AutoModel.from_pretrained(model_name, token="YOUR_TOKEN_GOES_HERE")

    # Define quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  # Load model weights in 4-bit
        bnb_4bit_quant_type="nf4",  # Use nf4 quantization type
        bnb_4bit_use_double_quant=True,  # Enable nested quantization
        bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
        token="YOUR_TOKEN_GOES_HERE",
    )

    # Load quantized model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        trust_remote_code=True,
        token="YOUR_TOKEN_GOES_HERE",
        low_cpu_mem_usage=True
    )

    # Disable caching in model configuration
    model.config.use_cache = False

    # Update tokenizer settings
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    return tokenizer, model

#model = 'mistralai/Mistral-7B-Instruct-v0.2'
#model = 'mistralai/Mixtral-8x7B-v0.1'
#model = 'bigscience/bloom-7b1'
model = 'bigscience/bloom'

login(token="YOUR_TOKEN_GOES_HERE")

# Use this function if you already downloaded model
print("Getting model")
tokenizer, model = load_model_huggingface(model)
dataset = load_dataset('csv', data_files={'test': 'animals.csv'})
from transformers import pipeline

generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.001,
    max_new_tokens=256,
    repetition_penalty=1.1
)

prompts = []
prompts.append("{english} in English is {slovene} in Slovenian, can you tell me in Slovenian, the meaning of {slovene}")
prompts.append("{english} in english is {slovene} in Slovenian, can u tell me, the meaning of {english} and also translate it to slovenian")
prompts.append("Kaj je {slovene} (angleži temu pravijo {english})")
prompts.append("Definition of word {english}, also called {slovene} in Slovenian")

with open("prompt1_bloom.txt", "w") as file:
    print("Dataset has: " + str(len(dataset["test"]["en"])) + " entries")
    for i in range(len(prompts)):
        for j in range(len(dataset["test"]["en"])):
            start = time.time()
            prompt_number = i
            print("Prompt: "+str(prompt_number)+" | "+str(j)+" Word")

            english = dataset["test"]["en"][j]
            slovene = dataset["test"]["sl"][j]

            #prompt = f"""
            #    dog in english is pes in Slovenian, can u tell me in Slovenian, the meaning of pes
            #    """

            prompt = prompts[i]
            prompt = prompt.replace("{english}", english).replace("{slovene}", slovene)

            #print(prompt)

            res = generator(prompt)
            end = time.time()
            file.write("================================\n")
            file.write("Question number: "+str(0)+"\n")
            file.write("Prompt: ")
            file.write(prompt+"\n")
            file.write("================================\n")
            file.write("Generated output\n")
            file.write(res[0]["generated_text"][len(prompt):]+"\n")
            file.write("================================\n")
            file.write("Time need to generate prompt "+str(end - start)+" \n")
            file.write("================================\n")
            file.write("--------------------------------\n")

If the model is small like mistralai/Mistral-7B-Instruct-v0.2 or bigscience/bloom-7b1 everything works just fine.

But if model is big like bigscience/bloom or even meta-llama/Meta-Llama-3.1-405B then I start to get CUDA out of memory errors.

The exact error looks like

INFO:    Setting 'NVIDIA_VISIBLE_DEVICES=all' to emulate legacy GPU binding.
Unused kwargs: ['token']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /d/hpc/home/mk8054/.cache/huggingface/token
Login successful
Getting model

Loading checkpoint shards:   0%|          | 0/72 [00:00<?, ?it/s]
Loading checkpoint shards:   1%|▏         | 1/72 [00:11<14:05, 11.91s/it]
Loading checkpoint shards:   3%|▎         | 2/72 [00:21<12:13, 10.48s/it]
Loading checkpoint shards:   4%|▍         | 3/72 [00:29<10:38,  9.25s/it]
Loading checkpoint shards:   6%|▌         | 4/72 [00:37<10:04,  8.89s/it]
Loading checkpoint shards:   7%|▋         | 5/72 [00:45<09:32,  8.54s/it]
Loading checkpoint shards:   8%|▊         | 6/72 [00:53<09:07,  8.29s/it]
Loading checkpoint shards:  10%|▉         | 7/72 [01:01<08:59,  8.31s/it]
Loading checkpoint shards:  11%|█         | 8/72 [01:09<08:44,  8.19s/it]
Loading checkpoint shards:  12%|█▎        | 9/72 [01:17<08:39,  8.24s/it]
Loading checkpoint shards:  14%|█▍        | 10/72 [01:26<08:30,  8.23s/it]
Loading checkpoint shards:  15%|█▌        | 11/72 [01:34<08:16,  8.15s/it]
Loading checkpoint shards:  17%|█▋        | 12/72 [01:41<08:02,  8.04s/it]
Loading checkpoint shards:  18%|█▊        | 13/72 [01:49<07:47,  7.93s/it]
Loading checkpoint shards:  19%|█▉        | 14/72 [01:57<07:42,  7.98s/it]
Loading checkpoint shards:  21%|██        | 15/72 [02:05<07:37,  8.03s/it]
Loading checkpoint shards:  22%|██▏       | 16/72 [02:13<07:27,  7.99s/it]
Loading checkpoint shards:  24%|██▎       | 17/72 [02:21<07:17,  7.96s/it]
Loading checkpoint shards:  25%|██▌       | 18/72 [02:29<07:07,  7.92s/it]
Loading checkpoint shards:  26%|██▋       | 19/72 [02:37<07:09,  8.11s/it]
Loading checkpoint shards:  28%|██▊       | 20/72 [02:45<06:55,  8.00s/it]
Loading checkpoint shards:  28%|██▊       | 20/72 [02:53<07:31,  8.68s/it]
Traceback (most recent call last):
  File "/d/hpc/home/mk8054/bats_bloom.py", line 63, in <module>
    tokenizer, model = load_model_huggingface(model)
  File "/d/hpc/home/mk8054/bats_bloom.py", line 37, in load_model_huggingface
    model = AutoModelForCausalLM.from_pretrained(
  File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
    return model_class.from_pretrained(
  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 3916, in from_pretrained
    ) = cls._load_pretrained_model(
  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4390, in _load_pretrained_model
    new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 938, in _load_state_dict_into_meta_model
    hf_quantizer.create_quantized_param(model, param, param_name, param_device, state_dict, unexpected_keys)
  File "/opt/conda/lib/python3.10/site-packages/transformers/quantizers/quantizer_bnb_4bit.py", line 217, in create_quantized_param
    new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 332, in to
    return self._quantize(device)
  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/nn/modules.py", line 297, in _quantize
    w_4bit, quant_state = bnb.functional.quantize_4bit(
  File "/opt/conda/lib/python3.10/site-packages/bitsandbytes/functional.py", line 1173, in quantize_4bit
    out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 392.00 MiB. GPU 
srun: error: wn221: task 0: Exited with exit code 1

which is very weird, because I use SLURM and have acsess to a HPC with 48x NVIDIA v100 (each has about 32GB of RAM).

My run script looks like

#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --partition=gpu
#SBATCH --gres=gpu:2 #--gpus=2
#SBATCH --cpus-per-task=4
#SBATCH --time=24:00:00
#SBATCH --output=logs/sling-my-model-%J.out
#SBATCH --job-name="My Model running script"


srun singularity exec --nv --nvccli --writable-tmpfs  python-container.sif python bats.py

How can I load these large models? Is there any way I can modify my scripts, or do I have to find a better computer to run my models on?

Upvotes: 1

Views: 371

Answers (0)

Related Questions