half() is not supported for quantized model when using FineTuned

Question

I have fine tuned a Llama-3 model ( model_name="meta-llama/Meta-Llama-3-8B") in standard way per this notebook https://colab.research.google.com/drive/1Zmaceu65d7w4Tcd-cfnZRb6k_Tcv2b8g?usp=sharing

Using the merged model, I'm trying to deploy on AWS sagamaker as per this https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/pytorch_deploy_large_GPT_model/GPT-J-6B-model-parallel-inference-DJL.ipynb

Below is the code ...

The docker image I'm using is DeepSpeed image URI is 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121 ( https://github.com/aws/deep-learning-containers/blob/master/available_images.md )

I'm getting below error, the

.half() is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct dtype.')

Any advise ? Thanks

Error Stack trace

[WARN ] :--- Logging error ---
[INFO ] :[2024-05-09 01:32:52,535] [INFO] [logging.py:18:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[WARN ] :Traceback (most recent call last):
[WARN ] :  File "/tmp/.djl.ai/python/0.26.0/djl_python_engine.py", line 121, in run_server
[WARN ] :    outputs = self.service.invoke_handler(function_name, inputs)
[WARN ] :  File "/tmp/.djl.ai/python/0.26.0/djl_python/service_loader.py", line 29, in invoke_handler
[WARN ] :    return getattr(self.module, function_name)(inputs)
[WARN ] :  File "/opt/ml/model/model_pkg/model.py", line 61, in handle
[WARN ] :    predictor = get_model(inputs.get_properties())
[WARN ] :  File "/opt/ml/model/model_pkg/model.py", line 45, in get_model
[WARN ] :    model = deepspeed.init_inference(
[WARN ] :  File "/usr/local/lib/python3.10/dist-packages/deepspeed/__init__.py", line 65, in init_inference
[WARN ] :    engine=InferenceEngine(model,config=ds_inference_config)
[WARN ] :  File "/usr/local/lib/python3.10/dist-packages/deepspeed/inference/engine.py", line 48, in __init__
[WARN ] :    if config.dtype:self._convert_to_dtype(config)
[WARN ] :  File "/usr/local/lib/python3.10/dist-packages/deepspeed/inference/engine.py", line 232, in _convert_to_dtype
[WARN ] :    elif config.dtype==torch.half:self.module.half()
[WARN ] :  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 2465, in half
[WARN ] :    raise ValueError(
[WARN ] :ValueError: `.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.
[WARN ] :

Code

from djl_python import Input, Output
import os
import deepspeed
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

predictor = None

def init_model(model_name="cs_model"):
    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True,
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
    )
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return model, tokenizer

def get_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    local_rank = int(os.getenv("LOCAL_RANK", "0"))

    model, tokenizer = init_model(model_name="cs_model")
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator


def handle(inputs: Input) -> None:
    global predictor
    if not predictor:
        predictor = get_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    data = inputs.get_as_string()
    result = predictor(data, do_sample=True, max_new_tokens=256)
    return Output().add(result)

half() is not supported for quantized model when using FineTuned

Answers (1)

Related Questions