Reputation: 253
I am looking to convert flan-T5 model downloaded from Hugging face into onnx format and make inference with the same.
My input data is the symptoms of disease and expected output is the Disease name
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import onnx
# Set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
# Export the model to ONNX format
onnx_path = "flan-t5-xl.onnx"
dummy_input = tokenizer("What's the disease name in this text: Example text", return_tensors="pt", padding=True).to(device)
dummy_input_ids = dummy_input["input_ids"]
dummy_attention_mask = dummy_input["attention_mask"]
dummy_decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)
with torch.no_grad():
torch.onnx.export(
model,
(dummy_input_ids, dummy_attention_mask, dummy_decoder_input_ids),
onnx_path,
opset_version=11,
input_names=["input_ids", "attention_mask", "decoder_input_ids"],
output_names=["output"],
dynamic_axes={
"input_ids": {0: "batch_size"},
"attention_mask": {0: "batch_size"},
"decoder_input_ids": {0: "batch_size"},
"output": {0: "batch_size", 1: "sequence_length"},
},
)
print(f"Model saved to {onnx_path}")
# Inference using the ONNX model on GPU
import onnxruntime
onnx_model = onnxruntime.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"]
)
InvalidGraph: [ONNXRuntimeError] : 10 : INVALID_GRAPH : Load model from flan-t5-xl.onnx failed:This is an invalid model. Type Error: Type 'tensor(int64)' of input parameter (/decoder/block.0/layer.0/SelfAttention/Sub_output_0) of operator (Min) in node (/decoder/block.0/layer.0/SelfAttention/Min) is invalid.
input_text = input("Enter Disease/Symptom Detail: ")
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)
onnx_inputs = {
"input_ids": input_ids.cpu().numpy(),
"attention_mask": attention_mask.cpu().numpy(),
"decoder_input_ids": decoder_input_ids.cpu().numpy(),
}
onnx_output = onnx_model.run(None, onnx_inputs)[0]
decoded_output = tokenizer.decode(onnx_output[0], skip_special_tokens=True)
print('-' * 100)
print(f"Name of Disease based on Entered Text: {decoded_output}")
Upvotes: 1
Views: 552
Reputation: 122092
Use https://huggingface.co/datasets/bakks/flan-t5-onnx instead.
And to convert the google/flan-t5
, see https://huggingface.co/datasets/bakks/flan-t5-onnx/blob/main/exportt5.py
from pathlib import Path
import transformers as t
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM
# print out the version of the transformers library
print("transformers version:", t.__version__)
models = [
#"google/flan-t5-small",
#"google/flan-t5-base",
#"google/flan-t5-large",
"google/flan-t5-xl",
"google/flan-t5-xxl",
]
for model_id in models:
model_name = model_id.split("/")[1]
onnx_path = Path("onnx/" + model_name)
# load vanilla transformers and convert to onnx
model = ORTModelForSeq2SeqLM.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)
Then try again:
import onnxruntime
onnx_model = onnxruntime.InferenceSession(
onnx_path, providers=["CUDAExecutionProvider"]
)
Upvotes: 1