Reputation: 241
set up - multi model endpoint in aws sagemaker with nvidia triton server. based on the documentation provided here -> https://github.com/aws/amazon-sagemaker-examples/blob/main/inference/nlp/realtime/triton/multi-model/t5_pytorch_python-backend/t5_pytorch_python-backend.ipynb, we construct a request payload, which is of type httpclient class provided by tritonclient.http => httpclient.InferenceServerClient.genreate_request_body(inputs, outputs=outputs...
examples i have seen pass list for both inputs and outputs parameter. any examples of passing just one input instead of list ? also, on the backend , the code that process this request is model.py file (code below) , looks like it only accepts , list of inputs rather than just one input. is there a way to override that .
import tritonclient.http as httpclient
import numpy as np
def get_text_payload_binary(model_name, text):
inputs = []
outputs = []
input_ids, attention_mask = tokenize_text(model_name, text)
inputs.append(httpclient.InferInput("input_ids", input_ids.shape, "INT32"))
inputs.append(httpclient.InferInput("attention_mask", attention_mask.shape, "INT32"))
inputs[0].set_data_from_numpy(input_ids.astype(np.int32), binary_data=True)
inputs[1].set_data_from_numpy(attention_mask.astype(np.int32), binary_data=True)
output_name = "output" if model_name == "t5-small" else "logits"
request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
inputs, outputs=outputs
)
return request_body, header_length
model.py
import numpy as np
import sys
import os
import json
from pathlib import Path
import torch
import triton_python_backend_utils as pb_utils
class TritonPythonModel:
def initialize(self, args):
...
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
responses = []
for request in requests:
input_ids = pb_utils.get_input_tensor_by_name(request, "input_ids")
input_ids = input_ids.as_numpy()
input_ids = torch.as_tensor(input_ids).long().cuda()
attention_mask = pb_utils.get_input_tensor_by_name(request, "attention_mask")
attention_mask = attention_mask.as_numpy()
attention_mask = torch.as_tensor(attention_mask).long().cuda()
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
translation = self.model.generate(**inputs, num_beams=1)
np_translation = translation.cpu().int().detach().numpy()
inference_response = pb_utils.InferenceResponse(
output_tensors=[
pb_utils.Tensor(
"output",
np_translation.astype(self.output_dtype)
)
]
)
responses.append(inference_response)
return responses
Upvotes: 1
Views: 476