Reputation: 1
I'm trying to run a binary semantic segmentation model, which i want to speedup using tensorRT. I base myself upon: https://stackoverflow.com/questions/59280745/inference-with-tensorrt-engine-file-on-python/67492525#67492525:~:text=I%20have-,updated,-%40Oguz%20Vuruskaner%27s%20answer
And
https://tengteng.medium.com/example-inference-code-to-run-tensorrt-10-0-32ea93fdcc2e
INFO
Checking the engine file gives: &&&& PASSED TensorRT.trtexec [TensorRT v100700] [b23] # /usr/src/tensorrt/bin/trtexec --loadEngine=unet_mobileone.trt --shapes=input:1x3x1024x1024 --verbose
Current code
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit # Note: required! to initialize pycuda
import tensorrt as trt
class TensorRTInference:
def __init__(self, engine_path):
# initialize
self.logger = trt.Logger(trt.Logger.ERROR)
self.runtime = trt.Runtime(self.logger)
# setup
self.engine = self.load_engine(engine_path)
self.context = self.engine.create_execution_context()
# allocate buffers
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(
self.engine
)
def load_engine(self, engine_path):
# loads the model from given filepath
with open(engine_path, "rb") as f:
engine = self.runtime.deserialize_cuda_engine(f.read())
return engine
class HostDeviceMem:
def __init__(self, host_mem, device_mem, shape):
# keeping track of addresses
self.host = host_mem
self.device = device_mem
# keeping track of shape to un-flatten it later
self.shape = shape
def allocate_buffers(self, engine):
inputs, outputs, bindings = [], [], []
stream = cuda.Stream()
for i in range(engine.num_io_tensors):
tensor_name = engine.get_tensor_name(i)
shape = engine.get_tensor_shape(tensor_name)
size = trt.volume(shape)
dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))
# allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# append the device buffer address to device bindings
bindings.append(int(device_mem))
# append to the appropiate input/output list
if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
inputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
else:
outputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
return inputs, outputs, bindings, stream
def infer(self, input_data):
# transfer input data to device
np.copyto(self.inputs[0].host, input_data.ravel())
cuda.memcpy_htod_async(self.inputs[0].device, self.inputs[0].host, self.stream)
# set tensor address
for i in range(self.engine.num_io_tensors):
self.context.set_tensor_address(
self.engine.get_tensor_name(i), self.bindings[i]
)
# run inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# transfer predictions back
for i in range(len(self.outputs)):
cuda.memcpy_dtoh_async(
self.outputs[i].host, self.outputs[i].device, self.stream
)
# synchronize the stream
self.stream.synchronize()
# un-flatten the outputs
outputs = []
for i in range(len(self.outputs)):
output = self.outputs[i].host
output = output.reshape(self.outputs[i].shape)
outputs.append(output)
return outputs
from PIL import Image
if __name__ == "__main__":
engine_path = "unet_mobilenet_2.trt"
trt_inference = TensorRTInference(engine_path)
img = Image.open("data/test/images/lineImage0.jpeg")
img_array = np.array(img)
inputs = img_array.transpose(2, 0, 1) # (3, 1024, 1024)
# Run inference
output_data = trt_inference.infer(inputs)
The problem np.copyto(self.inputs[0].host, input_data.ravel())
ValueError: could not broadcast input array from shape (5953536,) into shape (3145728,)
Just in general no clear example to follow, so any help is welcome.
Upvotes: 0
Views: 41