Brecht De Cock
Brecht De Cock

Reputation: 1

Broadcasting error when trying to speedup segmentation model with TensorRT 10.7

I'm trying to run a binary semantic segmentation model, which i want to speedup using tensorRT. I base myself upon: https://stackoverflow.com/questions/59280745/inference-with-tensorrt-engine-file-on-python/67492525#67492525:~:text=I%20have-,updated,-%40Oguz%20Vuruskaner%27s%20answer

And

https://tengteng.medium.com/example-inference-code-to-run-tensorrt-10-0-32ea93fdcc2e

INFO

Checking the engine file gives: &&&& PASSED TensorRT.trtexec [TensorRT v100700] [b23] # /usr/src/tensorrt/bin/trtexec --loadEngine=unet_mobileone.trt --shapes=input:1x3x1024x1024 --verbose

Current code

import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit  # Note: required! to initialize pycuda
import tensorrt as trt

class TensorRTInference:
    def __init__(self, engine_path):
        # initialize
        self.logger = trt.Logger(trt.Logger.ERROR)
        self.runtime = trt.Runtime(self.logger)

        # setup
        self.engine = self.load_engine(engine_path)
        self.context = self.engine.create_execution_context()

        # allocate buffers
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(
            self.engine
        )

    def load_engine(self, engine_path):
        # loads the model from given filepath
        with open(engine_path, "rb") as f:
            engine = self.runtime.deserialize_cuda_engine(f.read())
        return engine

    class HostDeviceMem:
        def __init__(self, host_mem, device_mem, shape):
            # keeping track of addresses
            self.host = host_mem
            self.device = device_mem
            # keeping track of shape to un-flatten it later
            self.shape = shape

    def allocate_buffers(self, engine):
        inputs, outputs, bindings = [], [], []
        stream = cuda.Stream()

        for i in range(engine.num_io_tensors):
            tensor_name = engine.get_tensor_name(i)
            shape = engine.get_tensor_shape(tensor_name)
            size = trt.volume(shape)
            dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

            # allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # append the device buffer address to device bindings
            bindings.append(int(device_mem))

            # append to the appropiate input/output list
            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
                inputs.append(self.HostDeviceMem(host_mem, device_mem, shape))
            else:
                outputs.append(self.HostDeviceMem(host_mem, device_mem, shape))

        return inputs, outputs, bindings, stream

    def infer(self, input_data):
        # transfer input data to device
        np.copyto(self.inputs[0].host, input_data.ravel())
        cuda.memcpy_htod_async(self.inputs[0].device, self.inputs[0].host, self.stream)

        # set tensor address
        for i in range(self.engine.num_io_tensors):
            self.context.set_tensor_address(
                self.engine.get_tensor_name(i), self.bindings[i]
            )

        # run inference
        self.context.execute_async_v3(stream_handle=self.stream.handle)

        # transfer predictions back
        for i in range(len(self.outputs)):
            cuda.memcpy_dtoh_async(
                self.outputs[i].host, self.outputs[i].device, self.stream
            )

        # synchronize the stream
        self.stream.synchronize()

        # un-flatten the outputs
        outputs = []
        for i in range(len(self.outputs)):
            output = self.outputs[i].host
            output = output.reshape(self.outputs[i].shape)
            outputs.append(output)

        return outputs
    
   
from PIL import Image 
if __name__ == "__main__":
    engine_path = "unet_mobilenet_2.trt"
    trt_inference = TensorRTInference(engine_path)
    img = Image.open("data/test/images/lineImage0.jpeg")
    img_array = np.array(img)
    inputs = img_array.transpose(2, 0, 1)  # (3, 1024, 1024)
    
    # Run inference
    output_data = trt_inference.infer(inputs)
   

The problem np.copyto(self.inputs[0].host, input_data.ravel())

ValueError: could not broadcast input array from shape (5953536,) into shape (3145728,)

Just in general no clear example to follow, so any help is welcome.

Upvotes: 0

Views: 41

Answers (0)

Related Questions