Reputation: 41

YoloV8 TFlite Python Predictions And Interpreting output

I am new to python, flutter and ML. I am trying to convert yolov8 to be a tflite model to later build a flutter application.

I managed to convert yolov8e to a tflite model using the yolo export command.

Before i move that model into flutter i am trying to test the model in python to make sure it functions as expected. The code i am using is below.

import numpy as np
import tensorflow as tf

# Load the TFLite model
model_path = "C:\\Users\\yolov8x_saved_model\\yolov8x_float32.tflite"
interpreter = tf.lite.Interpreter(model_path=model_path)
interpreter.allocate_tensors()


# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()


# Load and preprocess the image
image_path = "C:\\Users\\Downloads\\2.jpeg"
image = tf.keras.preprocessing.image.load_img(image_path, target_size=(640, 640))
image_array = tf.keras.preprocessing.image.img_to_array(image)
preprocessed_image = np.expand_dims(image_array, axis=0)


# Set the input tensor to the preprocessed image
interpreter.set_tensor(input_details[0]['index'], preprocessed_image)


# Run the inference
interpreter.invoke()


# Get the output tensor and reshape it
output_tensor = interpreter.get_tensor(output_details[0]['index'])
output_shape = output_details[0]['shape']
outputs = np.reshape(output_tensor, output_shape)


print(output)

The output is [[[6.20934343e+00 1.20168591e+01 1.99987564e+01 ... 5.18638123e+02 5.35865967e+02 5.85887085e+02] ... [1.57089694e-03 6.52399845e-04 1.49149655e-05 ... 2.00569357e-05 1.41740784e-05 5.61324532e-06]]]

So i try to convert it

from pathlib import Path
import re
import yaml
import cv2

def yaml_load(file='data.yaml', append_filename=False):
    with open(file, errors='ignore', encoding='utf-8') as f:
        s = f.read()  # string

        # Remove special characters
        if not s.isprintable():
            s = re.sub(r'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]+', '', s)

        # Add YAML filename to dict and return
        return {**yaml.safe_load(s), 'yaml_file': str(file)} if append_filename else yaml.safe_load(s)

CLASSES = yaml_load("C:\\Users\\Downloads\\coco128.yml")['names']
colors = np.random.uniform(0, 255, size=(len(CLASSES), 3))

original_image: np.ndarray = cv2.imread("C:\\Users\\Downloads\\2.jpeg")
[height, width, _] = original_image.shape
length = max((height, width))
image = np.zeros((length, length, 3), np.uint8)
image[0:height, 0:width] = original_image
scale = length / 640


def draw_bounding_box(img, class_id, confidence, x, y, x_plus_w, y_plus_h):
    label = f'{CLASSES[class_id]} ({confidence:.2f})'
    color = colors[class_id]
    cv2.rectangle(img, (x, y), (x_plus_w, y_plus_h), color, 2)
    cv2.putText(img, label, (x - 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)


outputs = np.array([cv2.transpose(outputs[0])])
rows = outputs.shape[1]

boxes = []
scores = []
class_ids = []

for i in range(rows):
    classes_scores = outputs[0][i][4:]
    (minScore, maxScore, minClassLoc, (x, maxClassIndex)) = cv2.minMaxLoc(classes_scores)
    if maxScore >= 0.60:
        box = [outputs[0][i][0] - (0.5 * outputs[0][i][2]), outputs[0][i][1] - (0.5 * outputs[0][i][3]), outputs[0][i][2], outputs[0][i][3]] 
        boxes.append(box) 
        scores.append(maxScore) 
        class_ids.append(maxClassIndex)

    result_boxes = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45, 0.5)

    detections = []
for i in range(len(result_boxes)):
    index = result_boxes[i]
    box = boxes[index]
    detection = {
            'class_id': class_ids[index],
            'class_name': CLASSES[class_ids[index]],
            'confidence': scores[index],
            'box': box,
            'scale': scale}
    
    if(CLASSES[class_ids[index]]=='person'):
        detections.append(detection)
        draw_bounding_box(original_image, class_ids[index], scores[index], round(box[0] * scale), round(box[1] * scale),
                          round((box[0] + box[2]) * scale), round((box[1] + box[3]) * scale))

cv2.imshow('image', original_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

The problem i am getting is the model predicts almost everything as a person. There are 2 people in the image but i get over 100 person predictions of +-70 to 100% acc.

Any help would be appreciated.

Upvotes: 2

Answers (3)

Wamiq Raza AI

Reputation: 39

Refer to this PR example:

https://github.com/ultralytics/ultralytics/tree/main/examples/YOLOv8-OpenCV-int8-tflite-Python

You can take some insight and extract information for your use case

Upvotes: 0

Mahdi Madadi

Reputation: 11

There is an easy way to check whether the "yolovx.tflite" works fine or not, and here is the code:

from IPython.display import Image as imgshow
import matplotlib.pyplot as plt
from ultralytics import YOLO
from PIL import Image
import numpy as np
import cv2
import os
%matplotlib inline

model = YOLO("path_to_your_tflite_model", task='detect')

image = Image.open('path_to_image')
image = np.asarray(image)
results = model.predict(image)

if you print the results it will give you something like this "0: size_of_image number_of_class class_name", and you can find out how many objects your model detected, and what are those objects.

And for training the model it is better to use pre-trained model as checkpoint to increase the accuracy.

Upvotes: 1

cute-marshmallow

Reputation: 71

There are some preprocessing and postprocessing steps that are used by YOLOv8 CLI and thus should be implemented in your pipeline:

Resizing and Padding (Letterboxing)
Non-Maximum Suppression (NMS)
Rescaling Bounding Boxes

Resizing and Padding (Letterboxing) - the code of the LetterBox class can be found here.

image_path = "demo.jpg"
imgsize = 512

im = [LetterBox(imgsize, auto=False, stride=32)(image=cv2.imread(image_path))]

im = np.stack(im)
print(im.shape)
im = im[..., ::-1].transpose((0, 1, 2, 3))  # BGR to RGB, BHWC to BCHW, (n, 3, h, w)
print(im.shape)

im = np.ascontiguousarray(im)  # contiguous
im = im.astype(np.float32)
im /= 255

# Allocate input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Prepare the input tensor
input_data = im
interpreter.set_tensor(input_details[0]['index'], input_data)

# Run inference
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])

You can see that the .transpose function does not change the shape of the im. I used onnx2tf tool for conversion from ONNX to TFLite and it has produced different output shape than it was originally in the model. If it does not work for you then use the original code.

Non-Maximum Suppression (NMS) - This is used to handle overlapping bounding boxes. It keeps the bounding box with the highest confidence score and suppresses all the other bounding boxes with high overlap (IoU). The original code is located here. Here is my simplified version for testing purposes:

nc = 0
conf_thres = 0.25

bs = output_data.shape[0]  # batch size
nc = nc or (output_data.shape[1] - 4)  # number of classes
nm = output_data.shape[1] - nc - 4
mi = 4 + nc  # mask start index
xc = np.amax(output_data[:, 4:mi], 1) > conf_thres  # candidates

multi_label=False
multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)

prediction = np.transpose(output_data, (0, -1, -2))

def xywh2xyxy(x):
  """
  Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
  top-left corner and (x2, y2) is the bottom-right corner.

  Args:
    x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
  Returns:
    y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
  """
  y = np.copy(x)
  y[..., 0] = x[..., 0] - x[..., 2] / 2  # top left x
  y[..., 1] = x[..., 1] - x[..., 3] / 2  # top left y
  y[..., 2] = x[..., 0] + x[..., 2] / 2  # bottom right x
  y[..., 3] = x[..., 1] + x[..., 3] / 2  # bottom right y
  return y

prediction[..., :4] = xywh2xyxy(prediction[..., :4])  # xywh to xyxy
output = [np.zeros((0, 6 + nm))] * bs

max_nms=30000
agnostic=False
max_wh=7680
iou_thres = 0.45
max_det = 300

for xi, x in enumerate(prediction):  # image index, image inference
  x = x[xc[xi]]  # confidence

  if not x.shape[0]:
    continue

  # Detections matrix nx6 (xyxy, conf, cls)
  box = x[:, :4]
  cls = x[:, 4:4+nc]
  mask = x[:, 4+nc:4+nc+nm]

  conf = np.max(cls, axis=1, keepdims=True)
  j = np.argmax(cls, axis=1, keepdims=True)  

  # Concatenate the arrays along axis 1
  x = np.concatenate((box, conf, j.astype(float), mask), axis=1)

  # Reshape conf to a 1-dimensional array
  conf_flat = conf.flatten()

  # Filter the resulting array based on the condition conf_flat > conf_thres
  filtered_x = x[conf_flat > conf_thres]

  n = filtered_x.shape[0]  # number of boxes

  if not n:  # no boxes
    continue
  if n > max_nms:  # excess boxes
  # Sort x based on the 5th column in descending order
    sorted_indices = np.argsort(x[:, 4])[::-1]

  # Select the top max_nms rows based on the sorted indices
  x = x[sorted_indices[:max_nms]]

  c = x[:, 5:6] * (0 if agnostic else max_wh)
  boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores

  # Apply NMS using cv2.dnn.NMSBoxes function
  i = cv2.dnn.NMSBoxes(boxes, scores, score_threshold=0.4, nms_threshold=iou_thres)
  i = i[:max_det]  # limit detections

  output[xi] = x[i]

Rescaling Bounding Boxes - This step is necessary because the output bounding box coordinates are relative to the size of the input image. To get the coordinates for the original image, you'll need to rescale the bounding box coordinates.

def clip_boxes(boxes, shape):
  """
  It takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the
shape

  Args:
  boxes (torch.Tensor): the bounding boxes to clip
  shape (tuple): the shape of the image
  """
  boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
  boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2

def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
  """
  Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
(img1_shape) to the shape of a different image (img0_shape).

  Args:
    img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
    boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
    img0_shape (tuple): the shape of the target image, in the format of (height, width).
    ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
                     calculated based on the size difference between the two images.

  Returns:
  boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
  """
  if ratio_pad is None:  # calculate from img0_shape
    gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
    pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
        (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)  # wh padding
  else:
    gain = ratio_pad[0][0]
    pad = ratio_pad[1]

  boxes[..., [0, 2]] -= pad[0]  # x padding
  boxes[..., [1, 3]] -= pad[1]  # y padding
  boxes[..., :4] /= gain
  clip_boxes(boxes, img0_shape)
  return boxes

results = []

img = cv2.imread(image_path)

for i, pred in enumerate(output):
  pred[:, :4] = scale_boxes((512, 512), pred[:, :4], img.shape)
  results.append(pred)

The original code can be found here.

And then draw the bounding boxes on the image:

for detection in results:
  print(detection)
  xmin, ymin, width, height, conf, class_id = detection[0]

  # Convert float coordinates to integers
  xmin = int(xmin)
  ymin = int(ymin)
  width = int(width)
  height = int(height)

  # Draw the rectangle on the image
  cv2.rectangle(img, (xmin, ymin), (width, height), (0, 255, 0), 2)

  # Add text label
  label = f"Class {int(class_id)}: {conf:.2f}"
  cv2.putText(img, label, (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

Here is my Google Colab. Hope it helps!

Update: To apply NMS on mobile device you should consider stitching it to the ONNX model (before conversion to TFLite) like described here as there is no support for NMS operations using known libraries as far as I know.

Upvotes: 2

YoloV8 TFlite Python Predictions And Interpreting output

Answers (3)

Related Questions