PyCUDA fills np.array too slow

Question

With this code I want to draw filled triangles:

import cv2
import numpy as np
import os
import time
import math

import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule


executions_per_frame = 10
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, int* canvas_shape, float *z_buffer, int x, int y, float z, unsigned char *color) {
    int index = y * canvas_shape[1] + x;
    if (z > z_buffer[index]) {
        z_buffer[index] = z;
        for (int i = 0; i < canvas_shape[2]; ++i) {
            canvas[index * canvas_shape[2] + i] = color[i];
        }
    }
}

// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, int *canvas_shape, float *z_buffer, float *ll, float *rl, float *lu, float *ru, unsigned char *color, int height, int min_x, int min_y) {
    int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
    int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;

    float k1 = (float)global_thread_y / height;
    int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
    int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
    float left_z = ll[2] + (lu[2] - ll[2]) * k1;
    float right_z = rl[2] + (ru[2] - rl[2]) * k1;
    int actual_x = min_x + global_thread_x;
    if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
        int actual_y = min_y + global_thread_y;
        float k2 = (float)(global_thread_x - (left_x - min_x)) / (right_x - left_x);
        float actual_z = left_z + (right_z - left_z) * k2;
        set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
    }
}
"""

if __name__ == '__main__':
    if (os.system("cl.exe")):
        os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64"
    if (os.system("cl.exe")):
        raise RuntimeError("cl.exe still not found")

    pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
    pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")

    time_start, frames_count, fps = time.time(), 0, 0
    while True:
        key = cv2.waitKeyEx(1)
        if key == 27:
            break

        canvas_width, canvas_height = 1000, 800
        canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
        z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
        fragment_width, fragment_height = 400, 300
        color = [0, 0, 200]

        block_side = 32
        block_dim = (block_side, block_side, 1)
        grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))

        param_canvas = cuda.InOut(canvas)  # unsigned char *canvas
        param_canvas_shape = cuda.In(np.array(canvas.shape, dtype=np.int32))  # int *canvas_shape
        param_z_buffer = cuda.InOut(z_buffer)  # float *z_buffer
        param_ll = cuda.In(np.array([100, 200, frames_count], dtype=np.float32))  # float *ll
        param_rl = cuda.In(np.array([500, 200, frames_count], dtype=np.float32))  # float *rl
        param_lu = cuda.In(np.array([400, 500, frames_count], dtype=np.float32))  # float *lu
        param_ru = cuda.In(np.array([400, 500, frames_count], dtype=np.float32))  # float *ru
        param_color = cuda.In(np.array(color, dtype=np.uint8))  # unsigned char *color
        param_height = np.int32(fragment_height)  # int height
        param_min_x = np.int32(100)  # int min_x
        param_min_y = np.int32(200)  # int min_y
        for i in range(executions_per_frame):
            pycuda_draw_triangle(param_canvas, param_canvas_shape,
                param_z_buffer, param_ll, param_rl, param_lu, param_ru,
                param_color, param_height, param_min_x, param_min_y,
                block=block_dim, grid=grid_dim)

        frames_count += 1
        fps = frames_count / (time.time() - time_start)
        cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
        cv2.imshow('Scene', canvas)
    cv2.destroyAllWindows()

With executions_per_frame = 1 (for 1 iteration C function will be called 1 time) I got ~100 fps, with executions_per_frame = 10 - ~30 fps. It doesn't look as productive as it could be. What did I miss?

Also, does this have benefit in that particular task?

block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
pycuda_draw_triangle(..., block=block_dim, grid=grid_dim)

Or it can be just

pycuda_draw_triangle(..., block=(1, 1, 1), grid=(fragment_width, fragment_height))

Python 3.6.9, CUDA 10.0, RTX 2060

UPD:

I managed to improve performance to 150 fps on executions_per_frame = 10 by replacing cuda.In() and cuda.InOut() with cuda.mem_alloc(), but CPU usage now is near 30%. Can we even better?

if __name__ == '__main__':
    if (os.system("cl.exe")):
        os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64"
    if (os.system("cl.exe")):
        raise RuntimeError("cl.exe still not found")

    pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
    pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")

    canvas_width, canvas_height = 1000, 800
    param_canvas = cuda.mem_alloc(canvas_width * canvas_height * 3)  # unsigned char *canvas
    param_canvas_shape = cuda.mem_alloc(12)  # int *canvas_shape
    param_z_buffer = cuda.mem_alloc(canvas_width * canvas_height * 4)  # float *z_buffer
    param_ll = cuda.mem_alloc(12)  # float *ll
    param_rl = cuda.mem_alloc(12)  # float *rl
    param_lu = cuda.mem_alloc(12)  # float *lu
    param_ru = cuda.mem_alloc(12)  # float *ru
    param_color = cuda.mem_alloc(3)  # unsigned char *color

    time_start, frames_count, fps = time.time(), 0, 0
    while True:
        key = cv2.waitKeyEx(1)
        if key == 27:
            break

        fragment_width, fragment_height = 400, 300
        color = [0, 0, 200]
        block_side = 32
        block_dim = (block_side, block_side, 1)
        grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))

        canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
        z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
        cuda.memcpy_htod(param_canvas, canvas)
        cuda.memcpy_htod(param_canvas_shape, np.array(canvas.shape, dtype=np.int32))
        cuda.memcpy_htod(param_z_buffer, z_buffer)
        cuda.memcpy_htod(param_ll, np.array([100, 200, frames_count], dtype=np.float32))
        cuda.memcpy_htod(param_rl, np.array([500, 200, frames_count], dtype=np.float32))
        cuda.memcpy_htod(param_lu, np.array([400, 500, frames_count], dtype=np.float32))
        cuda.memcpy_htod(param_ru, np.array([400, 500, frames_count], dtype=np.float32))
        cuda.memcpy_htod(param_color, np.array(color, dtype=np.uint8))
        param_height = np.int32(fragment_height)  # int height
        param_min_x = np.int32(100)  # int min_x
        param_min_y = np.int32(200)  # int min_y
        for i in range(executions_per_frame):
            pycuda_draw_triangle(param_canvas, param_canvas_shape,
                param_z_buffer, param_ll, param_rl, param_lu, param_ru,
                param_color, param_height, param_min_x, param_min_y,
                block=block_dim, grid=grid_dim)
        cuda.memcpy_dtoh(canvas, param_canvas)
        cuda.memcpy_dtoh(z_buffer, param_z_buffer)

        frames_count += 1
        fps = frames_count / (time.time() - time_start)
        cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
        cv2.imshow('Scene', canvas)
    cv2.destroyAllWindows()

Robert Crovella · Accepted Answer

The basic principle here is that you want to get everything that is unnecessary out of the performance loop. Your definition of performance is fps, so you want to get everything out of the while loop that doesn't have to be there.

The biggest limiter to performance is the loop overhead - some work that must be done that is "independent" of your setting for executions_per_frame.

Without resorting to the profiler, we can get some estimate of the overhead as well as the contribution of each iteration of executions_per_frame. We will measure the overall performance (fps) at two different values of executions_per_frame, and then solve 2 equations in 2 variables (overhead c and per-iteration-cost x):

1/fps (milliseconds per frame) = c + ix

My GPU is a bit slower than your RTX2060, so when I run your original code with two different values of executions_per_frame (i) of 1, and 10, I observed:

i=1:  80 fps = 12 ms/frame
i=10: 11 fps = 90 ms/frame

Therefore our 2 equations are:

c + (1)x  = 12
c + (10)x = 90

Solving, we have c = 3ms and x = 9ms. So there is some "fixed" overhead of ~3ms per frame, and some variable overhead of ~9ms per iteration of executions_per_frame. The thing we can definitely attack is the x number (that is way too large) but we probably will make little progress with the c number.

A big contributor to your original problem is that the pycuda .In, .Out and .InOut specify data movement to be done per kernel launch. This means every time you launch your kernel in the for-loop, you are moving data specified that way. This is almost certainly not all necessary for your algorithm.

So let's refactor the code to remove that characteristic and take another measurement. What follows is code that has been coverted to run on linux (because that is where I am doing my work -- it seems you may be on windows) and also does these things:

It's somewhat trivial, but I have converted your in-kernel division operation by height to a multiplication operation by 1/height. Since you are passing height as a kernel parameter, and only using it for that 1 operation, I refactored to pass 1/height and make it a multiplication. Not very imporant.
Refactor (remove) all your usage of .In and .InOut to do something similar using cuda.mem_alloc and cuda.memcpy_XXXX.
I've converted some of the data movement (of zeros) to use cuda.memset_XXXX instead. It's quicker than moving the data.
I've moved some operations around in the timing-critical loop.
Importantly, I'm not moving z-buffer back to the host. If you need that (its not needed for the code you have shown) you will have to add that back, and it will impact performance somewhat.

Here's my refactored code:

import cv2
import numpy as np
import os
import time
import math

import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule


executions_per_frame = 100
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, const int* canvas_shape, float *z_buffer, int x, int y, float z, const unsigned char *color) {
    int index = y * canvas_shape[1] + x;
    if (z > z_buffer[index]) {
        z_buffer[index] = z;
        for (int i = 0; i < canvas_shape[2]; ++i) {
            canvas[index * canvas_shape[2] + i] = color[i];
        }
    }
}

// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, const int *canvas_shape, float *z_buffer, const float *ll, const float *rl, const float *lu, const float *ru, const unsigned char *color, const float height, const int min_x, const int min_y) {
    int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
    int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;

    float k1 = (float)global_thread_y * height;
    int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
    int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
    float left_z = ll[2] + (lu[2] - ll[2]) * k1;
    float right_z = rl[2] + (ru[2] - rl[2]) * k1;
    int actual_x = min_x + global_thread_x;
    if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
        int actual_y = min_y + global_thread_y;
        float k2 = ((float)(global_thread_x - (left_x - min_x))) / (right_x - left_x);
        float actual_z = left_z + (right_z - left_z) * k2;
        set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
    }
}
"""

if __name__ == '__main__':

    pycuda_src_module = SourceModule(pycuda_code)
#    pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
    pycuda_draw_triangle = pycuda_src_module.get_function("draw_triangle")

    time_start, frames_count, fps = time.time(), 0, 0
    canvas_width, canvas_height = 1000, 800
    canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
    z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
    fragment_width, fragment_height = 400, 300
#             B   G  R
    color = [200, 0, 100]

    block_side = 32
    block_dim = (block_side, block_side, 1)
    grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
    param_canvas = cuda.mem_alloc(canvas.nbytes)  # unsigned char *canvas
    canvas_shape = np.array(canvas.shape, dtype=np.int32)
    param_canvas_shape = cuda.mem_alloc(canvas_shape.nbytes)  # int *canvas_shape
    cuda.memcpy_htod(param_canvas_shape, canvas_shape)
    param_z_buffer = cuda.mem_alloc(z_buffer.nbytes)  # float *z_buffer
    param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
    param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
    param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
    param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
    param_rl = cuda.mem_alloc(param_ll_h.nbytes)
    param_lu = cuda.mem_alloc(param_ll_h.nbytes)
    param_ru = cuda.mem_alloc(param_ll_h.nbytes)
    param_ll = cuda.mem_alloc(param_ll_h.nbytes)
    color_h = np.array(color, dtype=np.uint8)
    param_color = cuda.mem_alloc(color_h.nbytes)
    cuda.memcpy_htod(param_color, color_h)
    while True:
        key = cv2.waitKey(1)
        if key == 27:
            break
        cuda.memset_d8(param_canvas, 0, canvas.nbytes)
        cuda.memset_d8(param_z_buffer, 0, z_buffer.nbytes)
        cuda.memcpy_htod(param_ll, param_ll_h)
        cuda.memcpy_htod(param_rl, param_rl_h)
        cuda.memcpy_htod(param_lu, param_lu_h)
        cuda.memcpy_htod(param_ru, param_ru_h)
        param_height = np.float32(1.0/fragment_height)  # int height
        param_min_x = np.int32(100)  # int min_x
        param_min_y = np.int32(200)  # int min_y
        for i in range(executions_per_frame):
            pycuda_draw_triangle(param_canvas, param_canvas_shape,
                param_z_buffer, param_ll, param_rl, param_lu, param_ru,
                param_color, param_height, param_min_x, param_min_y,
                block=block_dim, grid=grid_dim)

        frames_count += 1
        param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
        param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
        param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
        param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
        fps = frames_count / (time.time() - time_start)
        cuda.memcpy_dtoh(canvas, param_canvas)
        cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
        cv2.imshow('Scene', canvas)
    cv2.destroyAllWindows()

This code runs quite a bit faster, so we can run timing measurements at 10 iterations and 100 iterations, rather than 1 and 10 as previously. At 100 iterations I get around 60fps and at 10 iterations I get around 80 fps. (At 1 iteration I still only get around 85 fps). Doing the same arithmetic:

c + (10)x  = 12ms
c + (100)x = 16ms

So x = 4/90 = 0.05ms and c = 11ms. (Precise equivalence between these 2 cases is not necessary. We are modelling something that may not be perfectly linear anyway, and this is a crude model). The point is that we have drastically reduced the per-executions_per_frame iteration cost, while making little improvement in the fixed overhead per frame.

So if your goal is to do many executions per frame, this will be an important method. If your goal really was just to do 1 execution per frame, this hasn't helped much.

With this change, for example, it might be the case that the cv2.imshow operation is several milliseconds, in which case that will eventually become a limiter to performance (I don't know that, just speculation). To make further progress, the recommendation at this point would be to carefully profile what is going on in the while loop, to see where the per-frame cost is.

PyCUDA fills np.array too slow

Answers (1)

Related Questions