Reputation: 755
With this code I want to draw filled triangles:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 10
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, int* canvas_shape, float *z_buffer, int x, int y, float z, unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, int *canvas_shape, float *z_buffer, float *ll, float *rl, float *lu, float *ru, unsigned char *color, int height, int min_x, int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y / height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = (float)(global_thread_x - (left_x - min_x)) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.InOut(canvas) # unsigned char *canvas
param_canvas_shape = cuda.In(np.array(canvas.shape, dtype=np.int32)) # int *canvas_shape
param_z_buffer = cuda.InOut(z_buffer) # float *z_buffer
param_ll = cuda.In(np.array([100, 200, frames_count], dtype=np.float32)) # float *ll
param_rl = cuda.In(np.array([500, 200, frames_count], dtype=np.float32)) # float *rl
param_lu = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *lu
param_ru = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *ru
param_color = cuda.In(np.array(color, dtype=np.uint8)) # unsigned char *color
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
With executions_per_frame = 1
(for 1 iteration C function will be called 1 time) I got ~100 fps, with executions_per_frame = 10
- ~30 fps. It doesn't look as productive as it could be. What did I miss?
Also, does this have benefit in that particular task?
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
pycuda_draw_triangle(..., block=block_dim, grid=grid_dim)
Or it can be just
pycuda_draw_triangle(..., block=(1, 1, 1), grid=(fragment_width, fragment_height))
Python 3.6.9, CUDA 10.0, RTX 2060
UPD:
I managed to improve performance to 150 fps on executions_per_frame = 10
by replacing cuda.In()
and cuda.InOut()
with cuda.mem_alloc()
, but CPU usage now is near 30%. Can we even better?
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio\2017\Community\VC\Tools\MSVC\14.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
canvas_width, canvas_height = 1000, 800
param_canvas = cuda.mem_alloc(canvas_width * canvas_height * 3) # unsigned char *canvas
param_canvas_shape = cuda.mem_alloc(12) # int *canvas_shape
param_z_buffer = cuda.mem_alloc(canvas_width * canvas_height * 4) # float *z_buffer
param_ll = cuda.mem_alloc(12) # float *ll
param_rl = cuda.mem_alloc(12) # float *rl
param_lu = cuda.mem_alloc(12) # float *lu
param_ru = cuda.mem_alloc(12) # float *ru
param_color = cuda.mem_alloc(3) # unsigned char *color
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
cuda.memcpy_htod(param_canvas, canvas)
cuda.memcpy_htod(param_canvas_shape, np.array(canvas.shape, dtype=np.int32))
cuda.memcpy_htod(param_z_buffer, z_buffer)
cuda.memcpy_htod(param_ll, np.array([100, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_rl, np.array([500, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_lu, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_ru, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_color, np.array(color, dtype=np.uint8))
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
cuda.memcpy_dtoh(canvas, param_canvas)
cuda.memcpy_dtoh(z_buffer, param_z_buffer)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
Upvotes: 0
Views: 521
Reputation: 151879
The basic principle here is that you want to get everything that is unnecessary out of the performance loop. Your definition of performance is fps, so you want to get everything out of the while
loop that doesn't have to be there.
The biggest limiter to performance is the loop overhead - some work that must be done that is "independent" of your setting for executions_per_frame
.
Without resorting to the profiler, we can get some estimate of the overhead as well as the contribution of each iteration of executions_per_frame
. We will measure the overall performance (fps) at two different values of executions_per_frame
, and then solve 2 equations in 2 variables (overhead c
and per-iteration-cost x
):
1/fps (milliseconds per frame) = c + ix
My GPU is a bit slower than your RTX2060, so when I run your original code with two different values of executions_per_frame
(i
) of 1, and 10, I observed:
i=1: 80 fps = 12 ms/frame
i=10: 11 fps = 90 ms/frame
Therefore our 2 equations are:
c + (1)x = 12
c + (10)x = 90
Solving, we have c
= 3ms and x
= 9ms. So there is some "fixed" overhead of ~3ms per frame, and some variable overhead of ~9ms per iteration of executions_per_frame
. The thing we can definitely attack is the x
number (that is way too large) but we probably will make little progress with the c
number.
A big contributor to your original problem is that the pycuda .In
, .Out
and .InOut
specify data movement to be done per kernel launch. This means every time you launch your kernel in the for-loop, you are moving data specified that way. This is almost certainly not all necessary for your algorithm.
So let's refactor the code to remove that characteristic and take another measurement. What follows is code that has been coverted to run on linux (because that is where I am doing my work -- it seems you may be on windows) and also does these things:
It's somewhat trivial, but I have converted your in-kernel division operation by height
to a multiplication operation by 1/height
. Since you are passing height
as a kernel parameter, and only using it for that 1 operation, I refactored to pass 1/height
and make it a multiplication. Not very imporant.
Refactor (remove) all your usage of .In
and .InOut
to do something similar using cuda.mem_alloc
and cuda.memcpy_XXXX
.
I've converted some of the data movement (of zeros) to use cuda.memset_XXXX
instead. It's quicker than moving the data.
I've moved some operations around in the timing-critical loop.
Importantly, I'm not moving z-buffer back to the host. If you need that (its not needed for the code you have shown) you will have to add that back, and it will impact performance somewhat.
Here's my refactored code:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 100
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, const int* canvas_shape, float *z_buffer, int x, int y, float z, const unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, const int *canvas_shape, float *z_buffer, const float *ll, const float *rl, const float *lu, const float *ru, const unsigned char *color, const float height, const int min_x, const int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y * height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = ((float)(global_thread_x - (left_x - min_x))) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
pycuda_src_module = SourceModule(pycuda_code)
# pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
pycuda_draw_triangle = pycuda_src_module.get_function("draw_triangle")
time_start, frames_count, fps = time.time(), 0, 0
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
# B G R
color = [200, 0, 100]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.mem_alloc(canvas.nbytes) # unsigned char *canvas
canvas_shape = np.array(canvas.shape, dtype=np.int32)
param_canvas_shape = cuda.mem_alloc(canvas_shape.nbytes) # int *canvas_shape
cuda.memcpy_htod(param_canvas_shape, canvas_shape)
param_z_buffer = cuda.mem_alloc(z_buffer.nbytes) # float *z_buffer
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
param_rl = cuda.mem_alloc(param_ll_h.nbytes)
param_lu = cuda.mem_alloc(param_ll_h.nbytes)
param_ru = cuda.mem_alloc(param_ll_h.nbytes)
param_ll = cuda.mem_alloc(param_ll_h.nbytes)
color_h = np.array(color, dtype=np.uint8)
param_color = cuda.mem_alloc(color_h.nbytes)
cuda.memcpy_htod(param_color, color_h)
while True:
key = cv2.waitKey(1)
if key == 27:
break
cuda.memset_d8(param_canvas, 0, canvas.nbytes)
cuda.memset_d8(param_z_buffer, 0, z_buffer.nbytes)
cuda.memcpy_htod(param_ll, param_ll_h)
cuda.memcpy_htod(param_rl, param_rl_h)
cuda.memcpy_htod(param_lu, param_lu_h)
cuda.memcpy_htod(param_ru, param_ru_h)
param_height = np.float32(1.0/fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
fps = frames_count / (time.time() - time_start)
cuda.memcpy_dtoh(canvas, param_canvas)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
This code runs quite a bit faster, so we can run timing measurements at 10 iterations and 100 iterations, rather than 1 and 10 as previously. At 100 iterations I get around 60fps and at 10 iterations I get around 80 fps. (At 1 iteration I still only get around 85 fps). Doing the same arithmetic:
c + (10)x = 12ms
c + (100)x = 16ms
So x
= 4/90 = 0.05ms and c
= 11ms. (Precise equivalence between these 2 cases is not necessary. We are modelling something that may not be perfectly linear anyway, and this is a crude model). The point is that we have drastically reduced the per-executions_per_frame
iteration cost, while making little improvement in the fixed overhead per frame.
So if your goal is to do many executions per frame, this will be an important method. If your goal really was just to do 1 execution per frame, this hasn't helped much.
With this change, for example, it might be the case that the cv2.imshow
operation is several milliseconds, in which case that will eventually become a limiter to performance (I don't know that, just speculation). To make further progress, the recommendation at this point would be to carefully profile what is going on in the while
loop, to see where the per-frame cost is.
Upvotes: 1