Reputation: 71
I am new to CUDA development and wanted to write a simple benchmark to test some image processing feasibility. I have 32 images that are each 720x540, one byte per pixel greyscale.
I am running benchmarks for 10 seconds, and counting how many times they are able to process. There are three benchmarks I am running:
For a starting, simple test, the image processing is just counting the number of pixels above a certain greyscale value. I'm finding that accessing global memory on the GPU is very slow. I have my benchmark structured such that it creates one block per image, and one thread per row in each image. Each thread counts its pixels into a shared memory array, after which the first thread sums them up (See below).
The issue I am having is that this all runs very slowly - about 50fps. Much slower than a CPU version - about 230fps. If I comment out the pixel value comparison, resulting in just a count of all pixels, I get 6x the performance. I tried using texture memory but didn't see a performance gain. I am running a Quadro K2000. Also: the image copy only benchmark is able to copy at around 330fps, so that doesn't appear to be the issue.
Any help / pointers would be appreciated. Thank you.
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
extern "C" void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
Upvotes: 0
Views: 947
Reputation: 151879
Two changes to your kernel design can result in a significant speedup:
Perform the operations column-wise instead of row-wise. The general background for why this matters/helps is described here.
Replace your final operation with a canonical parallel reduction.
According to my testing, those 2 changes result in ~22x speedup in kernel performance:
$ cat t49.cu
#include <iostream>
#include <helper_cuda.h>
typedef unsigned char U8;
__global__ void ThreadPerRowCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns)
{
extern __shared__ int row_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
//see here for indexing https://blog.usejournal.com/cuda-thread-indexing-fb9910cba084
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
int myStartRow = (threadIdx.y * blockDim.x + threadIdx.x);
unsigned char *imageStart = AllPixels[myImage];
unsigned char *pixelStart = imageStart + myStartRow * W;
unsigned char *pixelEnd = pixelStart + W;
unsigned char *pixelItr = pixelStart;
int row_count = 0;
while(pixelItr < pixelEnd)
{
if (*pixelItr > Threshold) //REMOVING THIS LINE GIVES 6x PERFORMANCE
{
row_count++;
}
pixelItr++;
}
row_counts[myStartRow] = row_count;
__syncthreads();
if (myStartRow == 0)
{//first thread sums up for the while image
int image_count = 0;
for (int i = 0; i < H; i++)
{
image_count += row_counts[i];
}
AllReturns[myImage] = image_count;
}
}
__global__ void ThreadPerColCounter(int Threshold, int W, int H, U8 **AllPixels, int *AllReturns, int rsize)
{
extern __shared__ int col_counts[];//this parameter to kernel call "<<<, ,>>>" sets the size
int myImage = blockIdx.y * gridDim.x + blockIdx.x;
unsigned char *imageStart = AllPixels[myImage];
int myStartCol = (threadIdx.y * blockDim.x + threadIdx.x);
int col_count = 0;
for (int i = 0; i < H; i++) if (imageStart[myStartCol+i*W]> Threshold) col_count++;
col_counts[threadIdx.x] = col_count;
__syncthreads();
for (int i = rsize; i > 0; i>>=1){
if ((threadIdx.x+i < W) && (threadIdx.x < i)) col_counts[threadIdx.x] += col_counts[threadIdx.x+i];
__syncthreads();}
if (!threadIdx.x) AllReturns[myImage] = col_counts[0];
}
void cuda_Benchmark(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
ThreadPerRowCounter<<<nImages, H, sizeof(int)*H>>> (
Threshold,
W, H,
AllPixels,
AllReturns);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
unsigned next_power_of_2(unsigned v){
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;}
void cuda_Benchmark1(int nImages, int W, int H, U8** AllPixels, int *AllReturns, int Threshold)
{
int rsize = next_power_of_2(W/2);
ThreadPerColCounter<<<nImages, W, sizeof(int)*W>>> (
Threshold,
W, H,
AllPixels,
AllReturns, rsize);
//wait for all blocks to finish
checkCudaErrors(cudaDeviceSynchronize());
}
int main(){
const int my_W = 720;
const int my_H = 540;
const int n_img = 128;
const int my_thresh = 10;
U8 **img_p, **img_ph;
U8 *img, *img_h;
int *res, *res_h, *res_h1;
img_ph = (U8 **)malloc(n_img*sizeof(U8*));
cudaMalloc(&img_p, n_img*sizeof(U8*));
cudaMalloc(&img, n_img*my_W*my_H*sizeof(U8));
img_h = new U8[n_img*my_W*my_H];
for (int i = 0; i < n_img*my_W*my_H; i++) img_h[i] = rand()%20;
cudaMemcpy(img, img_h, n_img*my_W*my_H*sizeof(U8), cudaMemcpyHostToDevice);
for (int i = 0; i < n_img; i++) img_ph[i] = img+my_W*my_H*i;
cudaMemcpy(img_p, img_ph, n_img*sizeof(U8*), cudaMemcpyHostToDevice);
cudaMalloc(&res, n_img*sizeof(int));
cuda_Benchmark(n_img, my_W, my_H, img_p, res, my_thresh);
res_h = new int[n_img];
cudaMemcpy(res_h, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
cuda_Benchmark1(n_img, my_W, my_H, img_p, res, my_thresh);
res_h1 = new int[n_img];
cudaMemcpy(res_h1, res, n_img*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < n_img; i++) if (res_h[i] != res_h1[i]) {std::cout << "mismatch at: " << i << " was: " << res_h1[i] << " should be: " << res_h[i] << std::endl; return 0;}
}
$ nvcc -o t49 t49.cu -I/usr/local/cuda/samples/common/inc
$ cuda-memcheck ./t49
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t49
==1756== NVPROF is profiling process 1756, command: ./t49
==1756== Profiling application: ./t49
==1756== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 72.02% 54.325ms 1 54.325ms 54.325ms 54.325ms ThreadPerRowCounter(int, int, int, unsigned char**, int*)
24.71% 18.639ms 2 9.3195ms 1.2800us 18.638ms [CUDA memcpy HtoD]
3.26% 2.4586ms 1 2.4586ms 2.4586ms 2.4586ms ThreadPerColCounter(int, int, int, unsigned char**, int*, int)
0.00% 3.1040us 2 1.5520us 1.5360us 1.5680us [CUDA memcpy DtoH]
API calls: 43.63% 59.427ms 3 19.809ms 18.514us 59.159ms cudaMalloc
41.70% 56.789ms 2 28.394ms 2.4619ms 54.327ms cudaDeviceSynchronize
14.02% 19.100ms 4 4.7749ms 17.749us 18.985ms cudaMemcpy
0.52% 705.26us 96 7.3460us 203ns 327.21us cuDeviceGetAttribute
0.05% 69.268us 1 69.268us 69.268us 69.268us cuDeviceTotalMem
0.04% 50.688us 1 50.688us 50.688us 50.688us cuDeviceGetName
0.04% 47.683us 2 23.841us 14.352us 33.331us cudaLaunchKernel
0.00% 3.1770us 1 3.1770us 3.1770us 3.1770us cuDeviceGetPCIBusId
0.00% 1.5610us 3 520ns 249ns 824ns cuDeviceGetCount
0.00% 1.0550us 2 527ns 266ns 789ns cuDeviceGet
$
(Quadro K2000, CUDA 9.2.148, Fedora Core 27)
(The next_power_of_2 code is lifted from this answer)
I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the questions in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.
Upvotes: 3