Reputation: 321
I need to find average for thousands (20,000+) images represented by unsigned short arrays. Could you please check me, it looks for me that this code is not optimal:
my kernel:
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
kernel wrapper:
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}`
and I do
float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);
for (int k = 0; k < count; k++) {
imageObjectList.at( curObj )->getImage( k );
kernel_wrapper( avrg1, avrg2f, height * width, (float)count);
}
as result may averaged image will be in avrg2f;
Thank you.
Upvotes: 0
Views: 855
Reputation: 151899
If the images are all the same size, then your wrapper function need not do cudaMalloc
and cudaFree
operations on every call.
Pre-allocate that storage needed, and don't allocated and free it on every call to the wrapper.
In addition you may see something like a ~2x speedup (for the cudaMemcpy
operations) if you use pinned allocations (cudaHostAlloc
) on the host side for your image storage.
Finally, for the duration of your loop, there's no need to copy the results back to the host. Do this after you're done computing the average. This will save 2 out of the 3 cudaMemcpy
operations you are doing in the wrapper.
While we're at it, in my opinion using memset
to initialize a float
array is questionable. It works for a zero value, but essentially no other. Furthermore, I would expect passing 0.0
as the second parameter to memset
to at least throw a compiler warning.
The following code shows the above optimizations, and demonstrates about an 8x speedup over your code in my test case:
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
__global__ void VecAdd(unsigned short *A, float *B, unsigned int Size, float div){
register float divider = div;
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] / divider + B[idx];
}
//__syncthreads();
}
__global__ void VecAdd2(unsigned short *A, float *B, unsigned int Size, float mult){
register int idx = threadIdx.x + blockIdx.x * blockDim.x;
if ( idx < Size) {
B[ idx ] = (float) A[idx] * mult + B[idx];
}
}
void kernel_wrapper(unsigned short* pixels1, float* pixels2, unsigned int length, float div)
{
unsigned short* deviceData1;
float* deviceData2;
cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
cudaMalloc((void**)&deviceData2, length * sizeof(float));
cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd<<< blocks, threads >>>( deviceData1, deviceData2, length, div );
cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree( deviceData1 );
cudaFree( deviceData2 );
}
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2, unsigned int length, float my_mult)
{
cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
int threads = 1024; //my maximum
int blocks = (length / threads); // lenght=1280*960 -> blocks=1200
VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2, length, my_mult );
}
int main(){
const int count = 2000;
const int width = 1280;
const int height = 960;
timeval t1, t2;
unsigned long et;
unsigned short *h1_image;
h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));
float* avrg2f = (float*)malloc( width * height * sizeof(float));
for (int i = 0; i<height*width; i++){
h1_image[i] = (i%256);
avrg2f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
for (int k = 0; k < count; k++) {
kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
}
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 1 = %ld us\n", et);
unsigned short *h2_image;
float* avrg3f = (float*)malloc( width * height * sizeof(float));
cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
for (int i = 0; i<height*width; i++){
h2_image[i] = (i%256);
avrg3f[i] = 0.0f;
}
gettimeofday(&t1,NULL);
unsigned short *d_image;
float *d_result;
cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
cudaMalloc((void **)&d_result, height*width*sizeof(float));
cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
for (int k = 0; k < count; k++) {
kernel_wrapper2( h2_image, d_image, d_result, height * width, (float)(1/(float)count));
}
cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
gettimeofday(&t2,NULL);
et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
printf("time 2 = %ld us\n", et);
for (int i = 0; i < (height*width); i++)
if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
return 0;
}
Upvotes: 2