Grammer
Grammer

Reputation: 321

Sum array of vectors with cuda

I need to find average for thousands (20,000+) images represented by unsigned short arrays. Could you please check me, it looks for me that this code is not optimal:

my kernel:

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

kernel wrapper:

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }`

and I do

float* avrg2f = (float*)malloc( width * height * sizeof(float));
memset( avrg2f, 0.0, sizeof(float) * width * height);

for (int k = 0; k < count; k++) {           
 imageObjectList.at( curObj )->getImage( k );
 kernel_wrapper( avrg1, avrg2f, height * width, (float)count);  
}

as result may averaged image will be in avrg2f;

Thank you.

Upvotes: 0

Views: 855

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151899

If the images are all the same size, then your wrapper function need not do cudaMalloc and cudaFree operations on every call.

Pre-allocate that storage needed, and don't allocated and free it on every call to the wrapper.

In addition you may see something like a ~2x speedup (for the cudaMemcpy operations) if you use pinned allocations (cudaHostAlloc) on the host side for your image storage.

Finally, for the duration of your loop, there's no need to copy the results back to the host. Do this after you're done computing the average. This will save 2 out of the 3 cudaMemcpy operations you are doing in the wrapper.

While we're at it, in my opinion using memset to initialize a float array is questionable. It works for a zero value, but essentially no other. Furthermore, I would expect passing 0.0 as the second parameter to memset to at least throw a compiler warning.

The following code shows the above optimizations, and demonstrates about an 8x speedup over your code in my test case:

#include <stdio.h>
#include <sys/time.h>
#include <time.h>

__global__ void VecAdd(unsigned short *A, float *B,  unsigned int Size, float div){

  register float divider = div;
  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] /  divider + B[idx];
  }
  //__syncthreads();
}

__global__ void VecAdd2(unsigned short *A, float *B,  unsigned int Size, float mult){

  register int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if ( idx < Size) {
   B[ idx ] = (float) A[idx] * mult + B[idx];
  }
}

void kernel_wrapper(unsigned short* pixels1, float* pixels2,  unsigned int length, float div)
{
    unsigned short* deviceData1;
    float* deviceData2;

    cudaMalloc((void**)&deviceData1, length * sizeof(unsigned short));
    cudaMalloc((void**)&deviceData2, length * sizeof(float));

    cudaMemcpy(deviceData1, pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);
    cudaMemcpy(deviceData2, pixels2, length * sizeof(float), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd<<< blocks, threads >>>( deviceData1, deviceData2,  length, div );

    cudaMemcpy(pixels2, deviceData2, length * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree( deviceData1 );
    cudaFree( deviceData2 );
    }
void kernel_wrapper2(unsigned short* h_pixels1, unsigned short* d_pixels1, float* d_pixels2,  unsigned int length, float my_mult)
{


    cudaMemcpy(d_pixels1, h_pixels1, length * sizeof(unsigned short), cudaMemcpyHostToDevice);

    int  threads = 1024; //my maximum
    int blocks = (length / threads); // lenght=1280*960 -> blocks=1200

    VecAdd2<<< blocks, threads >>>( d_pixels1, d_pixels2,  length, my_mult );

    }

int main(){

  const int count = 2000;
  const int width = 1280;
  const int height = 960;
  timeval t1, t2;
  unsigned long et;

  unsigned short *h1_image;
  h1_image = (unsigned short *)malloc(height*width*sizeof(unsigned short));

  float* avrg2f = (float*)malloc( width * height * sizeof(float));
  for (int i = 0; i<height*width; i++){
    h1_image[i] = (i%256);
    avrg2f[i] = 0.0f;
    }

  gettimeofday(&t1,NULL);
  for (int k = 0; k < count; k++) {
    kernel_wrapper( h1_image, avrg2f, height * width, (float)count);
  }
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 1 = %ld us\n", et);
  unsigned short *h2_image;
  float* avrg3f = (float*)malloc( width * height * sizeof(float));
  cudaHostAlloc((void **)&h2_image, height*width*sizeof(unsigned short), cudaHostAllocDefault);
  for (int i = 0; i<height*width; i++){
    h2_image[i] = (i%256);
    avrg3f[i] = 0.0f;
    }
  gettimeofday(&t1,NULL);
  unsigned short *d_image;
  float *d_result;
  cudaMalloc((void **)&d_image, height*width*sizeof(unsigned short));
  cudaMalloc((void **)&d_result, height*width*sizeof(float));
  cudaMemcpy(d_result, avrg3f, height*width*sizeof(float), cudaMemcpyHostToDevice);
  for (int k = 0; k < count; k++) {
    kernel_wrapper2( h2_image, d_image,  d_result, height * width, (float)(1/(float)count));
  }
  cudaMemcpy(avrg3f, d_result, height*width*sizeof(float), cudaMemcpyDeviceToHost);
  gettimeofday(&t2,NULL);
  et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec);
  printf("time 2 = %ld us\n", et);
  for (int i = 0; i < (height*width); i++)
    if (fabs(avrg2f[i] - avrg3f[i]) > 0.0001) {printf("mismatch at %d, 1 = %f, 2 = %f\n", i, avrg2f[i], avrg3f[i]); return 1;}
  return 0;
}

Upvotes: 2

Related Questions