C CUDA convolution bug(s)

Question

I have 2 problems with my program. here is part of my program

The main program will call the convolution 2d function At this time the kernel only consists of the sequential code. because so I can test if all the data passing is correct.

Problem 1 is with passing filter to the dev_filter in the kenel I tried a lot of stuff but nothing worked

Problem 2 is how to parallelize this with all those loops in the sequential part.

I hope I made my problem clear

#define FILTER_WIDTH          3
#define FILTER_HEIGTH         3

float SOBEL_FILTER_X[FILTER_HEIGTH][FILTER_WIDTH] = { {-1,  0,  1}, {-2, 0, 2}, {-1, 0, 1} };
float SOBEL_FILTER_Y[FILTER_HEIGTH][FILTER_WIDTH] = { { 1,  2,  1}, { 0, 0, 0}, {-1,-2,-1} };


gray_image_t convolution2D(gray_image_t in, int imgW, int imgH, float filter[FILTER_HEIGTH][FILTER_WIDTH]) {
    int imgS = imgW * imgH;
    gray_image_t out, dev_in, dev_out;
    float dev_filter[FILTER_HEIGTH][FILTER_WIDTH];
    int filterS = FILTER_HEIGTH * FILTER_WIDTH;


    //allocate memory
    out = (gray_image_t) calloc(imgS, sizeof(float));
    if (out == NULL) return NULL;
    checkCudaCall(cudaMalloc(&dev_in, imgS * sizeof(float)));
    checkCudaCall(cudaMalloc(&dev_out, imgS * sizeof(float)));

    //memcopy
    checkCudaCall(cudaMemcpy(dev_in,in,imgS * sizeof(float), cudaMemcpyHostToDevice));


    timer convolution2D_kernel_timer("Convolution2D_kernel_timer");
    convolution2D_kernel_timer.start();
    convolution_2DKernel<<>>(dev_in,dev_out,imgW,imgH,dev_filter);
    convolution2D_kernel_timer.stop();

    std::cout << convolution2D_kernel_timer;
    checkCudaCall(cudaThreadSynchronize());

    checkCudaCall(cudaMemcpy(out,dev_out,imgS * sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(dev_in);
    cudaFree(dev_out);
    return out;
}

and here is the Kernel

__global__ void convolution_2DKernel(gray_image_t dev_in, gray_image_t dev_out, int imgW,int imgH,float dev_filter[FILTER_HEIGTH][FILTER_WIDTH]){
    // find center position of kernel (half of kernel size)
    int kCenterX = FILTER_WIDTH / 2;
    int kCenterY = FILTER_HEIGTH / 2;

    for(int y=0; y < imgH; y++) {
        for(int x=0; x < imgW; x++) {
            for(int m=0; m < FILTER_HEIGTH; ++m) {
                for(int n=0; n < FILTER_WIDTH; ++n) {

                    // index of input signal, used for checking boundary
                    int yy = y + (m - kCenterY);
                    int xx = x + (n - kCenterX);

                    // ignore input samples which are out of bound
                    if( yy >= 0 && yy < imgH && xx >= 0 && xx < imgW ) {
                        dev_out[y*imgW+x] += dev_in[yy*imgW+xx] * dev_filter[m][n];
                    }
                }
            }
        }
    }
}

Hi I tried It with cudaMallocPitch and cudaMemcpy2D but I still get the same error

C CUDA convolution bug(s)

Answers (1)

Related Questions