cudaMemCpy2D getting crashed

Question

I am trying to implement Sauvola Binarization in cuda.For this I have read the image in a 2d array in host and allocating memory for 2D array in device using pitch.After allocating the memory I am trying to copy the host 2D array to Device 2d Array using cudaMemcpy2D,it compiles fine but it crashes here on runtime.I am unable to understand where am I missing,Kindly suggest something.The code which I have written is as follows:

#include "BinMain.h"
#include "Binarization.h"
#include 
#include 
#include 
#include 
#include 

void printDevProp(cudaDeviceProp);
void CUDA_SAFE_CALL( cudaError_t);


int main()
{
    //Read an IplImage in imgOriginal as grayscale
    IplImage * imgOriginal = cvLoadImage("E:\1.tiff",CV_LOAD_IMAGE_GRAYSCALE);

    //Create a size variable of type CvSize for cvCreateImage Parameter
    CvSize size = cvSize(imgOriginal->width,imgOriginal->height);

    //create an image for storing the result image with same height and width as     imgOriginal
    IplImage * imgResult = cvCreateImage(size,imgOriginal->depth,imgOriginal-    >nChannels);

    //Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
    int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
    for (int i = 0; i < imgOriginal->height; i++) 
{
    arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}

//Create a 2D array for storing the returned device array
int ** arrReturn = (int **)malloc(imgOriginal->height * sizeof(int *));
for (int i = 0; i < imgOriginal->height; i++) 
{
    arrReturn[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}

//Create a CvScalar variable to copy pixel values in 2D array (arrOriginal)
CvScalar s;

//Copying the pixl values
for(int j = 0;jheight;j++)
{
    for(int k =0;kwidth;k++)
    {
        s = cvGet2D(imgOriginal,j,k);
        arrOriginal[j][k] = s.val[0];
    }
}

//Cuda Device Property
int devCount;
cudaGetDeviceCount(&devCount);
printf("CUDA Device Query...
");
printf("There are %d CUDA devices.
", devCount);

// Iterate through devices
for (int i = 0; i < devCount; ++i)
{
    // Get device properties
    printf("
CUDA Device #%d
", i);
    cudaDeviceProp devProp;
    cudaGetDeviceProperties(&devProp, i);
    printDevProp(devProp);
}

//Start the clock
clock_t start = clock();

//Allocating Device memory for 2D array using pitch
size_t host_orig_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height; //host original array pitch in bytes
size_t dev_pitch;   //device array pitch in bytes which will be used in cudaMallocPitch
size_t dev_pitchReturn; //device return array pitch in bytes
size_t host_ret_pitch = imgOriginal->width * sizeof(int)* imgOriginal->height;  //host return array pitch in bytes

int * devArrOriginal;   //device 2d array of original image
int * result;   //device 2d array for returned array
int dynmicRange = 128; //Dynamic Range for calculating the threshold from sauvola's formula

//Allocating memory by using cudaMallocPitch
CUDA_SAFE_CALL(cudaMallocPitch((void**)&devArrOriginal,&dev_pitch,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));

//Allocating memory for returned array
CUDA_SAFE_CALL(cudaMallocPitch((void**)&result,&dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int)));

//Copying 2D array from host memory to device mempry by using cudaMemCpy2D
CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));
    int windowSize = 19;    //Size of the window for calculating mean and variance
    //Launching the kernel by calling myKernelLauncher function.
    myKernelLauncher(devArrOriginal,result,windowSize,imgOriginal->width,imgOriginal-    >height,dev_pitch,dynmicRange);
    //Calling the sauvola binarization function by passing the parameters as 
    //1.arrOriginal 2D array 2.Original image height 3.Original image width
    //int ** result = AdaptiveBinarization(arrOriginal,imgOriginal->height,imgOriginal-    >width);//binarization(arrOriginal,imgOriginal->width,imgOriginal->height);
    //
CUDA_SAFE_CALL(cudaMemcpy2D(arrReturn,host_ret_pitch,result,dev_pitchReturn,imgOriginal->width * sizeof(int),imgOriginal->height * sizeof(int),cudaMemcpyDeviceToHost));
//create a CvScalar variable to set the data in imgResult
CvScalar ss;

//Copy the pixel values from returned array to imgResult
for(int i=0;iheight;i++)
{
    for(int j=0;jwidth;j++)
    {
        ss = cvScalar(arrReturn[i][j]*255);
        cvSet2D(imgResult,i,j,ss);
        //k++; //No need for k if returned array is 2D
    }
}

printf("Done 
");
//calculate and print the time elapsed
printf("Time elapsed: %f
", ((double)clock() - start) / CLOCKS_PER_SEC);

//Create a windoe and show the resule image
cvNamedWindow("Result",CV_WINDOW_AUTOSIZE);
cvShowImage("Result",imgResult);
cvWaitKey(0);
getch();

//Release the various resources
cvReleaseImage(&imgResult);
cvReleaseImage(&imgOriginal);
cvDestroyWindow("Result");
for(int i = 0; i < imgOriginal->height; i++)
    free(arrOriginal[i]);

free(arrOriginal);
free(result);
cudaFree(&devArrOriginal);
cudaFree(&result);

}

// Print device properties
void printDevProp(cudaDeviceProp devProp)
{
printf("Major revision number:         %d
",  devProp.major);
printf("Minor revision number:         %d
",  devProp.minor);
printf("Name:                          %s
",  devProp.name);
printf("Total global memory:           %u
",  devProp.totalGlobalMem);
printf("Total shared memory per block: %u
",  devProp.sharedMemPerBlock);
printf("Total registers per block:     %d
",  devProp.regsPerBlock);
printf("Warp size:                     %d
",  devProp.warpSize);
printf("Maximum memory pitch:          %u
",  devProp.memPitch);
printf("Maximum threads per block:     %d
",  devProp.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of block:  %d
", i, devProp.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf("Maximum dimension %d of grid:   %d
", i, devProp.maxGridSize[i]);
printf("Clock rate:                    %d
",  devProp.clockRate);
printf("Total constant memory:         %u
",  devProp.totalConstMem);
printf("Texture alignment:             %u
",  devProp.textureAlignment);
printf("Concurrent copy and execution: %s
",  (devProp.deviceOverlap ? "Yes" : "No"));
printf("Number of multiprocessors:     %d
",  devProp.multiProcessorCount);
printf("Kernel execution timeout:      %s
",  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
return;
}

/* Utility Macro : CUDA SAFE CALL */ 
void CUDA_SAFE_CALL( cudaError_t call) 
{ 

cudaError_t ret = call; 
switch(ret) 
{ 
    case cudaSuccess: 
         break; 
    default : 
            {
                printf(" ERROR at line :%i.%d' ' %s
", 
                __LINE__,ret,cudaGetErrorString(ret)); 
                exit(-1); 
                break; 
            } 
} 
}

The flow of the code is as follows: 1. Create a 2D array in host from image and another array for returned array from kernel. 2. Allocate memory for a 2D array in device using CudaMallocPitch 3. Allocate memory for a 2d array which will be returned by kernel. 4. Copy the original 2d array from host to device array using cudaMemcpy2d. 5. Launch the Kernel. 6. Copy the returned device array to host array using cudaMemcpy2D.

The program is crashing while it reaches to 4th point.It is an unhandled exception stating "Unhandled exception at 0x773415de in SauvolaBinarization_CUDA_OpenCV.exe: 0xC0000005: Access violation reading location 0x01611778."

I think the problem must be while allocating the memory,but I am using the function first time and have no idea how it works,kindly suggest.

a_a_t · Accepted Answer

First of all, you're not calling "cudaMallocPitch" properly. The "height" parameter should represent the number of rows, so instead of :

imgOriginal->height * sizeof(int)

you should simply use:

imgOriginal->height

This is fine because the number of bytes per row is already contained in the "pitch" property. The main problem, however, lies with the way you allocate the memory for the host image. When you write:

//Create a 2D array for storing the pixels value of each of the pixel of imgOriginal grayscale image
    int ** arrOriginal = (int **)malloc(imgOriginal->height * sizeof(int *));
    for (int i = 0; i < imgOriginal->height; i++) 
{
    arrOriginal[i] = (int*)malloc(imgOriginal->width * sizeof(int));
}

you are effectively creating an array with pointers to arrays. The CUDA API call that you 're making:

CUDA_SAFE_CALL(cudaMemcpy2D((void*)devArrOriginal,dev_pitch,(void*)arrOriginal,host_orig_pitch,imgOriginal->width * sizeof(float),imgOriginal->height,cudaMemcpyHostToDevice));

expects that the input memory buffer is contiguous. So here's what will happen: the first row from the input image (totalling "imgOriginal->width * sizeof(float)" bytes) will be read starting with the address:

 (void*)arrOriginal

However, the amount of valid data you have starting at that address is only "imgOriginal->height * sizeof(int *)" bytes. The two byte counts are very likely to be different, which will lead to the crash because you will end up reading from an unknown location.

To solve this, consider allocating "arrOriginal" as one contiguous block, such as:

int * arrOriginal = (int *)malloc(imgOriginal->height * imgOriginal->width * sizeof(int));

Also, in this case, your pitch should be:

 "imgOriginal->width * sizeof(int)"

cudaMemCpy2D getting crashed

Answers (1)

Related Questions