Generwp
Generwp

Reputation: 514

CUDA separate kernel file error

I'm trying to get all the CUDA code to the separate test.cu file and call it from my main.cpp file by using test.h file. But when I try to get the data from device, I always get the error "Unhandled exception at 0x0F277552 (nvcuda.dll) in ExampleSeparate.exe: 0xC0000005: Access violation writing location 0x04A8D000."

Can you please tell me what's the problem with code? And what I'm doing wrong with separating kernel code and main part of code into different files? What is the best way to do that?

I know how to do that in OpenCL, but can't manage it in CUDA.

main.cpp

printf("My CUDA example.\n");

    int iWidth, iHeight, iBpp, cycles_max = 100;

    vector<unsigned char> pDataIn;
    vector<unsigned char> pDataOut;

    unsigned int SizeIn, SizeOut;
    unsigned char *devDatOut, *devDatIn, *PInData, *POutData, *DatIn, *DatOut;

    int error1 = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);

    if (error1 != 0 || pDataIn.size() == 0 || iBpp != 32)
    {
        printf("error load input file!\n");
    }


    pDataOut.resize(pDataIn.size()/4);  
    //Для CUDA
    SizeIn = pDataIn.size();
    SizeOut = pDataOut.size();
    PInData = pDataIn.data();
    POutData = pDataOut.data();

    //Для CPU
    DatIn = pDataIn.data();
    DatOut = pDataOut.data();

  my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);

  return 0;

test.h

void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut);

test.cu

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{

   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4  *devDatIn;
unsigned char *devDatOut;

  printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));

  printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));

dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);

addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);

gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);


  _getch();
}

Upvotes: 1

Views: 199

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151799

At this line of code:

SizeIn = pDataIn.size();

your pDataIn is a vector of <unsigned char> of sufficient size to handle a 3840x2160 image with 4 bytes per pixel, presumably. So SizeIn should be 3840x2160x4.

Then you assign your vector data to an unsigned char pointer:

PInData = pDataIn.data();

Then you cast that pointer to a uchar4, while passing the old SizeIn in bytes:

my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);

In your my_cuda function, you allocate size for device storage that is 4 times too large:

gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));

then you try to copy 4 times too much data from host to device:

gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));

That line will seg fault on the host, almost certainly.

The solution might be as simple as:

SizeIn = pDataIn.size()/4;

Here's a fully worked example based on the code you have shown, demonstrating the seg fault and the fix:

$ cat t1135.cu
#include <stdio.h>
#include <vector>

using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{

   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4  *devDatIn;
unsigned char *devDatOut;

  printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));

  printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));

dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);

//addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);

gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);


}

int main(){

printf("My CUDA example.\n");


    vector<unsigned char> pDataIn(3840*2160*4);
    vector<unsigned char> pDataOut;

    unsigned int SizeIn, SizeOut;
    unsigned char *PInData, *POutData;



    pDataOut.resize(pDataIn.size()/4);
    //... CUDA
#ifdef FIX
    SizeIn = pDataIn.size()/4;
#else
    SizeIn = pDataIn.size();
#endif
    SizeOut = pDataOut.size();
    PInData = pDataIn.data();
    POutData = pDataOut.data();

  my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);

  return 0;

}
$ nvcc -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
Segmentation fault (core dumped)
$ nvcc -DFIX -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
$

Upvotes: 3

Related Questions