Reputation: 514
I'm trying to get all the CUDA code to the separate test.cu file and call it from my main.cpp file by using test.h file. But when I try to get the data from device, I always get the error "Unhandled exception at 0x0F277552 (nvcuda.dll) in ExampleSeparate.exe: 0xC0000005: Access violation writing location 0x04A8D000."
Can you please tell me what's the problem with code? And what I'm doing wrong with separating kernel code and main part of code into different files? What is the best way to do that?
I know how to do that in OpenCL, but can't manage it in CUDA.
main.cpp
printf("My CUDA example.\n");
int iWidth, iHeight, iBpp, cycles_max = 100;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;
unsigned int SizeIn, SizeOut;
unsigned char *devDatOut, *devDatIn, *PInData, *POutData, *DatIn, *DatOut;
int error1 = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);
if (error1 != 0 || pDataIn.size() == 0 || iBpp != 32)
{
printf("error load input file!\n");
}
pDataOut.resize(pDataIn.size()/4);
//Для CUDA
SizeIn = pDataIn.size();
SizeOut = pDataOut.size();
PInData = pDataIn.data();
POutData = pDataOut.data();
//Для CPU
DatIn = pDataIn.data();
DatOut = pDataOut.data();
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
return 0;
test.h
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut);
test.cu
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4 *devDatIn;
unsigned char *devDatOut;
printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));
printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);
addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);
gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);
_getch();
}
Upvotes: 1
Views: 199
Reputation: 151799
At this line of code:
SizeIn = pDataIn.size();
your pDataIn
is a vector of <unsigned char>
of sufficient size to handle a 3840x2160 image with 4 bytes per pixel, presumably. So SizeIn
should be 3840x2160x4.
Then you assign your vector data to an unsigned char
pointer:
PInData = pDataIn.data();
Then you cast that pointer to a uchar4
, while passing the old SizeIn
in bytes:
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
In your my_cuda
function, you allocate size for device storage that is 4 times too large:
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
then you try to copy 4 times too much data from host to device:
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
That line will seg fault on the host, almost certainly.
The solution might be as simple as:
SizeIn = pDataIn.size()/4;
Here's a fully worked example based on the code you have shown, demonstrating the seg fault and the fix:
$ cat t1135.cu
#include <stdio.h>
#include <vector>
using namespace std;
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void my_cuda(uchar4* PInData, unsigned char *POutData, unsigned int SizeIn, unsigned int SizeOut){
uchar4 *devDatIn;
unsigned char *devDatOut;
printf("Allocate memory on device\n");
gpuErrchk(cudaMalloc((void**)&devDatIn, SizeIn * sizeof(uchar4)));
gpuErrchk(cudaMalloc((void**)&devDatOut, SizeOut * sizeof(unsigned char)));
printf("Copy data on device\n");
gpuErrchk(cudaMemcpy(devDatIn, PInData, SizeIn * sizeof(uchar4), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(devDatOut, POutData, SizeOut * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blocks(8100, 1, 1);
dim3 threads(1024, 1, 1);
//addMatrix<<<blocks, threads>>>(devDatIn, devDatOut);
gpuErrchk(cudaMemcpy(POutData, devDatOut, SizeOut * sizeof(unsigned char), cudaMemcpyDeviceToHost));
cudaFree(devDatOut);
cudaFree(devDatIn);
}
int main(){
printf("My CUDA example.\n");
vector<unsigned char> pDataIn(3840*2160*4);
vector<unsigned char> pDataOut;
unsigned int SizeIn, SizeOut;
unsigned char *PInData, *POutData;
pDataOut.resize(pDataIn.size()/4);
//... CUDA
#ifdef FIX
SizeIn = pDataIn.size()/4;
#else
SizeIn = pDataIn.size();
#endif
SizeOut = pDataOut.size();
PInData = pDataIn.data();
POutData = pDataOut.data();
my_cuda((uchar4*)PInData, POutData, SizeIn, SizeOut);
return 0;
}
$ nvcc -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
Segmentation fault (core dumped)
$ nvcc -DFIX -o t1135 t1135.cu
$ ./t1135
My CUDA example.
Allocate memory on device
Copy data on device
$
Upvotes: 3