CUDA API error = 0001 on memory copy from host to device for a global device pointer defined in the kernel

Question

We have a scenario, where a global device pointer is declared in the kernel file. We should be mapping that pointer from the host and copy a host memory pointer to the device pointer.

Currently the setup for a sample program is, the kernel function defined in .cu file as below

#include 
#include 

__device__ unsigned int * p_GLOBAL_HOLDER_Picture;


__global__ void CKNL_GrayTransform()
{
    if (NULL == p_GLOBAL_HOLDER_Picture)
    {
        printf("omg bob does not know what he is talking about
");
    }
    else
    {
      printf("the data is: %u %u
", p_GLOBAL_HOLDER_Picture[0], p_GLOBAL_HOLDER_Picture[1]);
    }
}`

The invocation of the kernel is done from a test.cpp file as below

`#include 
#include 
#include 
#include 

#define FATBIN_FILE "kernel.fatbin"
#define FUNCTION_NAME "_Z18CKNL_GrayTransformv"
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
#define checkCudaResult(err)  __checkCudaResult (err, __FILE__, __LINE__)

// Load Fatbin file
void* getFile() {
  std::ifstream file(FATBIN_FILE, std::ios::binary | std::ios::ate);
  std::streamsize size = file.tellg();
  file.seekg(0, std::ios::beg);
  void* buffer = malloc(size);
  file.read((char*) buffer, size);
  return buffer;
}

inline void __checkCudaResult( CUresult err, const char *file, const int line )
{
    if( CUDA_SUCCESS != err) {
        printf(
                "CUDA API error = %04d from file <%s>, line %i.
",
                err, file, line );
        cudaDeviceReset();
        exit(-1);
    }
}

inline void __checkCudaErrors(cudaError_t err, const char *file, const int line )
{
    if( CUDA_SUCCESS != err) {
        printf(
                "CUDA API error = %04d from file <%s>, line %i.
",
                err, file, line );
        cudaDeviceReset();
        exit(-1);
    }
}

int main(){
  CUcontext ctx;
  CUdevice dev = 0;
  // Initialize handle
  checkCudaResult(cuInit(0));
  // Create context of handle
  checkCudaResult(cuCtxCreate(&ctx, 0, dev));
  int count = 8;
  const int SIZE = sizeof(unsigned int);

  unsigned int * ptr;
  ptr = (unsigned int*)malloc(count * SIZE);

  CUmodule modId = 0;
    CUfunction funcHandle = 0;
  CUstream cuStream;
  checkCudaResult(cuModuleLoadFatBinary(&modId, getFile()));
    checkCudaResult(cuModuleGetFunction(&funcHandle, modId, FUNCTION_NAME));
  checkCudaResult(cuStreamCreate(&cuStream, CU_STREAM_DEFAULT));

  CUdeviceptr global_a;
  size_t global_a_bytes;
  checkCudaResult(cuModuleGetGlobal(&global_a, NULL, modId, "p_GLOBAL_HOLDER_Picture"));
  checkCudaResult(cuMemcpyHtoD(global_a, (void*)(ptr), count * SIZE));

  // Assign threads and blocks dims
  int threadsPerBlock = 1;
  int blocksPerGrid = 1;

  checkCudaResult(cuLaunchKernel((CUfunction)funcHandle, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, cuStream, NULL, NULL));
  checkCudaResult(cuCtxSynchronize());

  free(ptr);
  checkCudaResult(cuCtxDestroy(ctx));
  
}`

The kernel file is compiled into a .fatbin file with commands of

nvcc -arch=sm_52 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_86,code=compute_86 kernel.cu -lcuda --fatbin -o kernel.fatbin

When we are trying to launch the kernel using the .fatbin file with help CUDA DRIVER API, we are currently facing the error CUDA API error = 0001 from file , line 66. in the following line

checkCudaResult(cuMemcpyHtoD(global_a, (void*)(ptr), count * SIZE));

We are able to map the pointer defined in the kernel file in line 65 successfully, but while copying the memory we are facing this error. Should we be allocating any memory to the pointer defined in kernel file. We tried allocating the memory with the use of cuMemAlloc from the test.cpp, after this as well we are facing the cuda error.

Any pointers on how can we get this type of code running without errors would be helpful. Could not attach the .fatbin file.

CUDA API error = 0001 on memory copy from host to device for a global device pointer defined in the kernel

Answers (1)

Related Questions