CUDA API error = 0001 on memory copy from host to device for a global device pointer defined in the kernel

We have a scenario, where a global device pointer is declared in the kernel file. We should be mapping that pointer from the host and copy a host memory pointer to the device pointer.

Currently the setup for a sample program is, the kernel function defined in .cu file as below

#include <stdio.h>
#include <cuda.h>

__device__ unsigned int * p_GLOBAL_HOLDER_Picture;


__global__ void CKNL_GrayTransform()
{
    if (NULL == p_GLOBAL_HOLDER_Picture)
    {
        printf("omg bob does not know what he is talking about\n");
    }
    else
    {
      printf("the data is: %u %u\n", p_GLOBAL_HOLDER_Picture[0], p_GLOBAL_HOLDER_Picture[1]);
    }
}`

The invocation of the kernel is done from a test.cpp file as below

`#include <stdio.h>
#include <fstream>
#include <cuda.h>
#include <cuda_runtime.h>

#define FATBIN_FILE "kernel.fatbin"
#define FUNCTION_NAME "_Z18CKNL_GrayTransformv"
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
#define checkCudaResult(err)  __checkCudaResult (err, __FILE__, __LINE__)

// Load Fatbin file
void* getFile() {
  std::ifstream file(FATBIN_FILE, std::ios::binary | std::ios::ate);
  std::streamsize size = file.tellg();
  file.seekg(0, std::ios::beg);
  void* buffer = malloc(size);
  file.read((char*) buffer, size);
  return buffer;
}

inline void __checkCudaResult( CUresult err, const char *file, const int line )
{
    if( CUDA_SUCCESS != err) {
        printf(
                "CUDA API error = %04d from file <%s>, line %i.\n",
                err, file, line );
        cudaDeviceReset();
        exit(-1);
    }
}

inline void __checkCudaErrors(cudaError_t err, const char *file, const int line )
{
    if( CUDA_SUCCESS != err) {
        printf(
                "CUDA API error = %04d from file <%s>, line %i.\n",
                err, file, line );
        cudaDeviceReset();
        exit(-1);
    }
}

int main(){
  CUcontext ctx;
  CUdevice dev = 0;
  // Initialize handle
  checkCudaResult(cuInit(0));
  // Create context of handle
  checkCudaResult(cuCtxCreate(&ctx, 0, dev));
  int count = 8;
  const int SIZE = sizeof(unsigned int);

  unsigned int * ptr;
  ptr = (unsigned int*)malloc(count * SIZE);

  CUmodule modId = 0;
    CUfunction funcHandle = 0;
  CUstream cuStream;
  checkCudaResult(cuModuleLoadFatBinary(&modId, getFile()));
    checkCudaResult(cuModuleGetFunction(&funcHandle, modId, FUNCTION_NAME));
  checkCudaResult(cuStreamCreate(&cuStream, CU_STREAM_DEFAULT));

  CUdeviceptr global_a;
  size_t global_a_bytes;
  checkCudaResult(cuModuleGetGlobal(&global_a, NULL, modId, "p_GLOBAL_HOLDER_Picture"));
  checkCudaResult(cuMemcpyHtoD(global_a, (void*)(ptr), count * SIZE));

  // Assign threads and blocks dims
  int threadsPerBlock = 1;
  int blocksPerGrid = 1;

  checkCudaResult(cuLaunchKernel((CUfunction)funcHandle, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, cuStream, NULL, NULL));
  checkCudaResult(cuCtxSynchronize());

  free(ptr);
  checkCudaResult(cuCtxDestroy(ctx));
  
}`

The kernel file is compiled into a .fatbin file with commands of

nvcc -arch=sm_52 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87 -gencode=arch=compute_86,code=compute_86 kernel.cu -lcuda --fatbin -o kernel.fatbin

When we are trying to launch the kernel using the .fatbin file with help CUDA DRIVER API, we are currently facing the error CUDA API error = 0001 from file <test.cpp>, line 66. in the following line

checkCudaResult(cuMemcpyHtoD(global_a, (void*)(ptr), count * SIZE));

We are able to map the pointer defined in the kernel file in line 65 successfully, but while copying the memory we are facing this error. Should we be allocating any memory to the pointer defined in kernel file. We tried allocating the memory with the use of cuMemAlloc from the test.cpp, after this as well we are facing the cuda error.

Any pointers on how can we get this type of code running without errors would be helpful. Could not attach the .fatbin file.

Upvotes: 0

Views: 86

Answers (1)

talonmies
talonmies

Reputation: 72342

Your global memory static pointer can only hold an address. The runtime error is telling you that the size is invalid, not the address.

Should we be allocating any memory to the pointer defined in kernel file.

Yes. Like this:

  CUdeviceptr global_a, global_buf;
  cuMemAlloc(&global_buf, count * SIZE);
  cuMemcpyHtoD(global_buf, (void*)(ptr), count * SIZE));

  size_t global_a_bytes;
  cuModuleGetGlobal(&global_a, &global_a_bytes, modId, "p_GLOBAL_HOLDER_Picture");
  cuMemcpyHtoD(global_a, (void*)&global_buf, global_a_bytes);

[disclaimer code written in browser]

You need to allocate a global memory buffer on the device, copy your data to the device and then copy the GPU address of the runtime buffer (i.e. its pointer value) to the statically defined pointer on the GPU.

Upvotes: 2

Related Questions