CUDA debug invalid kernel image error

Question

I wrote the following CUDA kernel and am trying to load it into a module:

#include 

extern "C"   // ensure function name to be exactly "vadd"
{
    __global__ void vadd(const float *a, const float *b, float *c)
    {
        int i = threadIdx.x + blockIdx.x * blockDim.x;
        printf("Thread id %d
", i);
        c[i] = a[i] + b[i];
    }
}

I compile it to ptx code using the following command:

nvcc -ptx -arch=sm_20 vadd.cu

When trying to load this file into a module using cuModuleLoad I get a CUDA 200 error (invalid kernel image). How can I find out what is wrong with the kernel image? I have tried ptxas, but according to that, the generated ptx code is fine.

Edit: This is the code I am using to load the module:

#include "cuda.h"
#include 
#include 
#include 

void check(CUresult err) {
  if (err != CUDA_SUCCESS) {
    printf("Error %i
", err);
  }
  assert(err == CUDA_SUCCESS);
}

int main(int argc, char **argv) {
    void *cuda = dlopen("libcuda.so", RTLD_NOW | RTLD_DEEPBIND | RTLD_GLOBAL);
    assert(cuda != NULL);

    printf("cuInit
");
    CUresult (*Init)() = (CUresult (*)()) dlsym(cuda, "cuInit");
    check(Init());

    printf("cuDeviceGet
");
    CUresult (*DeviceGet)(CUdevice *, int) = (CUresult (*)(CUdevice *, int)) dlsym(cuda, "cuDeviceGet");
    CUdevice device;
    check(DeviceGet(&device, 0));

    printf("cuCtxCreate
");
    CUresult (*CtxCreate)(CUcontext * , unsigned int, CUdevice) = (CUresult (*)(CUcontext * , unsigned int, CUdevice)) dlsym(cuda, "cuCtxCreate");
    CUcontext context;
    check(CtxCreate(&context, 0, device));

    printf("cuModuleLoad
");
    CUresult (*ModuleLoad)(CUmodule *, const char*) = (CUresult (*)(CUmodule *, const char*)) dlsym(cuda, "cuModuleLoad");
    CUmodule mod;
    check(ModuleLoad(&mod, "vadd.ptx"));

    return 0;
}

maleadt · Accepted Answer

This is related to Why cuCtxCreate creates old context?: you are using cuCtxCreate directly, which gives you an old API context (v3.1) incompatible with your usage of printf. You can check the API version with cuCtxGetApiVersion. If you switch to cuCtxCreate_v2, which is normally used through some #define's in cuda.h, you'll get a more recent API context.

In order to spot this discrepancy, I've run your sample with LD_DEBUG=symbols, and compared it to using the CUDA API directly (since it properly runs your sample PTX). Comparing symbol resolutions, the big difference was the call to cuCtxCreate:

cuCtxCreate(...)
    symbol=cuCtxCreate_v2;  lookup in file=./test [0]
    symbol=cuCtxCreate_v2;  lookup in file=/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0]

... which in your original code, using dlsym(..., "cuCtxCreate") mapped directly to cuCtxCreate.

CUDA debug invalid kernel image error

Answers (1)

Related Questions