PieterV
PieterV

Reputation: 836

CUDA debug invalid kernel image error

I wrote the following CUDA kernel and am trying to load it into a module:

#include <stdio.h>

extern "C"   // ensure function name to be exactly "vadd"
{
    __global__ void vadd(const float *a, const float *b, float *c)
    {
        int i = threadIdx.x + blockIdx.x * blockDim.x;
        printf("Thread id %d\n", i);
        c[i] = a[i] + b[i];
    }
}

I compile it to ptx code using the following command:

nvcc -ptx -arch=sm_20 vadd.cu

When trying to load this file into a module using cuModuleLoad I get a CUDA 200 error (invalid kernel image). How can I find out what is wrong with the kernel image? I have tried ptxas, but according to that, the generated ptx code is fine.

Edit: This is the code I am using to load the module:

#include "cuda.h"
#include <cassert>
#include <dlfcn.h>
#include <stdio.h>

void check(CUresult err) {
  if (err != CUDA_SUCCESS) {
    printf("Error %i\n", err);
  }
  assert(err == CUDA_SUCCESS);
}

int main(int argc, char **argv) {
    void *cuda = dlopen("libcuda.so", RTLD_NOW | RTLD_DEEPBIND | RTLD_GLOBAL);
    assert(cuda != NULL);

    printf("cuInit\n");
    CUresult (*Init)() = (CUresult (*)()) dlsym(cuda, "cuInit");
    check(Init());

    printf("cuDeviceGet\n");
    CUresult (*DeviceGet)(CUdevice *, int) = (CUresult (*)(CUdevice *, int)) dlsym(cuda, "cuDeviceGet");
    CUdevice device;
    check(DeviceGet(&device, 0));

    printf("cuCtxCreate\n");
    CUresult (*CtxCreate)(CUcontext * , unsigned int, CUdevice) = (CUresult (*)(CUcontext * , unsigned int, CUdevice)) dlsym(cuda, "cuCtxCreate");
    CUcontext context;
    check(CtxCreate(&context, 0, device));

    printf("cuModuleLoad\n");
    CUresult (*ModuleLoad)(CUmodule *, const char*) = (CUresult (*)(CUmodule *, const char*)) dlsym(cuda, "cuModuleLoad");
    CUmodule mod;
    check(ModuleLoad(&mod, "vadd.ptx"));

    return 0;
}

Upvotes: 1

Views: 2434

Answers (1)

maleadt
maleadt

Reputation: 109

This is related to Why cuCtxCreate creates old context?: you are using cuCtxCreate directly, which gives you an old API context (v3.1) incompatible with your usage of printf. You can check the API version with cuCtxGetApiVersion. If you switch to cuCtxCreate_v2, which is normally used through some #define's in cuda.h, you'll get a more recent API context.

In order to spot this discrepancy, I've run your sample with LD_DEBUG=symbols, and compared it to using the CUDA API directly (since it properly runs your sample PTX). Comparing symbol resolutions, the big difference was the call to cuCtxCreate:

cuCtxCreate(...)
    symbol=cuCtxCreate_v2;  lookup in file=./test [0]
    symbol=cuCtxCreate_v2;  lookup in file=/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0]

... which in your original code, using dlsym(..., "cuCtxCreate") mapped directly to cuCtxCreate.

Upvotes: 3

Related Questions