Reputation: 836
I wrote the following CUDA kernel and am trying to load it into a module:
#include <stdio.h>
extern "C" // ensure function name to be exactly "vadd"
{
__global__ void vadd(const float *a, const float *b, float *c)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
printf("Thread id %d\n", i);
c[i] = a[i] + b[i];
}
}
I compile it to ptx code using the following command:
nvcc -ptx -arch=sm_20 vadd.cu
When trying to load this file into a module using cuModuleLoad
I get a CUDA 200 error (invalid kernel image). How can I find out what is wrong with the kernel image? I have tried ptxas
, but according to that, the generated ptx code is fine.
Edit: This is the code I am using to load the module:
#include "cuda.h"
#include <cassert>
#include <dlfcn.h>
#include <stdio.h>
void check(CUresult err) {
if (err != CUDA_SUCCESS) {
printf("Error %i\n", err);
}
assert(err == CUDA_SUCCESS);
}
int main(int argc, char **argv) {
void *cuda = dlopen("libcuda.so", RTLD_NOW | RTLD_DEEPBIND | RTLD_GLOBAL);
assert(cuda != NULL);
printf("cuInit\n");
CUresult (*Init)() = (CUresult (*)()) dlsym(cuda, "cuInit");
check(Init());
printf("cuDeviceGet\n");
CUresult (*DeviceGet)(CUdevice *, int) = (CUresult (*)(CUdevice *, int)) dlsym(cuda, "cuDeviceGet");
CUdevice device;
check(DeviceGet(&device, 0));
printf("cuCtxCreate\n");
CUresult (*CtxCreate)(CUcontext * , unsigned int, CUdevice) = (CUresult (*)(CUcontext * , unsigned int, CUdevice)) dlsym(cuda, "cuCtxCreate");
CUcontext context;
check(CtxCreate(&context, 0, device));
printf("cuModuleLoad\n");
CUresult (*ModuleLoad)(CUmodule *, const char*) = (CUresult (*)(CUmodule *, const char*)) dlsym(cuda, "cuModuleLoad");
CUmodule mod;
check(ModuleLoad(&mod, "vadd.ptx"));
return 0;
}
Upvotes: 1
Views: 2434
Reputation: 109
This is related to Why cuCtxCreate creates old context?: you are using cuCtxCreate
directly, which gives you an old API context (v3.1) incompatible with your usage of printf
. You can check the API version with cuCtxGetApiVersion
. If you switch to cuCtxCreate_v2
, which is normally used through some #define
's in cuda.h
, you'll get a more recent API context.
In order to spot this discrepancy, I've run your sample with LD_DEBUG=symbols
, and compared it to using the CUDA API directly (since it properly runs your sample PTX). Comparing symbol resolutions, the big difference was the call to cuCtxCreate
:
cuCtxCreate(...)
symbol=cuCtxCreate_v2; lookup in file=./test [0]
symbol=cuCtxCreate_v2; lookup in file=/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0]
... which in your original code, using dlsym(..., "cuCtxCreate")
mapped directly to cuCtxCreate
.
Upvotes: 3