Reputation: 13
I want to write a c++ CUDA program where I pass a class to the kernel. The class just evaluates a function on the kernel via the call operator(). If I hardwire the function in the class everything works as I'd like. However I want some flexibility with the class so I'd like the class to be able to be instantiated with different functions. Say by passing in a pointer function. I can't get the pointer function implementation to work. Below I define two classes, one that has the function defined (fixedFunction) and another that takes a pointer to function (genericFunction)
//Functions.hh
#include <iostream>
#include <stdio.h>
class fixedFunction{
public:
__host__ fixedFunction() {}
__host__ __device__ double operator()(double x) {
return x*x;
}
};
double f1(double x){
return x*x;
}
typedef double (*pf) (double var);
class genericFunction{
public:
__host__ genericFunction(double (*infunc)(double)) : func(infunc){}
__host__ __device__ double operator()(double x) {
return func(x);
}
private:
pf func;
};
__global__ void kernel1(fixedFunction* g1){
unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
printf("Func val is: %f\n", (*g1)(tid));
}
__global__ void kernel2(genericFunction* g1){
unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
printf("Func val is: %f\n", (*g1)(tid));
}
Instantiating both classes and running them on the host works. Passing to the relevant kernels I see that kernel2 where that class calls a pointer function fails
#include "Functions.hh"
int main(){
fixedFunction h_g1;
fixedFunction* d_g1;
cudaMallocManaged(&d_g1, sizeof(h_g1));
//Host call
std::cout << h_g1(2.0) << "\n";
//device call
kernel1<<<1,32>>>(d_g1);
cudaDeviceSynchronize();
genericFunction h_g2(f1);
genericFunction* d_g2;
cudaMallocManaged(&d_g2, sizeof(h_g2));
//Host call
std::cout << h_g2(3.0) << "\n";
//device call
kernel2<<<1,32>>>(d_g2);
cudaDeviceSynchronize();
I can see an issue in the pointer function can be any size and that is not accounted for on the device. So is there a way to pass a pointer function to a class and run it on the device?
Thanks
Upvotes: 1
Views: 2815
Reputation: 151879
This was approximately the "smallest" number of changes I could make to your code to get it to function approximately as you appeared to intend. Also note that there are many other questions about function pointers in CUDA, this answer links to several.
decorate f1
with __host__ __device__
. This is necessary to get the compiler to generate a device-callable routine for it. Otherwise, only host code is generated.
we need to capture the device entry address for the device callable version of f1
created in 1 above. There are a number of methods to do this. I will capture it "statically" with another __device__
variable (f1_d
) and then use cudaMemcpyFromSymbol
to pull it into host code.
Your genericFunction
class is modified to be able to hold both __host__
and separate __device__
entry points (function pointers) for the desired function. Also, the class is modified to select the proper one, based on whether we are compiling the host or device version of the class (__CUDA_ARCH__
macro), and the class constructor is modified to accept and assign both entry points.
Finally, we also need to initialize the d_g2
object on the device. In the case of the d_g1
object, there are no class data members for that object, so we can "get away with" creating an "empty" object pointed to by d_g1
and it works correctly because the entry points for that object's class member functions are already known in device code. However, in the case of d_g2
, we are accessing the functions indirectly via class data members which are pointers to the respective host and device versions (entry points) of the function. Therefore, after initializing the h_g2
object in host code, and establishing storage for the d_g2
object in device code, we must initialize d_g2
with the contents of h_g2
using cudaMemcpy
after the cudaMallocManaged
for d_g2
.
With those changes, your code works as written according to my test:
$ cat t353.cu
#include <iostream>
#include <stdio.h>
class fixedFunction{
public:
__host__ fixedFunction() {}
__host__ __device__ double operator()(double x) {
return x*x;
}
};
__host__ __device__ double f1(double x){
return x*x;
}
typedef double (*pf) (double var);
__device__ pf f1_d = f1;
class genericFunction{
public:
__host__ genericFunction(double (*h_infunc)(double), double (*d_infunc)(double)) : h_func(h_infunc),d_func(d_infunc){}
__host__ __device__ double operator()(double x) {
#ifdef __CUDA_ARCH__
return d_func(x);
#else
return h_func(x);
#endif
}
private:
pf h_func;
pf d_func;
};
__global__ void kernel1(fixedFunction* g1){
unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
printf("Func val is: %f\n", (*g1)(tid));
}
__global__ void kernel2(genericFunction* g1){
unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
printf("Func val is: %f\n", (*g1)(tid));
}
int main(){
fixedFunction h_g1;
fixedFunction* d_g1;
cudaMallocManaged(&d_g1, sizeof(h_g1));
//Host call
std::cout << h_g1(2.0) << "\n";
//device call
kernel1<<<1,32>>>(d_g1);
cudaDeviceSynchronize();
pf d_f1;
cudaMemcpyFromSymbol(&d_f1, f1_d, sizeof(void*));
genericFunction h_g2(f1, d_f1);
genericFunction* d_g2;
cudaMallocManaged(&d_g2, sizeof(h_g2));
cudaMemcpy(d_g2, &h_g2, sizeof(h_g2), cudaMemcpyDefault);
//Host call
std::cout << h_g2(3.0) << "\n";
//device call
kernel2<<<1,32>>>(d_g2);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t353 t353.cu
$ cuda-memcheck ./t353
========= CUDA-MEMCHECK
4
Func val is: 0.000000
Func val is: 1.000000
Func val is: 4.000000
Func val is: 9.000000
Func val is: 16.000000
Func val is: 25.000000
Func val is: 36.000000
Func val is: 49.000000
Func val is: 64.000000
Func val is: 81.000000
Func val is: 100.000000
Func val is: 121.000000
Func val is: 144.000000
Func val is: 169.000000
Func val is: 196.000000
Func val is: 225.000000
Func val is: 256.000000
Func val is: 289.000000
Func val is: 324.000000
Func val is: 361.000000
Func val is: 400.000000
Func val is: 441.000000
Func val is: 484.000000
Func val is: 529.000000
Func val is: 576.000000
Func val is: 625.000000
Func val is: 676.000000
Func val is: 729.000000
Func val is: 784.000000
Func val is: 841.000000
Func val is: 900.000000
Func val is: 961.000000
9
Func val is: 0.000000
Func val is: 1.000000
Func val is: 4.000000
Func val is: 9.000000
Func val is: 16.000000
Func val is: 25.000000
Func val is: 36.000000
Func val is: 49.000000
Func val is: 64.000000
Func val is: 81.000000
Func val is: 100.000000
Func val is: 121.000000
Func val is: 144.000000
Func val is: 169.000000
Func val is: 196.000000
Func val is: 225.000000
Func val is: 256.000000
Func val is: 289.000000
Func val is: 324.000000
Func val is: 361.000000
Func val is: 400.000000
Func val is: 441.000000
Func val is: 484.000000
Func val is: 529.000000
Func val is: 576.000000
Func val is: 625.000000
Func val is: 676.000000
Func val is: 729.000000
Func val is: 784.000000
Func val is: 841.000000
Func val is: 900.000000
Func val is: 961.000000
========= ERROR SUMMARY: 0 errors
$
Upvotes: 5