photon
photon

Reputation: 13

CUDA: passing class to device with a class member that is a pointer function

I want to write a c++ CUDA program where I pass a class to the kernel. The class just evaluates a function on the kernel via the call operator(). If I hardwire the function in the class everything works as I'd like. However I want some flexibility with the class so I'd like the class to be able to be instantiated with different functions. Say by passing in a pointer function. I can't get the pointer function implementation to work. Below I define two classes, one that has the function defined (fixedFunction) and another that takes a pointer to function (genericFunction)

//Functions.hh
#include <iostream>
#include <stdio.h>

class fixedFunction{
public:
 __host__ fixedFunction() {}
 __host__ __device__ double operator()(double x) {
    return x*x;
 }
};

double f1(double x){
  return x*x;
}

typedef double (*pf) (double var);

class genericFunction{
public:
  __host__ genericFunction(double (*infunc)(double)) : func(infunc){}
  __host__ __device__ double operator()(double x) {
    return func(x);
  }
private:
  pf func;
};

__global__ void kernel1(fixedFunction* g1){
  unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
  printf("Func val is: %f\n", (*g1)(tid));
}

__global__ void kernel2(genericFunction* g1){
  unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
  printf("Func val is: %f\n", (*g1)(tid));
}

Instantiating both classes and running them on the host works. Passing to the relevant kernels I see that kernel2 where that class calls a pointer function fails

#include "Functions.hh"

int main(){

  fixedFunction h_g1;
  fixedFunction* d_g1;
  cudaMallocManaged(&d_g1, sizeof(h_g1));

  //Host call
  std::cout << h_g1(2.0) << "\n";

  //device call
  kernel1<<<1,32>>>(d_g1);
  cudaDeviceSynchronize();

  genericFunction h_g2(f1);
  genericFunction* d_g2;
  cudaMallocManaged(&d_g2, sizeof(h_g2));

  //Host call
  std::cout << h_g2(3.0) << "\n";

  //device call
  kernel2<<<1,32>>>(d_g2);
  cudaDeviceSynchronize();

I can see an issue in the pointer function can be any size and that is not accounted for on the device. So is there a way to pass a pointer function to a class and run it on the device?

Thanks

Upvotes: 1

Views: 2815

Answers (1)

Robert Crovella
Robert Crovella

Reputation: 151879

This was approximately the "smallest" number of changes I could make to your code to get it to function approximately as you appeared to intend. Also note that there are many other questions about function pointers in CUDA, this answer links to several.

  1. decorate f1 with __host__ __device__. This is necessary to get the compiler to generate a device-callable routine for it. Otherwise, only host code is generated.

  2. we need to capture the device entry address for the device callable version of f1 created in 1 above. There are a number of methods to do this. I will capture it "statically" with another __device__ variable (f1_d) and then use cudaMemcpyFromSymbol to pull it into host code.

  3. Your genericFunction class is modified to be able to hold both __host__ and separate __device__ entry points (function pointers) for the desired function. Also, the class is modified to select the proper one, based on whether we are compiling the host or device version of the class (__CUDA_ARCH__ macro), and the class constructor is modified to accept and assign both entry points.

  4. Finally, we also need to initialize the d_g2 object on the device. In the case of the d_g1 object, there are no class data members for that object, so we can "get away with" creating an "empty" object pointed to by d_g1 and it works correctly because the entry points for that object's class member functions are already known in device code. However, in the case of d_g2, we are accessing the functions indirectly via class data members which are pointers to the respective host and device versions (entry points) of the function. Therefore, after initializing the h_g2 object in host code, and establishing storage for the d_g2 object in device code, we must initialize d_g2 with the contents of h_g2 using cudaMemcpy after the cudaMallocManaged for d_g2.

With those changes, your code works as written according to my test:

$ cat t353.cu
#include <iostream>
#include <stdio.h>

class fixedFunction{
public:
 __host__ fixedFunction() {}
 __host__ __device__ double operator()(double x) {
    return x*x;
 }
};

__host__ __device__ double f1(double x){
  return x*x;
}

typedef double (*pf) (double var);

__device__ pf f1_d = f1;

class genericFunction{
public:
  __host__ genericFunction(double (*h_infunc)(double), double (*d_infunc)(double)) : h_func(h_infunc),d_func(d_infunc){}
  __host__ __device__ double operator()(double x) {
#ifdef __CUDA_ARCH__
    return d_func(x);
#else
    return h_func(x);
#endif
  }
private:
  pf h_func;
  pf d_func;
};

__global__ void kernel1(fixedFunction* g1){
  unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
  printf("Func val is: %f\n", (*g1)(tid));
}

__global__ void kernel2(genericFunction* g1){
  unsigned int tid = blockIdx.x *blockDim.x + threadIdx.x;
  printf("Func val is: %f\n", (*g1)(tid));
}

int main(){

  fixedFunction h_g1;
  fixedFunction* d_g1;
  cudaMallocManaged(&d_g1, sizeof(h_g1));

  //Host call
  std::cout << h_g1(2.0) << "\n";

  //device call
  kernel1<<<1,32>>>(d_g1);
  cudaDeviceSynchronize();
  pf d_f1;
  cudaMemcpyFromSymbol(&d_f1, f1_d, sizeof(void*));
  genericFunction h_g2(f1, d_f1);
  genericFunction* d_g2;
  cudaMallocManaged(&d_g2, sizeof(h_g2));
  cudaMemcpy(d_g2, &h_g2, sizeof(h_g2), cudaMemcpyDefault);
  //Host call
  std::cout << h_g2(3.0) << "\n";

  //device call
  kernel2<<<1,32>>>(d_g2);
  cudaDeviceSynchronize();
}
$ nvcc -arch=sm_61 -o t353 t353.cu
$ cuda-memcheck ./t353
========= CUDA-MEMCHECK
4
Func val is: 0.000000
Func val is: 1.000000
Func val is: 4.000000
Func val is: 9.000000
Func val is: 16.000000
Func val is: 25.000000
Func val is: 36.000000
Func val is: 49.000000
Func val is: 64.000000
Func val is: 81.000000
Func val is: 100.000000
Func val is: 121.000000
Func val is: 144.000000
Func val is: 169.000000
Func val is: 196.000000
Func val is: 225.000000
Func val is: 256.000000
Func val is: 289.000000
Func val is: 324.000000
Func val is: 361.000000
Func val is: 400.000000
Func val is: 441.000000
Func val is: 484.000000
Func val is: 529.000000
Func val is: 576.000000
Func val is: 625.000000
Func val is: 676.000000
Func val is: 729.000000
Func val is: 784.000000
Func val is: 841.000000
Func val is: 900.000000
Func val is: 961.000000
9
Func val is: 0.000000
Func val is: 1.000000
Func val is: 4.000000
Func val is: 9.000000
Func val is: 16.000000
Func val is: 25.000000
Func val is: 36.000000
Func val is: 49.000000
Func val is: 64.000000
Func val is: 81.000000
Func val is: 100.000000
Func val is: 121.000000
Func val is: 144.000000
Func val is: 169.000000
Func val is: 196.000000
Func val is: 225.000000
Func val is: 256.000000
Func val is: 289.000000
Func val is: 324.000000
Func val is: 361.000000
Func val is: 400.000000
Func val is: 441.000000
Func val is: 484.000000
Func val is: 529.000000
Func val is: 576.000000
Func val is: 625.000000
Func val is: 676.000000
Func val is: 729.000000
Func val is: 784.000000
Func val is: 841.000000
Func val is: 900.000000
Func val is: 961.000000
========= ERROR SUMMARY: 0 errors
$

Upvotes: 5

Related Questions