pass by reference in device function CUDA

Question

I want to pass by reference in CUDA and pass the function in mex file Matlab, I called add_func which return multi-variables, and I pass this by pointer, but there has the problem when executing the code. Please take a look at my code and give me some advance.

#include 
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"

typedef void(*op_func_t) (double, double, double*, double*);
typedef void(*my_func_t) (double, double, double, double, void (*func)(double, double, double *, double *));

__device__ void add_func(double x, double y, double *z, double *k)

{
    *z= x + y;
    *k= x + y;
}
__device__ void mul_func(double x, double y, double *z, double *k)

{   
    *z= x * y;
    *k= x * y;

}
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__device__ my_func_t p_my_func = my_func;


__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)

{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5)
        {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        }
        else   
        {
            v[i]=10;
            u[i]=8;
        }
    __syncthreads();// wait for each thread to copy its elemenet
}

//host code

void mexFunction(int nlhs,mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

    /* Declare all variables.*/
    mxGPUArray const *A;
    mxGPUArray const *C;
    mxGPUArray *B;
    mxGPUArray *D;
    double const *d_A;
    double const *d_C;
    double *d_B;
    double *d_D;
    int N;

    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    /* Initialize the MathWorks GPU API. */
    mxInitGPU();

    A = mxGPUCreateFromMxArray(prhs[0]);
    C = mxGPUCreateFromMxArray(prhs[1]);

    /*
     * Now that we have verified the data type, extract a pointer to the input
     * data on the device.
     */
    d_A = (double const *)(mxGPUGetDataReadOnly(A));
    d_C = (double const *)(mxGPUGetDataReadOnly(C));

    /* Create a GPUArray to hold the result and get its underlying pointer. */
    B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    D = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    d_B = (double *)(mxGPUGetData(B));
    d_D = (double *)(mxGPUGetData(D));

    /*
     * Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
     * and it would be possible for the number of elements to be too large for
     * the grid. For this example we are not guarding against this possibility.
     */
    N = (int)(mxGPUGetNumberOfElements(A));
    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;

    op_func_t h_mul_func;

    my_func_t h_my_func;


    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);

    /* Wrap the result up as a MATLAB gpuArray for return. */
    plhs[0] = mxGPUCreateMxArrayOnGPU(B);
    plhs[1] = mxGPUCreateMxArrayOnGPU(D);
    /*
     * The mxGPUArray pointers are host-side structures that refer to device
     * data. These must be destroyed before leaving the MEX function.
     */
    mxGPUDestroyGPUArray(A);
    mxGPUDestroyGPUArray(B);
    mxGPUDestroyGPUArray(C);
    mxGPUDestroyGPUArray(D);

    return;

}

here is my results:

[a,b]=return_values(gpuArray(ones(10)),gpuArray(rand(10)))

a =

 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
 0    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10
10    10    10    10    10    10    10    10    10    10

b =

 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 0     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8
 8     8     8     8     8     8     8     8     8     8

Some initialization code:

// Static pointers to device functions

    __device__ op_func_t p_add_func = add_func;

    __device__ op_func_t p_mul_func = mul_func;

    __device__ my_func_t p_my_func = my_func;

        op_func_t h_add_func;

        op_func_t h_mul_func;

        my_func_t h_my_func;


        // Copy device function pointer to host side
        cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

        op_func_t d_myfunc = h_mul_func;
        op_func_t d_myfunc1 = h_add_func;
        my_func_t d_myfunc2 = h_my_func;

talonmies · Accepted Answer

There are a number of problems here. This:

__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}

is broken. The modified values of z and k will never be return to the caller because of pass-by-value. You can fix this by using references (CUDA is C++ based, and references are supported in __device__ functions):

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}

The second problem is a lack of bounds checking within the kernel, which will lead to out-of-bounds memory access when N is not an exact multiple of threadsPerBlock.

When I fix both these things in your code like this:

#include 
#include 

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;

__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5) {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        } else if (i A(N, 1.0);
    thrust::device_vector C(N, 1.0);
    thrust::device_vector B(N);
    thrust::device_vector D(N);
    double *d_A = thrust::raw_pointer_cast(A.data()); 
    double *d_C = thrust::raw_pointer_cast(C.data());  
    double *d_B = thrust::raw_pointer_cast(B.data()); 
    double *d_D = thrust::raw_pointer_cast(D.data()); 


    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;
    op_func_t h_mul_func;
    my_func_t h_my_func;

    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
    cudaDeviceSynchronize();

    for(const auto bval: B)
        std::cout << bval << std::endl;

    for(const auto dval: D)
        std::cout << dval << std::endl;

    return 0;
}

the CUDA part of your code works correctly, as far as I can tell:

$ nvcc -o mexhead -std=c++11 -arch=sm_52 mexhead.cu
$ cuda-memcheck ./mexhead 
========= CUDA-MEMCHECK
2
2
2
2
2
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
2
2
2
2
2
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
========= ERROR SUMMARY: 0 errors

pass by reference in device function CUDA

Answers (1)

Related Questions