Reputation: 33
I want to pass by reference in CUDA and pass the function in mex file Matlab, I called add_func which return multi-variables, and I pass this by pointer, but there has the problem when executing the code. Please take a look at my code and give me some advance.
#include <stdio.h>
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"
typedef void(*op_func_t) (double, double, double*, double*);
typedef void(*my_func_t) (double, double, double, double, void (*func)(double, double, double *, double *));
__device__ void add_func(double x, double y, double *z, double *k)
{
*z= x + y;
*k= x + y;
}
__device__ void mul_func(double x, double y, double *z, double *k)
{
*z= x * y;
*k= x * y;
}
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
(*func)(x, y, &z, &k);
}
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<5)
{
(*op2)(x[i], y[i], u[i], v[i], op1);
}
else
{
v[i]=10;
u[i]=8;
}
__syncthreads();// wait for each thread to copy its elemenet
}
//host code
void mexFunction(int nlhs,mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray const *C;
mxGPUArray *B;
mxGPUArray *D;
double const *d_A;
double const *d_C;
double *d_B;
double *d_D;
int N;
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
A = mxGPUCreateFromMxArray(prhs[0]);
C = mxGPUCreateFromMxArray(prhs[1]);
/*
* Now that we have verified the data type, extract a pointer to the input
* data on the device.
*/
d_A = (double const *)(mxGPUGetDataReadOnly(A));
d_C = (double const *)(mxGPUGetDataReadOnly(C));
/* Create a GPUArray to hold the result and get its underlying pointer. */
B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
mxGPUGetDimensions(A),
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
D = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
mxGPUGetDimensions(A),
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
d_B = (double *)(mxGPUGetData(B));
d_D = (double *)(mxGPUGetData(D));
/*
* Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
* and it would be possible for the number of elements to be too large for
* the grid. For this example we are not guarding against this possibility.
*/
N = (int)(mxGPUGetNumberOfElements(A));
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(B);
plhs[1] = mxGPUCreateMxArrayOnGPU(D);
/*
* The mxGPUArray pointers are host-side structures that refer to device
* data. These must be destroyed before leaving the MEX function.
*/
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
mxGPUDestroyGPUArray(D);
return;
}
here is my results:
[a,b]=return_values(gpuArray(ones(10)),gpuArray(rand(10)))
a =
0 10 10 10 10 10 10 10 10 10
0 10 10 10 10 10 10 10 10 10
0 10 10 10 10 10 10 10 10 10
0 10 10 10 10 10 10 10 10 10
0 10 10 10 10 10 10 10 10 10
10 10 10 10 10 10 10 10 10 10
10 10 10 10 10 10 10 10 10 10
10 10 10 10 10 10 10 10 10 10
10 10 10 10 10 10 10 10 10 10
10 10 10 10 10 10 10 10 10 10
b =
0 8 8 8 8 8 8 8 8 8
0 8 8 8 8 8 8 8 8 8
0 8 8 8 8 8 8 8 8 8
0 8 8 8 8 8 8 8 8 8
0 8 8 8 8 8 8 8 8 8
8 8 8 8 8 8 8 8 8 8
8 8 8 8 8 8 8 8 8 8
8 8 8 8 8 8 8 8 8 8
8 8 8 8 8 8 8 8 8 8
8 8 8 8 8 8 8 8 8 8
Some initialization code:
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
Upvotes: 1
Views: 3967
Reputation: 72372
There are a number of problems here. This:
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
(*func)(x, y, &z, &k);
}
is broken. The modified values of z and k will never be return to the caller because of pass-by-value. You can fix this by using references (CUDA is C++ based, and references are supported in __device__ functions):
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));
__device__ void add_func(double x, double y, double& z, double& k)
{
z= x + y;
k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)
{
z= x * y;
k= x * y;
}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
(*func)(x, y, z, k);
}
The second problem is a lack of bounds checking within the kernel, which will lead to out-of-bounds memory access when N
is not an exact multiple of threadsPerBlock
.
When I fix both these things in your code like this:
#include <thrust/device_vector.h>
#include <iostream>
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));
__device__ void add_func(double x, double y, double& z, double& k)
{
z= x + y;
k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)
{
z= x * y;
k= x * y;
}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
(*func)(x, y, z, k);
}
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<5) {
(*op2)(x[i], y[i], u[i], v[i], op1);
} else if (i<N) {
v[i]=10;
u[i]=8;
}
}
//host code
int main()
{
const size_t n = 5;
const size_t N = n * n;
/* Declare all variables.*/
thrust::device_vector<double> A(N, 1.0);
thrust::device_vector<double> C(N, 1.0);
thrust::device_vector<double> B(N);
thrust::device_vector<double> D(N);
double *d_A = thrust::raw_pointer_cast(A.data());
double *d_C = thrust::raw_pointer_cast(C.data());
double *d_B = thrust::raw_pointer_cast(B.data());
double *d_D = thrust::raw_pointer_cast(D.data());
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
cudaDeviceSynchronize();
for(const auto bval: B)
std::cout << bval << std::endl;
for(const auto dval: D)
std::cout << dval << std::endl;
return 0;
}
the CUDA part of your code works correctly, as far as I can tell:
$ nvcc -o mexhead -std=c++11 -arch=sm_52 mexhead.cu
$ cuda-memcheck ./mexhead
========= CUDA-MEMCHECK
2
2
2
2
2
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
2
2
2
2
2
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
========= ERROR SUMMARY: 0 errors
Upvotes: 3