Reputation: 31
I am creating a GPU accelerated convolution routine for python, which is back-ended to C which makes use of Cuda to access the GPU. To do this, have the following C code:
#include <cuda.h>
#include <cuda_runtime_api.h>
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
GPU device functions for GPU modules
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
__global__ void d_VectorConvolve(float *a, float *b, float *c, size_t n_a, size_t n_b, size_t half)
{
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
float val = 0.0;
if (idx < n_a)
{
for (int j = 0; j < n_b; j++)
{
int check = idx - half + j; // this is needed to ensure we dont attempt to index
// a value outsize the size of a.
if (check > 0 && check < n_a)
{
val = val + a[idx - half + j]*b[j];
}
}
c[idx] = val;
}
}
extern "C" {
void VectorConvolve(float *a, float *b, float *c, size_t n_a, size_t n_b, size_t half)
{
float *d_a, *d_b, *d_c;
cudaMalloc( &d_a, n_a*sizeof(float));
cudaMalloc( &d_b, n_b*sizeof(float));
cudaMalloc( &d_c, n_a*sizeof(float));
cudaMemcpy( d_a, a, n_a*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy( d_b, b, n_b*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy( d_c, c, n_a*sizeof(float), cudaMemcpyHostToDevice);
d_VectorConvolve <<< ceil(n_a / 256.0), 256 >>> (d_a, d_b, d_c, n_a, n_b, half);
cudaMemcpy( c, d_c, n_a*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
}
This file is saved vector_functions.cu, which I compile with nvcc to produce a .so :
nvcc -Xcompiler -fPIC -shared -o vector_functions.so vector_functions.cu
This all works fine, and the code works in C. I create an init.py file which uses the .so file created:
def get_vector_functions():
dll = ctypes.CDLL(current_dir + '/vector_functions.so', mode=ctypes.RTLD_GLOBAL)
# convolve funtion
vector_convolve = dll.VectorConvolve
vector_convolve.argtypes = [POINTER(c_float), POINTER(c_float), POINTER(c_float), c_size_t, c_size_t, c_size_t]
return vector_convolve
# create __cuda_sum function with get_cuda_sum()
__vector_convolve = get_vector_functions()
def cuda_convolve(a,b):
a = a.astype('float32')
b = b.astype('float32')
a_shape = a.shape[0]
b_shape = b.shape[0]
half = int(b_shape/2.)
a_p = a.ctypes.data_as(POINTER(c_float))
b_p = b.ctypes.data_as(POINTER(c_float))
c_p = np.zeros(a_shape).ctypes.data_as(POINTER(c_float))
__vector_convolve(a_p, b_p, c_p, a_shape, b_shape, half)
c = make_nd_array(c_p, [a_shape], dtype=np.float32, order='C', own_data=True)
return c
Now this works fine, and I can load my module to do large convolutions fast. The problem is, occasionally, I get an segmentation fault and I have no idea why. Once I get this, I can't use this module again until I restart my computer.
I assume I am not managing my memory correctly? But it's strange how sometimes it works fine, and then all of a sudden, fails. I also feel that linking against an .so file may be a bad idea and may have something to do with it, but it was a quick solution to link python to C.
I am relatively experienced in python and getting into C. Most of the code I have here is adapted from online tutorials and other peoples code. I welcome all suggestions for why this may be, and what can be done to avoid this issue.
The problem seems to go away if I run a few warm-up commands with the python interpreter:
cuda_convolve(np.ones(2**5), np.ones(100))
cuda_convolve(np.ones(2**10), np.ones(100))
cuda_convolve(np.ones(2**15), np.ones(100))
cuda_convolve(np.ones(2**18), np.ones(100))
After this, I can use it with large arrays with absolutely no issue. However if I load in the module without doing this "warm-up", I get a seg fault.
Upvotes: 2
Views: 604
Reputation: 6468
This is not actually a solution to your segfault problem, but a way to finding the actual culprit behind the segfault.
There's no error checking in your code. How do you expect to find the error?
Use the following function to wrap all your CUDA calls:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
Now to modify your code:
extern "C" {
void VectorConvolve(float *a, float *b, float *c, size_t n_a, size_t n_b, size_t half)
{
float *d_a, *d_b, *d_c;
gpuErrchk(cudaMalloc( &d_a, n_a*sizeof(float)));
gpuErrchk(cudaMalloc( &d_b, n_b*sizeof(float)));
gpuErrchk(cudaMalloc( &d_c, n_a*sizeof(float)));
gpuErrchk(cudaMemcpy( d_a, a, n_a*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy( d_b, b, n_b*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy( d_c, c, n_a*sizeof(float), cudaMemcpyHostToDevice));
d_VectorConvolve <<< ceil(n_a / 256.0), 256 >>> (d_a, d_b, d_c, n_a, n_b, half);
// check if cuda kernel executed correctly
gpuErrchk(cudaPeekAtLastError())
// make sure kernel execution has ended
gpuErrchk(cudaDeviceSynchronize())
gpuErrchk(cudaMemcpy( c, d_c, n_a*sizeof(float), cudaMemcpyDeviceToHost));
gpuErrchk(cudaFree(d_a));
gpuErrchk(cudaFree(d_b));
gpuErrchk(cudaFree(d_c));
}
}
Upvotes: 1