Reputation: 903
I wrote some CUDA code, and everything seems great until I try to get the results from the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib>
#include <ctime>
#include <iostream>
#define maskSize 3
__constant__ float masks[32*maskSize*maskSize];
__global__ void myConv(float *res, const float* mats, int mSize)
{
extern __shared__ float curr[];
int rSize=maskSize+mSize-1;
int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
else //zero padding
if (threadIdx.x < mSize)
curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
else
curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;
__syncthreads();
float tmp=0;
if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
for (int i=0;i<maskSize;i++)
#pragma unroll
for (int j=0;j<maskSize;j++)
tmp+=curr[(threadIdx.x+i)*(mSize+2*(maskSize-1)) + threadIdx.y+j]*masks[blockIdx.x*maskSize*maskSize +maskSize*i +j];
res[blockIdx.y*rSize*rSize + threadIdx.x*rSize + threadIdx.y]=tmp;
}
}
int main()
{
int MatSize=5;
int bSize=2000;
int maskNum=10;
int resSize=MatSize+maskSize-1;
float* ms;
ms=(float *)malloc(maskSize*maskSize*maskNum*sizeof(float));
float* resPtr=(float *)malloc((MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
for (int i=0; i<maskSize;i++)
for (int j=0; j<maskSize; j++)
for (int k=0; k<maskNum; k++)
ms[k*maskSize*maskSize + j*maskSize + i]=(float)(rand() % 1000)/100;
float* inp=(float *)malloc(MatSize*MatSize*bSize*sizeof(float));
for (int i=0; i<MatSize; i++)
for (int j=0; j<MatSize; j++)
for (int k=0;k<bSize;k++)
inp[k*MatSize*MatSize + j*MatSize + i]=(float)(rand() % 500)/100;
float *cudams, *cudaresPtr,*cudainp;
cudaMalloc((void **) &cudams,maskSize*maskSize*maskNum*sizeof(float));
cudaMalloc((void **) &cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
cudaMalloc((void **) &cudainp,MatSize*MatSize*bSize*sizeof(float));
cudaMemcpy((void *)cudams,(void *)ms,maskSize*maskSize*maskNum*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy((void *)cudainp,(void *)inp,MatSize*MatSize*bSize*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(masks,(void *)cudams,maskSize*maskSize*maskNum*sizeof(float),0,cudaMemcpyDeviceToDevice);
dim3 threadSize(MatSize+2*(maskSize-1),MatSize+2*(maskSize-1));
dim3 blockSize(1, 1); //for testing purposes. should be dim3 blockSize(maskNum,bSize);
myConv<<<blockSize, threadSize, (MatSize+2*(maskSize-1))*(MatSize+2*(maskSize-1))>>>(cudaresPtr,cudainp,MatSize);
cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
//The problem is here - They copying won't work!
free(inp);
free(ms);
free(resPtr);
return 0;
}
I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.
Edit: memcheck result: no errors if I understand correctly:
O:\CudaTst>cuda-memcheck CUDA_TST ========= CUDA-MEMCHECK
Time spent: 0.144000 secondsError: Failed to read the strings for error record ========= ERROR SUMMARY: 0 errors
Re-ran with -l (leak) - 0 leaks.
Upvotes: 0
Views: 172
Reputation: 72349
It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.
The amount of shared memory per block is specific in bytes, so I suspect you want something like:
size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
(MatSize+2*(maskSize-1));
myConv<<<blockSize, threadSize, shmz)>>>(cudaresPtr,cudainp,MatSize);
Beyond that, I leave the debugging to you.
Upvotes: 1