user1999728
user1999728

Reputation: 903

CUDA unkown error when copying from device to host

I wrote some CUDA code, and everything seems great until I try to get the results from the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cstdlib> 
#include <ctime> 
#include <iostream>

#define maskSize 3 

__constant__ float masks[32*maskSize*maskSize];

__global__ void myConv(float *res, const float* mats, int mSize)
{
    extern __shared__ float curr[];
    int rSize=maskSize+mSize-1;
    int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
    int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
    if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
        curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
    else //zero padding
        if (threadIdx.x < mSize)
            curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
        else 
            curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;

    __syncthreads();
    float tmp=0;

if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
    for (int i=0;i<maskSize;i++)
        #pragma unroll
        for (int j=0;j<maskSize;j++)

            tmp+=curr[(threadIdx.x+i)*(mSize+2*(maskSize-1)) + threadIdx.y+j]*masks[blockIdx.x*maskSize*maskSize +maskSize*i +j];
    res[blockIdx.y*rSize*rSize + threadIdx.x*rSize + threadIdx.y]=tmp;
}
}

int main()
{
    int MatSize=5;
    int bSize=2000;
    int maskNum=10;
    int resSize=MatSize+maskSize-1;
    float* ms;
    ms=(float *)malloc(maskSize*maskSize*maskNum*sizeof(float));
    float* resPtr=(float *)malloc((MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    for (int i=0; i<maskSize;i++)
        for (int j=0; j<maskSize; j++)
            for (int k=0; k<maskNum; k++)
                ms[k*maskSize*maskSize + j*maskSize + i]=(float)(rand() % 1000)/100;
    float* inp=(float *)malloc(MatSize*MatSize*bSize*sizeof(float));
    for (int i=0; i<MatSize; i++)
        for (int j=0; j<MatSize; j++)
            for (int k=0;k<bSize;k++)
                inp[k*MatSize*MatSize + j*MatSize + i]=(float)(rand() % 500)/100;
    float *cudams, *cudaresPtr,*cudainp;
    cudaMalloc((void **) &cudams,maskSize*maskSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float));
    cudaMalloc((void **) &cudainp,MatSize*MatSize*bSize*sizeof(float));

    cudaMemcpy((void *)cudams,(void *)ms,maskSize*maskSize*maskNum*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpy((void *)cudainp,(void *)inp,MatSize*MatSize*bSize*sizeof(float),cudaMemcpyHostToDevice);

    cudaMemcpyToSymbol(masks,(void *)cudams,maskSize*maskSize*maskNum*sizeof(float),0,cudaMemcpyDeviceToDevice);
    dim3 threadSize(MatSize+2*(maskSize-1),MatSize+2*(maskSize-1));
    dim3 blockSize(1, 1); //for testing purposes. should be dim3 blockSize(maskNum,bSize);
    myConv<<<blockSize, threadSize, (MatSize+2*(maskSize-1))*(MatSize+2*(maskSize-1))>>>(cudaresPtr,cudainp,MatSize);
    cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
    //The problem is here - They copying won't work!

    free(inp);
    free(ms);
    free(resPtr);
    return 0;
}

I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.

Edit: memcheck result: no errors if I understand correctly:

O:\CudaTst>cuda-memcheck CUDA_TST ========= CUDA-MEMCHECK

Time spent: 0.144000 secondsError: Failed to read the strings for error record ========= ERROR SUMMARY: 0 errors

Re-ran with -l (leak) - 0 leaks.

Upvotes: 0

Views: 172

Answers (1)

talonmies
talonmies

Reputation: 72349

It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.

The amount of shared memory per block is specific in bytes, so I suspect you want something like:

size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
                                    (MatSize+2*(maskSize-1));
myConv<<<blockSize, threadSize, shmz)>>>(cudaresPtr,cudainp,MatSize);

Beyond that, I leave the debugging to you.

Upvotes: 1

Related Questions