CUDA unkown error when copying from device to host

Question

I wrote some CUDA code, and everything seems great until I try to get the results from the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include  
#include  
#include 

#define maskSize 3 

__constant__ float masks[32*maskSize*maskSize];

__global__ void myConv(float *res, const float* mats, int mSize)
{
    extern __shared__ float curr[];
    int rSize=maskSize+mSize-1;
    int idxmod=(threadIdx.x+maskSize-1) % (mSize+2*maskSize-2); //these two map any value not within (mSize-1,mSize-1) to the boarders for padding.
    int idymod=(threadIdx.y+maskSize-1) % (mSize+2*maskSize-2);
    if (threadIdx.x < mSize && threadIdx.y < mSize) //put the value of mats in the middle of the curr matrix
        curr[(threadIdx.x+ maskSize-1)*(mSize+2*(maskSize-1)) + threadIdx.y + maskSize-1]=mats[mSize*(blockIdx.y*mSize + threadIdx.x) + threadIdx.y];
    else //zero padding
        if (threadIdx.x < mSize)
            curr[threadIdx.x*(mSize+2*(maskSize-1)) +idymod] =0;
        else 
            curr[idxmod*(mSize+2*(maskSize-1)) +threadIdx.y] =0;

    __syncthreads();
    float tmp=0;

if (threadIdx.x < mSize+maskSize-1 && threadIdx.y < mSize+maskSize-1)
{
#pragma unroll
    for (int i=0;i>>(cudaresPtr,cudainp,MatSize);
    cudaMemcpy((void *)resPtr,(const void *)cudaresPtr,(MatSize+maskSize-1)*(MatSize+maskSize-1)*bSize*maskNum*sizeof(float),cudaMemcpyDeviceToHost);
    //The problem is here - They copying won't work!

    free(inp);
    free(ms);
    free(resPtr);
    return 0;
}

I put printf in various places, used error checking as recommended here, printed error string... Can't find anything that would cause an error copying the contents of the pointer back to the host.

Edit: memcheck result: no errors if I understand correctly:

O:\CudaTst>cuda-memcheck CUDA_TST ========= CUDA-MEMCHECK

Time spent: 0.144000 secondsError: Failed to read the strings for error record ========= ERROR SUMMARY: 0 errors

Re-ran with -l (leak) - 0 leaks.

talonmies · Accepted Answer

It would appear that you are (at least) launching your kernel with insufficient dynamically allocated shared memory for it to run without a buffer overflow inside the kernel.

The amount of shared memory per block is specific in bytes, so I suspect you want something like:

size_t shmsz = sizeof(float)*size_t((MatSize+2*(maskSize-1))*
                                    (MatSize+2*(maskSize-1));
myConv<<>>(cudaresPtr,cudainp,MatSize);

Beyond that, I leave the debugging to you.

CUDA unkown error when copying from device to host

Answers (1)

Related Questions