Using shared memory in Dynamic Parallelism CUDA

Question

Question 1: Do I have to specify the amount of dynamic shared memory to be allocated at the launch of parent kernel if shared memory is only used by child kernel.

Question 2: The following is my child kernel and parent kernel

Parent kernel

__global__ void ColumnFractionalShift(DataIn DataInput,float* __restrict__ DeviceInput, float ShiftAmount, float* __restrict__ LightFieldDevice)
{
    cudaError_t status;
    float ImageShift = threadIdx.x*ShiftAmount;
    float ImageIntegerShift = nearbyintf(ImageShift);
    float Delay = ImageShift - ImageIntegerShift;
    int InputImageOffset = +DataInput.X*DataInput.Y*DataInput.U*(threadIdx.y) + DataInput.X*DataInput.Y*(threadIdx.x);
    dim3 dimBlock(32, 24);
    dim3 dimGrid(16, 14);
    //if (threadIdx.x > 5)
    {
        ConvolutionColumn << > >(DataInput, DeviceInput + InputImageOffset, Delay, LightFieldDevice + InputImageOffset);
    }
    status = cudaGetLastError();
    if (status != cudaSuccess) {
        printf("failed %s
", cudaGetErrorString(status));
    }
    cudaDeviceSynchronize();

    if (threadIdx.x == 5)
    {
        printf("The values at beginig of %d  %d are %f
", threadIdx.x, threadIdx.y, *(LightFieldDevice + InputImageOffset));
    }
}

Child Kernel

__global__ void ConvolutionColumn(DataIn DataInput,float* __restrict__ DeviceInput, float Delay, float* __restrict__ DeviceResult)
{
    extern __shared__ float ConvolutionBlockLeft[];
    int BlockStart = blockDim.y*blockIdx.y*DataInput.V + blockIdx.x*blockDim.x;
    //int BlockEnd = BlockStart+(blockDim.x*blockDim.y)-1;
    int PixelId = blockDim.x*threadIdx.y + threadIdx.x; //32 by 24 kernal
    int LoadPixelId = DataInput.V*threadIdx.y + threadIdx.x;
    int LoadLeft,LoadRght,LoadCentre;
    float KernalSum;
    float DelayPower = Delay;
    //load upper values
    if (blockIdx.y == 0)
    {
        LoadLeft = DataInput.V*(blockDim.y - threadIdx.y-1) + threadIdx.x;
    }
    else
    {
        LoadLeft =  LoadPixelId - (DataInput.V*blockDim.y);
    }
    *(ConvolutionBlockLeft + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + BlockStart + LoadLeft);
    if (blockIdx.y*blockDim.y + threadIdx.y >= DataInput.U)
    {
        LoadCentre = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((blockIdx.y*blockDim.y + threadIdx.y) - DataInput.U)*DataInput.V;
    }
    else
    {
        LoadCentre = BlockStart+LoadPixelId;
    }
    *(ConvolutionBlockLeft + (blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadCentre);
    if (blockIdx.y*blockDim.y + threadIdx.y + blockDim.y >= DataInput.U)
    {
        LoadRght = ((DataInput.U - 1)*DataInput.V) + (blockDim.x*blockIdx.x) + threadIdx.x - ((((blockIdx.y*blockDim.y) + threadIdx.y + blockDim.y) - DataInput.U)*DataInput.V);
    }
    else
    {
        LoadRght = BlockStart+LoadPixelId + (DataInput.V*blockDim.y);
    }
    //float tempfil, tempdata;
    //int t;
    *(ConvolutionBlockLeft + (2 * blockDim.x*blockDim.y) + (threadIdx.y*blockDim.x) + threadIdx.x) = *(DeviceInput + LoadRght);
    __syncthreads();
    float FilterSum = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId));
    for (int k = 1; k < DataInput.KernalNoOfFilters; k++)
    {
        KernalSum = 0;
        //printf("The value of filter size is %d
", (DeviceFilterSize[k]));
        for (int l = -((*(DeviceFilterSize + k) - 1) / 2); l < ((*(DeviceFilterSize + k) + 1) / 2); l++)
        {
            //tempfil = *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l);
            //t = (blockDim.x*blockDim.y) + PixelId + (l*blockDim.x);
            //tempdata = *(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));
            KernalSum += *(DeviceFilterKernal + k*DataInput.KernalFilterLength + ((*(DeviceFilterSize + k) - 1) / 2) + l)**(ConvolutionBlockLeft + ((blockDim.x*blockDim.y) + PixelId - (l*blockDim.x)));
        }
        KernalSum *= DelayPower;
        DelayPower *= Delay;
        FilterSum += KernalSum;
    }
    if (blockIdx.y*blockDim.y + threadIdx.y < DataInput.U)
    {
        *(DeviceResult + LoadPixelId + BlockStart) = FilterSum;
    }
}

Here child kernel alone works fine. However when its launched from another kernel, after parent kernel launch from host at cudaDeviceSynchronize() unspecified launch failure error is given(The error isn't printed from printf within the kernel).

The launch configuration of parent kernel is <<<1,(17 17)>>>. If only one thread from parent is allowed to launch the child grid then the code works fine. Is there a limit on how many grids that can be launched from one block?

Using shared memory in Dynamic Parallelism CUDA

Answers (1)

Related Questions