Malacu
Malacu

Reputation: 191

using shared memory in cuda gives memory write error

I have a kernel which works fine as

__global__ static void  CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
    const Integer TID = CudaGetTargetID();
    const Integer ID  = TID;
    if(ID >= ComputeParticleNumber)
    {
        return ;
    }
    CDistance NearestDistance;
    Integer NearestID = -1;
    NearestDistance.Magnitude = 1e8;
    NearestDistance.Direction = make_Scalar3(0,0,0);
    if(c_daOutputParticleID[ID] < -1)
    {
        c_daSTLDistance[ID] = NearestDistance;
        c_daSTLID[ID] = NearestID;
        return;
    }
    Scalar3 TargetPosition = c_daParticlePosition[ID];

    Integer TriangleID;     
    Integer CIDX, CIDY, CIDZ;
    Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&TargetPosition,CIDX, CIDY, CIDZ);
    Integer Range = 1;
    if(CID >=0 && CID < c_CellNum)
    {
        for(Integer k = -Range; k <= Range; ++k)
        {
            for(Integer j = -Range; j <= Range; ++j)
            {
                for(Integer i = -Range; i <= Range; ++i)
                {
                    Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
                    if(MCID < 0 || MCID >= c_CellNum)
                    {
                        continue;
                    }
                    unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
                    for(unsigned int l = 0; l < TriangleNum; ++l)
                    {
                        TriangleID = c_daCell[MCID].m_TriangleID[l];
                        if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
                        {
                            CDistance Distance ;
                            Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition, &Distance.Direction);
                            if(Distance.Magnitude < NearestDistance.Magnitude)
                            {
                                NearestDistance = Distance;
                                NearestID = TriangleID;
                            }
                        }
                    }   
                }
            }
        }
    }
    c_daSTLDistance[ID] = NearestDistance;
    c_daSTLID[ID] = NearestID;
}

here c_daParticlePosition is of float3 data type and resides in constant memory. I want to use shared memory so I tried to create a float3 variable in shared memory and tried to copy the data from constant memory to shared memory. However, it shows unknown error and with cuda-memcheck it says

here thread number is 255 with 2 block size

Shared memory code:

__global__ static void CalcSTLDistance_Kernel(Integer ComputeParticleNumber)
{
    //const Integer TID = CudaGetTargetID();
    const Integer ID  =CudaGetTargetID(); 
    extern __shared__ float3 s[];
    /*if(ID >= ComputeParticleNumber)
    {
        return ;
    }*/
    s[ID] = c_daParticlePosition[ID];
    __syncthreads();

    CDistance NearestDistance;
    Integer NearestID = -1;
    NearestDistance.Magnitude = 1e8;
    NearestDistance.Direction.x = 0;
    NearestDistance.Direction.y = 0;
    NearestDistance.Direction.z = 0;//make_Scalar3(0,0,0);
    //if(c_daOutputParticleID[ID] < -1)
    //{
    //  c_daSTLDistance[ID] = NearestDistance;
    //  c_daSTLID[ID] = NearestID;
    //  return;
    //}

    //Scalar3 TargetPosition = c_daParticlePosition[ID];

    Integer TriangleID;     
    Integer CIDX, CIDY, CIDZ;
    Integer CID = GetCellID(&CONSTANT_BOUNDINGBOX,&s[ID],CIDX, CIDY, CIDZ);
    if(CID >=0 && CID < c_CellNum)
    {
        //Integer Range = 1;
        for(Integer k = -1; k <= 1; ++k)
        {
            for(Integer j = -1; j <= 1; ++j)
            {
                for(Integer i = -1; i <= 1; ++i)
                {
                    Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ + k);
                    if(MCID < 0 || MCID >= c_CellNum)
                    {
                        continue;
                    }
                    unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
                    for(unsigned int l = 0; l < TriangleNum; ++l)
                    {
                        TriangleID = c_daCell[MCID].m_TriangleID[l];
                        /*if(c_daTrianglesParameters[c_daTriangles[TriangleID].ModelIDNumber].isDrag)
                        {
                            continue;
                        }*/

                        if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID != NearestID)// No need to calculate again for the same triangle
                        {
                        CDistance Distance ;
                            Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &s[ID], &Distance.Direction);
                            if(Distance.Magnitude < NearestDistance.Magnitude)
                            {
                                NearestDistance = Distance;
                                NearestID = TriangleID;
                            }
                        }
                    }   
                }
            }
        }
    }
    c_daSTLDistance[ID] = NearestDistance;
    c_daSTLID[ID] = NearestID;
}

error

  Invalid __shared__ write of size 4
    =========     at 0x00000128 in CalcSTLDistance_Kernel(int)
    =========     by thread (159,0,0) in block (0,0,0)
    =========     Address 0x0000077c is out of bounds

Upvotes: 0

Views: 1564

Answers (1)

Michal Hosala
Michal Hosala

Reputation: 5697

You may find useful info on how to work with shared memory in this article. Focus especially on static shared memory and dynamic shared memory sections.

Based on above article you should find out that you are simply writing out of bounds of your array s, exactly as the error message says. To fix the issue you can:

  • either specify the size of shared memory array s at compile time, if you know it in advance, such as __shared__ float3 s[123456];
  • or use dynamically sized s array, thats basically what you are doing at the moment, but ALSO specify the third kernel launch parameter as CalcSTLDistance_Kernel<<<gridSize, blockSize, sharedMemorySizeInBytes>>>. In case you will be using an array of 123456 float3s then use int sharedMemorySizeInBytes = 123456 * sizeof(float3)

Upvotes: 4

Related Questions