Optimizing global memory load in CUDA

Question

My task : I have two matrices : A - (18 x 4194304) ; B - (18 x 1024).

I have to take each 18-length vector from A and compute distance with each 18-length vector from B and find minimum distance and index.

My code :

__device__
void GetMin(float &dist, int &idx)
{
    float dist2;
    int idx2;
    dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 16, 32);
    idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 16);
    if (dist > dist2)
    {
        dist = dist2;
        idx = idx2;
    }

    dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 8, 32);
    idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 8);
    if (dist > dist2)
    {
        dist = dist2;
        idx = idx2;
    }

    dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 4, 32);
    idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 4);
    if (dist > dist2)
    {
        dist = dist2;
        idx = idx2;
    }

    dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 2, 32);
    idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 2);
    if (dist > dist2)
    {
        dist = dist2;
        idx = idx2;
    }

    dist2 = __shfl_down_sync(0xFFFFFFFF, dist, 1, 32);
    idx2 = __shfl_down_sync(0xFFFFFFFF, idx, 1);
    if (dist > dist2)
    {
        dist = dist2;
        idx = idx2;
    }
}

__global__
void CalcMinDist_kernel(const float *A, const float *B, float *output, const int nNumPixels, int nNumImages)
{
    int tx = threadIdx.x + blockIdx.x * blockDim.x;
    int ty = threadIdx.y;

    int lane_id = tx % 32;

    float dist = 0;
    int idx = 0;

    float fMin = 99999999;
    int nMinIdx = -1;

    for(int i = lane_id; i < 1024; i += 32)
    {
        dist = 0;
        for(int  j = 0; j < nNumImages; ++j)
        {
            int img_idx = blockIdx.x * ty + j * nNumPixels;
            dist += (A[img_idx] - B[i * nNumImages + j]) * 
                    (A[img_idx] - B[i * nNumImages + j]);
        }
        idx = i;
        GetMin(dist, idx);

        if(threadIdx.x == 0)
        {
            if(fMin > dist)
            {
                fMin = dist;
                nMinIdx = idx;
            }
        }
    }

    if(threadIdx.x == 0)
    {
        output[blockIdx.x * ty] = nMinIdx;
    }
}

Looking at the profiler, I'm memory bound, and do have ~90% occupancy. Is there any way to speed up this operation?

Let me know if I need to provide any other information.

Optimizing global memory load in CUDA

Answers (1)

Related Questions