omer sahban
omer sahban

Reputation: 97

Reducing Shared Memory Bank Conflicts

Nvprof reported that there are about 200 milion shared_ld_bank_conflict and some shared_st_bank_conflict in my sgemm kernel. I tried the padding trick __shared__ float smem[SIZE + OFFSET];, it reduced store bank conflicts to 0, but load bank conflicts are still there. I don't know how to further improve it.

__global__ void sgemm(
  const float* __restrict__ A,
  const float* __restrict__ B,
  float* __restrict__ C,
  int M, int N, int K
){
  int tid = threadIdx.x;
  int gStartx = blockIdx.x * 128;
  int gStarty = blockIdx.y * 128;

  int dx = tid % 8;
  int dy = tid / 8;
  int vx = tid % 16;
  int vy = tid / 16;

  __shared__ volatile float aSM[8][128+4];
  __shared__ volatile float bSM[8][128+4];
  float aBuffer1[4];
  float bBuffer1[4];
  float aBuffer2[4];
  float bBuffer2[4];

  float cCache[8][8];
#pragma unroll
  for (int i=0; i<8; i++) 
#pragma unroll
    for (int j=0; j<8; j++)
      cCache[i][j] = 0.f;

//load first two tiles
#pragma unroll
  for (int i=0; i<4; i++){
    aBuffer1[i] = A[(gStarty + dy + i*32)*K + (dx)];
    bBuffer1[i] = B[(gStartx + dy + i*32)*K + (dx)];
  }
  int nIt = (K + 8 - 1) / 8;
#pragma unroll
  for (int itr=0; itr<nIt; itr++){
    int gStartk = itr * 8;
    int is_odd = itr & 1;
    if (is_odd == 0){
#pragma unroll
      for (int i=0; i<4; i++){
        if (itr != (nIt - 1)){
          // prefetch next tiles
          aBuffer2[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
          bBuffer2[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
        }
        //move current tiles to SMEM
        aSM[dx][dy+i*32] = aBuffer1[i];
        bSM[dx][dy+i*32] = bBuffer1[i];
      }
    } else {
#pragma unroll
      for (int i=0; i<4; i++){
        if (itr != (nIt - 1)){
          //prefetch next tiles to another buffer
          aBuffer1[i] = A[(gStarty + i*32 + dy)*K + (gStartk + 8 + dx)];
          bBuffer1[i] = B[(gStartx + i*32 + dy)*K + (gStartk + 8 + dx)];
        }
        aSM[dx][dy+i*32] = aBuffer2[i];
        bSM[dx][dy+i*32] = bBuffer2[i];
      }
    }
    __syncthreads();

    float aCache[8][4];

#pragma unroll
    for (int p=0; p<2; p++){
#pragma unroll
      for (int ki=0; ki<8; ki++){
#pragma unroll 
        for (int mi=0; mi<4; mi++){
          aCache[ki][mi] = aSM[ki][8*vy + 4*p +mi];
        }
      }

#pragma unroll
      for (int ki=0; ki<8; ki++){
#pragma unroll
        for (int ni=0; ni<8; ni++){
        float b = bSM[ki][8*vx + ni];
#pragma unroll
          for (int mi=0; mi<4; mi++){
            float a = aCache[ki][mi];
            cCache[mi + 4*p][ni] = fma(a, b, cCache[mi + 4*p][ni] );
          }
        }
      }
    } 
    __syncthreads();
  }

#pragma unroll
  for (int i=0; i<8; i++){
    for (int j=0; j<8; j++){
      C[(gStarty + vy*8 + i)*N + (gStartx + vx*8 + j)] = cCache[i][j];
    }
  }
}

A (2048x2048) matrix is row major, B (2048x2048) is column major, each block has 256 threads, each block calculates 128x128 portion of C, and each thread calculates 8x8x8. the gpu is Tesla P100.

Upvotes: 0

Views: 594

Answers (1)

omer sahban
omer sahban

Reputation: 97

Ok I found a solution: when storing to bSM, insert one padding word between every 32 words in the second dimention

//bSM[dx][dy+i*32] = bBuffer1[i];
bSM[dx][dy+i*33] = bBuffer1[i]; //we're skipping column 32, 65, 98, 131

when reading bSM[i][j], read it like this: bSM[i][j/32 + j]

//float b = bSM[ki][8*vx + ni];
float b = bSM[ki][(8*vx) / 32 + 8*vx + ni];
// (8*vx+ni)/32 is the same as (8*vx)/32, since vi is always less than 8

now it's giving me 55% performance of cublas gemm on tesla p4

Upvotes: 1

Related Questions