Rohit Jena
Rohit Jena

Reputation: 3

Performance difference due to indexing during matrix multiplication

I'm trying out the difference between using a tiled and naive implementation in CUDA C++. I expect to see a performance gap in these variations because of the repeated usage of shared memory. However, the speedup was only about twice as fast (naive ~12ms and tiled ~6ms). Here are the code snippets:

#include <iostream>
#include <assert.h>
using namespace std;

# define N 1024
# define THREADS 16
# define IDX(x, y, s) (x*s + y)

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

void init_values(int *a, int *b, int sz) {
    for(int i=0; i<sz; i++) {
        a[i] = rand()%513 - 256;
        b[i] = rand()%513 - 256;
    }
}

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int t = 0;
    for(int i=0; i<n; i++) {
        t += (a[IDX(x, i, n)] * b[IDX(i, y, n)]);
    }
    c[IDX(x, y, n)] = t;
}

void matmul_verify(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<n; j++) {
            int t = 0;
            for(int k=0; k<n; k++)
                t += a[IDX(i, k, n)] * b[IDX(k, j, n)];
            // cout << i << " " << j << " " << c[IDX(i, j, n)] <<  " " << t <<  endl;
            assert(c[IDX(i, j, n)] == t);
        }
    }
}

int main()
{
    int *a, *b, *c;
    int *da, *db, *dc;
    size_t sz = N * N * sizeof(int);

    a = (int*)malloc(sz);
    b = (int*)malloc(sz);
    c = (int*)malloc(sz);
    init_values(a, b, N*N);

    gpuErrchk(cudaMalloc((void**)&da, sz));
    gpuErrchk(cudaMalloc((void**)&db, sz));
    gpuErrchk(cudaMalloc((void**)&dc, sz));

    gpuErrchk(cudaMemcpy(da, a, sz, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(db, b, sz, cudaMemcpyHostToDevice));
    // init grid size
    dim3 grids(N/THREADS, N/THREADS);
    dim3 blocks(THREADS, THREADS);
    // time it
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    matmul<<<grids, blocks>>>(da, db, dc, N);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cout << "Took " << milliseconds << " milliseconds.\n";

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    gpuErrchk(cudaMemcpy(c, dc, sz, cudaMemcpyDeviceToHost));
    matmul_verify(a, b, c, N);

    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    free(a);
    free(b);
    free(c);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}

and for the tiled implementation, I change the kernel as

__global__ 
void matmul(int *a, int *b, int *c, int n) {
    // perform parallel matmul
    int ty = threadIdx.y, by = blockIdx.y;
    int tx = threadIdx.x, bx = blockIdx.x;
    int x = bx * blockDim.x + tx;
    int y = by * blockDim.y + ty;
    // block IDs tell us which block to solve for
    // (bx, by) --> (bx: bx + tx, by:by + ty)
    __shared__ int A[SHMEM_SIZE];
    __shared__ int B[SHMEM_SIZE];

    const int tile_size = THREADS;

    // to get value of tile [tx, ty] in block [bx, by], we need blocks A[bx, *] and blocks B[*, by]
    int res = 0;

    for(int blk=0; blk < n; blk+=tile_size)  {
        // block index
        A[IDX(tx, ty, tile_size)] = a[IDX(x, blk + ty, n)];
        B[IDX(tx, ty, tile_size)] = b[IDX(blk + tx, y, n)];
        __syncthreads();
        for(int k=0; k<tile_size; k++) {
            res += (A[IDX(tx, k, tile_size)] * B[IDX(k, ty, tile_size)]);
        }
        __syncthreads();
    }

    // for(int k=0; k<n; k++)
    //     res += a[IDX(x, k, n)] * b[IDX(k, y, n)];
    c[IDX(x, y, n)] = res;
}    

nothing else really changes. However, in the tiled implementation, if I simply change

    int ty = threadIdx.x, by = blockIdx.x;
    int tx = threadIdx.y, bx = blockIdx.y;

for the initialization of thread and block indices, I get about a ~1ms runtime (12x speedup). How is this happening? I read from the book "CUDA By Example" that the thread and block indices in 2 dimensions are just for programmer convenience and do not reflect any difference in performance. This seems to be false. Any clarification is really appreciated.

Upvotes: 0

Views: 350

Answers (1)

paleonix
paleonix

Reputation: 3031

CUDA thread blocks are partitioned into warps of 32 threads. Ideally the neighboring lanes of a warp should always load neighboring elements from global memory. This is called coalescing and allows for maximum memory bandwidth. In hardware all the coalesced loads from a warp will be bundled into a minimal number of memory transactions.

Other factors that can deteriorate memory bandwidth are the size of the load (one can try to use the builtin vector types to get bigger loads for optimization, e.g. int2, int4, float2, etc.) and alignment.

The mapping from 3D threadIdx to warp lanes always takes the first dimension .x as the continuous dimension, i.e. a block of dimensions (32, 2, 1) will have one warp with threadIdx.y == 0 and one warp with threadIdx.y == 1 where the lanes of each warp correspond to threadIdx.x.

Therefore to allow for coalescing, you have to access memory as

A[ty * s + tx] // coalesced access

instead of

A[tx * s + ty] // strided access

to achieve optimal performance.

What is probably meant in the book you mentioned is that there shouldn't be a performance difference between launching a grid of (32, 2, 1) blocks and a grid of (64, 1, 1) blocks while manually getting ty = threadIdx.x / 32 and tx = threadIdx.x % 32. These divisions probably happen internally when having a block that is not flat in the first place.

Upvotes: 1

Related Questions