Shifting columns in a row-major matrix

Question

Given the fat matrix A=[[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]], I am trying to shift (permute) rows, i.e. B=[[11,12,13,14,15],[1,2,3,4,5],[6,7,8,9,10]], and shift columns, i.e. C=[[5,1,2,3,4],[10,6,7,8,9],[15,11,12,13,14]]

I made a similar kernel as shifting columns:

// A->C
__global__ void column_shift(int* mat, int row, int col) {
    int row_num = blockDim.x * blockIdx.x + threadIdx.x;
    if (row_num < row) {
        int a = mat[row_num * col];
        for (int i = 0; i < col - 1; ++i) {
            mat[row_num * col + i] = mat[row_num * col + i + 1];
        }
        mat[row_num * col + (col - 1)] = a;
    }
}

// A->B
__global__ void row_shift(int* mat, int row, int col) {
    int col_num = blockDim.x * blockIdx.x + threadIdx.x;
    if (col_num < col) {
        int a = mat[(row - 1) * col + col_num];
        for (int i = row - 1; i > 0; i--) {
            mat[i * col + col_num] = mat[(i - 1) * col + col_num];
        }
        mat[col_num] = a;
    }
}

However, comparing to row_shift, column_shift performs worse. I guess this is due to memory coalescing. Are there efficient way to enhance performance of column_shift?

talonmies · Accepted Answer

The most obvious performance issue with column_shift is a lack of memory coalescing. This could be fixed by have a warp of threads perform the row data shift, rather than a single thread.

Consider the following example (note I have rewritten your kernels to use a simple helper class which both simplifies the kernel code, and reduces the risk of indexing calculation errors (as was the case in at least one of the kernels you originally posted):

#include 
#include 
#include 

struct stride2D
{
   int* p;
   int s0;
   __host__ __device__
   stride2D(int* _p, int _s0) : p(_p), s0(_s0) {};
   __host__ __device__
   int operator  () (int x, int y) const { return p[x*s0 + y]; };
   __host__ __device__
   int& operator () (int x, int y) { return p[x*s0 + y]; };
};

__global__ void column_shift2(int* mat, int row, int col)
{
    int row_num = blockDim.x * blockIdx.x + threadIdx.x;
    stride2D m(mat, col);   

    if (row_num < row) {
        int a = m(row_num, 0);
        for (int i = 0; i < col-1; i++) {
            m(row_num, i) = m(row_num, i+1);
        }
        m(row_num, col-1) = a;
    }
}

__global__ void column_shift3(int* mat, int row, int col)
{
    int row_num = blockDim.y * blockIdx.y + threadIdx.y;
    stride2D m(mat, col);   
    if (row_num < row) {
        int a = m(row_num, 0);
        for (int i = threadIdx.x; i < col-1; i += warpSize) {
            m(row_num, i) = m(row_num, i+1);
        }
        if (threadIdx.x == 0) m(row_num, col-1) = a;
    }
}

__global__ void row_shift2(int* mat, int row, int col) {
    int col_num = blockDim.x * blockIdx.x + threadIdx.x;
    stride2D m(mat, col);   
    if (col_num < col) {
        int a = m(row-1, col_num);
        for (int i = row - 1; i > 0; i--) {
            m(i, col_num) = m(i-1, col_num);
        }
        m(0, col_num) = a;
    }
}

int main()
{
    const int r = 300, c = 900, n = r * c;

    {
        std::vector idata(n);
        thrust::counting_iterator first(1);
        thrust::copy(first, first+n, idata.begin());

        thrust::device_vector ddata(idata);
        int* d = thrust::raw_pointer_cast(ddata.data());

        int bsize = 256, nblocks = (c / bsize) + (c % bsize > 0) ? 1 : 0;
        row_shift2<<>>(d, r, c);
        cudaDeviceSynchronize();

        std::vector odata(n);
        thrust::copy(ddata.begin(), ddata.end(), odata.begin());
    }

    {
        std::vector idata(n);
        thrust::counting_iterator first(1);
        thrust::copy(first, first+n, idata.begin());

        thrust::device_vector ddata(idata);
        int* d = thrust::raw_pointer_cast(ddata.data());

        int bsize = 256, nblocks = (r / bsize) + (r % bsize > 0) ? 1 : 0;
        column_shift2<<>>(d, r, c);
        cudaDeviceSynchronize();

        std::vector odata(n);
        thrust::copy(ddata.begin(), ddata.end(), odata.begin());
    }

    {
        std::vector idata(n);
        thrust::counting_iterator first(1);
        thrust::copy(first, first+n, idata.begin());

        thrust::device_vector ddata(idata);
        int* d = thrust::raw_pointer_cast(ddata.data());

        const int bwidth = 32;
        dim3 bsize(bwidth, 1024/bwidth);
        int nblocks = (r / bsize.y) + (r % bsize.y > 0) ? 1 : 0;
        column_shift3<<>>(d, r, c);
        cudaDeviceSynchronize();

        std::vector odata(n);
        thrust::copy(ddata.begin(), ddata.end(), odata.begin());
    }

    cudaDeviceReset();

    return 0;
}

The only real change required is the inner copying loop within the column_shift operation:

    for (int i = threadIdx.x; i < col-1; i += warpSize) {
        m(row_num, i) = m(row_num, i+1);
    }

Now we use a warp strided loop (which must be run with blockDim.x = 32 for correctness). Profiling this code shows this:

nvprof ./permute
==13687== NVPROF is profiling process 13687, command: ./permute
==13687== Profiling application: ./permute
==13687== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   51.24%  643.80us         1  643.80us  643.80us  643.80us  column_shift2(int*, int, int)
                   21.36%  268.41us         3  89.471us  89.087us  89.887us  [CUDA memcpy HtoD]
                   21.06%  264.57us         3  88.191us  87.647us  89.023us  [CUDA memcpy DtoH]
                    5.54%  69.631us         1  69.631us  69.631us  69.631us  row_shift2(int*, int, int)
                    0.81%  10.144us         1  10.144us  10.144us  10.144us  column_shift3(int*, int, int)
      API calls:   68.19%  114.44ms         3  38.148ms  78.552us  114.28ms  cudaMalloc
                   30.00%  50.352ms         1  50.352ms  50.352ms  50.352ms  cudaDeviceReset
                    0.65%  1.0974ms         6  182.89us  102.55us  246.46us  cudaMemcpyAsync
                    0.44%  732.75us         3  244.25us  13.565us  646.95us  cudaDeviceSynchronize
                    0.21%  348.53us        97  3.5930us     263ns  197.14us  cuDeviceGetAttribute
                    0.17%  290.47us         1  290.47us  290.47us  290.47us  cuDeviceTotalMem
                    0.16%  266.04us         6  44.339us  2.3170us  87.602us  cudaStreamSynchronize
                    0.11%  184.85us         3  61.616us  53.903us  71.672us  cudaFree
                    0.03%  54.650us         3  18.216us  13.862us  25.133us  cudaLaunchKernel
                    0.03%  51.108us         1  51.108us  51.108us  51.108us  cuDeviceGetName
                    0.00%  4.0760us         3  1.3580us     408ns  3.1910us  cuDeviceGetCount
                    0.00%  3.4620us         1  3.4620us  3.4620us  3.4620us  cuDeviceGetPCIBusId
                    0.00%  1.6850us         2     842ns     248ns  1.4370us  cuDeviceGet
                    0.00%     585ns         1     585ns     585ns     585ns  cuDeviceGetUuid

i.e. the warp strided copy is about 60 times faster than your original implementation.

[Note all code extremely lightly tested, and no guarantees of correctness or optimality are made or implied]

Shifting columns in a row-major matrix

Answers (1)

Related Questions