Reputation: 149
Given the fat matrix A=[[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]]
, I am trying to shift (permute) rows, i.e. B=[[11,12,13,14,15],[1,2,3,4,5],[6,7,8,9,10]]
, and shift columns, i.e. C=[[5,1,2,3,4],[10,6,7,8,9],[15,11,12,13,14]]
I made a similar kernel as shifting columns:
// A->C
__global__ void column_shift(int* mat, int row, int col) {
int row_num = blockDim.x * blockIdx.x + threadIdx.x;
if (row_num < row) {
int a = mat[row_num * col];
for (int i = 0; i < col - 1; ++i) {
mat[row_num * col + i] = mat[row_num * col + i + 1];
}
mat[row_num * col + (col - 1)] = a;
}
}
// A->B
__global__ void row_shift(int* mat, int row, int col) {
int col_num = blockDim.x * blockIdx.x + threadIdx.x;
if (col_num < col) {
int a = mat[(row - 1) * col + col_num];
for (int i = row - 1; i > 0; i--) {
mat[i * col + col_num] = mat[(i - 1) * col + col_num];
}
mat[col_num] = a;
}
}
However, comparing to row_shift
, column_shift
performs worse. I guess this is due to memory coalescing. Are there efficient way to enhance performance of column_shift
?
Upvotes: 0
Views: 252
Reputation: 72349
The most obvious performance issue with column_shift
is a lack of memory coalescing. This could be fixed by have a warp of threads perform the row data shift, rather than a single thread.
Consider the following example (note I have rewritten your kernels to use a simple helper class which both simplifies the kernel code, and reduces the risk of indexing calculation errors (as was the case in at least one of the kernels you originally posted):
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <vector>
struct stride2D
{
int* p;
int s0;
__host__ __device__
stride2D(int* _p, int _s0) : p(_p), s0(_s0) {};
__host__ __device__
int operator () (int x, int y) const { return p[x*s0 + y]; };
__host__ __device__
int& operator () (int x, int y) { return p[x*s0 + y]; };
};
__global__ void column_shift2(int* mat, int row, int col)
{
int row_num = blockDim.x * blockIdx.x + threadIdx.x;
stride2D m(mat, col);
if (row_num < row) {
int a = m(row_num, 0);
for (int i = 0; i < col-1; i++) {
m(row_num, i) = m(row_num, i+1);
}
m(row_num, col-1) = a;
}
}
__global__ void column_shift3(int* mat, int row, int col)
{
int row_num = blockDim.y * blockIdx.y + threadIdx.y;
stride2D m(mat, col);
if (row_num < row) {
int a = m(row_num, 0);
for (int i = threadIdx.x; i < col-1; i += warpSize) {
m(row_num, i) = m(row_num, i+1);
}
if (threadIdx.x == 0) m(row_num, col-1) = a;
}
}
__global__ void row_shift2(int* mat, int row, int col) {
int col_num = blockDim.x * blockIdx.x + threadIdx.x;
stride2D m(mat, col);
if (col_num < col) {
int a = m(row-1, col_num);
for (int i = row - 1; i > 0; i--) {
m(i, col_num) = m(i-1, col_num);
}
m(0, col_num) = a;
}
}
int main()
{
const int r = 300, c = 900, n = r * c;
{
std::vector<int> idata(n);
thrust::counting_iterator<int> first(1);
thrust::copy(first, first+n, idata.begin());
thrust::device_vector<int> ddata(idata);
int* d = thrust::raw_pointer_cast(ddata.data());
int bsize = 256, nblocks = (c / bsize) + (c % bsize > 0) ? 1 : 0;
row_shift2<<<nblocks, bsize>>>(d, r, c);
cudaDeviceSynchronize();
std::vector<int> odata(n);
thrust::copy(ddata.begin(), ddata.end(), odata.begin());
}
{
std::vector<int> idata(n);
thrust::counting_iterator<int> first(1);
thrust::copy(first, first+n, idata.begin());
thrust::device_vector<int> ddata(idata);
int* d = thrust::raw_pointer_cast(ddata.data());
int bsize = 256, nblocks = (r / bsize) + (r % bsize > 0) ? 1 : 0;
column_shift2<<<nblocks, bsize>>>(d, r, c);
cudaDeviceSynchronize();
std::vector<int> odata(n);
thrust::copy(ddata.begin(), ddata.end(), odata.begin());
}
{
std::vector<int> idata(n);
thrust::counting_iterator<int> first(1);
thrust::copy(first, first+n, idata.begin());
thrust::device_vector<int> ddata(idata);
int* d = thrust::raw_pointer_cast(ddata.data());
const int bwidth = 32;
dim3 bsize(bwidth, 1024/bwidth);
int nblocks = (r / bsize.y) + (r % bsize.y > 0) ? 1 : 0;
column_shift3<<<nblocks, bsize>>>(d, r, c);
cudaDeviceSynchronize();
std::vector<int> odata(n);
thrust::copy(ddata.begin(), ddata.end(), odata.begin());
}
cudaDeviceReset();
return 0;
}
The only real change required is the inner copying loop within the column_shift
operation:
for (int i = threadIdx.x; i < col-1; i += warpSize) {
m(row_num, i) = m(row_num, i+1);
}
Now we use a warp strided loop (which must be run with blockDim.x = 32
for correctness). Profiling this code shows this:
nvprof ./permute
==13687== NVPROF is profiling process 13687, command: ./permute
==13687== Profiling application: ./permute
==13687== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 51.24% 643.80us 1 643.80us 643.80us 643.80us column_shift2(int*, int, int)
21.36% 268.41us 3 89.471us 89.087us 89.887us [CUDA memcpy HtoD]
21.06% 264.57us 3 88.191us 87.647us 89.023us [CUDA memcpy DtoH]
5.54% 69.631us 1 69.631us 69.631us 69.631us row_shift2(int*, int, int)
0.81% 10.144us 1 10.144us 10.144us 10.144us column_shift3(int*, int, int)
API calls: 68.19% 114.44ms 3 38.148ms 78.552us 114.28ms cudaMalloc
30.00% 50.352ms 1 50.352ms 50.352ms 50.352ms cudaDeviceReset
0.65% 1.0974ms 6 182.89us 102.55us 246.46us cudaMemcpyAsync
0.44% 732.75us 3 244.25us 13.565us 646.95us cudaDeviceSynchronize
0.21% 348.53us 97 3.5930us 263ns 197.14us cuDeviceGetAttribute
0.17% 290.47us 1 290.47us 290.47us 290.47us cuDeviceTotalMem
0.16% 266.04us 6 44.339us 2.3170us 87.602us cudaStreamSynchronize
0.11% 184.85us 3 61.616us 53.903us 71.672us cudaFree
0.03% 54.650us 3 18.216us 13.862us 25.133us cudaLaunchKernel
0.03% 51.108us 1 51.108us 51.108us 51.108us cuDeviceGetName
0.00% 4.0760us 3 1.3580us 408ns 3.1910us cuDeviceGetCount
0.00% 3.4620us 1 3.4620us 3.4620us 3.4620us cuDeviceGetPCIBusId
0.00% 1.6850us 2 842ns 248ns 1.4370us cuDeviceGet
0.00% 585ns 1 585ns 585ns 585ns cuDeviceGetUuid
i.e. the warp strided copy is about 60 times faster than your original implementation.
[Note all code extremely lightly tested, and no guarantees of correctness or optimality are made or implied]
Upvotes: 2