Basic CUDA load and warp transpose

Question

I want to implement a basic blocked load and warp transpose using CUDA 9.0's shuffle operations. I'm aware of the cub and trove implementations, but I'm restricted to compiling with nvrtc and the standard header includes make these libraries difficult to cater for. I'm not looking for anything fancy, just some integer, float and double shuffles on data with dimension a power of 2.

Visualising an example with warp size 8, I want to go from:

             correlation
             0    1    2    3

lane 0       0    8   16   24
lane 1       1    9   17   25
lane 2       2   10   18   26
lane 3       3   11   19   27
lane 4       4   12   20   28
lane 5       5   13   21   29
lane 6       6   14   22   30 
lane 7       7   15   23   31

to this structure:

             correlation
             0    1    2    3

lane 0       0    1    2    3
lane 1       8    9   10   11
lane 2       16  17   18   19
lane 3       24  25   26   27 
lane 4       4    5    6    7
lane 5       12  13   14   15
lane 6       20  21   22   23
lane 7       28  29   30   31

I feel this should be really simple but I can't figure out what I've done incorrectly. I think that the basic transposition loop should look like:

int loads[ncorrs];
int values[ncorrs];
int lane_id = threadIdx.x & (warp_size - 1);
// 0 0 0 0 4 4 4 4 8 8 8 8 ....
int base_idx = lane_id & (warp_size - ncorrs);
// 0 1 2 3 0 1 2 3 0 1 2 3
int src_corr = lane_id & (ncorrs - 1);

for(int corr=0; corr < ncorrs; ++corr)
{
    int src_lane = base_idx + corr;
    values[corr] = __shfl_sync(mask, loads[src_corr],
                                 src_lane, warp_size);
}

So given the example data above, if we're in lane 5, I expect that the following indexing should occur:

base_idx == 4;
src_corr == 1;

corr == [0, 1, 2, 3]
src_lane == [4, 5, 6, 7]
values == [12, 13, 14 15]

But instead the following is happening (33's are from later in the data):

             correlation
             0    1    2    3

lane 0       0    0    0    0
lane 1       4    4    4    4
lane 2       12  12   12   12
lane 3       16  16   16   16
lane 4       20  20   20   20
lane 5       24  24   24   24
lane 6       28  28   28   28 
lane 7       33  33   33   33

What am I doing incorrectly? Full implementation for a warp size of 32:

#include 
#include 

#include "cuda.h"

#define ncorr 4
#define warp_size 32

template 
__global__ void kernel(
    int * input,
    int * output,
    int N)
{
    // This should provide 0 0 0 0 4 4 4 4 8 8 8 8 ...
    #define base_idx(lane_id) (lane_id & (warp_size - ncorrs))
    // This should provide 0 1 2 3 0 1 2 3 0 1 2 3
    #define corr_idx(lane_id) (lane_id & (ncorrs - 1))


    int n = blockIdx.x*blockDim.x + threadIdx.x;
    int lane_id = threadIdx.x & (warp_size - 1);

    if(n >= N)
        { return; }

    // Input correlation handled by this thread
    int src_corr = corr_idx(lane_id);
    int mask = __activemask();

    if(threadIdx.x == 0)
        { printf("mask %d
", mask); }

    int loads[ncorrs];
    int values[ncorrs];

    #pragma unroll (ncorrs)
    for(int corr=0; corr < ncorrs; ++corr)
        { loads[corr] = input[n + corr*N]; }

    __syncthreads();

    printf("[%d, %d] %d %d %d %d
",
           lane_id, base_idx(lane_id),
           loads[0], loads[1],
           loads[2], loads[3]);

    #pragma unroll (ncorrs)
    for(int corr=0; corr < ncorrs; ++corr)
    {
        int src_lane = base_idx(lane_id) + corr;
        values[corr] = __shfl_sync(mask, loads[src_corr],
                                     src_lane, warp_size);
    }

    printf("[%d, %d] %d %d %d %d
",
           lane_id, base_idx(lane_id),
           values[0], values[1],
           values[2], values[3]);


    #pragma unroll (ncorrs)
    for(int corr=0; corr < ncorrs; ++corr)
        { output[n + corr*N] = values[corr]; }
}

void print_data(int * data, int N)
{
    for(int n=0; n < N; ++n)
    {
        printf("% -3d: ", n);
        for(int c=0; c < ncorr; ++c)
        {
            printf("%d ", data[n*ncorr + c]);
        }
        printf("
");
    }
}

int main(void)
{
    int * host_input;
    int * host_output;

    int * device_input;
    int * device_output;
    int N = 32;

    host_input = (int *) malloc(sizeof(int)*N*ncorr);
    host_output = (int *) malloc(sizeof(int)*N*ncorr);

    printf("malloc done
");

    cudaMalloc((void **) &device_input, sizeof(int)*N*ncorr);
    cudaMalloc((void **) &device_output, sizeof(int)*N*ncorr);

    printf("cudaMalloc done
");

    for(int i=0; i < N*ncorr; ++i)
        { host_input[i] = i; }

    print_data(host_input, N);

    dim3 block(256, 1, 1);
    dim3 grid((block.x + N - 1) / N, 1, 1);

    cudaMemcpy(device_input, host_input,
               sizeof(int)*N*ncorr, cudaMemcpyHostToDevice);

    printf("memcpy done
");

    kernel<4> <<>> (device_input, device_output, N);

    cudaMemcpy(host_output, device_output,
               sizeof(int)*N*ncorr, cudaMemcpyDeviceToHost);

    print_data(host_output, N);

    cudaFree(device_input);
    cudaFree(device_output);

    free(host_input);
    free(host_output);
}

Edit 1: Clarified that the visual example has a warp size of 8 while the full code caters for a warp size of 32

Basic CUDA load and warp transpose

Answers (1)

Related Questions