CUDA more load transactions than store even though both are coalesced?

Question

I am profiling the NVIDIA's matrix transpose sample. From the looks of it and from the profiler, there are no bank conflicts. However, one thing I noticed is that global load transactions per request is more than global store transactions per request. From the looks of it both store and loads are coalesced. The data that is being read is int, so i should be getting perfect coalescing and I would imagine load/store transactions per request would be at least same if not equal. What am i missing? Note that my input matrix is of 12800x12800 size.

I compile the .cu code with following

nvcc matrix_transpose.cu -o no_bank_conflict

I have 22.04.1-Ubuntu and my CPU is x86, intel i7-7700HQ and GPU is GP107M [GeForce GTX 1050 Mobile].

#include 
#include 
#include 
#include 

// typedef unsigned int int;
#define BLOCK_DIM 32
#define CUDA_CHECK_ERROR() \
do { \
    cudaError_t err = cudaGetLastError(); \
    if (err != cudaSuccess) { \
        printf("CUDA error: %s at line %d
", cudaGetErrorString(err), __LINE__); \
        exit(-1); \
    } \
} while (0)


#ifdef OPTIMIZED
__global__ void vector_transpose(int *idata, int *odata, int height, int width) {
    __shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
    
    // read the matrix tile into shared memory
        // load one element per thread from device memory (idata) and store it
        // in transposed order in block[][]
    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
    if((xIndex < width) && (yIndex < height))
    {
        unsigned int index_in = yIndex * width + xIndex;
        block[threadIdx.y][threadIdx.x] = idata[index_in];
    }

        // synchronise to ensure all writes to block[][] have completed
    __syncthreads();

    // write the transposed matrix tile to global memory (odata) in linear order
    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
    if((xIndex < height) && (yIndex < width))
    {
        unsigned int index_out = yIndex * height + xIndex;
        odata[index_out] = block[threadIdx.x][threadIdx.y];
    }
}
#else
__global__ void vector_transpose(int *in, int *out, int row, int col) {

    int c = blockDim.x * blockIdx.x + threadIdx.x;
    int r = blockDim.y * blockIdx.y + threadIdx.y;

    if (r < row && c < col )
    {
            out[c * row + r] = in[r * col + c];
    } 
}
#endif

void printMatrix(int *matrix, int row, int col) {
    if (row * col > 30) return;
    for (int r = 0; r < row; r++) {
        for (int c = 0; c < col; c++) {
            printf("%d, ", matrix[r * col + c]);
        }
        printf("
");
    }
}

void UT_corret(int *h_org, int* h_transpose, int row, int col) {
    for (int r = 0; r < row; r++) {
        for (int c = 0; c < col; c++) {
            if (h_org[r * col + c] != h_transpose[c * row + r]) {
                printf("Not proper transpose
");
                exit(2);
            }
        }
    }
    printf("****************Correct***************
");
}


int main(){
    int *d_in, *d_out;
    int row = 12800;
    int col = 12800;
    int iter_num = 10;
    int size = row * col * sizeof(int);
    clock_t start, end, alloc_time;
    alloc_time = 0;
    int *h_org = (int *) malloc(size);
    int *h_transpose = (int *) malloc(size);
    if (h_org == NULL || h_transpose == NULL) {
        printf("Could not allocate enough memeory
");
        exit(1);
    }


    srand(time(NULL));
    for (int r = 0; r < row; r++) {
        for (int c = 0; c < col; c++) {
            h_org[r * col + c] = rand() % 60;
        }
    }

#ifdef OS_CUDA
    start = clock();
    cudaMalloc(&d_in, size);
    CUDA_CHECK_ERROR();
    cudaMalloc(&d_out, size);
    CUDA_CHECK_ERROR();
    cudaMemcpy(d_in, h_org, size, cudaMemcpyHostToDevice);
    CUDA_CHECK_ERROR();
    end = clock();
    alloc_time = end - start;
    dim3 threadsPerBlock(BLOCK_DIM, BLOCK_DIM);
    dim3 numBlocks((row)/ threadsPerBlock.x, (col) / threadsPerBlock.y);
    printf("block x dimension: %d
", numBlocks.x);
    printf("block y dimension: %d
", numBlocks.y);
#endif
    printMatrix(h_org, row, col);

    start = clock();
#ifdef OS_CUDA
    vector_transpose<<>>(d_in, d_out, row, col);
    CUDA_CHECK_ERROR();
    cudaDeviceSynchronize();
    CUDA_CHECK_ERROR();
    for (int i = 0; i < iter_num; i++) {
        vector_transpose<<>>(d_in, d_out, row, col);
        CUDA_CHECK_ERROR();
        cudaDeviceSynchronize();
        CUDA_CHECK_ERROR();
    }
#else
    // CPU implementation goes here
    for (int i = 0; i < iter_num; i++) {
        for (int r = 0; r < row; r++) {
            for (int c = 0; c < col; c++) {
                h_transpose[c * row + r] = h_org[r * col + c]; 
            }
        }
    }
#endif
    end = clock();
    printf("Execution time %ld  
", (end - start));

#ifdef OS_CUDA
    start = clock();
    cudaMemcpy(h_transpose, d_out, size, cudaMemcpyDeviceToHost);
    end = clock();
    alloc_time += (end - start);
    printf("CUDA allocation time %ld    
", alloc_time);
#endif
    printf("



Matrix after transpose has been generated

");
    printMatrix(h_org, col, row);

#ifdef OS_CUDA
    UT_corret(h_org, h_transpose, row, col);
#endif

    return 0;
}

sudo nvprof --metrics gld_transactions_per_request,gst_transactions_per_request,shared_load_transactions_per_request,shared_store_transactions_per_request ./no_bank_conflict                                                                              
==1212742== NVPROF is profiling process 1212742, command: ./no_bank_conflict
block x dimension: 400
block y dimension: 400
==1212742== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Execution time 3487827  s
CUDA allocation time 566151    




Matrix after transpose has been generated

****************Correct***************
==1212742== Profiling application: ./no_bank_conflict
==1212742== Profiling result:
==1212742== Metric result:
Invocations                               Metric Name                             Metric Description         Min         Max         Avg
Device "NVIDIA GeForce GTX 1050 (0)"
    Kernel: vector_transpose(int*, int*, int, int)
         11              gld_transactions_per_request           Global Load Transactions Per Request   16.000000   16.000000   16.000000
         11              gst_transactions_per_request          Global Store Transactions Per Request    4.000000    4.000000    4.000000
         11      shared_load_transactions_per_request    Shared Memory Load Transactions Per Request    1.000000    1.000000    1.000000
         11     shared_store_transactions_per_request   Shared Memory Store Transactions Per Request    1.000000    1.000000    1.000000

CUDA more load transactions than store even though both are coalesced?

Answers (1)

Related Questions