Reputation: 27
I am profiling the NVIDIA's matrix transpose sample. From the looks of it and from the profiler, there are no bank conflicts. However, one thing I noticed is that global load transactions per request is more than global store transactions per request. From the looks of it both store and loads are coalesced. The data that is being read is int, so i should be getting perfect coalescing and I would imagine load/store transactions per request would be at least same if not equal. What am i missing? Note that my input matrix is of 12800x12800 size.
I compile the .cu code with following
nvcc matrix_transpose.cu -o no_bank_conflict
I have 22.04.1-Ubuntu and my CPU is x86, intel i7-7700HQ and GPU is GP107M [GeForce GTX 1050 Mobile].
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <unistd.h>
// typedef unsigned int int;
#define BLOCK_DIM 32
#define CUDA_CHECK_ERROR() \
do { \
cudaError_t err = cudaGetLastError(); \
if (err != cudaSuccess) { \
printf("CUDA error: %s at line %d\n", cudaGetErrorString(err), __LINE__); \
exit(-1); \
} \
} while (0)
#ifdef OPTIMIZED
__global__ void vector_transpose(int *idata, int *odata, int height, int width) {
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
// read the matrix tile into shared memory
// load one element per thread from device memory (idata) and store it
// in transposed order in block[][]
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
// synchronise to ensure all writes to block[][] have completed
__syncthreads();
// write the transposed matrix tile to global memory (odata) in linear order
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
#else
__global__ void vector_transpose(int *in, int *out, int row, int col) {
int c = blockDim.x * blockIdx.x + threadIdx.x;
int r = blockDim.y * blockIdx.y + threadIdx.y;
if (r < row && c < col )
{
out[c * row + r] = in[r * col + c];
}
}
#endif
void printMatrix(int *matrix, int row, int col) {
if (row * col > 30) return;
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
printf("%d, ", matrix[r * col + c]);
}
printf("\n");
}
}
void UT_corret(int *h_org, int* h_transpose, int row, int col) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
if (h_org[r * col + c] != h_transpose[c * row + r]) {
printf("Not proper transpose\n");
exit(2);
}
}
}
printf("****************Correct***************\n");
}
int main(){
int *d_in, *d_out;
int row = 12800;
int col = 12800;
int iter_num = 10;
int size = row * col * sizeof(int);
clock_t start, end, alloc_time;
alloc_time = 0;
int *h_org = (int *) malloc(size);
int *h_transpose = (int *) malloc(size);
if (h_org == NULL || h_transpose == NULL) {
printf("Could not allocate enough memeory\n");
exit(1);
}
srand(time(NULL));
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
h_org[r * col + c] = rand() % 60;
}
}
#ifdef OS_CUDA
start = clock();
cudaMalloc(&d_in, size);
CUDA_CHECK_ERROR();
cudaMalloc(&d_out, size);
CUDA_CHECK_ERROR();
cudaMemcpy(d_in, h_org, size, cudaMemcpyHostToDevice);
CUDA_CHECK_ERROR();
end = clock();
alloc_time = end - start;
dim3 threadsPerBlock(BLOCK_DIM, BLOCK_DIM);
dim3 numBlocks((row)/ threadsPerBlock.x, (col) / threadsPerBlock.y);
printf("block x dimension: %d\n", numBlocks.x);
printf("block y dimension: %d\n", numBlocks.y);
#endif
printMatrix(h_org, row, col);
start = clock();
#ifdef OS_CUDA
vector_transpose<<<numBlocks, threadsPerBlock>>>(d_in, d_out, row, col);
CUDA_CHECK_ERROR();
cudaDeviceSynchronize();
CUDA_CHECK_ERROR();
for (int i = 0; i < iter_num; i++) {
vector_transpose<<<numBlocks, threadsPerBlock>>>(d_in, d_out, row, col);
CUDA_CHECK_ERROR();
cudaDeviceSynchronize();
CUDA_CHECK_ERROR();
}
#else
// CPU implementation goes here
for (int i = 0; i < iter_num; i++) {
for (int r = 0; r < row; r++) {
for (int c = 0; c < col; c++) {
h_transpose[c * row + r] = h_org[r * col + c];
}
}
}
#endif
end = clock();
printf("Execution time %ld \n", (end - start));
#ifdef OS_CUDA
start = clock();
cudaMemcpy(h_transpose, d_out, size, cudaMemcpyDeviceToHost);
end = clock();
alloc_time += (end - start);
printf("CUDA allocation time %ld \n", alloc_time);
#endif
printf("\n\n\n\nMatrix after transpose has been generated\n\n");
printMatrix(h_org, col, row);
#ifdef OS_CUDA
UT_corret(h_org, h_transpose, row, col);
#endif
return 0;
}
sudo nvprof --metrics gld_transactions_per_request,gst_transactions_per_request,shared_load_transactions_per_request,shared_store_transactions_per_request ./no_bank_conflict
==1212742== NVPROF is profiling process 1212742, command: ./no_bank_conflict
block x dimension: 400
block y dimension: 400
==1212742== Some kernel(s) will be replayed on device 0 in order to collect all events/metrics.
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Replaying kernel "vector_transpose(int*, int*, int, int)" (done)
Execution time 3487827 s
CUDA allocation time 566151
Matrix after transpose has been generated
****************Correct***************
==1212742== Profiling application: ./no_bank_conflict
==1212742== Profiling result:
==1212742== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce GTX 1050 (0)"
Kernel: vector_transpose(int*, int*, int, int)
11 gld_transactions_per_request Global Load Transactions Per Request 16.000000 16.000000 16.000000
11 gst_transactions_per_request Global Store Transactions Per Request 4.000000 4.000000 4.000000
11 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.000000 1.000000 1.000000
11 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.000000 1.000000 1.000000
Upvotes: -2
Views: 130
Reputation: 72342
If, as you say, you compile the code as:
nvcc matrix_transpose.cu -o no_bank_conflict
then you are compiling and profiling this kernel:
__global__ void vector_transpose(int *in, int *out, int row, int col) {
int c = blockDim.x * blockIdx.x + threadIdx.x;
int r = blockDim.y * blockIdx.y + threadIdx.y;
if (r < row && c < col )
{
out[c * row + r] = in[r * col + c];
}
}
(Note not the code you showed in your original version of this question)
and the profiling results you show in your question are completely to be expected because of uncoalesced memory access patterns.
To profile the kernel you are interested in, it would require compiling the code in your current question revision like this:
nvcc -DOPTIMIZED matrix_transpose.cu -o no_bank_conflict
Otherwise the preprocessor directives in the code will emit the unoptimized version of the kernel to the compiler.
Upvotes: 1