Reputation: 305
I am quite new to CUDA programming, and I wanted to try an implementation of matrix multiplication using Parallel Reduction. I came up with this code, and would like clarifications on :
For reference, I am running it on NVIDIA GeForce GTX 675M , which has a compute capability of 2.1 .
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include "cuda_runtime.h"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#define BLOCK_OPT 1024
typedef struct {
int width;
int height;
int stride;
float* elements;
}Matrix;
__global__ void MatMulOPT(Matrix A, Matrix B, Matrix C)
{
__shared__ float result[BLOCK_OPT];
int e = threadIdx.x;
int row = blockIdx.x;
int col = blockIdx.y;
result[e] = A.elements[row*A.stride + e] * B.elements[e*B.stride + col];
__syncthreads();
if (e < 512)
{
result[e] += result[e + 512];
}
__syncthreads();
if (e < 256)
{
result[e] += result[e + 256];
}
__syncthreads();
if (e < 128)
{
result[e] += result[e + 128];
}
__syncthreads();
if (e < 64)
{
result[e] += result[e + 64];
}
__syncthreads();
if (e < 32)
{
result[e] += result[e + 32];
result[e] += result[e + 16];
result[e] += result[e + 8];
result[e] += result[e + 4];
result[e] += result[e + 2];
result[e] += result[e + 1];
}
if (e == 0)C.elements[row*C.stride + col] = result[0];
}
void MatMulCPU(Matrix A, Matrix B, Matrix C)
{
for (int i = 0; i < A.height; i++)
{
for (int j = 0; j < B.width; j++)
{
for (int k = 0; k < B.height; k++)
{
C.elements[i*C.stride + j] += A.elements[i*A.stride+k] * B.elements[k*B.stride + j];
}
}
}
}
float randomFloat()
{
return (float)rand() / (float)RAND_MAX;
}
int main()
{
clock_t begin, end;
srand(time(NULL));
//Common Setup
float cpu_t = 0, gpu_t = 0;
int x = 1024;
int N = x * x;
size_t size = N * sizeof(float);
Matrix A;
A.width = x;
A.stride = x;
A.height = x;
A.elements = (float*)malloc(size);
for (int i = 0; i < N; i++)
A.elements[i] = randomFloat();
Matrix B;
B.width = x;
B.stride = x;
B.height = x;
B.elements = (float*)malloc(size);
for (int j = 0; j < N; j++)
B.elements[j] = randomFloat();
Matrix C;
C.width = x;
C.stride = x;
C.height = x;
C.elements = (float*)malloc(size);
for (int k = 0; k < N; k++)
C.elements[k] = 0;
Matrix D;
D.width = x;
D.stride = x;
D.height = x;
D.elements = (float*)malloc(size);
for (int l = 0; l < N; l++)
D.elements[l] = 0;
//Execute CPU code & time it
begin = clock();
MatMulCPU(A, B, D);
end = clock();
cpu_t = (float)end - begin;
// GPU setup
Matrix d_A, d_B, d_C;
d_A.width = x;
d_A.stride = x;
d_A.height = x;
d_B.width = x;
d_B.stride = x;
d_B.height = x;
d_C.width = x;
d_C.stride = x;
d_C.height = x;
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
cudaMalloc(&d_C.elements, size);
cudaMemcpy(d_C.elements, C.elements, size, cudaMemcpyHostToDevice);
//Special Parameters
int optBlockSize = BLOCK_OPT;
dim3 optDimGrid(x, x);
//Execute GPU Kernel and time it
begin = clock();
cudaThreadSynchronize();
MatMulOPT << <optDimGrid, optBlockSize >> > (d_A, d_B, d_C);
cudaThreadSynchronize();
end = clock();
gpu_t = (float)end - begin;
cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost);
//Do a memcheck
bool passed = true;
for (int k = 0; k < N; k++)
{
//printf("%f ",C.elements[k]);
if (fabs(C.elements[k] -D.elements[k] ) > 1e-6)
{
passed = false;
printf("\nFAIL\n");
printf("C[%d] = %f, D[%d] = %f\n",k,C.elements[k],k,D.elements[k]);
break;
}
}
printf("\n");
if (passed)
printf("PASSED\n");
printf("CPU Elapsed Time: %f\n", cpu_t);
printf("GPU Elapsed Time: %f\n", gpu_t);
//Clear all GPU memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Clear all CPU memory
free(A.elements);
free(B.elements);
free(C.elements);
}
Upvotes: 1
Views: 1276
Reputation: 1220
- Why the code is returning incorrect results.
In the last step of your reduction (e < 32
) you are breaking with your method. This leads to race conditions on the same element of result, e.g.
result[e] += result[e + 16];
For e==0
this means read result[16]
, whereas in the same step/at the same time for e==16
the operation means write result[16]
.
To avoid the race condition you have two options:
volatile
like in the document you
linked (p. 78) [edited]Continue the if( e < ...)
as before or transform all the ifs into a loop with:
for(int size=blockdim.x/2; size>0; size/=2)
if(e < size)
...
- Why it is taking much longer to run than the method using shared memory described in Chapter 3, page 25 of the CUDA C Programming Guide.
Accessing shared memory is much faster than accessing global memory. You are storing your intermediate result in shared memory, whereas the example you reference is storing the parts of the matrices to be read. In the example this is combined with loop tiling and every thread only loads one element of the whole tile from global memory, but later reads TILE_WIDTH * 2
elements.
The higher performance of the example directly comes from a reduced time waiting for data to be loaded from global memory.
Upvotes: 3