Reputation: 679
I am trying to explore CUBLAS library and hence wrote a code for matrix multiplication using its APIs. But I am getting weird output. I am pasting the code and output below. Please help me.
#include<cublas.h>
// Thread block size
#define BLOCK_SIZE 3
#define WA 3 // Matrix A width
#define HA 3 // Matrix A height
#define WB 3 // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
// Allocates a matrix with random float entries.
void randomInit(float* data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = i;
}
/////////////////////////////////////////////////////////
// Program main
/////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
cublasStatus_t status;
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 3. print out A and B
printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++)
{
printf("%f ", h_A[i]);
if(((i + 1) % WA) == 0)
printf("\n");
}
printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++)
{
printf("%f ", h_B[i]);
if(((i + 1) % WB) == 0)
printf("\n");
}
// 8. allocate device memory
float* d_A;
float* d_B;
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);
// 9. copy host memory to device
status = cublasSetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float), h_A, BLOCK_SIZE,d_A, BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
status = cublasSetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float), h_B, BLOCK_SIZE,d_B, BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 10. allocate device memory for the result
float* d_C;
cudaMalloc((void**) &d_C, mem_size_C);
// 5. perform the calculation
cublasSgemm('N','N',BLOCK_SIZE,BLOCK_SIZE,BLOCK_SIZE,1.0f,d_A,BLOCK_SIZE,d_B,BLOCK_SIZE,1.0f,d_C,BLOCK_SIZE);
status = cublasGetError();
if (status) {
fprintf (stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
// 11. copy result from device to host
status = cublasGetMatrix(BLOCK_SIZE,BLOCK_SIZE,sizeof(float),d_C, BLOCK_SIZE,h_C,BLOCK_SIZE);
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
// 6. print out the results
printf("\n\nMatrix C (Results)\n");
for(int i = 0; i < size_C; i++)
{
printf("%f ", h_C[i]);
if(((i + 1) % WC) == 0)
printf("\n");
}
printf("\n");
// 7. clean up memory
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
---------Output-------------
Matrix A
0.000000 1.000000 2.000000
3.000000 4.000000 5.000000
6.000000 7.000000 8.000000
Matrix B
0.000000 1.000000 2.000000
3.000000 4.000000 5.000000
6.000000 7.000000 8.000000
Matrix C (Results)
-1998397155538108416.000000 -1998397155538108416.000000 -1998397155538108416.000000
-1998397155538108416.000000 -1998397155538108416.000000 -1998397155538108416.000000
-1998397155538108416.000000 -1998397155538108416.000000 -1998397155538108416.000000
Upvotes: 1
Views: 1054
Reputation: 72349
Your problem is that you are using uninitialised memory in the sgemm call. cublas_sgemm()
, like all BLAS gemm operations computes
C = alpha * op(A) * op(B) + beta * C
In your code, you are passing op(A)=A
, op(B)=B
, alpha=1.
, and beta=1.
. But you have never set the values of C
to anything, the memory in the GPU is uninitialised and can contain random values, giving the corrupted result your are seeing. Change your function call to this:
cublasSgemm('N','N',BLOCK_SIZE,BLOCK_SIZE,BLOCK_SIZE,1.0f,d_A,
BLOCK_SIZE,d_B,BLOCK_SIZE,0.f,d_C,BLOCK_SIZE);
which computes
C = 1.0 * A * B + 0. * C
and you should get a more reasonable output. Once you get it producing that output, please keep in find that CUBLAS assumes matrices are stored in column major order, so the correct printed output for the inputs you printed out should be
15 18 21
42 54 66
69 90 111
Upvotes: 3