zeeck
zeeck

Reputation: 53

Error in the result of matrix multiplication example of CUDA C programming guide

I'm doing the matrix multiplication example from the book CUDA C Programming Guide, page 35, for practice, I copied the code and completed the missing code. I understand the logic of the program and how it should work, but I get no the expected result.

Here is the complete code i made, I do not know if the error is mine or from the example?

The code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>    
#include <stdio.h>
#include <stdio.h>

using namespace std;
#define BLOCK_SIZE 16

typedef struct
{
    int width;
    int height;
    float *elements;
}Matrix;

__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

void MatMul(const Matrix A,const Matrix B, Matrix C) 
{
    size_t size;
    //Matrix A creation y storage in device memory 
    Matrix d_A;
    d_A.width=A.width;
    d_A.height=A.height;
    size=A.height*A.width*sizeof(float);
    cudaMalloc(&d_A.elements,size);
    cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
    //Matrix B creation y storage in device memory 
    Matrix d_B;
    d_B.width=B.width;
    d_B.height=B.height;
    size=B.height*B.width*sizeof(float);
    cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory         
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //        
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.        
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);  
    //edit the missing code.
    // for(int i=0;i<BLOCK_SIZE*BLOCK_SIZE;i++){cout<<C.elements[i]<<endl;}      
    // result in random numbers
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
    float Cvalue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    for(int e=0;e<A.width;++e)
    {
        Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
    }
    C.elements[row*C.width+col]=Cvalue;
}

int main()
{
    cout<<"Matrices"<<endl;
    //Declarationd of the A,B,C matrix´s
    float a[15][15];        
    float b[15][15];
    float c[15][15];
    //Fill the matrix whit some numbers.
    int cont0=0;
    for(int c=0;c<15;c++)
    {
        for(int v=0;v<15;v++)
        {
            a[v][c]=cont0;
            b[v][c]=cont0;
            cont0++;
        }
    }
    //Flatten the matrix for the passing to the kernel
    int offset=0;
    float a_t[256];
    float b_t[256];
    for(int y=0;y<15;y++)
    {                        
        for(int x=0;x<15;x++)
        {
            a_t[x+offset]=a[x][y];
            b_t[x+offset]=a[x][y];
        }
        offset=offset+15;
    }
    float t_C[256];
    //Completing the matrix format for the kernel.
    Matrix m_A;
    m_A.height=15;
    m_A.width=15;
    m_A.elements=a_t;
    Matrix m_B;
    m_B.height=15;
    m_B.width=15;
    m_B.elements=b_t;
    Matrix m_C;
    m_C.height=15;
    m_C.width=15;
    m_C.elements=t_C;
    //Passing the formated matrix to the kernel.
    MatMul(m_A,m_B,m_C);                
    cout<<"Final"<<endl;        
return 0;
}

The program compiles and runs but the result matrix C.elements from: cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost); is a random number. I've tried to use it like a pointer to a array but i don't get anything from it and treating it like array does not work either.

I will be glad if anyone can help me to finish this.

Upvotes: 1

Views: 948

Answers (2)

lashgar
lashgar

Reputation: 5430

Your code has minor miss match between array indexing in kernel and initialization on CPU. Here is the corrected code with debugging suggested by @harrism:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <iostream>
    #include <stdio.h>
    #include <stdio.h>

    using namespace std;
    #define BLOCK_SIZE 16

    typedef struct
    {
        int width;
        int height;
        float *elements;
    }Matrix;

    __global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

    void MatMul(const Matrix A,const Matrix B, Matrix C)
    {
        size_t size;
        //Matrix A creation y storage in device memory
        Matrix d_A;
        d_A.width=A.width;
        d_A.height=A.height;
        size=A.height*A.width*sizeof(float);
        cudaMalloc(&d_A.elements,size);
        cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
        //Matrix B creation y storage in device memory
        Matrix d_B;
        d_B.width=B.width;
        d_B.height=B.height;
        size=B.height*B.width*sizeof(float);
        cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    //cudaMalloc(&d_C,sizeof(Matrix));
    //cudaMemcpy(d_C,C,sizeof(Matrix),cudaMemcpyHostToDevice);
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.
    printf("error code: %s\n",cudaGetErrorString(cudaGetLastError()));
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
    //
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
        //printf("%d\n",threadIdx.x);
    float Cvalue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    for(int e=0;e<A.width;++e)
    {
        Cvalue+=A.elements[row*A.width+e]*B.elements[e*B.width+col];
    }
    C.elements[row*C.width+col]=Cvalue;
}

int print_matrix(Matrix A){
        printf("Matrix:\n");
        int i;
        for(i=0; i<A.width*A.height; i++){
                if(i%A.width==0) printf("\n");
                printf("%6.4f\t",A.elements[i]);
        }
        printf("\n");
}
int main()
{
    cout<<"Matrices"<<endl;
    //Declarationd of the A,B,C matrix.s
    float a[BLOCK_SIZE][BLOCK_SIZE];
    float b[BLOCK_SIZE][BLOCK_SIZE];
    float c[BLOCK_SIZE][BLOCK_SIZE];
    //Fill the matrix whit some numbers.
    int cont0=0;
    for(int c=0;c<BLOCK_SIZE;c++)
    {
        for(int v=0;v<BLOCK_SIZE;v++)
        {
            a[v][c]=cont0;
            b[v][c]=cont0;
            cont0++;
        }
    }
    //Flatten the matrix for the passing to the kernel
    int offset=0;
    float a_t[BLOCK_SIZE*BLOCK_SIZE];
    float b_t[BLOCK_SIZE*BLOCK_SIZE];
    for(int y=0;y<BLOCK_SIZE;y++)
    {
        for(int x=0;x<BLOCK_SIZE;x++)
        {
            a_t[x+offset]=a[x][y];
            b_t[x+offset]=a[x][y];
        }
        offset=offset+BLOCK_SIZE;
    }
    float t_C[BLOCK_SIZE*BLOCK_SIZE];
    //Completing the matrix format for the kernel.
    Matrix m_A;
    m_A.height=BLOCK_SIZE;
    m_A.width=BLOCK_SIZE;
    m_A.elements=a_t;
    Matrix m_B;
    m_B.height=BLOCK_SIZE;
    m_B.width=BLOCK_SIZE;
    m_B.elements=b_t;
    Matrix m_C;
    m_C.height=BLOCK_SIZE;
    m_C.width=BLOCK_SIZE;
    m_C.elements=t_C;
    //Passing the formated matrix to the kernel.
    print_matrix(m_A);
    print_matrix(m_B);
    MatMul(m_A,m_B,m_C);
    print_matrix(m_C);
    cout<<"Final"<<endl;
return 0;
}

Check the output. If you see the results are wrong, check the kernel error on your system which is reported in output.

Upvotes: 3

Tom
Tom

Reputation: 21108

Firstly, see here for how to get useful answers to your questions. In particular, you should always check the return value of your CUDA API calls and kernel launches. Also, running cuda-memcheck can often be very helpful to detect out-of-bounds accesses like this.

@harrism asked how you know the result is wrong since you don't appear to do anything with it.

But more importantly you have 15x15 matrices being computed with a 16x16 threadblock, but you're not taking care to disable the out-of-bounds threads. Since you're trying to create a simple example, just increase the matrix size to 16x16 - if you want to handle odd sizes then you'll need to implement the control logic (or use cuBLAS!).

Upvotes: 1

Related Questions