Error in the result of matrix multiplication example of CUDA C programming guide

Question

I'm doing the matrix multiplication example from the book CUDA C Programming Guide, page 35, for practice, I copied the code and completed the missing code. I understand the logic of the program and how it should work, but I get no the expected result.

Here is the complete code i made, I do not know if the error is mine or from the example?

The code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include     
#include 
#include 

using namespace std;
#define BLOCK_SIZE 16

typedef struct
{
    int width;
    int height;
    float *elements;
}Matrix;

__global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

void MatMul(const Matrix A,const Matrix B, Matrix C) 
{
    size_t size;
    //Matrix A creation y storage in device memory 
    Matrix d_A;
    d_A.width=A.width;
    d_A.height=A.height;
    size=A.height*A.width*sizeof(float);
    cudaMalloc(&d_A.elements,size);
    cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
    //Matrix B creation y storage in device memory 
    Matrix d_B;
    d_B.width=B.width;
    d_B.height=B.height;
    size=B.height*B.width*sizeof(float);
    cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory         
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //        
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.        
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);  
    //edit the missing code.
    // for(int i=0;i



The program compiles and runs but the result matrix C.elements from:     cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
  is a random number. I've tried to use it like a pointer to a array but i don't get anything from it and treating it like array does not work either.

I will be glad if anyone can help me to finish this.

lashgar · Accepted Answer

Your code has minor miss match between array indexing in kernel and initialization on CPU. Here is the corrected code with debugging suggested by @harrism:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include 
    #include 
    #include 

    using namespace std;
    #define BLOCK_SIZE 16

    typedef struct
    {
        int width;
        int height;
        float *elements;
    }Matrix;

    __global__ void MatMulKernel(const Matrix,const Matrix, Matrix C);

    void MatMul(const Matrix A,const Matrix B, Matrix C)
    {
        size_t size;
        //Matrix A creation y storage in device memory
        Matrix d_A;
        d_A.width=A.width;
        d_A.height=A.height;
        size=A.height*A.width*sizeof(float);
        cudaMalloc(&d_A.elements,size);
        cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
        //Matrix B creation y storage in device memory
        Matrix d_B;
        d_B.width=B.width;
        d_B.height=B.height;
        size=B.height*B.width*sizeof(float);
        cudaMalloc(&d_B.elements,size);
    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
    //Matrix C creation y storage in device memory
    Matrix d_C;
    d_C.width=C.width;
    d_C.height=C.height;
    //cudaMalloc(&d_C,sizeof(Matrix));
    //cudaMemcpy(d_C,C,sizeof(Matrix),cudaMemcpyHostToDevice);
    size=C.height*C.width*sizeof(float);
    cudaMalloc(&d_C.elements,size);
    //
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
    MatMulKernel<<>>(d_A,d_B,d_C);
    //Copy the result in the matrix C from the device to the host.
    printf("error code: %s
",cudaGetErrorString(cudaGetLastError()));
    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
    //
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
        //printf("%d
",threadIdx.x);
    float Cvalue=0;
    int row=blockIdx.y*blockDim.y+threadIdx.y;
    int col=blockIdx.x*blockDim.x+threadIdx.x;
    for(int e=0;e



Check the output. If you see the results are wrong, check the kernel error on your system which is reported in output.

Error in the result of matrix multiplication example of CUDA C programming guide

Answers (2)

Related Questions