Reputation: 839

Multiply Rectangular Matrices in CUDA

In this homework i need to complete the code to multiply two rectangle matrices using CUDA C. After I completed the code, I submitted and the solution was correct for the data set when the matrices were square, while the result wasn't matching the expected value when the matrices were not square.

Here is the code after I added the missing parts:

#include    <wb.h>

#define wbCheck(stmt) do {                             \
    cudaError_t err = stmt;                            \
    if (err != cudaSuccess) {                          \
        wbLog(ERROR, "Failed to run stmt ", #stmt);    \
        return -1;                                     \
    }                                                  \
} while(0)

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
               int numARows, int numAColumns,
               int numBRows, int numBColumns,
               int numCRows, int numCColumns) {
   //@@ Insert code to implement matrix multiplication here
   int Row = blockIdx.y * blockDim.y + threadIdx.y;
   int Col = blockIdx.x * blockDim.x + threadIdx.x;
   if (numAColumns != numBRows) return ;
   if ((Row < numARows) && (Col < numBColumns)){
       float Cvalue = 0;
       for (int k = 0 ; k < numAColumns ; ++k )
       Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];
       C[Row*numAColumns + Col] = Cvalue;
     }

    }



int main(int argc, char ** argv) {
   wbArg_t args;
   float * hostA; // The A matrix
   float * hostB; // The B matrix
   float * hostC; // The output C matrix
   float * deviceA;
   float * deviceB;
   float * deviceC;
   int numARows; // number of rows in the matrix A
   int numAColumns; // number of columns in the matrix A
   int numBRows; // number of rows in the matrix B
   int numBColumns; // number of columns in the matrix B
   int numCRows; // number of rows in the matrix C (you have to set this)
   int numCColumns; // number of columns in the matrix C (you have to set this)

   args = wbArg_read(argc, argv);

   wbTime_start(Generic, "Importing data and creating memory on host");
   hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
   hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
   //@@ Set numCRows and numCColumns  
   numCRows = 0;
   numCColumns = 0;
   numCRows = numARows;
   numCColumns = numBColumns;  
   //@@ Allocate the hostC matrix
   hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);  
   wbTime_stop(Generic, "Importing data and creating memory on host");

   wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
   wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

   wbTime_start(GPU, "Allocating GPU memory.");
   //@@ Allocate GPU memory here
   cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
   cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
   cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);  

   wbTime_stop(GPU, "Allocating GPU memory.");

   wbTime_start(GPU, "Copying input memory to the GPU.");
   //@@ Copy memory to the GPU here

   cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
   cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
   wbTime_stop(GPU, "Copying input memory to the GPU.");

   //@@ Initialize the grid and block dimensions here

   dim3 DimGrid(numARows / 8 , numBColumns / 8, 1);
   dim3 DimBlock(8 , 8, 1);

   wbTime_start(Compute, "Performing CUDA computation");

   //@@ Launch the GPU Kernel here
   matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);  

   cudaThreadSynchronize();
   wbTime_stop(Compute, "Performing CUDA computation");

   wbTime_start(Copy, "Copying output memory to the CPU");
   //@@ Copy the GPU memory back to the CPU here
   cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);  

   wbTime_stop(Copy, "Copying output memory to the CPU");

   wbTime_start(GPU, "Freeing GPU Memory");
   //@@ Free the GPU memory here

   cudaFree(deviceA);
   cudaFree(deviceB);
   cudaFree(deviceC);
   wbTime_stop(GPU, "Freeing GPU Memory");

   wbSolution(args, hostC, numCRows, numCColumns);

   free(hostA);
   free(hostB);
   free(hostC);

   return 0;
}

I hope you can help me to find which part is incorrect.

Upvotes: 6

Answers (5)

mzn.rft

Reputation: 839

we can use tiled matrix multiplication and i found it has a better execution time .

#include    <wb.h>

#define wbCheck(stmt) do {                                 \
        cudaError_t err = stmt;                            \
        if (err != cudaSuccess) {                          \
            wbLog(ERROR, "Failed to run stmt ", #stmt);    \
            return -1;                                     \
        }                                                  \
    } while(0)

// Compute C = A * B
__global__ void matrixMultiplyShared(float * A, float * B, float * C,
                             int numARows, int numAColumns,
                             int numBRows, int numBColumns,
                             int numCRows, int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    //@@ You have to use shared memory for this MP
    const int TILE_WIDTH = 32;
     __shared__ float sharedA[TILE_WIDTH][TILE_WIDTH];
     __shared__ float sharedB[TILE_WIDTH][TILE_WIDTH];
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int Row = by*TILE_WIDTH + ty;
    int Col = bx*TILE_WIDTH + tx;
    float Cvalue = 0.0; 
    if (numAColumns != numBRows) return ;
    for (int i = 0; i < (int)(ceil((float)numAColumns/TILE_WIDTH)); i++)
    {

      if (i*TILE_WIDTH + tx < numAColumns && Row < numARows){
            sharedA[ty][tx] = A[Row*numAColumns + i*TILE_WIDTH + tx];
      }else{
            sharedA[ty][tx] = 0.0;
      }

      if (i*TILE_WIDTH + ty < numBRows && Col < numBColumns){
            sharedB[ty][tx] = B[(i*TILE_WIDTH + ty)*numBColumns + Col];
      }else{
            sharedB[ty][tx] = 0.0;
      }
       __syncthreads();
         if(Row < numARows && Col < numBColumns){

            for(int j = 0; j < TILE_WIDTH; j++)
            Cvalue += sharedA[ty][j] * sharedB[j][tx];
      }

        __syncthreads();
    }

    if (Row < numCRows && Col < numCColumns)
        C[Row*numCColumns + Col] = Cvalue;
}    




int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * deviceA;
    float * deviceB;
    float * deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C (you have to set this)
    int numCColumns; // number of columns in the matrix C (you have to set this)
    int TILE_WIDTH = 32;

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
    //@@ Set numCRows and numCColumns
    numCRows = 0;
    numCColumns = 0;
    numCRows = numARows;
    numCColumns = numBColumns;
    //@@ Allocate the hostC matrix
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);  
    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void**)&deviceA , sizeof(float)*numARows*numAColumns );
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);  

    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here
    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns,    cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);  

    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here
    int dimX = (int)(ceil((float)numCColumns / TILE_WIDTH));  
    int dimY = (int)(ceil((float)numCRows / TILE_WIDTH));
    dim3 DimGrid(dimX, dimY);
    dim3 DimBlock(TILE_WIDTH, TILE_WIDTH);



    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Launch the GPU Kernel here
    matrixMultiplyShared<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);  

    cudaThreadSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);  

    wbTime_stop(Copy, "Copying output memory to the CPU");

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);  

    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostC, numCRows, numCColumns);

    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}

Upvotes: 1

mzn.rft

Reputation: 839

After the help of Ira, Ahmad, ram, and Oli Fly, I got the correct answer as follows:

#include    <wb.h>

#define wbCheck(stmt) do {                                 \
        cudaError_t err = stmt;                            \
        if (err != cudaSuccess) {                          \
            wbLog(ERROR, "Failed to run stmt ", #stmt);    \
            return -1;                                     \
        }                                                  \
    } while(0)

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
                   int numARows, int numAColumns,
                   int numBRows, int numBColumns,
                   int numCRows, int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if (numAColumns != numBRows) return;
    if ((Row < numARows) && (Col < numBColumns)){
    float Cvalue = 0;
    for (int k = 0; k < numAColumns; ++k)
    Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];
    C[Row*numCColumns + Col] = Cvalue;
  }

}

int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * deviceA;
    float * deviceB;
    float * deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C (you have to set this)
    int numCColumns; // number of columns in the matrix C (you have to set this)

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
    //@@ Set numCRows and numCColumns  
    numCRows = 0;
    numCColumns = 0;
    numCRows = numARows;
    numCColumns = numBColumns;  
    //@@ Allocate the hostC matrix
    hostC = (float*) malloc(sizeof(float)*numCRows*numCColumns);  
    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void**)&deviceA ,sizeof(float)*numARows*numAColumns );
    cudaMalloc((void**)&deviceB , sizeof(float)*numBRows*numBColumns);
    cudaMalloc((void**)&deviceC , sizeof(float)*numCRows*numCColumns);  

    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here

    cudaMemcpy(deviceA, hostA, sizeof(float)*numARows*numAColumns, cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, sizeof(float)*numBRows*numBColumns, cudaMemcpyHostToDevice);
    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here

    dim3 DimGrid((numCColumns - 1) / 8 + 1, (numCRows - 1) / 8 + 1, 1);
    dim3 DimBlock(8 , 8, 1);

    wbTime_start(Compute, "Performing CUDA computation");

    //@@ Launch the GPU Kernel here
    matrixMultiply<<<DimGrid , DimBlock>>>(deviceA , deviceB , deviceC , numARows , numAColumns, numBRows ,numBColumns , numCRows , numCColumns);  

    cudaThreadSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy(hostC, deviceC, sizeof(float)*numCRows*numCColumns , cudaMemcpyDeviceToHost);  

    wbTime_stop(Copy, "Copying output memory to the CPU");

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here

    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);
    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostC, numCRows, numCColumns);

    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}

Upvotes: 4

ram

Reputation: 21

replace

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

for

Cvalue += A[Row*numAColumns +k]* B[k*numBColumns+Col];

and

C[Row*numAColumns + Col] = Cvalue;

for

C[Row*numCColumns+Col] = Cvalue;

Upvotes: 2

Ira

Reputation: 31

Replace : for (int k = 0 ; k < numAColumns ; ++k ) Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col]; C[Row*numAColumns + Col] = Cvalue; }

with for (int k = 0 ; k < numAColumns ; ++k ) Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col]; C[Row*numCColumns + Col] = Cvalue; }

Upvotes: 3

lashgar

Reputation: 5470

Replace :

Cvalue += A[Row*numAColumns + k] * B[k * numBRows + Col];

with

Cvalue += A[Row*numAColumns + k] * B[k * numBColumns + Col];

Upvotes: 2

Multiply Rectangular Matrices in CUDA

Answers (5)

Related Questions