trouble with the matrix multiplication in CUDA

Question

I got a trouble with the matrix multiplication in CUDA C. Checking for a long time, I found the problem is that i made a wrong use of the "dim3". After correcting it i got the right result. But I increased the dimension of the matrix, the answer was not correct. Even I couldn't use the Nsight, it worked well before i increased the dimension, to start CUDA debugging.

Kernel code:

__global__ void multiKernal(float* Md, float*Nd, float*Pd, int width)
{
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
  float Pvalue = 0;

for (int k = 0; k



host to device:

void matrixmutiplacation(float*hostM, float*hostN, float*hostP, int width)
{
    int size = width*width*sizeof(float);
    float* Md; float* Nd; float* Pd;
    dim3 dimGrid(4, 4, 1);
    dim3 dimBlock(128, 128, 1);
    cudaError_t error;
    cudaEvent_t start;
    error = cudaEventCreate(&start);
    cudaEvent_t stop;
    error = cudaEventCreate(&stop);

    cudaMalloc((void**)&Md, size);
    cudaMalloc((void**)&Nd, size);
    cudaMalloc((void**)&Pd, size);
    cudaMemcpy(Md, hostM, size, cudaMemcpyHostToDevice);
    cudaMemcpy(Nd, hostN, size, cudaMemcpyHostToDevice);
    cudaMemcpy(Pd, hostP, size, cudaMemcpyHostToDevice);

    error = cudaEventRecord(start, NULL);
    multiKernal << > >(Md, Nd, Pd, width);
    error = cudaEventRecord(stop, NULL);
    error = cudaEventSynchronize(stop);
    float msecTotal = 0.0f;
    error = cudaEventElapsedTime(&msecTotal, start, stop);
    float msecPerMatrixMul = msecTotal;
    printf("running time:%.3f msec", msecPerMatrixMul);

    cudaMemcpy(hostP, Pd, size, cudaMemcpyDeviceToHost);
    cudaFree(Md); cudaFree(Nd); cudaFree(Pd);
    return;
}


main:

int main()
{
    int M = 512 * 512;
    int N = 512 * 512;
    int P = 512 * 512;
    int width = 512;
    int c[512];
    float* hostM = (float*)malloc(sizeof(float)*M);
    float* hostN = (float*)malloc(sizeof(float)*N);
    float* hostP = (float*)malloc(sizeof(float)*P);

    for (int i = 0; i < P; ++i)
        hostP[i] = 0;

    for (int i = 0; i

trouble with the matrix multiplication in CUDA

Answers (1)

Related Questions