AlexLordThorsen
AlexLordThorsen

Reputation: 8498

Cuda memory errors (allocation and copy successfull)

I'm currently learning CUDA for the purpose of high performance computing. I have a project which implements Jacobi Iterations. I have a memory error in my program somewhere and I'm having a hard time tracking it down.

My Jacobi kernal runs through one iteration correctly and now I'm working on calculating the maximum difference between the old matrix and the new matrix. If I comment out the next line of code:

//diff[idx] = BJacobi[idx] - AJacobi[idx];

it works. Including this line of code however, cause BJacbi's data to be overwritten with part of AJacobi's data (or at least I think it's AJacobi's data, it's nearly the same pattern). It seems like an allocation issue to me but I'm not sure where it is.

__global__ 
void jacobi(float *diff, float *AJacobi, float *BJacobi, int *bitMask, int size) 
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;
    int count = 0;

    if(idx < size * size)
    {
        if(bitMask[idx] == 0)
        { 
            //if left side of matrix
            if(idx - 1 > 0 && idx % size != 0) {
                sum += AJacobi[ idx - 1 ];
                count++;
            }
            //if right side of matrix
            if(idx + 1 < size * size && (idx + 1) % size != 0)
            {
                sum += AJacobi[ idx + 1 ];
                count++;
            }
            //if top of matrix
            if(idx - size > 0)
            {
                sum += AJacobi[ idx - size ];
                count++;
            }
            //if bottom of matrix
            if(idx + size < size * size)
            {
                sum  += AJacobi[ idx + size ];
                count++;
            }
            BJacobi[idx] = sum / count;
        }
        else BJacobi[idx] = AJacobi[idx];
    }

    //diff[idx] = BJacobi[idx] - AJacobi[idx];
}

In my main function

readSparceMatrix(argv[1], &matrix);
array_size = matrix.rowSize * matrix.rowSize;

//we want as many or more threads then data.
dimGrid = array_size / THREADS + 1;
dimBlock = THREADS;

// ---------------------- START ALLOCATION OF DEVICE MEMEORY
err = cudaMalloc( (void**)&diff, array_size * sizeof(float)); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&AJacobi, array_size * sizeof(float) );
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&BJacobi, array_size * sizeof(float) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&MaxDiffTree, array_size * sizeof(float) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 
err = cudaMalloc( (void**)&bitMask, array_size * sizeof(int) ); 
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMalloc: %s\n", cudaGetErrorString(err));
    exit(1);
} 


// ---------------------- START INTITILIZATION OF DEVICE MEMERY 
err = cudaMemset(diff, 1.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemset(BJacobi, 0.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemset(MaxDiffTree, 0.0, array_size * sizeof(float));
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemcpy(AJacobi, matrix.data, array_size * sizeof(float) ,cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

err = cudaMemcpy(bitMask, matrix.mask, array_size * sizeof(int) ,cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
    fprintf (stderr, "cudaMemcpy: %s\n", cudaGetErrorString(err));
    exit(1);
} 

// ---------------------- START MAIN JACOBI LOOP
//while(MaxDiff >  delta){

jacobi<<<dimGrid, dimBlock>>>(diff, AJacobi, BJacobi, bitMask,  matrix.rowSize);

Upvotes: 0

Views: 728

Answers (1)

AlexLordThorsen
AlexLordThorsen

Reputation: 8498

So this is actually a simple error which I've spent quite a while trying to figure out. The problem happened because I have more threads then data. Because of that, I have threads with a thread index which is outside of the bounds of my array. the first if statement in my code is meant to check for that, but my diff assignment was outside of my index check. Moving the diff statement under the if check solved my problem.

if(idx < size * size){
    if(bitMask[idx] == 0){ 
        //if left side of matrix
        if(idx - 1 > 0 && idx % size != 0) {
            sum += src[ idx - 1 ];
            count++;
        }
        //if right side of matrix
        if(idx + 1 < size * size && (idx + 1) % size != 0)
        {
            sum += src[ idx + 1 ];
            count++;
        }
        //if top of matrix
        if(idx - size > 0)
        {
            sum += src[ idx - size ];
            count++;
        }
        //if bottom of matrix
        if(idx + size < size * size)
        {
            sum  += src[ idx + size ];
            count++;
        }
        dst[idx] = sum / count;
    }
    else dst[idx] = src[idx];

    diff[idx] = dst[idx] - src[idx];
}   

Upvotes: 1

Related Questions