Tendoskria
Tendoskria

Reputation: 3

CUDA - Parallel Reduction Sum of Even and Odd Number Separately

I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA. I'm new in CUDA programming and I'm trying so hard but I can't find a solution.

I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5). But the result of my following code is [8, 5].

I think that my problem is in the notion of "shared" but I do not understand why.

__global__ void sumEvenOdd(int *a, int *b, int N){
    int column = blockIdx.x * blockIdx.x + threadIdx.x;

    __shared__ int s_data[2];

    if (column < N){ 
        if (a[column] % 2 == 0){
            s_data[0] += a[column];
        }
        else{
            s_data[1] += a[column];
        }
        __syncthreads();
        b[0] = s_data[0];
        b[1] = s_data[1];
    }
}

void initArray(int *a, int N){
    for (unsigned int i = 0; i < N; i++){
        a[i] = rand() % 100;
    }
}

void verify_result(int *a, int *b, int N){
    int *verify_b;
    verify_b = (int*)malloc(2 * sizeof(int));
    verify_b[0] = 0;
    verify_b[1] = 0;
    for (unsigned int i = 0; i < N; i++){
        if (a[i] % 2 == 0){
            verify_b[0] += a[i]; 
        }
        else{
            verify_b[1] += a[i];
        }
    }
    for (unsigned int i = 0; i < 2; i++){
        assert(verify_b[i] == b[i]);
    }
}

void printResult(int *a, int *b, int N){
    printf("\n");
    for (unsigned int i = 0; i < N; i++){
        printf("%d, ", a[i]);
    }
    printf("\n");
    for (unsigned int i = 0; i < 2; i++){
        printf("%d, ", b[i]);
    }
}

int main(){
 
    //Array sizes;
    int N = 5;
        
    //Size (in bytes) of matrix
    size_t bytes = N * sizeof(int);

    //Host pointers
    int *a, *b;
    
    // Allocate host memory
    a = (int*)malloc(bytes);
    b = (int*)malloc(2 * sizeof(int));

    // Initialize array
    initArray(a, N);

    // Device pointers
    int *d_a, *d_b;

    // Allocated device memory
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, 2 * sizeof(int));

    // Copy data to the device
    cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);

    //Number of threads
    int THREADS = 128;

    //Number of blocks
    int BLOCKS = (N + THREADS - 1) / THREADS;

    // Launch kernel
    sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
    cudaDeviceSynchronize();

    // Copy back to the host
    cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);

    // Check result
    verify_result(a, b, N);

    printResult(a, b, N);

    return 0;
}

Upvotes: 0

Views: 162

Answers (1)

Ahmed AEK
Ahmed AEK

Reputation: 17616

you cannot just use

s_data[1] += a[column];

remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.

instead you should use atomic add

atomicAdd(&s_data[1], a[column]);

and you should also be initializing s_data to zeros.

Upvotes: 1

Related Questions