Reputation: 3
I am trying to implement a parallel reduction sum of even and odd number Separately in CUDA. I'm new in CUDA programming and I'm trying so hard but I can't find a solution.
I have for example the array : [5, 8, 0, -6, 2]. And the result need to be [4, 5] (Even : 8+0-6+2=4, Odd : 5=5). But the result of my following code is [8, 5].
I think that my problem is in the notion of "shared" but I do not understand why.
__global__ void sumEvenOdd(int *a, int *b, int N){
int column = blockIdx.x * blockIdx.x + threadIdx.x;
__shared__ int s_data[2];
if (column < N){
if (a[column] % 2 == 0){
s_data[0] += a[column];
}
else{
s_data[1] += a[column];
}
__syncthreads();
b[0] = s_data[0];
b[1] = s_data[1];
}
}
void initArray(int *a, int N){
for (unsigned int i = 0; i < N; i++){
a[i] = rand() % 100;
}
}
void verify_result(int *a, int *b, int N){
int *verify_b;
verify_b = (int*)malloc(2 * sizeof(int));
verify_b[0] = 0;
verify_b[1] = 0;
for (unsigned int i = 0; i < N; i++){
if (a[i] % 2 == 0){
verify_b[0] += a[i];
}
else{
verify_b[1] += a[i];
}
}
for (unsigned int i = 0; i < 2; i++){
assert(verify_b[i] == b[i]);
}
}
void printResult(int *a, int *b, int N){
printf("\n");
for (unsigned int i = 0; i < N; i++){
printf("%d, ", a[i]);
}
printf("\n");
for (unsigned int i = 0; i < 2; i++){
printf("%d, ", b[i]);
}
}
int main(){
//Array sizes;
int N = 5;
//Size (in bytes) of matrix
size_t bytes = N * sizeof(int);
//Host pointers
int *a, *b;
// Allocate host memory
a = (int*)malloc(bytes);
b = (int*)malloc(2 * sizeof(int));
// Initialize array
initArray(a, N);
// Device pointers
int *d_a, *d_b;
// Allocated device memory
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, 2 * sizeof(int));
// Copy data to the device
cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice);
//Number of threads
int THREADS = 128;
//Number of blocks
int BLOCKS = (N + THREADS - 1) / THREADS;
// Launch kernel
sumEvenOdd<<<BLOCKS, THREADS>>>(d_a, d_b, N);
cudaDeviceSynchronize();
// Copy back to the host
cudaMemcpy(b, d_b, 2 * sizeof(int), cudaMemcpyDeviceToHost);
// Check result
verify_result(a, b, N);
printResult(a, b, N);
return 0;
}
Upvotes: 0
Views: 162
Reputation: 17616
you cannot just use
s_data[1] += a[column];
remember all units are going to execute this line at the same time, and store in the same position, so all threads are storing into s_data at the same time.
instead you should use atomic add
atomicAdd(&s_data[1], a[column]);
and you should also be initializing s_data to zeros.
Upvotes: 1