Reputation: 15871
I need to create an OpenCL kernel function which uses parallel algorithm to sum n
integers from an array numbers
.
I should use an algorithm similar to the following:
parallel_summation(A):
# ASSUME n = |A| is a power of 2 for simplicity
# level 0:
in parallel do:
s[i] = A[i] for i = 0, 1, 2, ..., n-1
# level 1:
in parallel do:
s[i] = s[i] + s[i+1] for i = 0, 2, 4, ...
# level 2:
in parallel do:
s[i] = s[i] + s[i+2] for i = 0, 4, 8, ...
# level 3:
in parallel do:
s[i] = s[i] + s[i+4] for i = 0, 8, 16, ...
# ...
# level log_2( n ):
s[0] = s[0] + s[n/2]
return s[0]
So, I came up with the following kernel code:
kernel void summation(global uint* numbers,
global uint* sum,
const uint n,
const uint work_group_size,
local uint* work_group_buf,
const uint num_of_levels) {
// lets assume for now that the workgroup's size is 16,
// which is a power of 2.
int i = get_global_id(0);
if(i >= n)
return;
int local_i = get_local_id(0);
uint step = 1;
uint offset = 0;
for(uint k = 0; k < num_of_levels; ++k) {
if(k == 0) {
work_group_buf[local_i] = numbers[i];
} else {
if(local_i % step == 0) {
work_group_buf[local_i] += work_group_buf[local_i + offset];
}
}
if(offset == 0) {
offset = 1;
} else {
offset *= 2;
}
step *= 2;
barrier(CLK_LOCAL_MEM_FENCE);
}
atomic_add(sum, work_group_buf[0]);
}
But there's a bug because I'm not receiving the expected results. numbers
is a buffer that contains numbers from 1
to n
. num_of_levels
is log2(number of work items per work group), which in my current example is 4 (log2(16)).
What am I doing wrong?
Note: I'm not receiving any error, is just the result which is wrong. For example, I've an array of 1000000 elements from 0 to 999999, and the sum of those elements should be 1783293664, but I'm getting 1349447424.
Upvotes: 3
Views: 1106
Reputation: 349
You may do it simpler. It is very fast but work on OpenCL 1.2+ only.
inline void sum(__global int* a, int v)
{
int s = +1 * v;
int n = 0;
int o = 0;
do {
n = s + atom_xchg(a, o);
s = o + atom_xchg(a, n);
}
while (s != o);
}
__kernel void sum_kernel(__global int *set, __global int* out)
{
int i = (get_group_id(0) + get_group_id(1)*get_num_groups(0)) * get_local_size(0) + get_local_id(0);
sum(out, set[i]);
}
From: GitHub - Hello GPU Compute World!
Thanks!
Upvotes: 0
Reputation: 15871
I fixed a few bugs. There were a few mistakes and I was missing this part s[0] = s[0] + s[n/2]
, as you can see from this new version.
kernel void summation(global uint* numbers,
global uint* sum,
const uint n,
local uint* work_group_buf,
const uint num_of_levels) {
const int i = get_global_id(0);
const int local_i = get_local_id(0);
private uint step = 2;
private uint offset = 1;
if(i < n)
work_group_buf[local_i] = numbers[i];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint k = 1; k < num_of_levels; ++k) {
if((local_i % step) == 0) {
work_group_buf[local_i] += work_group_buf[local_i + offset];
}
offset *= 2;
step *= 2;
barrier(CLK_LOCAL_MEM_FENCE);
}
work_group_buf[0] += work_group_buf[get_local_size(0) / 2];
if(local_i == 0)
atomic_add(sum, work_group_buf[0]);
}
Note that now I'm adding to the final sum
just the first element of each work_group_buf
(i.e. work_group_buf[0]
) only if the local_i == 0
, because that position will contain the sum of all elements in the workgroup.
This actually seems to work for workgroups of size up to 32 (which are a power of 2). In other words, this kernel seems to work only for workgroups of size 2, 4, 8, 16 and 32 work items.
Upvotes: 2