Reputation: 792
I'm trying to modify the Intel's Bitonic Sorting algorithm which sorts an array of cl_int
s, to sort an array of cl_int2
s (based on the key – i.e. cl_int2.x
).
The Intel's example consists of a simple host code and one OpenCL kernel which is called multiple times during one sorting operation (multipass).
The kernel loads 4 array items at once as cl_int4
and operates on them.
I didn't modify the host code algorithm, only the device code. List of changes in the kernel function:
int4*
to int8*
(to load four key-value pairs).even
components of the theArray
's elements to compare values (<
)pseudomask
" (int4
) and based on that, create mask
as pseudomask.xxyyzzww
(to capture the values)Although the output of my modified kernel is perfectly sorted cl_int2
array by the first component (cl_int2.x
), the values (cl_int2.y
) are incorrect – the value of one item is repeated for the next 4 or 8 items and then new value is used and repeated...
I'm sure there's a trivial mistake, but I'm unable to find it.
Diff of the original Intel code and my modified version.
cl_int2
array is sorted flawlessly when each key (cl_int2.x
) is unique.Example input: http://pastebin.com/92qB1csT
Example output: http://pastebin.com/dsU97Npn
(Properly sorted array: http://pastebin.com/Nb56BuQK)
The modified kernel code (commented):
// Copyright (c) 2009-2011 Intel Corporation
// https://software.intel.com/en-us/articles/bitonic-sorting
// Modified to sort int2 key-value array
__kernel void BitonicSort(__global int8* theArray,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
int8 srcLeft, srcRight, mask;
int4 pseudomask;
int4 imask10 = (int4)(0, 0, -1, -1);
int4 imask11 = (int4)(0, -1, 0, -1);
if(stage > 0)
{
if(passOfStage > 0) // upper level pass, exchange between two fours,
{
size_t r = 1 << (passOfStage - 1);
size_t lmask = r - 1;
size_t left = ((i>>(passOfStage-1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
pseudomask = srcLeft.even < srcRight.even;
mask = pseudomask.xxyyzzww;
int8 imin = (srcLeft & mask) | (srcRight & ~mask);
int8 imax = (srcLeft & ~mask) | (srcRight & mask);
if( ((i>>(stage-1)) & 1) ^ dir )
{
theArray[left] = imin;
theArray[right] = imax;
}
else
{
theArray[right] = imin;
theArray[left] = imax;
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.s45670123;
pseudomask = (srcLeft.even < srcRight.even) ^ imask10;
mask = pseudomask.xxyyzzww;
if(((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxyyzzww;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxyyzzww;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
}
}
}
else // first stage, sort inside one four
{
/*
* To convert this code to int2 sorter, do this:
* 1. instead of loading int4, load int8 (key,value, key,value, ...)
* 2. when there is a vector swizzling, replace component index with two consecutive indices:
* srcLeft.yxwz -> srcLeft.s23016745
* use this rewrite rule:
* x y z w
* 01 23 45 67
* 3. replace comparison operands with only their keys swizzled:
* mask = srcLeft < srcRight; -> pseudomask = srcLeft.even < srcRight.even; mask = pseudomask.xxyyzzww;
*/
// make bitonic sequence out of 4.
int4 imask0 = (int4)(0, -1, -1, 0); // -1 in comparison = true (all bits set - two's complement)
srcLeft = theArray[i];
srcRight = srcLeft.s23016745;
/*
* This XOR mask flips bits, so that in `mask` are the following
* results (remember that srcRight is srcLeft with swapped component pairs):
*
* [ left.x<left.y, left.x<left.y, left.w<left.z, left.w<left.z ]
* or: [ left.x<left.y, left.x<left.y, left.z>left.w, left.z>left.w ]
*/
pseudomask = (srcLeft.even < srcRight.even) ^ imask0;
mask = pseudomask.xxyyzzww;
if( dir )
srcLeft = (srcLeft & mask) | (srcRight & ~mask); // make sure the numbers are sorted like this:
else
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
/*
* Now the pairs of numbers in `srcLeft` are sorted according to the specified `dir`ection.
* If dir == true, then
* The components `x` and `y` are swapped so that `x` < `y`. Moreover `z` and `w` are swapped so that `z` > `w`. This resembles up-hill: /\
* else
* The components `x` and `y` are swapped so that `x` > `y`. Moreover `z` and `w` are swapped so that `z` < `w`. This resembles down-hill: \/
*
* This swapping is achieved by creating `srcLeft`, which is in normal order, and `srcRight`, which has component pairs switched (xyzw -> yxwz).
* Then the `mask` is created. The mask bits are redundant because it applies to vector component pairs (so in order to implement key-value sorting,
* I have to increase the length of masks!).
*
* The non-ordered component pairs in `srcLeft` are masked out by `mask` while the inverted `mask` is applied to the (pair-wise switched) `srcRight`.
*
* This (the previous) first flipping just makes a 4-bitonic sequence.
*/
/*
* This second step just sorts the bitonic sequence
*/
srcRight = srcLeft.s45670123; // inverts the bitonic sequence
// [ left.a<left.c, left.b<left.d, left.a<left.c, left.b<left.d ]
pseudomask = (srcLeft.even < srcRight.even) ^ imask10; // imask10 = (noflip, noflip, flip, flip)
mask = pseudomask.xxyyzzww;
// even or odd (The output of this thread is sorted monotonic sequence. The monotonicity changes and thus preparing bitonic sequence for the next pass.).
if((i & 1) ^ dir)
{
// this sorts the bitonic sequence, hence splitting it
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxyyzzww;
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxyyzzww;
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
}
}
}
The host-side code:
void ExecuteSortKernel(cl_kernel kernel, cl_command_queue queue, cl_mem cl_input_buffer, cl_int arraySize, cl_uint sortAscending)
{
cl_int numStages = 0;
cl_int stage;
cl_int passOfStage;
for (cl_int temp = arraySize; temp > 2; temp >>= 1)
numStages++;
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &cl_input_buffer);
clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *) &sortAscending);
for (stage = 0; stage < numStages; stage++) {
clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &stage);
for (passOfStage = stage; passOfStage >= 0; passOfStage--) {
clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &passOfStage);
// set work-item dimensions
size_t gsz = arraySize / (2*4);
size_t global_work_size[1] = { passOfStage ? gsz : gsz << 1 }; //number of quad items in input array
// execute kernel
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
}
}
}
Upvotes: 2
Views: 696
Reputation: 792
I've finally resolved the problem!
The tricky part was in the way the original Intel code handled equal values of adjacent pairs inside the loaded 4-tuple — it didn't explicitly handle it!
The bugs were present in the very first stage
and in the last passOfStage
(i.e. passOfStage = 0
) of every other stage
s. These parts of code are interchanging individual 2-tuples inside one 4-tuple (represented by the cl_int8
array theArray
).
Let's consider this excerpt for example (it doesn't function properly for equal adjacent 2-tuples in the 4-tuple):
imask0 = (int4)(0, -1, -1, 0);
srcLeft = theArray[i]; // int8
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask0;
mask = pseudomask.xxyyzzww;
result = (srcLeft & mask) | (srcRight & ~mask);
Imagine what would happen when we'd use this (unfixed) code and srcLeft.even = (int4)(7,7, 5,5)
. The operation srcLeft.even < srcRight.even
would result yield (int4)(0,0,0,0)
, then we'd mask this result by imask0
and we'd get … pseudomask = (int4)(0,-1,-1,0)
– i.e. the imask itself. This is, however, wrong.
The pseudomask
's value is required to form this pattern: (int4)(a,a, b,b)
(where a
and b
can be either 0
or -1
). This means that it is sufficient do do the following comparison to form the correct mask
: quasimask = srcLeft.s07 < srcRight.s07
. Then the correct mask would be created as mask = quasimask.xxxxyyyy
. The first 2 x
es mask the first key-value pair in the first 2-tuple of the 4-tuple (4-tuple = one element in theArray
). And since we want to bitmask corresponding 2-tuples (which are specified by imask0
as 0
–-1
pairs), we add another xx
. We bitmask analogously for the second 2-tuple in the 4-tuple, which leaves us with yyyy
.
Visual example for bitshifting with imask11
srcLeft: x y z w
< < < <
srcRight [relative to srcLeft]: y x w z
^ imask0: 0 -1 0 1
------------------------------------------
(srcLeft<srcRight)^imask0: x x z z
The fixed, fully functioning version (I've commented the fixed parts):
__kernel void BitonicSort(__global int8* theArray,
const uint stage,
const uint passOfStage,
const uint dir)
{
size_t i = get_global_id(0);
int8 srcLeft, srcRight, mask;
int4 pseudomask;
int4 imask10 = (int4)(0, 0, -1, -1);
int4 imask11 = (int4)(0, -1, 0, -1);
if(stage > 0)
{
if(passOfStage > 0) // upper level pass, exchange between two fours
{
size_t r = 1 << (passOfStage - 1);
size_t lmask = r - 1;
size_t left = ((i>>(passOfStage-1)) << passOfStage) + (i & lmask);
size_t right = left + r;
srcLeft = theArray[left];
srcRight = theArray[right];
pseudomask = srcLeft.even < srcRight.even;
mask = pseudomask.xxyyzzww; // here we interchange individual components, so no mask is applied and hence no 2 pairs must contain the same bit-pattern
int8 imin = (srcLeft & mask) | (srcRight & ~mask);
int8 imax = (srcLeft & ~mask) | (srcRight & mask);
if( ((i>>(stage-1)) & 1) ^ dir )
{
theArray[left] = imin;
theArray[right] = imax;
}
else
{
theArray[right] = imin;
theArray[left] = imax;
}
}
else // last pass, sort inside one four
{
srcLeft = theArray[i];
srcRight = srcLeft.s45670123;
pseudomask = (srcLeft.even < srcRight.even) ^ imask10;
mask = pseudomask.xxyyxxyy;
if(((i >> stage) & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd)
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd)
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
}
}
}
else // first stage, sort inside one four
{
int4 imask0 = (int4)(0, -1, -1, 0);
srcLeft = theArray[i];
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask0;
mask = pseudomask.xxxxwwww; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd)
if( dir )
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
else
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.s45670123;
pseudomask = (srcLeft.even < srcRight.even) ^ imask10;
mask = pseudomask.xxyyxxyy; // the 0th and 2nd elements must contain the exact same value (as well as 1st and 3rd)
if((i & 1) ^ dir)
{
srcLeft = (srcLeft & mask) | (srcRight & ~mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd)
theArray[i] = (srcLeft & mask) | (srcRight & ~mask);
}
else
{
srcLeft = (srcLeft & ~mask) | (srcRight & mask);
srcRight = srcLeft.s23016745;
pseudomask = (srcLeft.even < srcRight.even) ^ imask11;
mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd)
theArray[i] = (srcLeft & ~mask) | (srcRight & mask);
}
}
}
Upvotes: 3