Reputation: 21
What I want to do is use two 256-bit registers to do operations on 10 uint32_ts. Right now, the way I do it is store the first 8 on the left register and the remaining 2 left padded on the right register.
[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, x, x, x, x, x, x]
where x are ignored values. I need to create an operation where I shift the 10 values to the left or the right by a maximum of 2 spaces
Result of left shift:
[1, 2, 3, 4, 5, 6, 7, 8], [9, Z, x, x, x, x, x, x]
Result of two right shifts:
[Z, Z, 0, 1, 2, 3, 4, 5], [6, 7, x, x, x, x, x, x]
The positions with 'Z' need to be zero. This is what I have so far:
__m256i ind[5] = {
_mm256_setr_epi32(2, 3, 4, 5, 6, 7, 0, 1),
_mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0),
_mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
_mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6),
_mm256_setr_epi32(6, 7, 0, 1, 2, 3, 4, 5)
__m256i mask[5][2] = {
{_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 1, 1), _mm256_setr_epi32(-1,-1,-1,-1,-1,-1,-1,-1)},
{_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 1), _mm256_setr_epi32( 1,-1,-1,-1,-1,-1,-1,-1)},
{_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32(-1,-1,-1,-1,-1,-1,-1,-1)},
{_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32( 0,-1,-1,-1,-1,-1,-1,-1)},
{_mm256_setr_epi32(-1,-1, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32( 0, 0,-1,-1,-1,-1,-1,-1)}
__m256i zeroes[5] = {
_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0),
_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0),
_mm256_setr_epi32( 0,-1,-1,-1,-1,-1,-1,-1),
_mm256_setr_epi32( 0, 0,-1,-1,-1,-1,-1,-1)
void shift(__m256i& ymml, __m256i& ymmr, int x){
if (x == 0)
__m256i resl = _mm256_permutevar8x32_epi32(ymml, ind[x+2]);
__m256i resr = _mm256_permutevar8x32_epi32(ymmr, ind[x+2]);
ymml = _mm256_blendv_epi8(resl, resr, mask[x+2][0]);
ymmr = _mm256_blendv_epi8(resl, resr, mask[x+2][1]);
if (x > 0) {
ymml = _mm256_and_si256(ymml, zeroes[x+2]);
} else {
ymmr = _mm256_and_si256(ymmr, zeroes[x+2]);
Is there a better way to do this?
Upvotes: 2
Views: 46