Thomas42
Thomas42

Reputation: 21

Shuffle in between two ymm registers and fill with zeroes?

What I want to do is use two 256-bit registers to do operations on 10 uint32_ts. Right now, the way I do it is store the first 8 on the left register and the remaining 2 left padded on the right register.

[0, 1, 2, 3, 4, 5, 6, 7], [8, 9, x, x, x, x, x, x]

where x are ignored values. I need to create an operation where I shift the 10 values to the left or the right by a maximum of 2 spaces

Result of left shift:

[1, 2, 3, 4, 5, 6, 7, 8], [9, Z, x, x, x, x, x, x]

Result of two right shifts:

[Z, Z, 0, 1, 2, 3, 4, 5], [6, 7, x, x, x, x, x, x]

The positions with 'Z' need to be zero. This is what I have so far:

__m256i ind[5] = {
    _mm256_setr_epi32(2, 3, 4, 5, 6, 7, 0, 1),
    _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0),
    _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
    _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6),
    _mm256_setr_epi32(6, 7, 0, 1, 2, 3, 4, 5)
};

__m256i mask[5][2] = {
    {_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 1, 1), _mm256_setr_epi32(-1,-1,-1,-1,-1,-1,-1,-1)},
    {_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 1), _mm256_setr_epi32( 1,-1,-1,-1,-1,-1,-1,-1)},
    {_mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32(-1,-1,-1,-1,-1,-1,-1,-1)},
    {_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32( 0,-1,-1,-1,-1,-1,-1,-1)},
    {_mm256_setr_epi32(-1,-1, 0, 0, 0, 0, 0, 0), _mm256_setr_epi32( 0, 0,-1,-1,-1,-1,-1,-1)}
};

__m256i zeroes[5] = {
    _mm256_setr_epi32( 0, 0, 0, 0, 0, 0, 0, 0),
    _mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0),
    _mm256_setr_epi32(-1,-1,-1,-1,-1,-1,-1,-1),
    _mm256_setr_epi32( 0,-1,-1,-1,-1,-1,-1,-1),
    _mm256_setr_epi32( 0, 0,-1,-1,-1,-1,-1,-1)
};

void shift(__m256i& ymml, __m256i& ymmr, int x){
    if (x == 0)
        return;
    
    __m256i resl = _mm256_permutevar8x32_epi32(ymml, ind[x+2]);
    __m256i resr = _mm256_permutevar8x32_epi32(ymmr, ind[x+2]);
    ymml = _mm256_blendv_epi8(resl, resr, mask[x+2][0]);
    ymmr = _mm256_blendv_epi8(resl, resr, mask[x+2][1]);

    if (x > 0) {
        ymml = _mm256_and_si256(ymml, zeroes[x+2]);
    } else {
        ymmr = _mm256_and_si256(ymmr, zeroes[x+2]);
    }
}

Is there a better way to do this?

Upvotes: 2

Views: 46

Answers (0)

Related Questions