SSE1,2,3 round() not fully follow std::round() result

Question

I'm trying to make similar to std::round() function using SSE(1,2,3) instructions but have some issues with certain values and/or logical operators. Here's my code:

#include 
#include 
#include 

int round_int( float x ) {
    return (int) (x > 0.0f) ? (x + 0.5f) : (x - 0.5f);
}

__m128 roundf_sse(__m128 x){ 
    __m128  zero    = _mm_set1_ps(0.0f);
    __m128  a       = _mm_set1_ps(0.5f);
    __m128  b       = _mm_set1_ps(-0.5f);
    __m128  cond    = _mm_cmpgt_ps(x, zero);
    __m128  val     = _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b));
    return  _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, val))); 

}
 
__m128 roundf_mp(__m128 x){  
        __m128i i = _mm_cvtps_epi32(x);
        return _mm_cvtepi32_ps(i);
}
 
int main() {
    for (int i = -10; i <= 10; i++){
        for (int j = 0; j < 10; j++){
        float x = (float)i + ((float)j/10.0f);
        
        std::cout << "x = " << x << "   ------------------------ " << std::endl;
        std::cout << "std::round = " << std::round(x) << std::endl; 
        std::cout << "round_int  = " << round_int(x) << std::endl;
        
        float m128res[4] = { 0 };
        __m128 in = _mm_set1_ps(x);
        
        _mm_store_ps(m128res, roundf_sse(in));
        std::cout << "roundf_sse = " << m128res[0] << std::endl;

        _mm_store_ps(m128res, roundf_mp(in));
        std::cout << "roundf_mp  = " << m128res[0] << std::endl;
        }
    }
}

Some testing done with Compiler Explorer - https://godbolt.org/z/b5b5YqEKo

Problems are:

a) roundf_mp() function, there are wrong results now an then with input values like ±6.5, ±4.5, ±2.5, ±0.5

and

b) roundf_sse() function: it tries to follow the function round_int structure (round_int results equals with std::round() output) and is based partly on Branchless “select” (cond ? a : b) section found from this posting.

Any suggestions what is the reason for issue in a case and is there something not implemented correctly in case b ?

EDIT: By rounding float to int using _mm_cvttps_epi32 I get proper rounding:

__m128 roundf_sse(__m128 x){ 
    __m128  zero    = _mm_set1_ps(0.0f);
    __m128  a       = _mm_set1_ps(0.5f);
    __m128  b       = _mm_set1_ps(-0.5f);
    __m128  cond    = _mm_cmpgt_ps(x, zero);
    __m128  val     = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));

    return  _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));

Would there be bottlenecks (control some certain values, limits, etc.)?

SSE1,2,3 round() not fully follow std::round() result

Answers (1)

Related Questions