Reputation: 307
I'm trying to make similar to std::round() function using SSE(1,2,3) instructions but have some issues with certain values and/or logical operators. Here's my code:
#include <iostream>
#include <cmath>
#include <emmintrin.h>
int round_int( float x ) {
return (int) (x > 0.0f) ? (x + 0.5f) : (x - 0.5f);
}
__m128 roundf_sse(__m128 x){
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b));
return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(x, val)));
}
__m128 roundf_mp(__m128 x){
__m128i i = _mm_cvtps_epi32(x);
return _mm_cvtepi32_ps(i);
}
int main() {
for (int i = -10; i <= 10; i++){
for (int j = 0; j < 10; j++){
float x = (float)i + ((float)j/10.0f);
std::cout << "x = " << x << " ------------------------ " << std::endl;
std::cout << "std::round = " << std::round(x) << std::endl;
std::cout << "round_int = " << round_int(x) << std::endl;
float m128res[4] = { 0 };
__m128 in = _mm_set1_ps(x);
_mm_store_ps(m128res, roundf_sse(in));
std::cout << "roundf_sse = " << m128res[0] << std::endl;
_mm_store_ps(m128res, roundf_mp(in));
std::cout << "roundf_mp = " << m128res[0] << std::endl;
}
}
}
Some testing done with Compiler Explorer - https://godbolt.org/z/b5b5YqEKo
Problems are:
a) roundf_mp() function, there are wrong results now an then with input values like ±6.5, ±4.5, ±2.5, ±0.5
and
b) roundf_sse() function: it tries to follow the function round_int structure (round_int results equals with std::round() output) and is based partly on Branchless “select” (cond ? a : b) section found from this posting.
Any suggestions what is the reason for issue in a case and is there something not implemented correctly in case b ?
EDIT: By rounding float to int using _mm_cvttps_epi32 I get proper rounding:
__m128 roundf_sse(__m128 x){
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));
return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));
Would there be bottlenecks (control some certain values, limits, etc.)?
Upvotes: 4
Views: 370
Reputation: 307
With help from commenters I got my problem solved and by changing the implementation technique also improved the performance of this function. There's one note: this function limits are ± 2^23. Limit can be extended by using _mm_cvtps_epi32() in float to integer conversion for values outside this range.
My original implementation with changed float to integer conversion intrinsic:
__m128 roundf_sse(__m128 x){ // for |x|<2^23
__m128 zero = _mm_set1_ps(0.0f);
__m128 a = _mm_set1_ps(0.5f);
__m128 b = _mm_set1_ps(-0.5f);
__m128 cond = _mm_cmpgt_ps(x, zero);
__m128 val = _mm_or_ps(_mm_and_ps(a, cond),_mm_andnot_ps(cond, b));
return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))); }
gives equal rounding with std::round() but, it is ~60% slower than std::round() (rdtsc/val) (GCC -O3 -ffast-math).
Suggested (by chtz) implementation gives same rounding but almost equal performance (rdtsc/val) as std::round() has and it also needs less code compared to my method:
__m128 roundf_sse(__m128 x){ // for |x|<2^23
__m128 val = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-0.0f), x), _mm_set1_ps(0.5f));
return _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val)));
}
EDIT:
This implementation (actually three versions of it) works for whole range:
__m128 round_M(__m128 x){ // for |x| >= 2^23
__m128 M = _mm_set1_ps(12582912.0); // "magic number [1.5*(2^24-8)]"
x = _mm_add_ps(x, M);
x = _mm_sub_ps(x, M);
return x;
}
__m128 roundf_sse(__m128 x){ // speed: 2.5x slower than std::round() (rdtsc/val)
__m128 SIGNMASK = _mm_set1_ps(-0.0f);
__m128 lim = _mm_set1_ps(0x1.0p23f);
__m128 val = _mm_or_ps(_mm_and_ps(SIGNMASK, x), _mm_set1_ps(0.5f));
__m128 cond = _mm_cmpge_ps(_mm_andnot_ps(SIGNMASK, x), lim);
// val = _mm_or_ps(_mm_and_ps(_mm_cvtepi32_ps(_mm_cvtps_epi32(x)), cond), // for |x|=>2^23
// _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x| <2^23
// val = _mm_or_ps(_mm_and_ps(round_M(x), cond), // for |x|=>2^23
// _mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x|<2^23
val = _mm_or_ps(_mm_and_ps(x, cond), // for |x|=>2^23
_mm_andnot_ps(cond, _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(x, val))))); // for |x|<2^23
return val;
}
but, much slower (rdtsc/val) compared to std::round().
Upvotes: 2