Reputation: 523
We find various tricks to replace std::sqrt
(Timing Square Root) and some for std::exp
(Using Faster Exponential Approximation) , but I find nothing to replace std::log
.
It's part of loops in my program and its called multiple times and while exp and sqrt were optimized, Intel VTune now suggest me to optimize std::log
, after that it seems that only my design choices will be limiting.
For now I use a 3rd order taylor approximation of ln(1+x)
with x
between -0.5
and +0.5
(90% of the case for max error of 4%) and fall back to std::log
otherwise. This gave me 15% speed-up.
Upvotes: 20
Views: 28323
Reputation: 151
I propose the following version of the code for ln(x) written in Visual Studio (x86). Here b0,...,b3 is the adjusted coefficients of the polynomial obtained as a Chebyshev approximation for the function
f(t) = lb((5+t)/(5-t))/t , -1<=t<=1 .
First calculated t/5 = (x-1)/(x+1), where x in [0.75; 1.5]. After obtaining the approximation f(t) the result is calculated by the formula
ln(x) = (t*f(t)+k)*ln(2) ,
where k is the number by which it was necessary to reduce the exponent of x to bring it into the range [0.75; 1.5]. Maximum relative error is 2.53708704e-07 and it is 3.49952803 ulps. The RMS relative error is 5.06926602e-08.
_declspec(naked) float _vectorcall ln(float x)
{
static const float ct[6] = // Constant table
{
-1.0f, // -1
0.576110899f, // b2*5^5
0.961808264f, // b1*5^3
2.88539004f, // b0*5
0.442831367f, // b3*5^7
0.693147181f // ln(2)
};
_asm
{
vmovd eax,xmm0 // Read the binary representation of the x into eax
mov edx,offset ct // edx - the address of the constant table
cmp eax,0x7F800000 // Compare x with the Inf value
jnc ln_bad_x // Break calculations if x<=-0, x=+Inf or x=+NaN
ror eax,23 // Shift eax so that its exponent is in al
movzx ecx,al // Get the exponent with an offset of +127 in ecx
jecxz ln_denorm // Jump if x=0 or x is denormalized
ln_calc: // The entry point after normalization of the number
setnc al // al=1 if the mantissa is less than 1.5, otherwise al=0
adc ecx,-127 // ecx=k - the integer part of the binary logarithm
or eax,126 // Change the exponent of x so that it becomes 0.75<=x<1.5
ror eax,9 // In eax: the value of x, in cf: its sign bit
vmovd xmm0,eax // Write the reduced value of x into xmm0
vsubss xmm1,xmm0,[edx] // xmm1 = x+1
vaddss xmm0,xmm0,[edx] // xmm0 = x-1
vcvtsi2ss xmm3,xmm3,ecx // xmm3 = k, the integer addition to the binary logarithm
vdivss xmm0,xmm0,xmm1 // xmm0 = (x-1)/(x+1)=t/5
vmovss xmm1,[edx+16] // xmm1 = 5^7*b3 - initialize the sum
vmulss xmm2,xmm0,xmm0 // xmm2 = t^2/25 - prepare the argument of the polynomial
vfmadd213ss xmm1,xmm2,[edx+4] // Step 1 calculating the polynomial by the Нorner scheme
vfmadd213ss xmm1,xmm2,[edx+8] // Step 2 calculating the polynomial by the Нorner scheme
vfmadd213ss xmm1,xmm2,[edx+12] // Step 3 calculating the polynomial by the Нorner scheme
vfmadd213ss xmm0,xmm1,xmm3 // xmm0 = t*f(t)+k - ready binary logarithm
vmulss xmm0,xmm0,[edx+20] // Convert binary logarithm to natural
ret // Return
ln_denorm: // Processing denormalized values of x including x=0
bsr ecx,eax // Search for the highest set bit; zf=1 if x=+0
ror eax,cl // Form a mantissa of a normalized number x*2^31
lea ecx,[ecx-31] // 31 is added to the exponent, so subtract it from ecx
jnz ln_calc // Go to calculate ln(x) if x>0
vmulss xmm0,xmm0,[edx] // xmm0 = -0
ln_bad_x: // The entry point for x<=-0, x=+Inf or x=+NaN
jnl ln_exit // Return x for x=+Inf or x=+NaN
vsqrtss xmm0,xmm0,xmm0 // The root of a negative number gives the result NaN
vrcpss xmm0,xmm0,xmm0 // In the case of x=-0 generate the result -Inf
ln_exit: // The result in xmm0 is ready
ret // Return
}
}
Upvotes: 2
Reputation: 971
Slight improvement in precision and performance:
#include <bit> // C++20
//fast_log abs(rel) : avgError = 2.85911e-06(3.32628e-08), MSE = 4.67298e-06(5.31012e-08), maxError = 1.52588e-05(1.7611e-07)
const float s_log_C0 = -19.645704f;
const float s_log_C1 = 0.767002f;
const float s_log_C2 = 0.3717479f;
const float s_log_C3 = 5.2653985f;
const float s_log_C4 = -(1.0f + s_log_C0) * (1.0f + s_log_C1) / ((1.0f + s_log_C2) * (1.0f + s_log_C3)); //ensures that log(1) == 0
const float s_log_2 = 0.6931472f;
// assumes x > 0 and that it's not a subnormal.
// Results for 0 or negative x won't be -Infinity or NaN
inline float fast_log(float x)
{
unsigned int ux = std::bit_cast<unsigned int>(x);
int e = static_cast<int>(ux - 0x3f800000) >> 23; //e = exponent part can be negative
ux |= 0x3f800000;
ux &= 0x3fffffff; // 1 <= x < 2 after replacing the exponent field
x = std::bit_cast<float>(ux);
float a = (x + s_log_C0) * (x + s_log_C1);
float b = (x + s_log_C2) * (x + s_log_C3);
float c = (float(e) + s_log_C4);
float d = a / b;
return (c + d) * s_log_2;
}
Although it has division that considered as slow operation, there are many operations that can be performed in parallel by processor.
Upvotes: 3
Reputation: 2572
I needed as well a fast log approximation and by far the best one seems to be one based on Ankerls algorithm.
Explanation: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
Code (copied from https://github.com/ekmett/approximate/blob/dc1ee7cef58a6b31661edde6ef4a532d6fc41b18/cbits/fast.c#L127):
double log_fast_ankerl(double a)
{
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Little endian is required!");
union { double d; int x[2]; } u = { a };
return (u.x[1] - 1072632447) * 6.610368362777016e-7;
}
Just one subtraction and multiplication. It's surprisingly good and unbeatable fast.
Upvotes: 1
Reputation: 26085
Prior to embarking on the design and deployment of a customized implementation of a transcendental function for performance, it is highly advisable to pursue optimizations at the algorithmic level, as well as through the toolchain. Unfortunately, we do not have any information about the code to be optimized here, nor do we have information on the toolchain.
At the algorithmic level, check whether all calls to transcendental functions are truly necessary. Maybe there is a mathematical transformation that requires fewer function calls, or converts transcendental functions into algebraic operation. Are any of the transcendental function calls possibly redundant, e.g. because the computation is unnecessarily switching in and out of logarithmic space? If the accuracy requirements are modest, can the whole computation be performed in single precision, using float
instead of double
throughout? On most hardware platforms, avoiding double
computation can lead to a significant performance boost.
Compilers tend to offer a variety of switches that affect the performance of numerically intensive code. In addition to increasing the general optimization level to -O3
, there is often a way to turn off denormal support, i.e. turn on flush-to-zero, or FTZ, mode. This has performance benefits on various hardware platforms. In addition, there is often a "fast math" flag whose use results in slightly reduced accuracy and eliminates overhead for handling special cases such as NaNs and infinities, plus the handling of errno
. Some compilers also support auto-vectorization of code and ship with a SIMD math library, for example the Intel compiler.
A custom implementation of a logarithm function typically involves separating a binary floating-point argument x
into an exponent e
and a mantissa m
, such that x = m * 2
e
, therefore log(x) = log(2) * e + log(m)
. m
is chosen such that it is close to unity, because this provides for efficient approximations, for example log(m) = log(1+f) = log1p(f)
by minimax polynomial approximation.
C++ provides the frexp()
function to separate a floating-point operand into mantissa and exponent, but in practice one typically uses faster machine-specific methods that manipulate floating-point data at the bit level by re-interpreting them as same-size integers. The code below for the single-precision logarithm, logf()
, demonstrates both variants. Functions __int_as_float()
and __float_as_int()
provide for the reinterpretation of an int32_t
into an IEEE-754 binary32
floating-point number and vice-versa. This code heavily relies on the fused multiply-add operation FMA supported directly in the hardware on most current processors, CPU or GPU. On platforms where fmaf()
maps to software emulation, this code will be unacceptably slow.
#include <cmath>
#include <cstdint>
#include <cstring>
float __int_as_float (int32_t a) { float r; memcpy (&r, &a, sizeof r); return r;}
int32_t __float_as_int (float a) { int32_t r; memcpy (&r, &a, sizeof r); return r;}
/* compute natural logarithm, maximum error 0.85089 ulps */
float my_logf (float a)
{
float i, m, r, s, t;
int e;
#if PORTABLE
m = frexpf (a, &e);
if (m < 0.666666667f) {
m = m + m;
e = e - 1;
}
i = (float)e;
#else // PORTABLE
i = 0.0f;
if (a < 1.175494351e-38f){ // 0x1.0p-126
a = a * 8388608.0f; // 0x1.0p+23
i = -23.0f;
}
e = (__float_as_int (a) - __float_as_int (0.666666667f)) & 0xff800000;
m = __int_as_float (__float_as_int (a) - e);
i = fmaf ((float)e, 1.19209290e-7f, i); // 0x1.0p-23
#endif // PORTABLE
/* m in [2/3, 4/3] */
m = m - 1.0f;
s = m * m;
/* Compute log1p(m) for m in [-1/3, 1/3] */
r = -0.130310059f; // -0x1.0ae000p-3
t = 0.140869141f; // 0x1.208000p-3
r = fmaf (r, s, -0.121483512f); // -0x1.f198b2p-4
t = fmaf (t, s, 0.139814854f); // 0x1.1e5740p-3
r = fmaf (r, s, -0.166846126f); // -0x1.55b36cp-3
t = fmaf (t, s, 0.200120345f); // 0x1.99d8b2p-3
r = fmaf (r, s, -0.249996200f); // -0x1.fffe02p-3
r = fmaf (t, m, r);
r = fmaf (r, m, 0.333331972f); // 0x1.5554fap-2
r = fmaf (r, m, -0.500000000f); // -0x1.000000p-1
r = fmaf (r, s, m);
r = fmaf (i, 0.693147182f, r); // 0x1.62e430p-1 // log(2)
if (!((a > 0.0f) && (a < INFINITY))) {
r = a + a; // silence NaNs if necessary
if (a < 0.0f) r = INFINITY - INFINITY; // NaN
if (a == 0.0f) r = -INFINITY;
}
return r;
}
As noted in the code comment, the implementation above provides faithfully-rounded single-precision results, and it deals with exceptional cases consistent with the IEEE-754 floating-point standard. The performance can be increased further by eliminating special-case support, eliminating the support for denormal arguments, and reducing the accuracy. This leads to the following exemplary variant:
/* natural log on [0x1.f7a5ecp-127, 0x1.fffffep127]. Maximum relative error 9.4529e-5 */
float my_faster_logf (float a)
{
float m, r, s, t, i, f;
int32_t e;
e = (__float_as_int (a) - 0x3f2aaaab) & 0xff800000;
m = __int_as_float (__float_as_int (a) - e);
i = (float)e * 1.19209290e-7f; // 0x1.0p-23
/* m in [2/3, 4/3] */
f = m - 1.0f;
s = f * f;
/* Compute log1p(f) for f in [-1/3, 1/3] */
r = fmaf (0.230836749f, f, -0.279208571f); // 0x1.d8c0f0p-3, -0x1.1de8dap-2
t = fmaf (0.331826031f, f, -0.498910338f); // 0x1.53ca34p-2, -0x1.fee25ap-2
r = fmaf (r, s, t);
r = fmaf (r, s, f);
r = fmaf (i, 0.693147182f, r); // 0x1.62e430p-1 // log(2)
return r;
}
Upvotes: 22
Reputation: 1360
I vectorized the @njuffa's answer. natural log, works with AVX2:
inline __m256 mm256_fmaf(__m256 a, __m256 b, __m256 c){
return _mm256_add_ps(_mm256_mul_ps(a, b), c);
}
//https://stackoverflow.com/a/39822314/9007125
//https://stackoverflow.com/a/65537754/9007125
// vectorized version of the answer by njuffa
/* natural log on [0x1.f7a5ecp-127, 0x1.fffffep127]. Maximum relative error 9.4529e-5 */
inline __m256 fast_log_sse(__m256 a){
__m256i aInt = *(__m256i*)(&a);
__m256i e = _mm256_sub_epi32( aInt, _mm256_set1_epi32(0x3f2aaaab));
e = _mm256_and_si256( e, _mm256_set1_epi32(0xff800000) );
__m256i subtr = _mm256_sub_epi32(aInt, e);
__m256 m = *(__m256*)&subtr;
__m256 i = _mm256_mul_ps( _mm256_cvtepi32_ps(e), _mm256_set1_ps(1.19209290e-7f));// 0x1.0p-23
/* m in [2/3, 4/3] */
__m256 f = _mm256_sub_ps( m, _mm256_set1_ps(1.0f) );
__m256 s = _mm256_mul_ps(f, f);
/* Compute log1p(f) for f in [-1/3, 1/3] */
__m256 r = mm256_fmaf( _mm256_set1_ps(0.230836749f), f, _mm256_set1_ps(-0.279208571f) );// 0x1.d8c0f0p-3, -0x1.1de8dap-2
__m256 t = mm256_fmaf( _mm256_set1_ps(0.331826031f), f, _mm256_set1_ps(-0.498910338f) );// 0x1.53ca34p-2, -0x1.fee25ap-2
r = mm256_fmaf(r, s, t);
r = mm256_fmaf(r, s, f);
r = mm256_fmaf(i, _mm256_set1_ps(0.693147182f), r); // 0x1.62e430p-1 // log(2)
return r;
}
Upvotes: 2
Reputation: 71
#include <math.h>
#include <iostream>
constexpr int LogPrecisionLevel = 14;
constexpr int LogTableSize = 1 << LogPrecisionLevel;
double log_table[LogTableSize];
void init_log_table() {
for (int i = 0; i < LogTableSize; i++) {
log_table[i] = log2(1 + (double)i / LogTableSize);
}
}
double fast_log2(double x) { // x>0
long long t = *(long long*)&x;
int exp = (t >> 52) - 0x3ff;
int mantissa = (t >> (52 - LogPrecisionLevel)) & (LogTableSize - 1);
return exp + log_table[mantissa];
}
int main() {
init_log_table();
double d1 = log2(100); //6.6438561897747244
double d2 = fast_log2(100); //6.6438561897747244
double d3 = log2(0.01); //-6.6438561897747244
double d4 = fast_log2(0.01); //-6.6438919626096089
}
Upvotes: 2
Reputation: 6110
Take a look at this discussion, the accepted answer refers to an implementation of a function for computing logarithms based on the Zeckendorf decomposition.
In the comments in the implementation file there is a discussion about complexity and some tricks to reach O(1).
Hope this helps!
Upvotes: 4
Reputation: 6406
It depends how accurate you need to be. Often log is called to get an idea of the magnitude of the number, which you can do essentially for free by examining the exponent field of a floating point number. That's also your first approximation. I'll put in a plug for my book "Basic Algorithms" which explains how to implement the standard library math functions from first principles.
Upvotes: -1