ritter
ritter

Reputation: 7729

Natural logarithm implementation when only logarithm base 2 is available

I am trying to implement the natural logarithm with PTX. PTX natively only provides lg2.approx.f32 which implements the logarithm to base 2. Thus, applying simple maths one can get the natural logarithm by just multiplying the logarithm to base 2 with the logarithm to base 2 of Euler's number e:

log_e(a) = log_2(a) / lg_2(e)

To a first approximation 1/lg_2(e) is 0.693147. So, I would just multiply with this number.

I had nvcc compile the log function (from CUDA C) into PTX (please find the output below). I can see multiplying with the numerical constant at the end. But there is a lot more stuff going on. Is this important? Can someone explain why there is so much overhead?

    .entry _Z6kernelPfS_ (
        .param .u64 __cudaparm__Z6kernelPfS__out,
        .param .u64 __cudaparm__Z6kernelPfS__in)
    {
    .reg .u32 %r<13>;
    .reg .u64 %rd<4>;
    .reg .f32 %f<48>;
    .reg .pred %p<4>;
    .loc    14  3   0
$LDWbegin__Z6kernelPfS_:
    .loc    14  5   0
    ld.param.u64    %rd1, [__cudaparm__Z6kernelPfS__in];
    ld.global.f32   %f1, [%rd1+0];
    .loc    16  9365    0
    mov.f32     %f2, 0f00000000;        // 0
    set.gt.u32.f32  %r1, %f1, %f2;
    neg.s32     %r2, %r1;
    mov.f32     %f3, 0f7f800000;        // ((1.0F)/(0.0F))
    set.lt.u32.f32  %r3, %f1, %f3;
    neg.s32     %r4, %r3;
    and.b32     %r5, %r2, %r4;
    mov.u32     %r6, 0;
    setp.eq.s32     %p1, %r5, %r6;
    @%p1 bra    $Lt_0_2306;
    .loc    16  8512    0
    mov.b32     %r7, %f1;
    and.b32     %r8, %r7, -2139095041;
    or.b32  %r9, %r8, 1065353216;
    mov.b32     %f4, %r9;
    mov.f32     %f5, %f4;
    .loc    16  8513    0
    shr.u32     %r10, %r7, 23;
    sub.u32     %r11, %r10, 127;
    mov.f32     %f6, 0f3fb504f3;        // 1.41421
    setp.gt.f32     %p2, %f4, %f6;
    @!%p2 bra   $Lt_0_2562;
    .loc    16  8515    0
    mov.f32     %f7, 0f3f000000;        // 0.5
    mul.f32     %f5, %f4, %f7;
    .loc    16  8516    0
    add.s32     %r11, %r11, 1;
$Lt_0_2562:
    .loc    16  8429    0
    mov.f32     %f8, 0fbf800000;        // -1
    add.f32     %f9, %f5, %f8;
    mov.f32     %f10, 0f3f800000;       // 1
    add.f32     %f11, %f5, %f10;
    neg.f32     %f12, %f9;
    div.approx.f32  %f13, %f9, %f11;
    mul.rn.f32  %f14, %f12, %f13;
    add.rn.f32  %f15, %f9, %f14;
    mul.f32     %f16, %f15, %f15;
    mov.f32     %f17, 0f3b2063c3;       // 0.00244735
    mov.f32     %f18, %f17;
    mov.f32     %f19, %f16;
    mov.f32     %f20, 0f3c4c4be0;       // 0.0124693
    mov.f32     %f21, %f20;
    mad.f32 %f22, %f18, %f19, %f21;
    mov.f32     %f23, %f22;
    mov.f32     %f24, %f23;
    mov.f32     %f25, %f16;
    mov.f32     %f26, 0f3daaab50;       // 0.0833346
    mov.f32     %f27, %f26;
    mad.f32 %f28, %f24, %f25, %f27;
    mov.f32     %f29, %f28;
    mul.f32     %f30, %f16, %f29;
    mov.f32     %f31, %f30;
    mov.f32     %f32, %f15;
    mov.f32     %f33, %f14;
    mad.f32 %f34, %f31, %f32, %f33;
    mov.f32     %f35, %f34;
    cvt.rn.f32.s32  %f36, %r11;
    mov.f32     %f37, %f36;
    mov.f32     %f38, 0f3f317218;       // 0.693147
    mov.f32     %f39, %f38;
    add.f32     %f40, %f9, %f35;
    mov.f32     %f41, %f40;
    mad.f32 %f42, %f37, %f39, %f41;
    mov.f32     %f43, %f42;
    .loc    16  8523    0
    mov.f32     %f44, %f43;
    bra.uni     $Lt_0_2050;
$Lt_0_2306:
    .loc    16  8526    0
    lg2.approx.f32  %f45, %f1;
    mov.f32     %f46, 0f3f317218;       // 0.693147
    mul.f32     %f44, %f45, %f46;
$Lt_0_2050:
    .loc    14  5   0
    ld.param.u64    %rd2, [__cudaparm__Z6kernelPfS__out];
    st.global.f32   [%rd2+0], %f44;
    .loc    14  6   0
    exit;
$LDWend__Z6kernelPfS_:
    } // _Z6kernelPfS_

* EDIT *

Just to be complete. Here the CUDA C kernel that I compiled into the above PTX:

__global__ void kernel(float *out, float *in)
{
  *out = log( *in );
}

Upvotes: 4

Views: 857

Answers (1)

Aki Suihkonen
Aki Suihkonen

Reputation: 20037

The function appears to calculate log2 of a floating point number by setting the exponent part to 1 (effectively scaling to range 1<x<2) and then calculating a 3rd degree polynomial approximation. EDIT: appears to be rational Pade approximation, as Taylor series for log(1+x) converges badly. Thus calculating the reciprocal.

Probably no more than a handful of instruction could be shaved off. (the code is multiplying with 0.5 instead of subtracting from exponent and such trivial stuff. e.g. testing argument x<=0.)

Upvotes: 6

Related Questions