Reputation: 7729
I am trying to implement the natural logarithm with PTX. PTX natively only provides lg2.approx.f32
which implements the logarithm to base 2. Thus, applying simple maths one can get the natural logarithm by just multiplying the logarithm to base 2 with the logarithm to base 2 of Euler's number e:
log_e(a) = log_2(a) / lg_2(e)
To a first approximation 1/lg_2(e)
is 0.693147
. So, I would just multiply with this number.
I had nvcc
compile the log
function (from CUDA C) into PTX (please find the output below). I can see multiplying with the numerical constant at the end. But there is a lot more stuff going on. Is this important? Can someone explain why there is so much overhead?
.entry _Z6kernelPfS_ (
.param .u64 __cudaparm__Z6kernelPfS__out,
.param .u64 __cudaparm__Z6kernelPfS__in)
{
.reg .u32 %r<13>;
.reg .u64 %rd<4>;
.reg .f32 %f<48>;
.reg .pred %p<4>;
.loc 14 3 0
$LDWbegin__Z6kernelPfS_:
.loc 14 5 0
ld.param.u64 %rd1, [__cudaparm__Z6kernelPfS__in];
ld.global.f32 %f1, [%rd1+0];
.loc 16 9365 0
mov.f32 %f2, 0f00000000; // 0
set.gt.u32.f32 %r1, %f1, %f2;
neg.s32 %r2, %r1;
mov.f32 %f3, 0f7f800000; // ((1.0F)/(0.0F))
set.lt.u32.f32 %r3, %f1, %f3;
neg.s32 %r4, %r3;
and.b32 %r5, %r2, %r4;
mov.u32 %r6, 0;
setp.eq.s32 %p1, %r5, %r6;
@%p1 bra $Lt_0_2306;
.loc 16 8512 0
mov.b32 %r7, %f1;
and.b32 %r8, %r7, -2139095041;
or.b32 %r9, %r8, 1065353216;
mov.b32 %f4, %r9;
mov.f32 %f5, %f4;
.loc 16 8513 0
shr.u32 %r10, %r7, 23;
sub.u32 %r11, %r10, 127;
mov.f32 %f6, 0f3fb504f3; // 1.41421
setp.gt.f32 %p2, %f4, %f6;
@!%p2 bra $Lt_0_2562;
.loc 16 8515 0
mov.f32 %f7, 0f3f000000; // 0.5
mul.f32 %f5, %f4, %f7;
.loc 16 8516 0
add.s32 %r11, %r11, 1;
$Lt_0_2562:
.loc 16 8429 0
mov.f32 %f8, 0fbf800000; // -1
add.f32 %f9, %f5, %f8;
mov.f32 %f10, 0f3f800000; // 1
add.f32 %f11, %f5, %f10;
neg.f32 %f12, %f9;
div.approx.f32 %f13, %f9, %f11;
mul.rn.f32 %f14, %f12, %f13;
add.rn.f32 %f15, %f9, %f14;
mul.f32 %f16, %f15, %f15;
mov.f32 %f17, 0f3b2063c3; // 0.00244735
mov.f32 %f18, %f17;
mov.f32 %f19, %f16;
mov.f32 %f20, 0f3c4c4be0; // 0.0124693
mov.f32 %f21, %f20;
mad.f32 %f22, %f18, %f19, %f21;
mov.f32 %f23, %f22;
mov.f32 %f24, %f23;
mov.f32 %f25, %f16;
mov.f32 %f26, 0f3daaab50; // 0.0833346
mov.f32 %f27, %f26;
mad.f32 %f28, %f24, %f25, %f27;
mov.f32 %f29, %f28;
mul.f32 %f30, %f16, %f29;
mov.f32 %f31, %f30;
mov.f32 %f32, %f15;
mov.f32 %f33, %f14;
mad.f32 %f34, %f31, %f32, %f33;
mov.f32 %f35, %f34;
cvt.rn.f32.s32 %f36, %r11;
mov.f32 %f37, %f36;
mov.f32 %f38, 0f3f317218; // 0.693147
mov.f32 %f39, %f38;
add.f32 %f40, %f9, %f35;
mov.f32 %f41, %f40;
mad.f32 %f42, %f37, %f39, %f41;
mov.f32 %f43, %f42;
.loc 16 8523 0
mov.f32 %f44, %f43;
bra.uni $Lt_0_2050;
$Lt_0_2306:
.loc 16 8526 0
lg2.approx.f32 %f45, %f1;
mov.f32 %f46, 0f3f317218; // 0.693147
mul.f32 %f44, %f45, %f46;
$Lt_0_2050:
.loc 14 5 0
ld.param.u64 %rd2, [__cudaparm__Z6kernelPfS__out];
st.global.f32 [%rd2+0], %f44;
.loc 14 6 0
exit;
$LDWend__Z6kernelPfS_:
} // _Z6kernelPfS_
* EDIT *
Just to be complete. Here the CUDA C kernel that I compiled into the above PTX:
__global__ void kernel(float *out, float *in)
{
*out = log( *in );
}
Upvotes: 4
Views: 857
Reputation: 20037
The function appears to calculate log2 of a floating point number by setting the exponent part to 1 (effectively scaling to range 1<x<2
) and then calculating a 3rd degree polynomial approximation. EDIT: appears to be rational Pade approximation, as Taylor series for log(1+x) converges badly. Thus calculating the reciprocal.
Probably no more than a handful of instruction could be shaved off. (the code is multiplying with 0.5 instead of subtracting from exponent and such trivial stuff. e.g. testing argument x<=0.)
Upvotes: 6