Why sinf + cosf far more quickly than sincosf, even though the assemble is also used sincosf?

Question

I am do some testing of math library on Android platform, use armeabi-v7a. And I want to know how much difference in time between using "sinf + cosf" and using "sincosf". And I used below codes:

    // testing sinf + cosf and sincosf time consumed
    const int count = 15;
    const float dx  = (M_PI * 2.0f) / (float)count;
    const int loopcnt = 50000;
    float sin, cos = 0.0f;
    tz_millisecond_t t1 = 0;
    // testing 50000 x 15 times "sinf + cosf"
    tz_millisecond_t t0 = tz_get_uptime();
    for(int i = 0; i < loopcnt; ++i) {
      for (float x = -M_PI; x <= M_PI; x += dx) {
        sin = sinf(x);
        cos = cosf(x);
      }
    }
    t1 = tz_get_uptime();
    LOGD("sinf + cosf - %llu ms, sin %f, cos %f", t1 - t0, sin, cos);
    // testing 50000 x 15 times "sincosf"
    t0 = tz_get_uptime();
    for(int i = 0; i < loopcnt; ++i) {
      for (float x = -M_PI; x <= M_PI; x += dx) {
        sincosf(x, &sin, &cos);
      }
    }
    t1 = tz_get_uptime();
    LOGD("sincosf - %llu ms, sin %f, cos %f", t1 - t0, sin, cos);

the result is:

sinf + cosf - 2 ms, sin 0.406737, cos -0.913545
sincosf - 17 ms, sin 0.406737, cos -0.913545

That is surprising, why? And I searched somebody said the sincosf has more accuracy results and more instructions.

Ok, so I use objdump to watch the difference of assembly code as below:

 24a:   f7ff fffe   bl  0 
 24e:   ed9f 0a42   vldr    s0, [pc, #264]  ; 358 
 252:   4604        mov r4, r0
 254:   ed9f 8a41   vldr    s16, [pc, #260] ; 35c 
 258:   460d        mov r5, r1
 25a:   ed9f 9b3d   vldr    d9, [pc, #244]  ; 350 
 25e:   f24c 3050   movw    r0, #50000  ; 0xc350
 262:   eeb0 2a40   vmov.f32    s4, s0
 266:   eeb0 1a42   vmov.f32    s2, s4
 26a:   ee32 2a08   vadd.f32    s4, s4, s16
 26e:   eef7 0ac2   vcvt.f64.f32    d16, s4
 272:   eef4 0bc9   vcmpe.f64   d16, d9
 276:   eef1 fa10   vmrs    APSR_nzcv, fpscr
 27a:   d9f4        bls.n   266 
 27c:   3601        adds    r6, #1
 27e:   4286        cmp r6, r0
 280:   d1ef        bne.n   262 
 282:   ee11 0a10   vmov    r0, s2
 286:   a915        add r1, sp, #84 ; 0x54
 288:   aa14        add r2, sp, #80 ; 0x50
 28a:   f7ff fffe   bl  0 
 28e:   ed9d 0a14   vldr    s0, [sp, #80]   ; 0x50
 292:   ed9d 1a15   vldr    s2, [sp, #84]   ; 0x54
 296:   eeb7 aac0   vcvt.f64.f32    d10, s0
 29a:   eeb7 bac1   vcvt.f64.f32    d11, s2
 29e:   ed8d 0a16   vstr    s0, [sp, #88]   ; 0x58
 2a2:   ed8d 1a17   vstr    s2, [sp, #92]   ; 0x5c
 2a6:   f7ff fffe   bl  0 
 2aa:   4e36        ldr r6, [pc, #216]  ; (384 )
 2ac:   1b02        subs    r2, r0, r4
 2ae:   eb61 0305   sbc.w   r3, r1, r5
 2b2:   ed8d bb00   vstr    d11, [sp]
 2b6:   447e        add r6, pc
 2b8:   ed8d ab02   vstr    d10, [sp, #8]
 2bc:   4630        mov r0, r6
 2be:   f7ff fffe   bl  0 
 2c2:   2500        movs    r5, #0
 2c4:   f7ff fffe   bl  0 
 2c8:   ed9f aa23   vldr    s20, [pc, #140] ; 358 
 2cc:   ae17        add r6, sp, #92 ; 0x5c
 2ce:   ac16        add r4, sp, #88 ; 0x58
 2d0:   4680        mov r8, r0
 2d2:   468a        mov sl, r1
 2d4:   f24c 3b50   movw    fp, #50000  ; 0xc350
 2d8:   eeb0 ba4a   vmov.f32    s22, s20
 2dc:   ee1b 0a10   vmov    r0, s22
 2e0:   4631        mov r1, r6
 2e2:   4622        mov r2, r4
 2e4:   f7ff fffe   bl  0 
 2e8:   ee3b ba08   vadd.f32    s22, s22, s16
 2ec:   eef7 0acb   vcvt.f64.f32    d16, s22
 2f0:   eef4 0bc9   vcmpe.f64   d16, d9
 2f4:   eef1 fa10   vmrs    APSR_nzcv, fpscr
 2f8:   d9f0        bls.n   2dc 
 2fa:   3501        adds    r5, #1
 2fc:   455d        cmp r5, fp
 2fe:   d1eb        bne.n   2d8 
 300:   f7ff fffe   bl  0

Using tz_get_uptime as flag, we can see two code segments are using sincosf to finish the job. I am totaly confused, why the time consumed has so much differences.

Why sinf + cosf far more quickly than sincosf, even though the assemble is also used sincosf?

Answers (1)

Related Questions