Helmut Wollmersdorfer
Helmut Wollmersdorfer

Reputation: 451

Is it reasonable that a static array is 700 times slower in this particular case?

I must admit that I'm not very experienced in C. At my attempt to port my Perl module LCS::BV to C and tune the code I got in one session a surprising rate of 5 G iterations per second (compared to 12 M/s before).

Now during isolating the cause the difference is ~50 G/s to ~70 M/s on an Intel Core i7-4770HQ.

To be sure that clang does not unroll the loop for the benchmark, I can use _mm_popcnt_u64(~v) and then decompile:

$ otool -tv lcstest > lcstest.dump.txt
$ grep popcnt lcstest.dump.txt
0000000100000e26    popcntq %rax, %rax

I'm not sure if something is wrong with the code or the method of the benchmark. See the annotated code (also available on GitHub):

#include <stdio.h>
#include <limits.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <nmmintrin.h>

static const uint64_t width = 64;

int count_bits(uint64_t bits) {
  bits = bits - ((bits >> 1) & 0x5555555555555555ull);
  bits = (bits & 0x3333333333333333ull) + ((bits >> 2) & 0x3333333333333333ull);
  // (bytesof(bits) -1) * bitsofbyte = (8-1)*8 = 56 -------------------------------vv
  return ((bits + (bits >> 4) & 0x0f0f0f0f0f0f0f0full) * 0x0101010101010101ull) >> 56;
}

int llcs_asci (char * a, char * b, uint32_t alen, uint32_t blen) {
    // static uint64_t posbits[128] = { 0 }; //    73.4 (M/sec)
    // uint64_t posbits[128] = { 0 };        // 53050.4 (M/sec)
    uint64_t posbits[128];                   // 56338.0 (M/sec)

    uint64_t i;   
    for (i=0; i < 128; i++) { posbits[i] = 0; } // needed

    for (i=0; i < alen; i++) {
        posbits[(unsigned int)a[i]] |= 0x1ull << (i % width);
    }    

    uint64_t v = ~0ull;

    for (i=0; i < blen; i++) {
      uint64_t p = posbits[(unsigned int)b[i]];
      uint64_t u = v & p;
      v = (v + u) | (v - u);
    }

    return count_bits(~v);  // portable
    //return _mm_popcnt_u64(~v); 
}

int main (void) {
    clock_t tic;
    clock_t toc;
    double elapsed;
    double rate;

    uint64_t count;
    uint64_t megacount;
    uint32_t iters     = 1000000;
    uint32_t megaiters = 1;

    // m=10, n=11, llcs=7, d=4, sim=0.667
    char str1[] = "Choerephon";
    char str2[] = "Chrerrplzon";

    uint32_t len1 = strlen(str1);
    uint32_t len2 = strlen(str2);

    int length_lcs;

    /* ########## llcs_asci ########## */

    tic = clock();

    megaiters = 20;

    for (megacount = 0; megacount < megaiters; megacount++) {
      for (count = 0; count < iters; count++) {
          length_lcs = llcs_asci (str1, str2, len1, len2);
      }
    }

    toc = clock();
    elapsed = (double)(toc - tic) / (double)CLOCKS_PER_SEC;
    rate    = (double)megaiters / (double)elapsed;

    // need to use the result to avoid loop unrolling ---------------------vv
    printf("[llcs_asci] iters: %u M Elapsed: %f s Rate: %.1f (M/sec) llcs: %u\n", 
        megaiters, elapsed, rate, length_lcs);

    /* #################### */

    return 0;

}

Upvotes: 2

Views: 118

Answers (1)

Alex Lop.
Alex Lop.

Reputation: 6875

Try to look at the disassembly of your code in both cases and compare it.

No static - https://godbolt.org/z/T2m7CQ

With static array - https://godbolt.org/z/naZWEY

Take a look at the main function! The compiler understands that all the loop iterations do not have any side effects and actually only the last iteration affects the length_lcs, so the compiler removes this code:

for (megacount = 0; megacount < megaiters; megacount++) {
  for (count = 0; count < iters; count++) {
      length_lcs = llcs_asci (str1, str2, len1, len2);
  }
}

main:                                   # @main
        push    r14
        push    rbx
        push    rax
        xor     ebx, ebx
        call    clock
        mov     r14, rax
.LBB2_1:                                # =>This Loop Header: Depth=1
        mov     eax, 1000000
.LBB2_2:                                #   Parent Loop BB2_1 Depth=1
        add     rax, -25
        jne     .LBB2_2
        add     rbx, 1
        cmp     rbx, 20
        jne     .LBB2_1
        call    clock
        sub     rax, r14
        cvtsi2sd        xmm0, rax
        divsd   xmm0, qword ptr [rip + .LCPI2_0]
        movsd   xmm1, qword ptr [rip + .LCPI2_1] # xmm1 = mem[0],zero
        divsd   xmm1, xmm0
        mov     edi, offset .L.str
        mov     esi, 20
        mov     edx, 7
        mov     al, 2
        call    printf
        xor     eax, eax
        add     rsp, 8
        pop     rbx
        pop     r14
        ret

However when the static array is used, the compiler has to update the values of the static array anyway, since it has to "remember" the latest values. Although the whole array is cleared on each entry, the memory allocated for posbits (when defined as static array) may be examined/tested (by debuggers for example) even at the moment when the execution is outside the llcs_asci function.

main:                                   # @main
        push    r15
        push    r14
        push    rbx
        xor     r15d, r15d
        call    clock
        mov     r14, rax
.LBB2_1:                                # =>This Loop Header: Depth=1
        mov     ebx, 1000000
.LBB2_2:                                #   Parent Loop BB2_1 Depth=1
        mov     edi, offset llcs_asci.posbits
        mov     edx, 1024
        xor     esi, esi
        call    memset
        mov     qword ptr [rip + llcs_asci.posbits+536], 1
        mov     qword ptr [rip + llcs_asci.posbits+912], 16
        mov     qword ptr [rip + llcs_asci.posbits+808], 40
        mov     qword ptr [rip + llcs_asci.posbits+896], 64
        mov     qword ptr [rip + llcs_asci.posbits+832], 130
        movaps  xmm0, xmmword ptr [rip + .LCPI2_0] # xmm0 = [512,260]
        movaps  xmmword ptr [rip + llcs_asci.posbits+880], xmm0
        mov     edi, offset llcs_asci.posbits
        mov     edx, 1024
        xor     esi, esi
        call    memset
        movaps  xmm0, xmmword ptr [rip + .LCPI2_0] # xmm0 = [512,260]
        mov     qword ptr [rip + llcs_asci.posbits+536], 1
        mov     qword ptr [rip + llcs_asci.posbits+912], 16
        mov     qword ptr [rip + llcs_asci.posbits+808], 40
        mov     qword ptr [rip + llcs_asci.posbits+896], 64
        mov     qword ptr [rip + llcs_asci.posbits+832], 130
        movaps  xmmword ptr [rip + llcs_asci.posbits+880], xmm0
        add     rbx, -2
        jne     .LBB2_2
        add     r15, 1
        cmp     r15, 20
        jne     .LBB2_1
        call    clock
        sub     rax, r14
        xorps   xmm0, xmm0
        cvtsi2sd        xmm0, rax
        divsd   xmm0, qword ptr [rip + .LCPI2_1]
        movsd   xmm1, qword ptr [rip + .LCPI2_2] # xmm1 = mem[0],zero
        divsd   xmm1, xmm0
        mov     edi, offset .L.str
        mov     esi, 20
        mov     edx, 7
        mov     al, 2
        call    printf
        xor     eax, eax
        pop     rbx
        pop     r14
        pop     r15
        ret

Please note, these optimization are available because the function which performs the benchmark (main in this case) and the functions which under the test (llcs_asci), are in the same compilation unit, which allows the compiler to inline the code and perform more aggressive optimizations. Try moving llcs_asci function to a different file and create a dependency between the iterations (by, for example, bitwise OR between the results length_lcs |= llcs_asci (str1, str2, len1, len2);. Then rerun your benchmark and I believe you will get more or less the same results.

Upvotes: 7

Related Questions