Reputation: 451
I must admit that I'm not very experienced in C. At my attempt to port my Perl module LCS::BV to C and tune the code I got in one session a surprising rate of 5 G iterations per second (compared to 12 M/s before).
Now during isolating the cause the difference is ~50 G/s to ~70 M/s on an Intel Core i7-4770HQ.
To be sure that clang does not unroll the loop for the benchmark, I can use _mm_popcnt_u64(~v)
and then decompile:
$ otool -tv lcstest > lcstest.dump.txt
$ grep popcnt lcstest.dump.txt
0000000100000e26 popcntq %rax, %rax
I'm not sure if something is wrong with the code or the method of the benchmark. See the annotated code (also available on GitHub):
#include <stdio.h>
#include <limits.h>
#include <time.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <nmmintrin.h>
static const uint64_t width = 64;
int count_bits(uint64_t bits) {
bits = bits - ((bits >> 1) & 0x5555555555555555ull);
bits = (bits & 0x3333333333333333ull) + ((bits >> 2) & 0x3333333333333333ull);
// (bytesof(bits) -1) * bitsofbyte = (8-1)*8 = 56 -------------------------------vv
return ((bits + (bits >> 4) & 0x0f0f0f0f0f0f0f0full) * 0x0101010101010101ull) >> 56;
}
int llcs_asci (char * a, char * b, uint32_t alen, uint32_t blen) {
// static uint64_t posbits[128] = { 0 }; // 73.4 (M/sec)
// uint64_t posbits[128] = { 0 }; // 53050.4 (M/sec)
uint64_t posbits[128]; // 56338.0 (M/sec)
uint64_t i;
for (i=0; i < 128; i++) { posbits[i] = 0; } // needed
for (i=0; i < alen; i++) {
posbits[(unsigned int)a[i]] |= 0x1ull << (i % width);
}
uint64_t v = ~0ull;
for (i=0; i < blen; i++) {
uint64_t p = posbits[(unsigned int)b[i]];
uint64_t u = v & p;
v = (v + u) | (v - u);
}
return count_bits(~v); // portable
//return _mm_popcnt_u64(~v);
}
int main (void) {
clock_t tic;
clock_t toc;
double elapsed;
double rate;
uint64_t count;
uint64_t megacount;
uint32_t iters = 1000000;
uint32_t megaiters = 1;
// m=10, n=11, llcs=7, d=4, sim=0.667
char str1[] = "Choerephon";
char str2[] = "Chrerrplzon";
uint32_t len1 = strlen(str1);
uint32_t len2 = strlen(str2);
int length_lcs;
/* ########## llcs_asci ########## */
tic = clock();
megaiters = 20;
for (megacount = 0; megacount < megaiters; megacount++) {
for (count = 0; count < iters; count++) {
length_lcs = llcs_asci (str1, str2, len1, len2);
}
}
toc = clock();
elapsed = (double)(toc - tic) / (double)CLOCKS_PER_SEC;
rate = (double)megaiters / (double)elapsed;
// need to use the result to avoid loop unrolling ---------------------vv
printf("[llcs_asci] iters: %u M Elapsed: %f s Rate: %.1f (M/sec) llcs: %u\n",
megaiters, elapsed, rate, length_lcs);
/* #################### */
return 0;
}
Upvotes: 2
Views: 118
Reputation: 6875
Try to look at the disassembly of your code in both cases and compare it.
No static - https://godbolt.org/z/T2m7CQ
With static array - https://godbolt.org/z/naZWEY
Take a look at the main
function! The compiler understands that all the loop iterations do not have any side effects and actually only the last iteration affects the length_lcs
, so the compiler removes this code:
for (megacount = 0; megacount < megaiters; megacount++) {
for (count = 0; count < iters; count++) {
length_lcs = llcs_asci (str1, str2, len1, len2);
}
}
main: # @main
push r14
push rbx
push rax
xor ebx, ebx
call clock
mov r14, rax
.LBB2_1: # =>This Loop Header: Depth=1
mov eax, 1000000
.LBB2_2: # Parent Loop BB2_1 Depth=1
add rax, -25
jne .LBB2_2
add rbx, 1
cmp rbx, 20
jne .LBB2_1
call clock
sub rax, r14
cvtsi2sd xmm0, rax
divsd xmm0, qword ptr [rip + .LCPI2_0]
movsd xmm1, qword ptr [rip + .LCPI2_1] # xmm1 = mem[0],zero
divsd xmm1, xmm0
mov edi, offset .L.str
mov esi, 20
mov edx, 7
mov al, 2
call printf
xor eax, eax
add rsp, 8
pop rbx
pop r14
ret
However when the static
array is used, the compiler has to update the values of the static
array anyway, since it has to "remember" the latest values. Although the whole array is cleared on each entry, the memory allocated for posbits
(when defined as static array) may be examined/tested (by debuggers for example) even at the moment when the execution is outside the llcs_asci
function.
main: # @main
push r15
push r14
push rbx
xor r15d, r15d
call clock
mov r14, rax
.LBB2_1: # =>This Loop Header: Depth=1
mov ebx, 1000000
.LBB2_2: # Parent Loop BB2_1 Depth=1
mov edi, offset llcs_asci.posbits
mov edx, 1024
xor esi, esi
call memset
mov qword ptr [rip + llcs_asci.posbits+536], 1
mov qword ptr [rip + llcs_asci.posbits+912], 16
mov qword ptr [rip + llcs_asci.posbits+808], 40
mov qword ptr [rip + llcs_asci.posbits+896], 64
mov qword ptr [rip + llcs_asci.posbits+832], 130
movaps xmm0, xmmword ptr [rip + .LCPI2_0] # xmm0 = [512,260]
movaps xmmword ptr [rip + llcs_asci.posbits+880], xmm0
mov edi, offset llcs_asci.posbits
mov edx, 1024
xor esi, esi
call memset
movaps xmm0, xmmword ptr [rip + .LCPI2_0] # xmm0 = [512,260]
mov qword ptr [rip + llcs_asci.posbits+536], 1
mov qword ptr [rip + llcs_asci.posbits+912], 16
mov qword ptr [rip + llcs_asci.posbits+808], 40
mov qword ptr [rip + llcs_asci.posbits+896], 64
mov qword ptr [rip + llcs_asci.posbits+832], 130
movaps xmmword ptr [rip + llcs_asci.posbits+880], xmm0
add rbx, -2
jne .LBB2_2
add r15, 1
cmp r15, 20
jne .LBB2_1
call clock
sub rax, r14
xorps xmm0, xmm0
cvtsi2sd xmm0, rax
divsd xmm0, qword ptr [rip + .LCPI2_1]
movsd xmm1, qword ptr [rip + .LCPI2_2] # xmm1 = mem[0],zero
divsd xmm1, xmm0
mov edi, offset .L.str
mov esi, 20
mov edx, 7
mov al, 2
call printf
xor eax, eax
pop rbx
pop r14
pop r15
ret
Please note, these optimization are available because the function which performs the benchmark (main
in this case) and the functions which under the test (llcs_asci
), are in the same compilation unit, which allows the compiler to inline the code and perform more aggressive optimizations. Try moving llcs_asci
function to a different file and create a dependency between the iterations (by, for example, bitwise OR between the results length_lcs |= llcs_asci (str1, str2, len1, len2);
. Then rerun your benchmark and I believe you will get more or less the same results.
Upvotes: 7