Reputation: 263118
The compare
function is a function that takes two arguments a
and b
and returns an integer describing their order. If a
is smaller than b
, the result is some negative integer. If a
is bigger than b
, the result is some positive integer. Otherwise, a
and b
are equal, and the result is zero.
This function is often used to parameterize sorting and searching algorithms from standard libraries.
Implementing the compare
function for characters is quite easy; you simply subtract the arguments:
int compare_char(char a, char b)
{
return a - b;
}
This works because the difference between two characters is generally assumed to fit into an integer. (Note that this assumption does not hold for systems where sizeof(char) == sizeof(int)
.)
This trick cannot work to compare integers, because the difference between two integers generally does not fit into an integer. For example, INT_MAX - (-1) = INT_MIN
suggests that INT_MAX
is smaller than -1
(technically, the overflow leads to undefined behavior, but let's assume modulo arithmetic).
So how can we implement the compare function efficiently for integers? Here is my first attempt:
int compare_int(int a, int b)
{
int temp;
int result;
__asm__ __volatile__ (
"cmp %3, %2 \n\t"
"mov $0, %1 \n\t"
"mov $1, %0 \n\t"
"cmovg %0, %1 \n\t"
"mov $-1, %0 \n\t"
"cmovl %0, %1 \n\t"
: "=r"(temp), "=r"(result)
: "r"(a), "r"(b)
: "cc");
return result;
}
Can it be done in less than 6 instructions? Is there a less straightforward way that is more efficient?
Upvotes: 65
Views: 44088
Reputation: 70402
The following has always proven to be fairly efficient for me:
return (a < b) ? -1 : (a > b);
With gcc -O2 -S
, this compiles down to the following five instructions:
xorl %edx, %edx
cmpl %esi, %edi
movl $-1, %eax
setg %dl
cmovge %edx, %eax
As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.
./opt_cmp_branchless: 0m1.070s
./opt_cmp_branch: 0m1.037s
I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.
The following is the version with the cmovge
instruction ((a < b) ? -1 : (a > b)
):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
orl $-1, %edi
.L12:
xorl %ebp, %ebp
.p2align 4,,10
.p2align 3
.L18:
movl arr.2789(%rbp), %ecx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L15:
movl arr.2789(%rax), %edx
xorl %ebx, %ebx
cmpl %ecx, %edx
movl $-1, %edx
setg %bl
cmovge %ebx, %edx
addq $4, %rax
addl %edx, %esi
cmpq $4096, %rax
jne .L15
addq $4, %rbp
cmpq $4096, %rbp
jne .L18
addl $1, %r8d
cmpl $500, %r8d
jne .L12
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",@progbits
The version below uses the branchless method ((a > b) - (a < b)
):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, @function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
.L19:
movl %ebp, %ebx
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L24:
movl %ebp, %ecx
xorl %eax, %eax
jmp .L22
.p2align 4,,10
.p2align 3
.L20:
movl arr.2789(%rax), %ecx
.L22:
xorl %edx, %edx
cmpl %ebx, %ecx
setg %cl
setl %dl
movzbl %cl, %ecx
subl %ecx, %edx
addl %edx, %esi
addq $4, %rax
cmpq $4096, %rax
jne .L20
addq $4, %rdi
cmpq $4096, %rdi
je .L21
movl arr.2789(%rdi), %ebx
jmp .L24
.L21:
addl $1, %r8d
cmpl $500, %r8d
jne .L19
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",@progbits
Upvotes: 57
Reputation: 24647
This code has no branches and uses 5 instructions. It may outperform other branch-less alternatives on recent Intel processors, where cmov* instructions are quite expensive. Disadvantage is non-symmetrical return value (INT_MIN+1, 0, 1).
int compare_int (int a, int b)
{
int res;
__asm__ __volatile__ (
"xor %0, %0 \n\t"
"cmpl %2, %1 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "=q"(res)
: "r"(a)
, "r"(b)
: "cc"
);
return res;
}
This variant does not need initialization, so it uses only 4 instructions:
int compare_int (int a, int b)
{
__asm__ __volatile__ (
"subl %1, %0 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "+q"(a)
: "r"(b)
: "cc"
);
return a;
}
Upvotes: 3
Reputation: 212969
For what it's worth I put together an SSE2 implementation. vec_compare1
uses the same approach as compare2
but requires just three SSE2 arithmetic instructions:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE vec_compare1
#define USE_RAND 1
int arr[COUNT] __attribute__ ((aligned(16)));
typedef __m128i vSInt32;
vSInt32 vec_compare1 (vSInt32 va, vSInt32 vb)
{
vSInt32 vcmp1 = _mm_cmpgt_epi32(va, vb);
vSInt32 vcmp2 = _mm_cmpgt_epi32(vb, va);
return _mm_sub_epi32(vcmp2, vcmp1);
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
vSInt32 vsum = _mm_set1_epi32(0);
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j+=4) {
vSInt32 v1 = _mm_loadu_si128(&arr[i]);
vSInt32 v2 = _mm_load_si128(&arr[j]);
vSInt32 v = COMPARE(v1, v2);
vsum = _mm_add_epi32(vsum, v);
}
}
}
printf("vsum = %vd\n", vsum);
return 0;
}
Time for this is 0.137s.
Time for compare2 with the same CPU and compiler is 0.674s.
So the SSE2 implementation is around 4x faster, as might be expected (since it's 4-wide SIMD).
Upvotes: 10
Reputation: 8095
This one has no branches, and doesn't suffer from overflow or underflow:
return (a > b) - (a < b);
With gcc -O2 -S
, this compiles down to the following six instructions:
xorl %eax, %eax
cmpl %esi, %edi
setl %dl
setg %al
movzbl %dl, %edx
subl %edx, %eax
Here's some code to benchmark various compare implementations:
#include <stdio.h>
#include <stdlib.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE compare2
#define USE_RAND 1
int arr[COUNT];
int compare1 (int a, int b)
{
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
int compare2 (int a, int b)
{
return (a > b) - (a < b);
}
int compare3 (int a, int b)
{
return (a < b) ? -1 : (a > b);
}
int compare4 (int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
int sum = 0;
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j++) {
sum += COMPARE(arr[i], arr[j]);
}
}
}
printf("%d=0\n", sum);
return 0;
}
The results on my 64-bit system, compiled with gcc -std=c99 -O2
, for positive integers (USE_RAND=1
):
compare1: 0m1.118s
compare2: 0m0.756s
compare3: 0m1.101s
compare4: 0m0.561s
Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge
).
Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.
With a full range of integers (USE_RAND=0
), the 4-instruction solution is in fact very slow (others are the same):
compare4: 0m1.897s
Upvotes: 97
Reputation: 263118
Okay, I managed to get it down to four instructions :) The basic idea is as follows:
Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.
Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
00000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
11111111 shifted
Shifting the carry bit in would yield 0 for the first case (although INT_MIN
is not equal to INT_MAX
) and some negative number for the second case (although INT_MAX
is not smaller than INT_MIN
).
But if we flip the carry bit before doing the shift, we get sensible numbers:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
100000001 carry flipped
10000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
011111111 carry flipped
01111111 shifted
I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.
int compare_int(int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?
Upvotes: 16
Reputation: 28251
Maybe you can use the following idea (in pseudo-code; didn't write asm-code because i am not comfortable with syntax):
result = a - b
)jo
instruction and branch prediction should work very well here)return (a < b) ? -1 : (a > b)
)Edit: for additional simplicity: if there was overflow, flip the sign of the result, instead of step 3.
Upvotes: 0