vls
vls

Reputation: 243

What is the fastest way for bit operations to calculate a parity?

My solution (for every bit of the input block, there is such a line):

*parity ^= (((x[0] >> 30) & 0x00000001) * 0xc3e0d69f);

All types are uint32. This line takes the second bit of the input x, shifts it to the LSB and sets all other bits to zero. Then, the 32-bit parity is XORed with the corresponding parity set for this bit.

I found that this multiplication solution is the fastest way to do this conditional XOR. Is there a faster way?

Upvotes: 4

Views: 17340

Answers (6)

Jack G
Jack G

Reputation: 5322

The stanford bithacks code is very suboptimal. For completeness, here is the best code across all platforms and all environments:

#include <stdint.h>

#if !defined(__wasm__) && defined(__cplusplus) && defined(__cpp_lib_bitops)
#   include <bit>
#   define paritytree_parity32(x) (std::popcount(x) & 1)
#elif !defined(__wasm__) && defined(__has_builtin)
#   if __has_builtin(__builtin_parity)
#       if INT_MAX >= INT32_MAX
#           define paritytree_parity32(x) __builtin_parity(x)
#       else
#           define paritytree_parity32(x) __builtin_parityl(x)
#       endif
#   endif
#endif
#if !defined(__wasm__) && defined(_MSC_VER) && defined(_M_X64) && !defined(paritytree_parity32)
#   include <intrin.h>
#   define paritytree_parity32(x) __popcnt(x)
#elif !defined(paritytree_parity32)
    static inline uint32_t paritytree_parity32(uint32_t v) {
        // On x86: 2 lea, 2 and, 1 xor, 1 mul, 1 shr, and 0 mov(!!!)
        // On arm64: 2 and, 1 add, 1 eor, 1 mul, 1 lsr, and 1 mov(!!!)
        v = (v ^ (v << 1)) & 0xAAAAAAAA;
        return (v*5 & UINT32_C(0x88888888)) * UINT32_C(0x11111111) >> 31;
    }
#endif

Here is the test C code used to reveal the assembly:

uint8_t fastParity(uint32_t x) {
    return paritytree_parity32(x);
}

On x86_64, Clang 17 and GCC 13 both produce this brilliant piece. Notice how it uses x86's 16-bit half-registers to remove 2 instructions for moving and shifting. Also notice setnp, which is a little-known x86 gem that gets the value of the parity flag register affected by xor. Yes!, every 16-bit xor sets the parity flag based upon the parity of its destination result. See https://c9x.me/x86/html/file_module_x86_id_288.html.

; Clang 17 and GCC 13 on x86_64
fastParity:
        mov     eax, edi
        shr     eax, 16
        xor     eax, edi
        xor     al, ah
        setnp   al
        ret

On x86_64 with MSVC 2019, it produces this. Note that popcnt requires at least 2008 Intel Nehalem (1st generation) and Windows 10 requires at least 2017 Intel Coffeelake (8th generation), so running on unsupported hardware isn't a concern.

; MSVC 2019 on x86_64
fastParity PROC
        popcnt  eax, ecx
        ret     0
fastParity ENDP

GCC on arm64 produces this:

; gcc 13 on arm64
fastParity:
        fmov    s0, w0
        cnt     v0.8b, v0.8b
        addv    b0, v0.8b
        fmov    w0, s0
        and     w0, w0, 1
        ret

Clang on arm64 produces this:

; clang 17 on arm64
fastParity:                          // @fastParity
        eor     w8, w0, w0, lsr #16
        eor     w8, w8, w8, lsr #8
        eor     w8, w8, w8, lsr #4
        eor     w8, w8, w8, lsr #2
        eor     w8, w8, w8, lsr #1
        and     w0, w8, #0x1
        ret

MSVC on arm64 produces this:

; MSVC 2019 on arm64
|fastParity| PROC
        eor         w8,w0,w0,lsl #1
        and         w8,w8,#0xAAAAAAAA
        add         w8,w8,w8,lsl #2
        and         w9,w8,#0x88888888
        mov         w8,#0x11111111
        mul         w8,w9,w8
        lsr         w0,w8,#0x1F
        ret

        ENDP  ; |fastParity|

The generated WebAssembly is:

; WebAssembly
 (func $fastParity (; 1 ;) (param $0 i32) (result i32)
  (i32.shr_u
   (i32.mul
    (i32.and
     (i32.mul
      (i32.and
       (i32.xor
        (i32.shl
         (get_local $0)
         (i32.const 1)
        )
        (get_local $0)
       )
       (i32.const -1431655766)
      )
      (i32.const 5)
     )
     (i32.const -2004318072)
    )
    (i32.const 286331153)
   )
   (i32.const 31)
  )
 )

This WebAssembly has no memory or floating point access that requires extra checker code, so it should get compiled into this nice small x86_64:

; Webassembly ran on a x86_64
fastParity:
        lea     eax, [rdi+rdi]
        xor     eax, edi
        and     eax, -1431655766
        lea     eax, [rax+rax*4]
        and     eax, -2004318072
        imul    eax, eax, 286331153
        shr     eax, 31
        ret

And this on arm64:

; Webassembly ran on an arm64
fastParity:
        eor     w9, w0, w0, lsl #1
        mov     w8, #286331153
        and     w9, w9, #0xaaaaaaaa
        add     w9, w9, w9, lsl #2
        and     w9, w9, #0x88888888
        mul     w8, w9, w8
        lsr     w0, w8, #31
        ret

For comparison, here is the very suboptimal C code and generated assembly by the Stanford bithacks page. There's 3 extra assembly instructions and no testing for faster hardware-specific parity instructions.

uint32_t stanfordParityDoNotUse(uint32_t v) {
    v ^= v >> 1;
    v ^= v >> 2;
    v = (v & 0x11111111U) * 0x11111111U;
    return (v >> 28) & 1;
}
stanfordParityDoNotUse:
        mov     edx, edi
        shr     edx
        xor     edx, edi
        mov     eax, edx
        shr     eax, 2
        xor     eax, edx
        and     eax, 286331153
        imul    eax, eax, 286331153
        shr     eax, 28
        and     eax, 1
        ret

Upvotes: 0

kimstik
kimstik

Reputation: 1889

A parity calculation task is equivalent of counting of ones. Also it called as "count set bits", "population count" or simply popcount. Some of processors have an efficient instruction to calculate it (POPCNT,VCNT).

I will suggest to use the lowest bit of popcount.

It can be accessed by inline assembler or by using builtins:

__builtin_popcount()/ __popcnt()/ std::bitset::count()

for GCC, Visual Studio, and C++.

Personally I prefer to give this job to the compiler by using __builtin_parity().

Upvotes: 3

ruslik
ruslik

Reputation: 14880

If I understand the question correctly, you are doing

for (i = 0; i < 32; i++)
    *parity ^= (((x[0] >> i) & 1) * SOME_CONST[i]); 

If so, it's better to use lookup tables:

for (i = 0; i < 4; i++)
    *parity ^= PARITY_LUT[i][ (x[0] >> (i*8)) & 0xFF];

It would cost 256 kilobytes, but it will be much faster.

Upvotes: 0

The Archetypal Paul
The Archetypal Paul

Reputation: 41769

See Compute parity in parallel for some neat hacks for calculating parity of a word, byte, etc.

Upvotes: 5

Vovanium
Vovanium

Reputation: 3898

I do not completely understand what kind of parity you mean, but if this line of code is doing that you want, it may be improved.

General rule: for x in {0, 1} x * N == -x & N

this because -x for 0 is all bits reset and for 1 is -1 in which all bits set.

So original line of code may be rewritten as:

*parity ^= (-((x[0] >> 30) & 0x00000001) & 0xc3e0d69f);

What two operations computed in less time than multiplication on many microprocessors, but you should check this.

Also code may take advantage of signed shift right

*parity ^= (((int32_t)x[0] << 1 >> 31) & 0xc3e0d69f);

First shift rshifts 30th bit into 31st, which is sign bit, and then second extend sign bit on all others as shift right on most machines act as floor(x / 2N), thus fill shifted in bits with sign bit (abc...yz>>3 == aaaabc...yz).

But these tricks are stated as undefined behaviour in C standard and thus not portable. Use them carefully.

Upvotes: 4

nmichaels
nmichaels

Reputation: 50971

Some processors will do this for you. See x86's parity flag.

Upvotes: 2

Related Questions