Reputation: 3204

Count leading zeros in __m256i word

I'm tinkering around with AVX-2 instructions and I'm looking for a fast way to count the number of leading zeros in a __m256i word (which has 256 bits).

So far, I have figured out the following way:

// Computes the number of leading zero bits.
// Here, avx_word is of type _m256i.

if (!_mm256_testz_si256(avx_word, avx_word)) {
  uint64_t word = _mm256_extract_epi64(avx_word, 0);
  if (word > 0)
    return (__builtin_clzll(word));

  word = _mm256_extract_epi64(avx_word, 1);
  if (word > 0)
    return (__builtin_clzll(word) + 64);

  word = _mm256_extract_epi64(avx_word, 2);
  if (word > 0)
    return (__builtin_clzll(word) + 128);

  word = _mm256_extract_epi64(avx_word, 3);
  return (__builtin_clzll(word) + 192);
} else
  return 256; // word is entirely zero

However, I find it rather clumsy to figure out the exact non-zero word within the 256 bit register.

Does anybody know if there is a more elegant (or faster) way to do this?

Just as an additional information: I actually want to compute the index of the first set bit for arbitrarily long vectors created by logical ANDs, and I am comparing the performance of standard 64 bit operations with SSE and AVX-2 code. Here is my entire test code:

#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
#include <sys/time.h>
#include <stdalign.h>

#define ALL  0xFFFFFFFF
#define NONE 0x0


#define BV_SHIFTBITS ((size_t)    6)
#define BV_MOD_WORD  ((size_t)   63)
#define BV_ONE       ((uint64_t)  1)
#define BV_ZERO      ((uint64_t)  0)
#define BV_WORDSIZE  ((uint64_t) 64)


uint64_t*
Vector_new(
    size_t num_bits) {

  assert ((num_bits % 256) == 0);
  size_t num_words = num_bits >> BV_SHIFTBITS;
  size_t mod = num_bits & BV_MOD_WORD;
  if (mod > 0)
    assert (0);
  uint64_t* words;
  posix_memalign((void**) &(words), 32, sizeof(uint64_t) * num_words);
  for (size_t i = 0; i < num_words; ++i)
    words[i] = 0;
  return words;
}


void
Vector_set(
    uint64_t* vector,
    size_t pos) {

  const size_t word_index = pos >> BV_SHIFTBITS;
  const size_t offset     = pos & BV_MOD_WORD;
  vector[word_index] |= (BV_ONE << (BV_MOD_WORD - offset));
}


size_t
Vector_and_first_bit(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_words) {

  for (size_t i = 0; i < num_words; ++i) {
    uint64_t word = vectors[0][i];
    for (size_t j = 1; j < num_vectors; ++j)
      word &= vectors[j][i];
    if (word > 0)
      return (1 + i * BV_WORDSIZE + __builtin_clzll(word));
  }
  return 0;
}


size_t
Vector_and_first_bit_256(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 2;
    __m256i avx_word = _mm256_load_si256(
        (__m256i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm256_and_si256(
        avx_word,
        _mm256_load_si256((__m256i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm256_testz_si256(avx_word, avx_word)) {
      uint64_t word = _mm256_extract_epi64(avx_word, 0);
      const size_t shift = i << 8;
      if (word > 0)
        return (1 + shift + __builtin_clzll(word));

      word = _mm256_extract_epi64(avx_word, 1);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 64);

      word = _mm256_extract_epi64(avx_word, 2);
      if (word > 0)
        return (1 + shift + __builtin_clzll(word) + 128);

      word = _mm256_extract_epi64(avx_word, 3);
      return (1 + shift + __builtin_clzll(word) + 192);
    }
  }
  return 0;
}


size_t
Vector_and_first_bit_128(
    uint64_t** vectors,
    const size_t num_vectors,
    const size_t num_avx_words) {

  for (size_t i = 0; i < num_avx_words; ++i) {
    const size_t addr_offset = i << 1;
    __m128i avx_word = _mm_load_si128(
        (__m128i const*) (vectors[0] + addr_offset));

    // AND the AVX words
    for (size_t j = 1; j < num_vectors; ++j) {
      avx_word = _mm_and_si128(
        avx_word,
        _mm_load_si128((__m128i const*) (vectors[j] + addr_offset))
      );
    }

    // test whether resulting AVX word is not zero
    if (!_mm_test_all_zeros(avx_word, avx_word)) {
      uint64_t word = _mm_extract_epi64(avx_word, 0);
      if (word > 0)
        return (1 + (i << 7) + __builtin_clzll(word));

      word = _mm_extract_epi64(avx_word, 1);
      return (1 + (i << 7) + __builtin_clzll(word) + 64);
    }
  }
  return 0;
}


uint64_t*
make_random_vector(
    const size_t num_bits,
    const size_t propability) {

  uint64_t* vector = Vector_new(num_bits);
  for (size_t i = 0; i < num_bits; ++i) {
    const int x = rand() % 10;
    if (x >= (int) propability)
      Vector_set(vector, i);
  }
  return vector;
}


size_t
millis(
    const struct timeval* end,
    const struct timeval* start) {

  struct timeval e = *end;
  struct timeval s = *start;
  return (1000 * (e.tv_sec - s.tv_sec) + (e.tv_usec - s.tv_usec) / 1000);
}


int
main(
    int argc,
    char** argv) {

  if (argc != 6)
    printf("fuck %s\n", argv[0]);

  srand(time(NULL));

  const size_t num_vectors = atoi(argv[1]);
  const size_t size = atoi(argv[2]);
  const size_t num_iterations = atoi(argv[3]);
  const size_t num_dimensions = atoi(argv[4]);
  const size_t propability = atoi(argv[5]);
  const size_t num_words = size / 64;
  const size_t num_sse_words = num_words / 2;
  const size_t num_avx_words = num_words / 4;

  assert(num_vectors > 0);
  assert(size > 0);
  assert(num_iterations > 0);
  assert(num_dimensions > 0);

  struct timeval t1;
  gettimeofday(&t1, NULL);

  uint64_t*** vectors = (uint64_t***) malloc(sizeof(uint64_t**) * num_vectors);
  for (size_t j = 0; j < num_vectors; ++j) {
    vectors[j] = (uint64_t**) malloc(sizeof(uint64_t*) * num_dimensions);
    for (size_t i = 0; i < num_dimensions; ++i)
      vectors[j][i] = make_random_vector(size, propability);
  }

  struct timeval t2;
  gettimeofday(&t2, NULL);
  printf("Creation: %zu ms\n", millis(&t2, &t1));



  size_t* results_64    = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_128   = (size_t*) malloc(sizeof(size_t) * num_vectors);
  size_t* results_256   = (size_t*) malloc(sizeof(size_t) * num_vectors);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_64[i] = Vector_and_first_bit(vectors[i], num_dimensions,
          num_words);
  gettimeofday(&t2, NULL);
  const size_t millis_64 = millis(&t2, &t1);
  printf("64            : %zu ms\n", millis_64);


  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_128[i] = Vector_and_first_bit_128(vectors[i],
          num_dimensions, num_sse_words);
  gettimeofday(&t2, NULL);
  const size_t millis_128 = millis(&t2, &t1);
  const double factor_128 = (double) millis_64 / (double) millis_128;
  printf("128           : %zu ms (factor: %.2f)\n", millis_128, factor_128);

  gettimeofday(&t1, NULL);
  for (size_t j = 0; j < num_iterations; ++j)
    for (size_t i = 0; i < num_vectors; ++i)
      results_256[i] = Vector_and_first_bit_256(vectors[i],
          num_dimensions, num_avx_words);
  gettimeofday(&t2, NULL);
  const size_t millis_256 = millis(&t2, &t1);
  const double factor_256 = (double) millis_64 / (double) millis_256;
  printf("256           : %zu ms (factor: %.2f)\n", millis_256, factor_256);


  for (size_t i = 0; i < num_vectors; ++i) {
    if (results_64[i] != results_256[i])
      printf("ERROR: %zu (64) != %zu (256) with i = %zu\n", results_64[i],
          results_256[i], i);
    if (results_64[i] != results_128[i])
      printf("ERROR: %zu (64) != %zu (128) with i = %zu\n", results_64[i],
          results_128[i], i);
  }


  free(results_64);
  free(results_128);
  free(results_256);

  for (size_t j = 0; j < num_vectors; ++j) {
    for (size_t i = 0; i < num_dimensions; ++i)
      free(vectors[j][i]);
    free(vectors[j]);
  }
  free(vectors);
  return 0;
}

To compile:

gcc -o main main.c -O3 -Wall -Wextra -pedantic-errors -Werror -march=native -std=c99 -fno-tree-vectorize

To execute:

./main 1000 8192 50000 5 9

The parameters mean: 1000 testcases, vectors of length 8192 bits, 50000, test repetitions (last two parameters are minor tweaks).

Sample output for the above call on my machine:

Creation: 363 ms
64            : 15000 ms
128           : 10070 ms (factor: 1.49)
256           : 6784 ms (factor: 2.21)

Upvotes: 7

Answers (4)

Peter Cordes

Reputation: 365267

If your input values are uniformly distributed, almost all of the time the highest set bit will be in the top 64 bits of the vector (1 in 2^64). A branch on this condition will predict very well. @Nejc's answer is good for that case.

But many problems where lzcnt is part of the solution have more uniformly distributed output, where it's common for the highest set bit to be somewhere other than the highest 64 bits.

Wim's idea of lzcnt on a compare bitmap to find the right element is a very good approach.

However, runtime-variable indexing of the vector with a store/reload is probably better than a shuffle. Store-forwarding latency is low (maybe 5 to 7 cycles on Skylake), and some of that latency is in parallel with the index generation (compare / movemask / lzcnt). The movd/vpermd/movd lane-crossing shuffle strategy takes 7 cycles on Skylake to get the right element into an integer register, only starting after the index is known. Load-use latency for an L1d cache hit is only 5 cycles from address to data, probably similar when data comes from the store buffer. (4c only in pointer-chasing scenarios). See http://agner.org/optimize/, and especially https://uops.info/ which has latency measurements for movd. If you look at the details, one of the measurements is for a round-trip chain of movd to/from xmm0. On Haswell it's 2 cycles per round trip, so 5 total with a vpermd in between. On Skylake, a movd round trip has 4 cycle latency, presumably 3 cycles one way, 1 the other, so 7 cycles with vpermd.

I think this version should be better or equal latency on Haswell/Skylake, and also better throughput. Much better on Ryzen. (vpermd is slower than Intel on Zen before Zen 4. And a movd round-trip on Zen 4 is about 7 cycles, with one of the movd instructions being 2 uops. 6 cycle round-trip on Zen 2 and 3 with single-uop movd.)

Some scalar math is needed on an lzcnt result to get a load index, which eats into the latency and throughput advantage some, depending on how clever the compiler is. lzcnt on a vmovmskps result works directly as a shuffle index for vpermd.

Aligning the stack by 32 avoids cache-line splits on a 32-byte store, but it takes extra instructions. So this is best if it can inline into a function that uses it multiple times, or already needs that much alignment for some other __m256i. GCC will align the stack whether you ask for it or not, but MSVC and Clang won't. Store-forwarding from a dword aligned relative to the store works even if the store itself is a cache-line split on most modern CPUs, I think.

static inline
int lzcnt_si256(__m256i vec)
{
    // or just lzcnt_si256_memsrc(&vec) optimizes away store/reload
    __m256i   vnonzero = _mm256_cmpeq_epi32(vec, _mm256_setzero_si256());
    uint32_t  nzmask = ~ _mm256_movemask_epi8(vnonzero);   //  1 for bytes that are part of non-zero dwords
    // nzmask |= 0xf;  // branchless clamp to last elem
    if (nzmask == 0)     // all 32 bits came from vpmovmskb, so NOT didn't introduce any constant 1s
        return 256;      // don't access outside the array
    alignas(32) uint32_t elems[8];
    _mm256_storeu_si256((__m256i*)elems, vec);

    unsigned  lzbytes = _lzcnt_u32(nzmask);   // bytes above the dword containing the nonzero bit.
    unsigned char *end_elem = 28 + (unsigned char*)elems;
    uint32_t *nz_elem = (uint32_t*)(end_elem - lzbytes);  // idx = 31-(lzcnt+3) = 28-lzcnt
    return    8*lzbytes + _lzcnt_u32(*nz_elem);  // this is an aligned load, memcpy not needed
}

// unaligned and strict-aliasing safe
static inline
int lzcnt_si256_memsrc(const void *m256)
{
    __m256i  vec = _mm256_loadu_si256((const __m256i*)m256);
    __m256i  vnonzero = _mm256_cmpeq_epi32(vec, _mm256_setzero_si256());
    uint32_t nzmask = ~ _mm256_movemask_epi8(vnonzero);
    // nzmask |= 0xf;  // branchless clamp to last elem
    if (nzmask == 0)    // all 32 bits came from vpmovmskb, so NOT didn't introduce any constant 1s
        return 256;     // don't access outside the array

    unsigned  lzbytes = _lzcnt_u32(nzmask);
    unsigned char *end_elem = 28 + (unsigned char*)m256;  // can be done as part of the addressing mode after sub
    uint32_t *nz_elem = (uint32_t*)(end_elem - lzbytes);  // idx = MSB_idx-3 = 31-(lzcnt+3) = 28-lzcnt
    uint32_t nz_dword;
    memcpy(&nz_dword, nz_elem, sizeof(nz_dword));  // For strict-aliasing safety,  and/or if m256 wasn't aligned by 4.  __attribute__((aligned(1),may_alias)) on the pointer would work in GNU C
    return    8*lzbytes + _lzcnt_u32(nz_dword);
}

GCC and clang don't optimize away the storeu copy if the vector was just loaded, so I made a separate version, unfortunately. (If your data isn't 4-byte aligned and you're getting cache-line splits on the dword load as well as the vector load, consider using the byte version that loads with movzx.)

On Godbolt with GCC13 -O3 -march=x86-64-v3, we get asm like this to count ymm0 into esi (inlined into main's loop.)

# GCC13.2 -O3 -march=x86-64-v3     (or -march=haswell)
# lzcnt_si256_memsrc inside a loop in main, after printf
   vmovdqa ymm0, YMMWORD PTR [rsp]
   vpxor   xmm1, xmm1, xmm1
   mov     esi, 256
   vpcmpeqd        ymm0, ymm0, ymm1  # could have used a memory source operand
   vpmovmskb       eax, ymm0
   xor     eax, -1                   # ~mask and set FLAGS, unlike the NOT instruction
   je      .L33
   xor     ecx, ecx            # dep-breaking for lzcnt
   mov     edx, 28
   lzcnt   ecx, eax
   sub     rdx, rcx
   lzcnt   edx, DWORD PTR [rsp+32+rdx]  # Un-laminates into 2 uops with an indexed addr mode.  mov rdx, rsp ; sub would have allowed [rdx+32+28] here
   lea     esi, [rdx+rcx*8]          # lzbytes*8 + lzcnt(nz chunk)
.L33:   # jump target for mask==0

Clang likes to do vptest ymm0, ymm0 / jne for the early-out, costing more uops than just waiting to test the movemask result. Perhaps an [[unlikely]] annotation or non-portable equivalent could help.

GCC's non-inline version is better in some ways (sub against the pointer arg and +28 as part of the addr mode for lzcnt edx, [rdi+28] which can stay micro-fused on Intel since it only uses one reg.) But GCC wastes a mov reg copy and two dep-breaking xor-zeroing instructions even though both lzcnt instructions can overwrite their inputs (or a reg holding a pointer for the mem-src version). Sometimes it's possible to rearrange your C in ways that help, but it depends on the surrounding code where this inlines.

bsr instead of 31 - lzcnt on the mask could reduce critical path latency on Intel: no SUB or NEG, just adding something as part of the addressing mode for the scalar load. GCC8 and earlier would emit it for 31-__builtin_clz(), but current GCC just uses 31-lzcnt or 31^lzcnt even with -march=haswell where both have identical performance characteristics (including the output dependency.)

If you're tuning specifically for Intel, BSR might still be a good idea. But for portable software, BSR is significantly slower on AMD than LZCNT, which is relevant everywhere except x86-64 macOS. But good luck getting compilers other than MSVC to emit it.

#ifndef _MSC_VER
#include <stdalign.h>  //MSVC is missing this?
#else
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)  // https://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx suggests this
#endif

// undefined result for mask=0, like BSR
static inline uint32_t bsr_nonzero(uint32_t mask)
{
// on Intel, bsr has a minor advantage for the first step
// for AMD, BSR is slow so you should use 31-LZCNT.

 // Intel's docs say there should be a _bit_scan_reverse(x), maybe try that with ICC
 #ifdef _MSC_VER
    unsigned long tmp;
    _BitScanReverse(&tmp, mask);
    return tmp;
 #else
    return 31 - __builtin_clz(mask);  // GCC 9 and later use lzcnt even with -march=haswell implying tune for Intel, leading to worse code.  Same for clang
 #endif
}

In C++20 there's 31 - std::countr_zero(mask) - all CPUs with AVX2 also have BMI1/BMI2, so compilers will be able to use lzcnt for it. There's no portable C equivalent. (On CPUs without BMI1, countr_zero would be slightly slower because it guarantees 32 for mask=0, unlike the bsr instruction or intrinsic, or GNU __builtin_clz. So it would branch or CMOV on the input being zero.)

BSR version using byte elements

Not recommended in general, but I didn't want to throw this away. It compiles with a BSR on MSVC but is worse with GCC and Clang. Change the #if to 1 for lzcnt, for a byte-source version for unaligned data to avoid cache-line and page splits on the scalar load (but not the vec).

int lzcnt_si256_byte_memsrc(const void *m256)
{
    __m256i  vec = _mm256_loadu_si256((const __m256i*)m256);
    __m256i   nonzero_elem = _mm256_cmpeq_epi8(vec, _mm256_setzero_si256());
    uint32_t  mask = ~_mm256_movemask_epi8(nonzero_elem);   //  1 where there are non-zero bytes

    // alternative: mask |= 1 to lzcnt the low byte in that case, giving 32-24 = 8
    if (mask == 0)
        return 256;
    unsigned char *elems = (unsigned char*)m256;

#if 0  // GCC13 compiles both the same, clang's __builtin_clz version is worse (xor twice)
// Unless you're specifically tuning for Intel, use this version
    uint8_t *end_elem = elems + 31;
    unsigned   lz_msk   = _lzcnt_u32(mask);
    size_t   idx_from_end = -(size_t)lz_msk;    // idx = 31 - lz_msk;  // relative to start
    unsigned   highest_nonzero_byte = end_elem[idx_from_end];
#else
    unsigned   idx = bsr_nonzero(mask);   // use bsr to get the 31-x for free, because it's known to be non-zero
    unsigned   lz_msk = 31 - idx;    // off the critical path, if compilers actually use BSR
// MSVC notices that (31-idx)*8 - 24 + x  is (28-idx)*8 + x,  allowing a 2-component LEA
// GCC and clang miss that, in older versions that actually use BSR
    unsigned   highest_nonzero_byte = elems[idx];
#endif
    return     lz_msk * 8 + _lzcnt_u32(highest_nonzero_byte) - 24;
    // lzcnt(byte)-24, because we don't want to count the leading 24 bits of padding.
}

On AMD CPUs, bsr is significantly slower than lzcnt. On Intel CPUs, they're the same performance, except for minor variations in output-dependency details. (lzcnt has a false dependency before Skylake. bsr has a true dependency on the output for all CPUs.)

bsr with input=0 leaves the destination register unmodified, but intrinsics don't provide a way to take advantage of that to get CMOV-like behaviour for free. (Intel only documents it as undefined output, but AMD documents the actual behaviour of Intel / AMD CPUs as producing the old value in the destination register).

bsr sets ZF if the input was zero, rather than based on the output like most instructions. (This and the output dependency may be why it's slow on AMD.) Branching on the BSR flags is not particularly better than branching on ZF as set by xor eax,-1 to invert the mask, which is what gcc does. Anyway, Intel does document a _BitScanReverse(&idx, mask) intrinsic that returns a bool, but gcc doesn't support it (not even with x86intrin.h). The GNU C builtin doesn't return a boolean to let you use the flag result, and doesn't normally use the FLAGS result even when you check the input C variable for non-zero.

Wim's version needs lz_msk-24 because the high 24 bits are always zero with an 8-bit mask. But a 32-bit mask fills a 32-bit reg.

This version with 8 bit elements and a 32-bit mask is the reverse: we need to lzcnt the selected byte, not including the 24 leading zero bits in the register. So our -24 moves to a different spot, not part of the critical path for indexing the array.

GCC chooses to do it as part of a single 3-component LEA (reg + reg*scale - const), which is great for throughput, but puts it on the critical path after the final lzcnt. (3-component LEA has extra latency vs. reg + reg*scale on Intel before Ice Lake. See Agner Fog's instruction tables, and https://uops.info/ for testing of different LEA addr modes on Ice Lake and Alder Lake; any scaled index is now a slow-LEA, but still 1c latency until Alder, and more ports.)

Clang with some -march options makes a mess, favouring latency over throughput by using more instructions instead of a complex LEA. But sometimes it breaks things up more than necessary.

A multiply by 8 can be done as part of an lea, but a multiply by 32 would need a shift (or be folded into two separate LEAs).

Intel's optimization manual says (Table 2-24) even Sandybridge can forward from a 256-bit store to single-byte loads without a problem, so I think it's fine on AVX2 CPUs, the same as forwarding to 32-bit loads that of 4-byte-aligned chunks of the store.

AVX-512 versions, maybe not faster, just an experiment

Store/reload might still be a better strategy, especially with data already in memory to avoid the store part. We can make the low element special, always true or always false, with vptestnmb against a vector of all-ones except the low dword, getting the nz |= 0xf for free.

AVX-512 has vplzcntq to do per-element 64-bit bit-scan. Doing that before a store could shorten critical-path latency by overlapping the lzcnt with the element-index calculation. But it takes a vector ALU instead of the port-1 ALU on Intel CPUs which can't run 512-bit vector uops (or any vector uops when there are any 512-bit uops in flight.)

The fun way is to shuffle and blend using the mask to pick one of four elements, I was hoping I could use vpcompressq to left-pack, but that gets lowest not highest. A horizontal-sum type of shuffle + blend pattern can grab the lower element only if the high element's mask bit was zero. Shuffles themselves can use merge-masking, including into the same reg for single-uop reduction steps. But the mask instructions to prepare the mask aren't fast.

// See Godbolt for the first 2 attempts at this

// this isn't great either
int lzcnt_si256_avx512_v2(__m256i vec)
{
    __mmask8 nz = _mm256_test_epi64_mask(vec, vec); // 1 where input is non-zero, useful for merging higher into lower elements
    __m256i vclz = _mm256_lzcnt_epi64(vec);
    vclz = _mm256_add_epi64(vclz, _mm256_set_epi64x(0, 64, 128, 192));  // add the number of bits to the left of each element
    int nz_shr = nz>>1;
    vclz = _mm256_mask_unpackhi_epi64(vclz, nz_shr, vclz, vclz); //  even elements = higher elem if it was non-zero (nz_mask=1 merge masking), else keep the lower element
    nz_shr |= nz;  // kand: 1 cycle latency on port 0
    nz_shr >>= 2;  // kshiftr: 4 cycle latency on port 5 (Intel).  Zen 4 is 1c latency on 2 ports
    __m128i low = _mm256_mask_extracti32x4_epi32(_mm256_castsi256_si128(vclz), nz_shr, vclz, 1);
    return _mm_cvtsi128_si32(low);
}

Trailing zero count via `vpcompressd` to left-pack based on non-zero mask

// Just for fun, use vpcompressd to grab the first non-zero element
// Worth comparing for throughput and latency on Intel and Zen 4, vs. store/reload
static inline
int tzcnt_si256_avx512(__m256i vec)
{
    __mmask8 mask = _mm256_test_epi32_mask(vec, vec);
    __m256i pack_nz = _mm256_maskz_compress_epi32(mask, vec);
    uint32_t first_nz = _mm256_extract_epi32(pack_nz, 0);  // optimizes to vmovd
    uint32_t mask32 = mask | (-1U<<7); // produce 256 = 7*32 + 32 for an all-zero vector where tzcnt(first_nz) = 32.
      // -1U<<7 fits in signed imm8.  With 1<<7, compilers were going nuts trying to optimize
      // alternative: if (mask==0) return 256;
    return _tzcnt_u32(first_nz) + 32*_tzcnt_u32(mask32);
}

On Godbolt I included a version that uses VBMI2 (Ice Lake) vpcompressb so the final x + 32*y can be *8 instead, allowing LEA. But that makes it harder to handle an all-zero mask; see the code comments on Godbolt.

Another tzcnt strategy could involve vplzcntq( v & -v ), and subtracting that from a vector of set_epi64x(192+63, 128+63, 64+63, 63) to get tzcnt=63-lzcnt(blsi(vec)). Then select the right non-zero element? That's more vector ops, but runs the lzcnt dep chain in parallel with the compress shuffle. vpcompress* is 2 uops, and the first only needs the mask as input, not the vector being shuffled. (Presumably it processes a mask into a shuffle-control for vperm*). That would optimize for latency but not throughput. More vector uops is even more of a downside if using 512-bit vectors.

Handling the all-zero input case might need a branch with the compress strategy, unless we wanted to do a merge-masked sub into a vector of set1(256). 63-lzcnt only works for lzcnt in [0..63]; it would need -1 instead of 64 to get the result we want for the input=0 case. That wouldn't hurt instruction-level parallelism badly: the vplzcntq could still run in parallel with with the compare-into-mask, and vpsubq would have to wait for both to be ready. And that's also separate from the compress reading the mask from the compare.

Upvotes: 11

wim

Reputation: 3998

(Update: new answer since 2019-01-31)

Three alternatives are:

Peter Cordes' excellent answer. Fast. This solution is not branchless, which should not be a problem, unless the input is frequently zero with an irregular pattern of occurrences.
My previous answer which is in the edit history of this answer now. Less efficient than Peter Cordes' answer, but branchless.
This answer. Very fast if the data from the 2 tiny lookup tables is in L1 cache. The L1 cache footprint is 128 bytes. Branchless. It may suffer from cache misses when called not often.

In this answer, the input epi64 vector is compared with zero, which produces a mask. This mask is converted to a 4-bit index i_mask (by _mm256_movemask_pd). With index i_mask two values are read from the two lookup tables: 1. the index of the first nonzero 64-bit element, and 2. the number of nonzeros of the preceding (from left to right) zero elements. Finally, the _lzcnt_u64 of the first nonzero 64-bit element is computed and added to the lookup table value. Function mm256_lzcnt_si256 implements this method:

#include <stdio.h>
#include <stdint.h>
#include <x86intrin.h>
#include <stdalign.h>
/* gcc -Wall -m64 -O3 -march=haswell clz_avx256_upd.c */


int mm256_lzcnt_si256(__m256i input)
{   
    /* Version with lookup tables and scratch array included in the function                                                                  */

    /* Two tiny lookup tables (64 bytes each, less space is possible with uint8_t or uint16_t arrays instead of uint32_t):                       */
    /* i_mask  (input==0)                 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111                        */
    /* ~i_mask (input!=0)                 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000                        */
    static const uint32_t indx[16]   = {   3,   3,   3,   3,   3,   3,   3,   3,   2,   2,   2,   2,   1,   1,   0,   0};
    static const uint32_t lz_msk[16] = {   0,   0,   0,   0,   0,   0,   0,   0,  64,  64,  64,  64, 128, 128, 192, 192};

    alignas(32)  uint64_t tmp[4]     = {   0,   0,   0,   0};                /* tmp is a scratch array of 32 bytes, preferably 32 byte aligned   */ 

                          _mm256_storeu_si256((__m256i*)&tmp[0], input);     /* Store input in the scratch array                                 */
    __m256i  mask       = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero                            */
    uint32_t i_mask     = _mm256_movemask_pd(_mm256_castsi256_pd(mask));     /* Move vector mask to integer mask                                 */
    uint64_t input_i    = tmp[indx[i_mask]];                                 /* Load the first (from the left) non-zero 64 bit element input_i   */
    int32_t  lz_input_i = _lzcnt_u64(input_i);                               /* Count the number of leading zeros in input_i                     */
    int32_t  lz         = lz_msk[i_mask] + lz_input_i;                       /* Add the number of leading zeros of the preceding 64 bit elements */
             return lz;
}    


int mm256_lzcnt_si256_v2(__m256i input, uint64_t* restrict tmp, const uint32_t* indx, const uint32_t* lz_msk)
{   
    /* Version that compiles to nice assembly, although, after inlining there won't be any difference between the different versions.            */
                          _mm256_storeu_si256((__m256i*)&tmp[0], input);     /* Store input in the scratch array                                 */
    __m256i  mask       = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero                            */
    uint32_t i_mask     = _mm256_movemask_pd(_mm256_castsi256_pd(mask));     /* Move vector mask to integer mask                                 */
    uint64_t input_i    = tmp[indx[i_mask]];                                 /* Load the first (from the left) non-zero 64 bit element input_i   */
    int32_t  lz_input_i = _lzcnt_u64(input_i);                               /* Count the number of leading zeros in input_i                     */
    int32_t  lz         = lz_msk[i_mask] + lz_input_i;                       /* Add the number of leading zeros of the preceding 64 bit elements */
             return lz;
}    


__m256i bit_mask_avx2_lsb(unsigned int n)               
{           
    __m256i ones       = _mm256_set1_epi32(-1);
    __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32);
    __m256i shift      = _mm256_set1_epi32(n);   
            shift      = _mm256_subs_epu16(cnst32_256,shift);  
                  return _mm256_srlv_epi32(ones,shift);
}


int print_avx2_hex(__m256i ymm)
{
    long unsigned int x[4];
        _mm256_storeu_si256((__m256i*)x,ymm);
        printf("%016lX %016lX %016lX %016lX  ", x[3],x[2],x[1],x[0]);
    return 0;
}


int main()
{
    unsigned int i;
    __m256i x;

    printf("mm256_lzcnt_si256\n");
    for (i = 0; i < 257; i++){
        printf("x=");
        x = bit_mask_avx2_lsb(i);
        print_avx2_hex(x);
        printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    }
    printf("\n");

    x = _mm256_set_epi32(0,0,0,0, 0,15,1,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(0,0,0,8, 0,0,0,256);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));
    x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x));

   /* Set arrays for mm256_lzcnt_si256_v2:                          */
    alignas(32) static const uint32_t indx[16]   = {   3,   3,   3,   3,   3,   3,   3,   3,   2,   2,   2,   2,   1,   1,   0,   0};
    alignas(32) static const uint32_t lz_msk[16] = {   0,   0,   0,   0,   0,   0,   0,   0,  64,  64,  64,  64, 128, 128, 192, 192};
    alignas(32)              uint64_t tmp[4]     = {   0,   0,   0,   0};
    printf("\nmm256_lzcnt_si256_v2\n");
    for (i = 0; i < 257; i++){
        printf("x=");
        x = bit_mask_avx2_lsb(i);
        print_avx2_hex(x);
        printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    }
    printf("\n");

    x = _mm256_set_epi32(0,0,0,0, 0,15,1,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(0,0,0,8, 0,0,0,256);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));
    x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0);
    printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk));

    return 0;
}

The output suggests that the code is correct:

$ ./a.out
mm256_lzcnt_si256
x=0000000000000000 0000000000000000 0000000000000000 0000000000000000  lzcnt(x)=256 
x=0000000000000000 0000000000000000 0000000000000000 0000000000000001  lzcnt(x)=255 
...
x=0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=129 
x=0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=128 
x=0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=127 
...
x=7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=1 
x=FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF  lzcnt(x)=0 

x=0000000000000000 0000000000000000 000000000000000F 0000000100000000  lzcnt(x)=188 
x=0000000000000000 0000000000000008 0000000000000000 0000000000000100  lzcnt(x)=124 
x=0000000000000100 0000000000000008 00000000000000C0 0000000000000000  lzcnt(x)=55 
x=FFFFFFFF00000100 0000000000000008 0000000000000000 0000002000000000  lzcnt(x)=0

Function mm256_lzcnt_si256_v2 is an alternative version of the same function, but now the pointers to the lookup tables and the scratch array are passed with the function call. This leads to clean assembly code (no stack operations), and gives an impression which instructions are needed after inlining mm256_lzcnt_si256 in a loop.

With gcc 8.2 and options -m64 -O3 -march=skylake:

mm256_lzcnt_si256_v2:
        vpxor   xmm1, xmm1, xmm1
        vmovdqu YMMWORD PTR [rdi], ymm0
        vpcmpeqq        ymm0, ymm0, ymm1
        vmovmskpd       ecx, ymm0
        mov     eax, DWORD PTR [rsi+rcx*4]
        lzcnt   rax, QWORD PTR [rdi+rax*8]
        add     eax, DWORD PTR [rdx+rcx*4]
        vzeroupper
        ret

In a loop context, and with inlining, vpxor is likely hoisted outside the loop.

Upvotes: 4

Nejc

Reputation: 937

Since you are also asking for more elegant (i.e. simpler) way to do this: on my computer, your code runs as fast as the one below. In both cases it took 45 milliseconds to compute the result for 10 million 256-bit words.

Since I was filling AVX registers with (four) randomly generated uniformly distributed 64-bit integers (and not uniformly distributed 256 integers), the order of iteration through the array had no impact on the result of my benchmark test. Also, even though this is almost needless to say, the compiler was smart enough to unroll the loop.

uint32_t countLeadZeros(__m256i const& reg)
{
  alignas(32) uint64_t v[4];
  _mm256_store_si256((__m256i*)&v[0], reg);

  for (int i = 3; i >= 0; --i)
    if (v[i]) return _lzcnt_u64(v[i]) + (3 - i)*64;

  return 256;
}

EDIT: as it can be seen in the discussion below my answer and in my edit history, I initally took the approach similar to the one of @PeterCorbes (but he provided a better optimized solution). I changed my approach once I started doing benchmarks because I completely overlooked the fact that practically all of my inputs had the most significant bit located within top 64 bits of the AVX word.

After I realized the mistake I had made, I decided to try to do the benchmarks more properly. I will present two results below. I searched through the edit history of my post and from there I copy-pasted the function I submitted (but later edited-out) before I changed my approach and went for the branched version. That function is presented below. I compared the performance of my "branched" function, my "branchless" function and the branchless function that was independently developed by @PeterCorbes. His version is superior to mine in terms of performance - see his excellently written post that contains lots of useful details.

int countLeadZeros(__m256i const& reg){

  __m256i zero = _mm256_setzero_si256();
  __m256i cmp = _mm256_cmpeq_epi64(reg, zero);

  int mask = _mm256_movemask_epi8(cmp);

  if (mask == 0xffffffff) return 256;

  int first_nonzero_idx = 3 - (_lzcnt_u32(~mask) >> 3);

  alignas(32) uint64_t stored[4]; // edit: added alignas(32)
  _mm256_store_si256((__m256i*)stored, reg);

  int lead_zero_count = _lzcnt_u64(stored[first_nonzero_idx]);

  return (3 - first_nonzero_idx) * 64 + lead_zero_count;
}

Benchmark number 1

I will present the test code in pseudocode to make this short. I actually used AVX implementation of random number generator that does the generation of random numbers blazingly fast. First, let's do the test on the inputs that make branch prediction really hard:

tick()
for(int i = 0; i < N; ++i)
{
   // "xoroshiro128+"-based random generator was actually used
   __m256i in = _mm256_set_epi64x(rand()%2, rand()%2, rand()%2, rand()%2);

   res = countLeadZeros(in);  
}
tock();

For 10 million repetitions, the function from the top of my post takes 200ms. The implementation that I initially developed requires only 65ms to do the same job. But the function provided by @PeterCorbes takes the cake by consuming only 60ms.

Benchmark number 2

Now let's turn to test that I originally used. Again, pseudocode:

tick()
for(int i = 0; i < N; ++i)
{
   // "rand()" represents random 64-bit int; xoroshiro128+ waw actually used here
   __m256i in = _mm256_set_epi64x(rand(), rand(), rand(), rand());

   res = countLeadZeros(in);  
}
tock();

In this case, the version with branches is faster; 45ms are required to compute 10 million results. The function by @PeterCorbes takes 50ms to complete and my "branchless" implementation requires 55ms to do the same job.

I don't think that I dare to draw any general conclusions out of this. It seems to me that the branchless approach is better as it offers the more stable computation time, but whether you need that stability or not probably depends on the usecase.

EDIT: the random generator.

This is an extended reply to comment by @PeterCorbes. As I stated above, the benchmark test code is just pseudocode. If anyone is interested, how I actually generated the numbers, here is a quick description.

I used xoroshiro128+ algorithm which was released into public domain and which is available at this website. It is quite simple to rewrite the algorithm with AVX instructions so that four numbers are generated in parallel. I wrote a class that accepts the so-called initial seed (128 bits) as parameter. I obtain the seeds (states) for each one of four parallel generators by first copying the initial seed four times; after that I use jump instructions on i-th parallel generator i-times; i = {0, 1, 2, 3}. Each jump advances the internal state J=2^64 steps forward. This means I can generate 4*J numbers (moooore than enough for all everyday purposes), four at a time before any parallel generator starts to repeat a sequence of numbers that were already produced by any other generator in a current session. I control the range of produced numbers with _mm256_srli_epi64 instruction; I use shift 63 for first test and no shift for the second one.

Upvotes: 2

gpnuma

Reputation: 128

I've got a version which is not really "elegant", but faster here (Apple LLVM version 9.0.0 (clang-900.0.39.2)) :

#define NOT_ZERO(x) (!!(x))

#ifdef UNIFORM_DISTRIBUTION
#define LIKELY(x)           __builtin_expect(NOT_ZERO(x), 1)
#define UNLIKELY(x)         __builtin_expect(NOT_ZERO(x), 0)
#else
#define LIKELY(x)           (x)
#define UNLIKELY(x)         (x)
#endif


inline unsigned int clz_u128(uint64_t a, uint64_t b, int not_a, int not_b) {
    if(UNLIKELY(not_a)) {
        if(UNLIKELY(not_b)) {
            return 128;
        } else {
            return (__builtin_clzll(b)) + 64;
        }
    } else {
        return (__builtin_clzll(a));
    }
}

unsigned int clz_u256(__m256i packed) {
    const uint64_t a_0 = (uint64_t)_mm256_extract_epi64(packed, 0);
    const uint64_t a_1 = (uint64_t)_mm256_extract_epi64(packed, 1);
    const uint64_t b_0 = (uint64_t)_mm256_extract_epi64(packed, 2);
    const uint64_t b_1 = (uint64_t)_mm256_extract_epi64(packed, 3);

    const int not_a_0 = !a_0;
    const int not_a_1 = !a_1;

    if(UNLIKELY(not_a_0 & not_a_1)) {
        return clz_u128(b_0, b_1, !b_0, !b_1) + 128;
    } else {
        return clz_u128(a_0, a_1, not_a_0, not_a_1);
    }
}

It splits a bigger problem into smaller ones and uses the fact that it is incredibly more likely for higher bits to be non-zero than lower bits if the vector distribution is uniform.

Just add a #define UNIFORM_DISTRIBUTION if uniform distribution is expected for extra performance.

Upvotes: 0

Count leading zeros in __m256i word

Answers (4)

BSR version using byte elements

AVX-512 versions, maybe not faster, just an experiment

Trailing zero count via vpcompressd to left-pack based on non-zero mask

Related Questions

Trailing zero count via `vpcompressd` to left-pack based on non-zero mask