Lock free stack poor performance

Question

I'm using free-list structure for a while implemented as the sort of stack:

// gets the next free index from the top
unsigned acquire()
{
    std::lock_guard lock{m_spin};
    if (m_nextFreeHandle < m_size)
    {
        return std::exchange(m_nextFreeHandle, m_nextHandles[m_nextFreeHandle]);
    }
    return InvalidHandle;
} 

void release(unsigned handle) // returns the index to the list
{
    std::lock_guard lock{m_spin};
    m_nextHandles[handle] = std::exchange(m_nextFreeHandle, handle);
}

Thread safety guaranteed by using simple spinlock:

void SpinLock::lock() noexcept
{
    while (flag_.test_and_set(std::memory_order_acquire))
    {
        while (flag_.test(std::memory_order_relaxed))
            std::this_thread::yield();
    }
}

void SpinLock::unlock() noexcept
{
    flag_.clear(std::memory_order_release);
}

I decided to try rework it with lock-free approach:

static constexpr unsigned extractMark(unsigned value) noexcept
{
    return value >> 22;
}

static constexpr unsigned addMark(unsigned value, unsigned tag) noexcept
{
    return value | (tag << 22);
}

static constexpr unsigned clearMark(unsigned value) noexcept
{
    return value & (0xFFFFFFFFu >> 10);
}

unsigned acquire()
{
    auto nextIndex = m_nextFreeHandle.load(std::memory_order_acquire);
    if(clearMark(nextIndex)< m_size)
    {
        unsigned newNextIndex;
        do
        {
            newNextIndex = m_nextHandles[clearMark(nextIndex)].load(std::memory_order_relaxed);
        } while (!m_nextFreeHandle.compare_exchange_weak(nextIndex, newNextIndex, std::memory_order_release, std::memory_order_acquire));
        
        auto mark = extractMark(nextIndex);
        nextIndex = addMark(clearMark(nextIndex), mark + 1); // to avoid ABA problem
        return nextIndex;
    }
    return InvalidIndex;
}
        
        
void release(unsigned handle)
{
    if(handle == InvalidIndex)
        return;
    auto currentHead = m_nextFreeHandle.load(std::memory_order_relaxed);
    do
    {
        m_nextHandles[clearMark(handle)].store(currentHead, std::memory_order_relaxed);
    }
    while (!m_nextFreeHandle.compare_exchange_weak(currentHead, handle, std::memory_order_release, std::memory_order_relaxed));
}

Then I wrote simple test (single and multithreaded)

    tbb::parallel_for(0, 10'000'000, [&](int) {
        auto h0 = list.acquire();
        list.release(h0);
    });

and both times (single and multithreaded) lock free version is much slower than spinlock one!

I'd expect the lock free version to be at least as fast as spinlock, but it's like 1.5x times slower both for one and for 12 threads (I'm using Intel Core i5 11th gen with 6 cores and HT). It's my first experience in writing lock free structures and I'm confused. What I'm doing wrong? Am I expect too much from the lock-free approach or doing something not optimal way?

Lock free stack poor performance

Answers (1)

Related Questions