memory ordering on a simple producer/consumer example

Question

I have understanding issues on the memory-model. Here is the example (I tried to simplify it but it's still a bit long) I will base my question upon:

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

constexpr size_t hardware_destructive_interference_size = 128;

constexpr int maxWait = 1000;
constexpr int experimentRuns = 100;
constexpr int experimentLength = 1'000;

static void waitRandom() {
    if (maxWait == 0) return;
    thread_local std::mt19937 gen{std::random_device{}()};
    std::uniform_int_distribution dist(0, maxWait);
    auto waitIterations = dist(gen);
    for (int i{0}; i < waitIterations; ++i) {
        [[maybe_unused]] volatile int doNotOptimize = 0;
    }
}

template 
class Worker {
    using input_t = INPUT;
    using func_t = void (*)(input_t);

   public:
    Worker(func_t fun) {
        mThread = std::thread([this, fun]() mutable {
            while (!stop()) {
                mainLoop(fun);
            }
        });
    }

    ~Worker() {
        {
            std::lock_guard lk{mMutex};
            mStop.store(true, std::memory_order_relaxed);
        }
        mCV.notify_one();
        mThread.join();
    }

    bool ready() { return !mWorkerBusy.load(std::memory_order_acquire); }

    template 
    void submitJob(T&& input) {
        assert(ready());
        {
            std::lock_guard lk{mMutex};
            mInput.slot = std::forward(input);
        }
        mWorkerBusy.store(true, std::memory_order_relaxed);
        mCV.notify_one();
    }

   private:
    void mainLoop(func_t& fun) {
        waitForJob();
        if (stop()) {
            return;
        }
        fun(input());
        completeJob();
    }

    void waitForJob() {
        assert(!inputValid());
        mWorkerBusy.store(false, std::memory_order_release);
        {
            std::unique_lock lk{mMutex};
            mCV.wait(lk, [&] { return inputValid() || stop(); });
        }
    }

    void completeJob() { mInput.slot.reset(); }

    bool stop() const { return mStop.load(std::memory_order_relaxed); }

    bool inputValid() const { return mInput.slot.has_value(); }

    input_t& input() {
        assert(inputValid());
        return *mInput.slot;
    }

   private:
    std::thread mThread;

    std::mutex mMutex;
    std::condition_variable mCV;

    struct {
        alignas(
            hardware_destructive_interference_size) std::optional slot;
    } mInput;
    std::atomic_bool mWorkerBusy = true;
    std::atomic_bool mStop = false;
};

static void asyncProcess(int) { waitRandom(); }

static auto now() { return std::chrono::high_resolution_clock::now(); }

static void runExperiment() {
    Worker w{asyncProcess};

    const auto start = now();

    int input{0};
    for (int j{0}; j < experimentLength; ++j) {
        do {
            ++input;
        } while (!w.ready());
        w.submitJob(input);
    }

    const auto duration = now() - start;
    const auto durationNs =
        std::chrono::duration_cast(duration).count();
    const auto usPerSample = static_cast(durationNs) / input;
    std::cout << usPerSample << "us/Sample" << std::endl;
}

int main() {
    using namespace std::literals::chrono_literals;
    for (int i{0}; i < experimentRuns; ++i) {
        runExperiment();
        std::cerr << i + 1 << " experiment(s) done

";
    }
}

LIVE
Basically, the main thread produces data and a worker thread consumes them but they may not be at the same pace (producing is faster than consuming).
The "invariants" are:

worker must finish its current task before starting a new one
data must be available

One of my misunderstanding lies here:

    void waitForJob() {
        assert(!inputValid());
        mWorkerBusy.store(false, std::memory_order_release);
        {
            std::unique_lock lk{mMutex};
            mCV.wait(lk, [&] { return inputValid() || stop(); });
        }
    }

What would prevent the store to be executed after the wait, leaving the program in an undesired state (possibly mWorkerBusy remaining true ad vitam, preventing the worker to be waked up)?

I've got the same issue with:

    template 
    void submitJob(T&& input) {
        assert(ready());
        {
            std::lock_guard lk{mMutex};
            mInput.slot = std::forward(input);
        }
        mWorkerBusy.store(true, std::memory_order_relaxed);
        mCV.notify_one();
    }

What would prevent notify_one being called first (for instance)?

[EDIT] follow-up posted here

memory ordering on a simple producer/consumer example

Answers (1)

1. Mutex Synchronizes

2. Loading Atomics Is Unsynchronized

3. Condition Variables Synchronize

4. Fixing Design Issues

Related Questions