OpenMP-based loop with reduction scales poorly

Question

I have a loop which I'm trying to efficiently parallelize with OpenMP. It involves accumulating the L2 norm of a vector stream, which comes with a reduction. Here's the loop:

struct vec3
{
    float data[3] = {};
};

float accumulate_eta_sq_t_mass(const vec3* etas, const float* masses, const std::size_t& n)
{
    auto a = 0.0;
    #pragma omp parallel for simd safelen(16) reduction(+:a)
    for (auto ii = std::size_t{}; ii < n; ++ii)
    {
        const auto& eta = etas[ii];
        const auto x = static_cast(eta.data[0]);
        const auto y = static_cast(eta.data[1]);
        const auto z = static_cast(eta.data[2]);
        const auto m = static_cast(masses[ii]);
        
        a += (x * x + y * y + z * z) * m;
    }
    
    return static_cast(a);
}

I wrote a micro-benchmarking program (code provided in the appendix) which shows that the simple loop does not scale very well as seen in the following diagram.

Here we're seeing a serial fraction of 15% according to Amdahl's law. In your experience (or perhaps in theory), is it normal for OpenMP loops with a reduction clause to scale so unimpressively? I can provide more information if needed. Thanks in advance.

Note: I'm using double precision in the loop to reduce large rounding errors in the accumulation. Using single precision did not lead to any improvement in scaling performance.

Additional information

@paleonix This was run on the Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz with 6 cores (no SMT) with cache sizes 192 KiB, 1.5 MiB and 9 MiB and highest vector flag AVX2.

Appendix

Here's the minimum reproducer which should compile so g++ -O3 -march=native -ffast-math -std=c++17 -fopenmp ./min_rep.cpp -o min_rep. It is to be called with a sufficiently large size, in my case 10000000.

/// \file This file contains a micro-benchmark program for the SAXPY loop.

#include  // std::stoul
#include  // std::abort
#include  // std::cerr
#include  // std::uint64_t
#include  // std::min
#include  // std::numeric_limits
#include  // std::vector

#include 

#include  // posix_memalign
#include  // sysconf

struct vec3
{
    float data[3] = {};
};

float accumulate_eta_sq_t_mass(const vec3* etas, const float* masses, const std::size_t& n)
{
    auto a = 0.0;
    #pragma omp parallel for simd safelen(16) reduction(+:a)
    for (auto ii = std::size_t{}; ii < n; ++ii)
    {
        const auto& eta = etas[ii];
        const auto x = static_cast(eta.data[0]);
        const auto y = static_cast(eta.data[1]);
        const auto z = static_cast(eta.data[2]);
        const auto m = static_cast(masses[ii]);
        
        a += (x * x + y * y + z * z) * m;
    }
    
    return static_cast(a);
}

int main(int argc, char** argv)
{
    // extract the problem size
    if (argc < 2)
    {
        std::cerr << "Please provide the problem size as command line argument." << std::endl;
        return 1;
    }
    const auto n = static_cast(std::stoul(argv[1]));
    if (n < 1)
    {
        std::cerr << "Zero valued problem size provided. The program will now be aborted." << std::endl;
        return 1;
    }
    if (n * sizeof(float) / (1024 * 1024 * 1024) > 40) // let's assume there's only 64 GiB of RAM
    {
        std::cerr << "Problem size is too large. The program will now be aborted." << std::endl;
        return 1;
    }
    
    // report
    std::cout << "Starting runs with problem size n=" << n << ".
Thread count: " << omp_get_max_threads() << "."
        << std::endl;
    
    // details
    const auto page_size = sysconf(_SC_PAGESIZE);
    const auto page_size_vec3 = page_size / sizeof(vec3);
    const auto page_size_float = page_size / sizeof(float);
    
    // experiment loop
    const auto experiment_count = 50;
    const auto warm_up_count = 10;
    const auto run_count = experiment_count + warm_up_count;
    auto durations = std::vector(experiment_count, std::numeric_limits::min());
    
    vec3* etas = nullptr;
    float* masses = nullptr;
    for (auto run_index = std::size_t{}; run_index < run_count; ++run_index)
    {
        // allocate
        const auto alloc_status0 = posix_memalign(reinterpret_cast(&etas), page_size, n * sizeof(vec3));
        const auto alloc_status1 = posix_memalign(reinterpret_cast(&masses), page_size, n * sizeof(float));
        if (alloc_status0 != 0 || alloc_status1 != 0 || etas == nullptr || masses == nullptr)
        {
            std::cerr << "Fatal error, failed to allocate memory." << std::endl;
            std::abort();
        }
        
        // "first touch"
        #pragma omp parallel for schedule(static)
        for (auto ii = std::size_t{}; ii < n; ii += page_size_vec3)
        {
            etas[ii].data[0] = 0.f;
        }
        #pragma omp parallel for schedule(static)
        for (auto ii = std::size_t{}; ii < n; ii += page_size_float)
        {
            masses[ii] = 0.f;
        }
        
        // run experiment
        const auto t1 = omp_get_wtime();
        accumulate_eta_sq_t_mass(etas, masses, n);
        const auto t2 = omp_get_wtime();
        const auto duration_in_us = static_cast((t2 - t1) * 1E+6);
        if (duration_in_us <= 0)
        {
            std::cerr << "Fatal error, no time elapsed in the test function." << std::endl;
            std::abort();
        }
        if (run_index + 1 > warm_up_count)
        {
            durations[run_index - warm_up_count] = static_cast(duration_in_us);
        }
        
        // deallocate
        std::free(etas);
        std::free(masses);
        etas = nullptr;
        masses = nullptr;
    }
    
    // statistics
    auto min = std::numeric_limits::max();
    auto max = std::uint64_t{};
    auto mean = std::uint64_t{};
    for (const auto& duration : durations)
    {
        min = std::min(min, duration);
        max = std::max(max, duration);
        mean += duration;
    }
    mean /= experiment_count;
    
    // report duration
    std::cout << "Mean duration:      " << mean << " us
"
        << "Min. duration:      " << min << " us
"
        << "Max. duration:      " << max << " us.
";
    
    // compute effective B/W
    const auto traffic = 4 * n * sizeof(float);
    constexpr auto inv_gigi = 1.0 / static_cast(1024 * 1024 * 1024);
    const auto traffic_in_gib = static_cast(traffic) * inv_gigi;
    std::cout << "Traffic per run:    " << traffic << " B (" << traffic_in_gib << " GiB)
" 
        << "Mean effective B/W: " << static_cast(traffic_in_gib) / (static_cast(mean) * 1E-6) << " GiB/s
"
        << "Min. effective B/W: " << static_cast(traffic_in_gib) / (static_cast(max) * 1E-6) << " GiB/s
"
        << "Max. effective B/W: " << static_cast(traffic_in_gib) / (static_cast(min) * 1E-6) << " GiB/s
"
        << std::endl;
    
    return 0;
}

OpenMP-based loop with reduction scales poorly

Answers (1)

Related Questions