Naive matrix addition vs expression template matrix addition performs better, allocation taken into account or not

Question

This question is a follow up on questions Expression templates cppcon 2019 and matrix class with expression templates à la cppcon 2019

I finally coded a toy matrix class dense_matrix with expression templates as I initially wanted :

// I forward declare the class which all matrix classes will derive from
// as I need such a class matrix-related traits
class base_matrix;

//is_matrix_v
template 
struct is_matrix
{
    static constexpr bool value = std::derived_from;
};
template 
struct is_matrix>
{
    // std::vector will be considered as a column or row matrix according to its position in a multiplication
    static constexpr bool value = true;
};
template 
constexpr bool is_matrix_v = is_matrix>::value;

struct expression {}; // you always need a base for traits right ?

// finally defining what is being a matrix or an expression
template 
concept is_matrix_or_expression =
is_matrix_v>
||
std::is_base_of_v>;

// defining what a legal binary matrix operation is
template 
constexpr bool is_matrix_binary_op_ok =
is_matrix_or_expression
||
is_matrix_or_expression;


//subscripts() for matrix(i,j)
template 
auto subscripts(operand const& v, size_t const i, size_t const j) {
    if constexpr (is_matrix_or_expression) {
        return v(i, j);
    }
    else {
        return v;
    }
}

template 
auto rows(operand const& m) {
    if constexpr (is_matrix_or_expression) {
        return m.rows();
    }
    else {
        return 1;
    }
};

template 
auto cols(operand const& m) {
    if constexpr (is_matrix_or_expression) {
        return m.cols();
    }
    else {
        return 1;
    }
};

The matrix template expression class, in the fashion of Bowie Owens' cppcon 2019 talk is using variadic operands to handle any-arity operator all at once :

template class matrix_expression : public expression { std::tuple args_; callable f_; public: matrix_expression(callable f, operands const&... args) : args_(args...), f_(f) {} auto& operator()(const size_t i, const size_t j) { auto const call_at_index_couple = [this, i, j](operands const&... a) { return f_(subscripts(a, i, j)...); }; return std::apply(call_at_index_couple, args_); } auto operator()(const size_t i, const size_t j) const { auto const call_at_index_couple = [this, i, j](operands const&... a) { return f_(subscripts(a, i, j)...); }; return std::apply(call_at_index_couple, args_); } size_t rows() const { auto const call_rows = [](operands const&... a) { return std::max(::rows(a)...); }; return std::apply(call_rows, args_); } size_t cols() const { auto const call_cols = [](operands const&... a) { return std::max(::cols(a)...); }; return std::apply(call_cols, args_); } }; //defining addition template >> auto operator+(LHS const& lhs, RHS const& rhs) { return matrix_expression { //[](auto a, auto b) //{ // return a + b; //} std::plus<>{}, lhs, rhs }; } // and substraction template >> auto operator-(LHS const& lhs, RHS const& rhs) { return matrix_expression { //[](auto a, auto b) //{ // return a - b; //} std::minus<>{}, lhs, rhs }; } //finally defining the forward declared base_matrix class base_matrix : public expression { public: virtual ~base_matrix() = default; // So we can call matrix(i,j) ... virtual double& operator()(const size_t i, const size_t j) = 0; // ... and set matrix(i,j) virtual double operator()(const size_t i, const size_t j) const = 0; virtual size_t rows() const = 0; virtual size_t cols() const = 0; }; //defining a simple and concrecte dense matrix class class dense_matrix : public base_matrix { size_t rows_; size_t columns_; std::vector underlying_vector_; public: dense_matrix() : rows_(0), columns_(0) {} dense_matrix(const size_t rows, const size_t cols) : rows_(rows), columns_(cols), underlying_vector_(rows* cols) {} dense_matrix(const size_t rows, const size_t cols, double* ptr) : rows_(rows), columns_(cols), underlying_vector_(ptr, ptr + (rows * cols)) {} dense_matrix(std::vector& vector, bool isColumn = true) { if (isColumn) { rows_ = vector.size(); columns_ = 1; } else { rows_ = 1; columns_ = vector.size(); } underlying_vector_ = std::move(vector); } template dense_matrix(const src_type& src) : rows_(src.rows()), columns_(src.cols()), underlying_vector_(src.rows()* src.cols()) { for (size_t i = 0; i < src.rows(); ++i) { for (size_t j = 0; j < src.cols(); ++j) { underlying_vector_[i * columns_ + j] = src(i, j); } } } template dense_matrix& operator=(src_type const& src) { for (size_t i = 0; i < rows(); ++i) { for (size_t j = 0; j < cols(); ++j) { underlying_vector_[i * columns_ + j] = src(i, j); } } return *this; // this line was missing in the slides and in the talk } // Access size_t rows() const override { return rows_; } size_t cols() const override { return columns_; } double& operator()(const size_t i, const size_t j) override { return underlying_vector_[i * columns_ + j]; } double operator()(const size_t i, const size_t j) const override { return underlying_vector_[i * columns_ + j]; } };

I am ok for the short time being with this class, so that I wanted to test its performance for algebraic operations. Hence I coded a naive matrix class as follows :

class dense_matrix_naive { size_t rows_; size_t columns_; std::vector underlying_vector_; public: const std::vector& underlying_vector() const { return underlying_vector_; } dense_matrix_naive() : rows_(0), columns_(0) {} dense_matrix_naive(const size_t rows, const size_t cols) : rows_(rows), columns_(cols), underlying_vector_(rows* cols) {} dense_matrix_naive(const size_t rows, const size_t cols, double* ptr) : rows_(rows), columns_(cols), underlying_vector_(ptr, ptr + (rows * cols)) {} dense_matrix_naive(std::vector& vector, bool isColumn = true) { if (isColumn) { rows_ = vector.size(); columns_ = 1; } else { rows_ = 1; columns_ = vector.size(); } underlying_vector_ = std::move(vector); } template dense_matrix_naive(const src_type& src) : rows_(src.rows()), columns_(src.cols()), underlying_vector_(src.rows()* src.cols()) { for (size_t i = 0; i < src.rows(); ++i) { for (size_t j = 0; j < src.cols(); ++j) { underlying_vector_[i * columns_ + j] = src(i, j); } } } template dense_matrix_naive& operator=(src_type const& src) { for (size_t i = 0; i < rows(); ++i) { for (size_t j = 0; j < cols(); ++j) { underlying_vector_[i * columns_ + j] = src(i, j); } } return *this; // this line was missing in the slides and in the talk } // Access size_t rows() const { return rows_; } size_t cols() const { return columns_; } double& operator()(const size_t i, const size_t j) { return underlying_vector_[i * columns_ + j]; } double operator()(const size_t i, const size_t j) const { return underlying_vector_[i * columns_ + j]; } dense_matrix_naive& operator+=(const dense_matrix_naive& rhs) { for (size_t i = 0; i < underlying_vector_.size(); ++i) { underlying_vector_[i] += rhs.underlying_vector()[i]; } return *this; } dense_matrix_naive& operator-=(const dense_matrix_naive& rhs) { for (size_t i = 0; i < underlying_vector_.size(); ++i) { underlying_vector_[i] -= rhs.underlying_vector()[i]; } return *this; } }; inline dense_matrix_naive operator+(const dense_matrix_naive& m1, const dense_matrix_naive& m2) { auto res(m1); res += m2; return res; } inline dense_matrix_naive operator-(const dense_matrix_naive& m1, const dense_matrix_naive& m2) { auto res(m1); res -= m2; return res; }

Finally, the comparison test :

I perform a sum of 3 dense matrices, with and without expression templates

matrix coefficients are random doubles

I perform the sum a thousand times each and for various sizes of square matrices.

I do that in themain() function beneath

The function's source code is compiled using current last version 17.8.5 of microsoft visual studio professional 2022, with the C++ compiler "Visual C++ 2022 00476-80000-00000-AA747" and with the following command :

/permissive- /ifcOutput "x64\Release\" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /sdl /Fd"x64\Release\vc143.pdb" /Zc:inline /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /Gd /Oi /MD /std:c++latest /FC /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\ConsoleApplication1.pch" /diagnostics:column

The main() source code is :

int main() { constexpr size_t nb_ops = 1000; constexpr size_t square_matrix_sizes[] = { 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 5000}; std::cout << "SIZE" << ' ' << ' ' << "NAIVE" << ' ' << ' ' << "EXPR" << " "; for (const auto square_matrix_size : square_matrix_sizes) { // Setting data //constexpr size_t square_matrix_size = 800; typedef std::mt19937 MyRNG; uint32_t seed_val = 1729; MyRNG rng; rng.seed(seed_val); std::normal_distribution normal_dist(0.0, 1.0); dense_matrix_naive m1_naive(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m1_naive(i, j) = normal_dist(rng); } } dense_matrix_naive m2_naive(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m2_naive(i, j) = normal_dist(rng); } } dense_matrix_naive m3_naive(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m3_naive(i, j) = normal_dist(rng); } } dense_matrix m1(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m1(i, j) = normal_dist(rng); } } dense_matrix m2(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m2(i, j) = normal_dist(rng); } } dense_matrix m3(square_matrix_size, square_matrix_size); for (size_t i = 0; i < square_matrix_size; ++i) { for (size_t j = 0; j < square_matrix_size; ++j) { m3(i, j) = normal_dist(rng); } } auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < nb_ops; ++i) { dense_matrix_naive m4_naive = m1_naive + m2_naive + m3_naive; } auto end = std::chrono::steady_clock::now(); std::chrono::duration elapsed_seconds = end - start; std::cout << square_matrix_size << ' ' << ' ' << std::fixed << elapsed_seconds << ' '; start = std::chrono::steady_clock::now(); for (size_t i = 0; i < nb_ops; ++i) { dense_matrix m4 = m1 + m2 + m3; } end = std::chrono::steady_clock::now(); elapsed_seconds = end - start; std::cout << std::fixed << elapsed_seconds << " "; }

This main() prints something in this fashion :

SIZE NAIVE EXPR 10 0.001863s 0.000990s 20 0.001125s 0.003765s 30 0.002650s 0.010777s 40 0.002999s 0.022661s 50 0.005942s 0.029157s 60 0.008142s 0.030399s 70 0.008975s 0.051088s 80 0.085463s 0.065621s 90 0.071054s 0.085647s 100 0.042847s 0.109610s 200 0.116581s 0.385381s 300 0.927886s 0.719902s 400 1.030033s 1.617557s 500 1.595740s 2.577250s 600 2.553515s 3.609214s 700 3.650004s 4.948367s 800 4.860585s 6.840540s 900 6.040074s 8.451447s 1000 7.586206s 10.528636s 5000 196.896556s 259.612446s

As soon as the size of matrices is sufficiently significant so that there are enough operations to be performed, the naïve implementation is quasi-systematically executed faster that the one using expression templates, which is not what I expected.

Naive matrix addition vs expression template matrix addition performs better, allocation taken into account or not

Answers (1)

Related Questions