Performance with matrix class in C++

Question

I was performance profiling our library and noticed that most time is spent in matrix manipulations. I wanted to see whether I could improve performance by changing the order of the matrix loops or by changing the matrix class definition from row major to column major. Questions:

Below I test 2 cases. Test case 1 is always the fastest, no matter whether my matrix is row or columns major. Why is that?
Turning on vectorization improves Test case 1 with a factor 2, why is that?

Performance profiling is done with Very Sleepy. I used Visual Studio 2019 – platformtoolset v142, and compiled in 32-bit.

Our library defines a matrix template where the underlying is a dynamic array where the ordering is column major (full code follows below):

Type& operator()(int row, int col)
{
   return pArr[row + col * m_rows];
}

Type operator()(int row, int col) const
{
   return pArr[row + col * m_rows];
}

We also have a matrix class specific for doubles:

class DMatrix : public TMatrix
{
public:
    // Constructors:
    DMatrix() : TMatrix() { }
    DMatrix(int rows, int cols) : TMatrix(rows, cols, true) {}

};

I ran 2 test cases that perform nested loop operations on randomly filled matrices. The difference between Test case 1 and 2 is the order of the inner loops.

       int nrep = 10000; // Large number of calculations
       int nstate = 400;
       int nstep = 400;
       int nsec = 3; // 100 times smaller than nstate and nstep
       DMatrix value(nstate, nsec);
       DMatrix Rc(nstate, 3 * nstep);
       DMatrix rhs(nstate, nsec);

    // Test case 1
    for (int k = 0; k < nrep; k++) {
        for (int n = 0; n < nstep; n++) {
            int diag = 3 * n + 1;
            for (int i = 1; i < nstate; i++) {
                for (int j = 0; j < nsec; j++) { 
                    value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
                }
            }
        }
    }

    // Test case 2
    for (int k = 0; k < nrep; k++) {
        for (int n = 0; n < nstep; n++) {
            int diag = 3 * n + 1;
            for (int j = 0; j < nsec; j++) {
                for (int i = 1; i < nstate; i++) {
                    value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
                }
            }
        }
    }

Since the matrix is column major, I expected that I would get the best performance when the inner loop follows a column, due to nearby elements being CPU cached, but instead it is doing the opposite. Note that nstep and nstate are typically 100 times larger than nsec.

When I turn on vectorization: “Advanced Vector Extensions 2” in Code Generation/Enable Enhanced Instruction Set, the performance difference gets even larger:

When I turn off the vectorization and make the matrix row major:

    Type& operator()(int row, int col)
    {
        return pArr[col + row*m_cols];
    }

    Type operator()(int row, int col) const
    {
        return pArr[col + row*m_cols];
    }

I don’t get any difference in performance compared to when the matrix was column major:

With vector optimizations:

The full code. matrix.h:

#ifndef __MATRIX_H
#define __MATRIX_H

#include 
#include 

template
class TMatrix
{
public:
    TMatrix();                                                                                         // Default constructor
    TMatrix(int rows, int cols, bool init = false);                            // Constructor with dimensions  + flag to default initialize or not
    TMatrix(const TMatrix& mat);                                  // Copy constructor
    TMatrix& operator=(const TMatrix& mat); // Assignment operator
    ~TMatrix();                             // Destructor   

    // Move constructor/assignment
    TMatrix(TMatrix&& mat) noexcept;
    TMatrix& operator=(TMatrix&& mat) noexcept;

    // Get matrix dimensions
    int no_rows() const { return m_rows; }
    int no_columns() const { return m_cols; }


    Type& operator()(int row, int col)
    {
        assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
        return pArr[row + col * m_rows];   // elements in a column lay next to each other
        //return pArr[col + row*m_cols];  // elements in a row lay next to each other
    }

    Type operator()(int row, int col) const
    {
        assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
        return pArr[row + col * m_rows];
        // return pArr[col + row*m_cols];
    }

protected:
    void clear();

    Type* pArr;
    int m_rows, m_cols;
};

//**************************************************************
// Implementation of TMatrix
//**************************************************************

// Default constructor
template
TMatrix::TMatrix()
{
    m_rows = 0;
    m_cols = 0;
    pArr = 0;
}

// Constructor with matrix dimensions (rows, cols)
template
TMatrix::TMatrix(int rows, int cols, bool init)
{
    pArr = 0;
    m_rows = rows;
    m_cols = cols;

    if (m_rows > 0 && m_cols > 0)
        if (init)
            pArr = new Type[m_rows * m_cols]();
        else
            pArr = new Type[m_rows * m_cols];                   // TODO: check for p = NULL (memory allocation error, which will triger a GPF)
    else
    {
        m_rows = 0;
        m_cols = 0;
    }
}

// Copy constructor
template
TMatrix::TMatrix(const TMatrix& mat)
{
    pArr = 0;
    m_rows = mat.m_rows;
    m_cols = mat.m_cols;

    if (m_rows > 0 && m_cols > 0)
    {
        int dim = m_rows * m_cols;
        pArr = new Type[dim];

        for (int i = 0; i < dim; i++)
            pArr[i] = mat.pArr[i];
    }
    else
    {
        m_rows = m_cols = 0;
    }
}

// Move constructors
template
TMatrix::TMatrix(TMatrix&& mat) noexcept
{
    m_rows = mat.m_rows;
    m_cols = mat.m_cols;

    if (m_rows > 0 && m_cols > 0)
    {
        pArr = mat.pArr;
    }
    else
    {
        m_rows = m_cols = 0;
        pArr = 0;
    }

    mat.pArr = 0;
}

// Clear the matrix
template
void TMatrix::clear()
{

    delete[] pArr;
    pArr = 0;
    m_rows = m_cols = 0;
}

// Destructor
template
TMatrix::~TMatrix()
{
    clear();
}

// Move assignment
template
TMatrix& TMatrix::operator=(TMatrix&& mat) noexcept
{

    if (this != &mat) // Check for self assignment
    {
        clear();
        m_rows = mat.m_rows;
        m_cols = mat.m_cols;

        if (m_rows > 0 && m_cols > 0)
        {
            pArr = mat.pArr;
        }
        else
        {
            m_rows = m_cols = 0;
        }
        mat.pArr = nullptr;
    }
    return *this;
}

// Assignment operator with check for self-assignment
template
TMatrix& TMatrix::operator=(const TMatrix& mat)
{
    if (this != &mat)        // Guard against self assignment
    {
        clear();
        m_rows = mat.m_rows;
        m_cols = mat.m_cols;

        if (m_rows > 0 && m_cols > 0)
        {
            int dim = m_rows * m_cols;
            pArr = new Type[dim];

            for (int i = 0; i < dim; i++)
                pArr[i] = mat.pArr[i];
        }
        else
        {
            m_rows = m_cols = 0;
        }
    }
    return *this;
}

#endif

dmatrix.h:

#ifndef __DMATRIX_H
#define __DMATRIX_H

#include "matrix.h"
class DMatrix : public TMatrix
{
public:
    // Constructors:
    DMatrix() : TMatrix() { }
    DMatrix(int rows, int cols) : TMatrix(rows, cols, true) {}
};
#endif

Main:

#include 
#include "dmatrix.h"

int main()
{
    int nrep = 10000; // Large number of calculations

    int nstate = 400;
    int nstep = 400;
    int nsec = 3; // 100 times smaller than nstate and nstep
    DMatrix value(nstate, nsec);
    DMatrix Rc(nstate, 3 * nstep);
    DMatrix rhs(nstate, nsec);

    // Give some random input
    for (int i = 0; i < Rc.no_rows(); i++) {
        for (int j = 0; j < Rc.no_columns(); j++) {
            Rc(i, j) = double(std::rand()) / RAND_MAX;
        }
    }

    for (int i = 0; i < value.no_rows(); i++) {
        for (int j = 0; j < value.no_columns(); j++) {
            value(i, j) = 1 + double(std::rand()) / RAND_MAX;
        }
    }

    for (int i = 0; i < rhs.no_rows(); i++) {
        for (int j = 0; j < rhs.no_columns(); j++) {
            rhs(i, j) = 1 + double(std::rand()) / RAND_MAX;
        }
    }

    // Test case 1
    for (int k = 0; k < nrep; k++) {
        for (int n = 0; n < nstep; n++) {
            int diag = 3 * n + 1;
            for (int i = 1; i < nstate; i++) {
                for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
                    value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
                }
            }
        }
    }

    // Test case 2
    for (int k = 0; k < nrep; k++) {
        for (int n = 0; n < nstep; n++) {
            int diag = 3 * n + 1;
            for (int j = 0; j < nsec; j++) {
                for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
                    value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
                }
            }
        }
    }
    
    return 0;
}

Thanks in advance for your help. Best regards, Nele

Performance with matrix class in C++

Answers (1)

Related Questions