C++ Divide and conquer matrix multiplication algorithm too slow

Question

I would like to ask you for help. I've implemented divide and conquer algorithm in C++. It works fine, but it is really slow. 512x512 matrix is calculated like 90 seconds which is unacceptable compared with 0.4s with naive algorithm. Could you tell me, if, and what I'm doing wrong?

The main divide and conquer function is:

std::vector> Matrix::divideAndConquer(std::vector>& matrix1, 
std::vector>& matrix2)
{
if (matrix1.size() == 1) {
    return naive(matrix1, matrix2);
}

int newSize = matrix1.size() / 2;
std::vector inner(newSize);
std::vector> a11(newSize, inner);
std::vector> a12(newSize, inner);
std::vector> a21(newSize, inner);
std::vector> a22(newSize, inner);

std::vector> b11(newSize, inner);
std::vector> b12(newSize, inner);
std::vector> b21(newSize, inner);
std::vector> b22(newSize, inner);

Matrix::divideMatrix(matrix1, a11, a12, a21, a22);
Matrix::divideMatrix(matrix2, b11, b12, b21, b22);

std::vector> m1 = divideAndConquer(a11, b11);
std::vector> m2 = divideAndConquer(a12, b21);
std::vector> m3 = divideAndConquer(a11, b12);
std::vector> m4 = divideAndConquer(a12, b22);

std::vector> m5 = divideAndConquer(a21, b11);
std::vector> m6 = divideAndConquer(a22, b21);
std::vector> m7 = divideAndConquer(a21, b12);
std::vector> m8 = divideAndConquer(a22, b22);

std::vector> c1 = add(m1, m2);
std::vector> c2 = add(m3, m4);
std::vector> c3 = add(m5, m6);
std::vector> c4 = add(m7, m8);

return combine(c1, c2, c3, c4);
}

And then divide matrix function is:

void Matrix::divideMatrix(std::vector>& matrix, std::vector>& m1, 
std::vector>& m2, std::vector>& m3, std::vector>& 
m4)
{
    for (int i = 0; i < matrix.size() / 2; i++) {
        for (int j = 0; j < matrix.size() / 2; j++) {
            m1[i][j] = matrix[i][j];
            m2[i][j] = matrix[i][j + matrix.size() / 2];
            m3[i][j] = matrix[i + (matrix.size() / 2)][j];
            m4[i][j] = matrix[i + (matrix.size() / 2)][j + (matrix.size() / 2)];
        }
    }
}

Combine method is:

std::vector> Matrix::combine(std::vector>& m1, std::vector>& m2, std::vector>& m3, std::vector>& m4)
{

    std::vector inner(m1[0].size() + m2[0].size());
    std::vector> result(m1.size() + m2.size(), inner);

    for (int i = 0; i < m1.size(); i++) {
        for (int j = 0; j < m2.size(); j++) {
            result[i][j] = m1[i][j];
            result[i][j + m1[0].size()] = m2[i][j];
            result[i + m1.size()][j] = m3[i][j];
            result[i + m1.size()][j + m3[0].size()] = m4[i][j];
        }
    }

    return result;
}

And last add method is:

std::vector> Matrix::add(std::vector>& matrix1, std::vector>& matrix2)
{
    std::vector> result(matrix1);

    for (int i = 0; i < matrix1.size(); i++) {
        for (int j = 0; j < matrix1.size(); j++) {
            result[i][j] = matrix1[i][j] + matrix2[i][j];
        }
    }

    return result;
}

The naive function just returns multiplication of matrix1[0][0] * matrix2[0][0] in a new vector

Yakk - Adam Nevraumont · Accepted Answer

struct matrix_view {
  int* m_data = 0;
  std::size_t m_stride = 1; // distance between lines
  std::size_t m_width = 0;
  std::size_t m_height = 0;

  int& operator()(std::size_t i, std::size_t j) const {
    return *get(i,j);
  }
  int* get(std::size_t i, std::size_t j) const {
    return m_data+(i*m_stride + j);
  }

  matrix_view slice(std::size_t i, std::size_t j, std::size_t width, std::size_t height ) {
    return {get(i,j), stride, width, height};
  }
};

struct matrix:matrix_view {
  std::vector m_storage;
  matrix( std::size_t width, std::size_t height ):
    matrix_view{ nullptr, height, width, height },
    m_storage( width*height )
  {
    m_data = m_storage.data();
  }
  // these are unsafe unless we write them manually, because
  // matrix_view gets out of sync.
  matrix( matrix const& ) = delete;
  matrix& operator=( matrix const& ) = delete;
};

With these you can avoid a bunch of memory copying.

It still might not beat naive, because fancy multiplication often requires big sizes to beat out naive.

Here is an untested version:

namespace Matrix {
    struct matrix_view {
      int* m_data = 0;
      std::size_t m_stride = 1; // distance between lines
      std::size_t m_size = 0;

      int& operator()(std::size_t i, std::size_t j) const {
        return *get(i,j);
      }
      int* get(std::size_t i, std::size_t j) const {
        return m_data+(i*m_stride + j);
      }

      matrix_view slice(std::size_t i, std::size_t j, std::size_t size ) {
        return {get(i,j), m_stride, size};
      }

      std::size_t count() const { return m_size * m_size; }
    };

    struct matrix:matrix_view {
      std::vector m_storage;

      matrix( std::vector storage, std::size_t size ):
        matrix_view{ storage.data(), size, size },
        m_storage(std::move(storage) )
      {
      }
      explicit matrix( std::size_t size ):
        matrix( std::vector(size*size), size )
      {}
      // these are unsafe unless we write them manually, because
      // matrix_view gets out of sync.
      matrix( matrix const& o ):
        matrix(o.m_storage, o.m_size )
      {}
      matrix( matrix && o ):
        matrix(std::move(o.m_storage), o.m_size )
      {
        o.m_size = 0;
      }

      matrix& operator=( matrix const& ) = delete;
    };

    struct quad_t {
        matrix_view sub_11;
        matrix_view sub_12;
        matrix_view sub_21;
        matrix_view sub_22;
    };

    quad_t quad( matrix_view matrix )
    {
        quad_t r {
            matrix.slice(0,0, matrix.m_size/2 ),  matrix.slice(0,matrix.m_size/2, matrix.m_size/2 ),
            matrix.slice(matrix.m_size/2,0, matrix.m_size/2 ), matrix.slice(matrix.m_size/2, matrix.m_size/2, matrix.m_size/2 )
        };
        return r;
    }
    matrix add(matrix_view matrix1, matrix_view matrix2);
    matrix combine(matrix_view m1, matrix_view m2, matrix_view m3, matrix_view m4);
    matrix naive(matrix_view m1, matrix_view m2);

    matrix divideAndConquer(matrix_view matrix1, matrix_view matrix2)
    {
      if (matrix1.count() == 1) {
        return naive(matrix1, matrix2);
      }


      auto a = quad(matrix1);
      auto b = quad(matrix2);

      auto m1 = divideAndConquer(a.sub_11, b.sub_11);
      auto m2 = divideAndConquer(a.sub_12, b.sub_21);
      auto m3 = divideAndConquer(a.sub_11, b.sub_12);
      auto m4 = divideAndConquer(a.sub_12, b.sub_22);

      auto m5 = divideAndConquer(a.sub_21, b.sub_11);
      auto m6 = divideAndConquer(a.sub_22, b.sub_21);
      auto m7 = divideAndConquer(a.sub_21, b.sub_12);
      auto m8 = divideAndConquer(a.sub_22, b.sub_22);

      auto c1 = add(m1, m2);
      auto c2 = add(m3, m4);
      auto c3 = add(m5, m6);
      auto c4 = add(m7, m8);

      return combine(c1, c2, c3, c4);
    }


    matrix combine(matrix_view m1, matrix_view m2, matrix_view m3, matrix_view m4)
    {
      matrix result = matrix( m1.m_size*2 );

      for (std::size_t i = 0; i < m1.m_size; i++) {
        for (std::size_t j = 0; j < m1.m_size; j++) {
          result(i,j) = m1(i,j);
          result(i, j + m1.m_size) = m2(i,j);
          result(i + m1.m_size,j) = m3(i,j);
          result(i + m1.m_size,j + m3.m_size) = m4(i,j);
        }
      }

      return result;
    }

    matrix add(matrix_view matrix1, matrix_view matrix2)
    {
        matrix result(matrix1.m_size);

        for (std::size_t i = 0; i < matrix1.m_size; i++) {
            for (std::size_t j = 0; j < matrix1.m_size; j++) {
                result(i,j) = matrix1(i,j) + matrix2(i,j);
            }
        }

        return result;
    }
    matrix naive(matrix_view m1, matrix_view m2)
    {
      matrix r = matrix(m1.m_size);
      for (std::size_t i = 0; i < m1.m_size; ++i)
        for (std::size_t j = 0; j < m1.m_size; ++j)
          for (std::size_t k = 0; k < m1.m_size; ++k)
            r(i,j) += m1(i,k) * m2(k,j);
      return r;
    }
    void print( std::ostream& os, matrix_view m1 )
    {
      for (std::size_t i = 0; i < m1.m_size; ++i)
      {
        for (std::size_t j = 0; j < m1.m_size; ++j)
          std::cout << m1(i,j) << " ";
        std::cout << "
";
      }
    }
}

int main()
{
    Matrix::matrix m1(16), m2(16);
    for (int i = 0; i < m1.m_size; ++i)
        m1(i,i) = 1;
    for (int i = 0; i < m1.m_size; ++i)
        m2(m2.m_size-i-1,i) = 1;

    auto m3 = Matrix::divideAndConquer(m1, m2);

    print(std::cout, m3);
}

reusing the "result" matrices would give this another huge boost in speed. Also, switch to naive at a size bigger than 1.

Live example.

C++ Divide and conquer matrix multiplication algorithm too slow

Answers (1)

Related Questions