mneuner
mneuner

Reputation: 447

C++ - Fastor vs. Eigen vs C-tran: Speed differences in tensor contraction

I am currently looking for a C++ library (preferably header only) for higher order tensor contractions with einstein summation like notation.

Fastor (https://github.com/romeric/Fastor) seems to be perfectly suited, and since Eigen (which I am using a lot) has a tensor module, I made a small comparison, including a benchmark of a simple loop implementation:

#include <Fastor.h>
#include "unsupported/Eigen/CXX11/Tensor"
#include <ctime>

int main() {
clock_t tstart, tend; 
 {
    using namespace Fastor;
    Tensor<double,20,50,10> A; 
    Tensor<double,50,10,20,4> B;
    Tensor<double, 4> C;
    Tensor<double, 10, 10, 4> D;

    A.ones();
    B.ones();
    C.zeros();
    D.zeros();
    enum {I,J,K,L,M,N};

    tstart = clock();

    C += einsum<Index<I,J,K>,Index<J,K,I,M>>(A,B);

    tend = clock(); 
    std::cout << "Fastor, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();

    D += einsum<Index<I,J,K>,Index<J,M,I,N>>(A,B);

    tend = clock(); 
    std::cout << "Fastor, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;
 }

 {
     using namespace Eigen;

    TensorFixedSize<double, Sizes<20, 50, 10>> A;
    TensorFixedSize<double, Sizes<50,10, 20, 4>> B;
    TensorFixedSize<double, Sizes<4>> C;
    TensorFixedSize<double, Sizes<10,10,4>> D;

    A.setConstant(1);
    B.setConstant(1);
    C.setConstant(0);
    D.setConstant(0);

     array<IndexPair<int>,3> IJK_JKIM = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
         IndexPair<int>(2, 1),
     };

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    tstart = clock();
    C += A.contract(  B,  IJK_JKIM) ;
    tend = clock(); 

    std::cout << "Eigen, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
     D += A.contract(  B,  IJK_JMIN) ;
    tend = clock(); 

    std::cout << "Eigen, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;


 }

 {
     using namespace Eigen;

     double A [20][50][10]  = {1} ;
     double B [50][10][20][4]  = {1};
     double C [4]  = {};
     double D [10][10][4]  = {};

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    B[j][k][i][l] = 1.0;


    tstart = clock();

    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int l = 0; l < 4; l++)
                    C[l] += A[i][j][k] * B [j][k][i][l];

    tend = clock(); 
    std::cout << "CTran, C_M = A_IJK * B_JKIM:\t"<< tend - tstart << std::endl;

    tstart = clock();
    for ( int i = 0; i < 20; i++)
        for ( int j = 0; j < 50; j++)
            for ( int k = 0; k < 10; k++)
                for ( int m = 0; m < 10; m++)
                    for ( int n = 0; n < 4; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    tend = clock(); 

    std::cout << "CTran, D_KMN = A_IJ * B_JMIN:\t"<< tend - tstart << std::endl;

 }

return 0;
}

Output for me is:

Fastor, C_M = A_IJK * B_JKIM:   206
Fastor, D_KMN = A_IJ * B_JMIN:  2230
Eigen, C_M = A_IJK * B_JKIM:    1286
Eigen, D_KMN = A_IJ * B_JMIN:   898
CTran, C_M = A_IJK * B_JKIM:    2
CTran, D_KMN = A_IJ * B_JMIN:   2

Compiled with g++ 9.1.0 (Arch Linux) using

g++ test.cpp -O3 -std=c++17  -I Fastor -isystem eigen-eigen-323c052e1731 -o test

So it seems that Fastor is considerably faster for example 1 than Eigen, but slower for example 2. However, both libraries are incredibely slower than the naive loop implementation! Is there an explanation for these contradicting results? Thank you in advance!

Upvotes: 2

Views: 3183

Answers (2)

romeric
romeric

Reputation: 2385

You can use Fastor's unused and timeit functions to do proper benchmarking without worrying if the compiler is going to eliminate your dead-code.

I would write your benchmark as follows:

constexpr int FIFTY =  50;
constexpr int TWETY =  20;
constexpr int TEN =  10;
constexpr int FOUR =  4;

using real_ = double;

real_ A [TWETY][FIFTY][TEN]  = {1} ;
real_ B [FIFTY][TEN][TWETY][FOUR]  = {1};
real_ C [FOUR]  = {};
real_ D [TEN][TEN][FOUR]  = {};


Fastor::Tensor<real_,TWETY,FIFTY,TEN> fA;
Fastor::Tensor<real_,FIFTY,TEN,TWETY,FOUR> fB;

Eigen::TensorFixedSize<real_, Eigen::Sizes<TWETY, FIFTY, TEN>> eA;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FIFTY,TEN, TWETY, FOUR>> eB;
Eigen::TensorFixedSize<real_, Eigen::Sizes<FOUR>> eC;
Eigen::TensorFixedSize<real_, Eigen::Sizes<TEN,TEN,FOUR>> eD;


void CTran_C() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    C[l] += A[i][j][k] * B[j][k][i][l];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(C);
}

void CTran_D() {

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                A[i][j][k] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int l = 0; l < FOUR; l++)
                    B[j][k][i][l] = 1.0;

    for ( int i = 0; i < TWETY; i++)
        for ( int j = 0; j < FIFTY; j++)
            for ( int k = 0; k < TEN; k++)
                for ( int m = 0; m < TEN; m++)
                    for ( int n = 0; n < FOUR; n++)
                        D[k][m][n] += A[i][j][k] * B [j][m][i][n];

    Fastor::unused(A);
    Fastor::unused(B);
    Fastor::unused(D);
}


void Fastor_C() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fC = einsum<Index<I,J,K>,Index<J,K,I,M>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fC);
}

void Fastor_D() {

    using namespace Fastor;
    enum {I,J,K,L,M,N};

    fA.ones();
    fB.ones();

    auto fD = einsum<Index<I,J,K>,Index<J,M,I,N>>(fA,fB);

    unused(fA);
    unused(fB);
    unused(fD);
}


void Eigen_C() {

    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

    array<IndexPair<int>,3> IJK_JKIM = {
        IndexPair<int>(0, 2),
        IndexPair<int>(1, 0),
        IndexPair<int>(2, 1),
    };

    eC = eA.contract(  eB,  IJK_JKIM) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eC);
}


void Eigen_D() {
    using namespace Eigen;

    eA.setConstant(1);
    eB.setConstant(1);

     array<IndexPair<int>,2> IJK_JMIN = {
         IndexPair<int>(0, 2),
         IndexPair<int>(1, 0),
     };

    eD = eA.contract(  eB,  IJK_JMIN) ;

    Fastor::unused(eA);
    Fastor::unused(eB);
    Fastor::unused(eD);
}


int main() {
    using namespace Fastor;

    print("Time for computing tensor C (CTran, Fastor, Eigen):");
    timeit(CTran_C);
    timeit(Fastor_C);
    timeit(Eigen_C);
    print("Time for computing tensor D (CTran, Fastor, Eigen):");
    timeit(CTran_D);
    timeit(Fastor_D);
    timeit(Eigen_D);

    return 0;
}

Compiling with GCC 9 g++-9 -O3 -DNDEBUG -mfma, with Eigen-3.3.7 and Fastor's most recent version from github, this is what I get

Time for computing tensor C (CTran, Fastor, Eigen):
32305 runs, average elapsed time is 30.9556 µs. No of CPU cycles 113604
42325 runs, average elapsed time is 23.6269 µs. No of CPU cycles 86643
2585 runs, average elapsed time is 387.003 µs. No of CPU cycles 1429229
Time for computing tensor D (CTran, Fastor, Eigen):
8086 runs, average elapsed time is 123.674 µs. No of CPU cycles 455798
21246 runs, average elapsed time is 47.0713 µs. No of CPU cycles 173044
5890 runs, average elapsed time is 169.793 µs. No of CPU cycles 626651

Note that the compiler auto-vectorises your CTran code as it is a straight-forward function with for loops so the generated code is very optimal. As you can see, Fastor performs better than others in both cases.

However, if you are really benchmarking on stack-allocated data, then you should decrease the size of your tensors because those dimensions are very unrealistic for stack data. If I redefine the sizes of your tensors as

constexpr int FIFTY =  5;
constexpr int TWETY =  2;
constexpr int TEN =  8;
constexpr int FOUR =  4;

Here is what I get

Time for computing tensor C (CTran, Fastor, Eigen):
5493029 runs, average elapsed time is 182.049 ns. No of CPU cycles 486
5865156 runs, average elapsed time is 170.498 ns. No of CPU cycles 442
301422 runs, average elapsed time is 3.31761 µs. No of CPU cycles 12032
Time for computing tensor D (CTran, Fastor, Eigen):
207912 runs, average elapsed time is 4.80974 µs. No of CPU cycles 17574
2733602 runs, average elapsed time is 365.818 ns. No of CPU cycles 1164
514351 runs, average elapsed time is 1.94421 µs. No of CPU cycles 6977

For computing the tensor D the compiler miscompiles the CTran code very badly for this size. Notice the difference between microseconds µs and nanoseconds ns.

Upvotes: 7

Ilya Popov
Ilya Popov

Reputation: 4010

For you "simple loop" benchmark, GCC can see through all the computations and remove all the of it completely. See interactive on godbolt: https://www.godbolt.org/z/-gPPVH.

The assembly for your "simple loop" fragment is the following:

foo():
        push    rbp
        push    rbx
        sub     rsp, 8
        call    clock
        mov     rbp, rax
        call    clock
        mov     edx, 29
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rbx, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        sub     rbx, rbp
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rsi, rbx
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L7
        cmp     BYTE PTR [rbx+56], 0
        je      .L5
        movsx   esi, BYTE PTR [rbx+67]
.L6:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        call    clock
        mov     rbp, rax
        call    clock
        mov     edx, 30
        mov     esi, OFFSET FLAT:.LC1
        mov     edi, OFFSET FLAT:_ZSt4cout
        mov     rbx, rax
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        mov     rsi, rbx
        mov     edi, OFFSET FLAT:_ZSt4cout
        sub     rsi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<long>(long)
        mov     rbp, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rbx, QWORD PTR [rbp+240+rax]
        test    rbx, rbx
        je      .L7
        cmp     BYTE PTR [rbx+56], 0
        je      .L8
        movsx   esi, BYTE PTR [rbx+67]
.L9:
        mov     rdi, rbp
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        add     rsp, 8
        pop     rbx
        mov     rdi, rax
        pop     rbp
        jmp     std::basic_ostream<char, std::char_traits<char> >::flush()
.L5:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
        je      .L6
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L6
.L8:
        mov     rdi, rbx
        call    std::ctype<char>::_M_widen_init() const
        mov     rax, QWORD PTR [rbx]
        mov     esi, 10
        mov     rax, QWORD PTR [rax+48]
        cmp     rax, OFFSET FLAT:std::ctype<char>::do_widen(char) const
        je      .L9
        mov     rdi, rbx
        call    rax
        movsx   esi, al
        jmp     .L9
.L7:
        call    std::__throw_bad_cast()

As you can see the compiler has completely removed all the computation.

The biggest contributing factor is the fact that the result of computation is not used anywhere. This allows the compiler to remove computation as "dead code".

Couriosly, both Clang and MSVC (see the link to goldbolt above) did not manage to remove the computation completely. Clang has removed the fist contraction, but not the second.


This is a common problem while doing microbenchmarks. Mature benchmarking frameworks such as Google benchmark and others have means to mitigate that by marking some variables as non-optiizable away, forcing the compiler to actually compute the value of these variables. I strongly recommend using one of these frameworks.

Upvotes: 1

Related Questions