Reputation: 17021

Mono SIMD worsening performance?

Benchmark Code:

using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using Mono.Simd;
using MathNet.Numerics.LinearAlgebra.Single;

namespace XXX {
public static class TimeSpanExtensions {
    public static double TotalNanoseconds(this TimeSpan timeSpan) {
        return timeSpan.TotalMilliseconds * 1000000.0;
    }
}

public sealed class SimdBenchmark : Benchmark {
    Vector4f a = new Vector4f(1.0f, 2.0f, 3.0f, 4.0f);
    Vector4f b = new Vector4f(1.0f, 2.0f, 3.0f, 4.0f);
    Vector4f c;

    public override void Do() {
        c = a + b;
    }
}

public sealed class MathNetBenchmark : Benchmark {
    DenseVector a = new DenseVector(new float[]{1.0f,2.0f,3.0f,4.0f});
    DenseVector b = new DenseVector(new float[]{1.0f,2.0f,3.0f,4.0f});
    DenseVector c;

    public override void Do() {
        c = a + b;
    }
}

public sealed class DefaultBenchmark : Benchmark {
    Vector4 a = new Vector4(1.0f, 2.0f, 3.0f, 4.0f);
    Vector4 b = new Vector4(1.0f, 2.0f, 3.0f, 4.0f);
    Vector4 c;

    public override void Do() {
        c = a + b;
    }
}

public sealed class SimpleBenchmark : Benchmark {
    float a = 1.0f;
    float b = 2.0f;
    float c;

    public override void Do() {
        c = a + b;
    }
}

public sealed class DelegateBenchmark : Benchmark {
    private readonly Action _action;

    public DelegateBenchmark(Action action) {
        _action = action;
    }

    public override void Do() {
        _action();
    }
}

public abstract class Benchmark : IEnumerable<TimeSpan> {
    public IEnumerator<TimeSpan> GetEnumerator() {
        Do(); // Warm-up!

        GC.Collect(); // Collect garbage.
        GC.WaitForPendingFinalizers(); // Wait until finalizers finish.

        var stopwatch = new Stopwatch();

        while (true) {
            stopwatch.Reset();
            stopwatch.Start();
            Do();
            stopwatch.Stop();

            yield return stopwatch.Elapsed;
        }
    }

    IEnumerator IEnumerable.GetEnumerator() {
        return GetEnumerator();
    }

    public abstract void Do();
}

public struct Vector4 {
    float x;
    float y;
    float z;
    float w;

    public Vector4(float x, float y, float z, float w) {
        this.x = x;
        this.y = y;
        this.z = z;
        this.w = w;
    }

    public static Vector4 operator +(Vector4 v1, Vector4 v2) {
        return new Vector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
    }
}

class MainClass {
    public static void Main(string[] args) {
        var avgNS1 = new SimdBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS2 = new SimpleBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS3 = new DefaultBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS4 = new MathNetBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());


        Console.WriteLine(avgNS1 + " ns");
        Console.WriteLine(avgNS2 + " ns");
        Console.WriteLine(avgNS3 + " ns");
        Console.WriteLine(avgNS4 + " ns");
    }
}
}

Environment Setup:

Windows 7 / Mono 2.10.8 / MonoDevelop 2.8.5

MonoDevelop Setup:

Tools > Options > .NET Runtimes > Mono 2.10.8 (Default)
Project > Options > Build > General > Target framework > Mono / .NET 4.0
Project > Options > Build > Compiler > General Options > Enable optimizations
Project > Options > Build > Compiler > General Options > Platform target > x86
Project > Options > Run > General > Parameters > -O=simd

Results:

94.4 ns
29.7 ns
49.9 ns
231595.2 ns

Upvotes: 3

Answers (3)

Alexander Shukaev

Reputation: 17021

Well, I've managed to modify my benchmark code to make it more robust and completely unbiased. In other words:

First, as we discussed with Nicholas - measuring single operation might give distorted results. Moreover, as the frequency of Stopwatch is 10 million - it means that ticks occur every 100 ns. So considering this fact, previous results look rather bizarre. Therefore, in order to mitigate this issue, I decided to test 1000 operations rather than 1 at time.

Second, I'm not completely sure, but I guess that previous benchmark implementation was subjected to intensive caching, since on every iteration sums were computed between the same vectors (their components never changed). The only straightforward solution I see is to simply rebuild vectors with random components before every test.

The respective benchmark implementation is:

public static class TimeSpanExtensions {
    public static double TotalNanoseconds(this TimeSpan timeSpan) {
        return timeSpan.TotalMilliseconds * 1000000.0;
    }
}

public static class RandomExtensions {
    public static float NextFloat(this Random random) {
        return (float)random.NextDouble();
    }

    public static float NextFloat(this Random random, float min, float max) {
        return random.NextFloat() * (max - min) + min;
    }
}

public sealed class SimdBenchmark : Benchmark {
    Vector4f[] a = new Vector4f[1000];
    Vector4f[] b = new Vector4f[1000];
    Vector4f[] c = new Vector4f[1000];

    public override void Begin() {
        Random r = new Random();

        for (int i = 0; i < 1000; ++i) {
            a[i] = new Vector4f(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
            b[i] = new Vector4f(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
        }
    }

    public override void Do() {
        for (int i = 0; i < 1000; ++i)
            c[i] = a[i] + b[i];
    }

    public override void End() {

    }
}

public sealed class MathNetBenchmark : Benchmark {
    DenseVector[] a = new DenseVector[1000];
    DenseVector[] b = new DenseVector[1000];
    DenseVector[] c = new DenseVector[1000];

    public override void Begin() {
        Random r = new Random();

        for (int i = 0; i < 1000; ++i) {
            a[i] = new DenseVector(new float[]{r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat()});
            b[i] = new DenseVector(new float[]{r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat()});
        }
    }

    public override void Do() {
        for (int i = 0; i < 1000; ++i)
            c[i] = a[i] + b[i];
    }

    public override void End() {

    }
}

public sealed class DefaultBenchmark : Benchmark {
    Vector4[] a = new Vector4[1000];
    Vector4[] b = new Vector4[1000];
    Vector4[] c = new Vector4[1000];

    public override void Begin() {
        Random r = new Random();

        for (int i = 0; i < 1000; ++i) {
            a[i] = new Vector4(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
            b[i] = new Vector4(r.NextFloat(), r.NextFloat(), r.NextFloat(), r.NextFloat());
        }
    }

    public override void Do() {
        for (int i = 0; i < 1000; ++i)
            c[i] = a[i] + b[i];
    }

    public override void End() {

    }
}

public sealed class SimpleBenchmark : Benchmark {
    float[] a = new float[1000];
    float[] b = new float[1000];
    float[] c = new float[1000];

    public override void Begin() {
        Random r = new Random();

        for (int i = 0; i < 1000; ++i) {
            a[i] = r.NextFloat();
            b[i] = r.NextFloat();
        }
    }

    public override void Do() {
        for (int i = 0; i < 1000; ++i)
            c[i] = a[i] + b[i];
    }

    public override void End() {

    }
}

public sealed class DelegateBenchmark : Benchmark {
    private readonly Action _action;

    public DelegateBenchmark(Action action) {
        _action = action;
    }

    public override void Begin() {

    }

    public override void Do() {
        _action();
    }

    public override void End() {

    }
}

public abstract class Benchmark : IEnumerable<TimeSpan> {
    public IEnumerator<TimeSpan> GetEnumerator() {
        Begin();
        Do(); // Warm-up!
        End();

        var stopwatch = new Stopwatch();

        while (true) {
            Begin();

            GC.Collect(); // Collect garbage.
            GC.WaitForPendingFinalizers(); // Wait until finalizers finish.

            stopwatch.Reset();
            stopwatch.Start();

            Do();

            stopwatch.Stop();

            End();

            yield return stopwatch.Elapsed;
        }
    }

    IEnumerator IEnumerable.GetEnumerator() {
        return GetEnumerator();
    }

    public abstract void Begin();

    public abstract void Do();

    public abstract void End();
}

public struct Vector4 {
    float x;
    float y;
    float z;
    float w;

    public Vector4(float x, float y, float z, float w) {
        this.x = x;
        this.y = y;
        this.z = z;
        this.w = w;
    }

    public static Vector4 operator +(Vector4 v1, Vector4 v2) {
        return new Vector4(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z, v1.w + v2.w);
    }
}

class MainClass {
    public static void Main(string[] args) {
        var avgNS1 = new SimdBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS2 = new SimpleBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS3 = new DefaultBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());
        var avgNS4 = new MathNetBenchmark().Take(1000).Average(timeSpan => timeSpan.TotalNanoseconds());

        Console.WriteLine(avgNS1 + " ns");
        Console.WriteLine(avgNS2 + " ns");
        Console.WriteLine(avgNS3 + " ns");
        Console.WriteLine(avgNS4 + " ns");
    }
}

Results:

3203.9 ns
2677.4 ns
20138.4 ns
597581060.7 ns

I think it confirms that SIMD is on air, because SimdBenchmark is getting close to SimpleBenchmark (as intended by SIMD technology) and is much better than DefaultBenchmark (again as implied by SIMD technology).

Moreover, the results seems consistent with konrad.kruczynski, because the ratio between SimdBenchmark (3203.9) and DefaultBenchmark (20138.4) is about 6 and the ratio between simdVector (5802) and usualVector (29598) is also about 6.

Anyway 2 questions still remain:

Why playing with "-O=simd" / "-O=-simd" has no effect. Is it deprecated? Is SIMD automatically engaged?
How could Stopwatch with 100 ns ticks give previous results (94.4, 29.7, 49.9), which are obviously lower than 100 ns?

Upvotes: 1

konrad.kruczynski

Reputation: 47571

These are my results:

1608.8 ns
1554.9 ns
1582.5 ns

(without MathNET, although it is not important here). Os is Ubuntu 10.10 (32bit), Mono 2.10.7. At this moment you may considering making a bug report targeting Windows Mono version. But:

I think this is not the right way to benchmarking SIMD operations because of the benchmark's mechanisms overhead.

For example look on this primitive test based on your Vector4 class.

        const int count = 100000;
        var simdVector = new Vector4f(1, 2, 3, 4);
        var simdResult = simdVector;
        var sw = Stopwatch.StartNew();
        for(var i = 0; i < count; i++)
        {
            simdResult += simdVector;
        }
        sw.Stop();
        Console.WriteLine("SIMD  result: {0} {1}", sw.Elapsed, simdResult);
        sw = Stopwatch.StartNew();
        var usualVector = new Vector4(1, 2, 3, 4);
        var usualResult = usualVector;
        for(var i = 0; i < count; i++)
        {
            usualResult += usualVector;
        }
        sw.Stop();
        Console.WriteLine("Usual result: {0} {1}", sw.Elapsed, usualResult);

On my machine results are:

SIMD  result: 00:00:00.0005802 <100001, 200002, 300003, 400004>
Usual result: 00:00:00.0029598 <100001, 200002, 300003, 400004>

So something definitely different than your tests. So you may think that SIMD operations are that factor faster - but benchmarking is not that easy. There are many reasons for the upper loop being faster in this configuration. These reasons can be discussed on another occasion.

Nevertheless it is sure that SIMD are faster than couple of adds in a row. What you should check for is whether they are really emitted.

On Linux, one can check generated assembly (in the meaning of target processor's assembly, not the mono assembly ;)) using mono -v -v. Nevertheless I am not sure if it works on usual Windows system as it is probably using disas from GCC (you may have more luck using cygwin). By reading such assembly you can check whether SIMD operations are really emitted.

For example, by examining assembly generated for the above pasted program one can find that it uses addps instruction in its SIMD loop which is what we are looking for here.

Oh, and for the completeness here is output with SIMD disabled:

$ mono --optimize=-simd SimdTest.exe 
SIMD result: 00:00:00.0027111 <100001, 200002, 300003, 400004>
Usual result: 00:00:00.0026127 <100001, 200002, 300003, 400004>

which is not so important as the generated assembly, containing no SIMD operations.

Hope this was helpful.

Upvotes: 6

Nick Butler

Reputation: 24383

I would suspect your benchmark infrastructure first.

A couple of points might be:

You are using a `Stopwatch` to time single operations - it doesn't have the resolution
Your timings include a virtual function call
Your sample size ( 1000 ) is too small

Upvotes: 7

Mono SIMD worsening performance?

Answers (3)

Related Questions