
Reputation: 71

C# is there a dependency between code length and performance?

The difference in code speed is not so important as I wonder why. One method is short, the other is much better optimized in my opinion, but in the test the shorter but worse is faster, so if anyone knows why.

The optimized method is a bit similar to what is in the NET Framework 4.8 library in the Buffer class in the Memmove method.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.Versioning;
using System.Runtime.ConstrainedExecution;
using System.Diagnostics;

namespace test
    public static class TestClass
    const int MAX_CHARS = 101;
    static int _cyclesLength = 10000000;
    static char[] _source;

    static TestClass()
      _source = ("A computer is a machine that can be programmed to carry out sequences of arithmetic or logical operations automatically. Modern computers can perform generic "
        + "sets of operations known as programs. These programs enable computers to perform a wide range of tasks. A computer system is a complete computer that includes the "
        + "hardware, operating system (main software), and peripheral equipment needed and used for full operation. This term may also refer to a group of computers that are "
        + "linked and function together, such as a computer network or computer cluster. A broad range of industrial and consumer products use computers as control systems.Simple "
        + "special-purpose devices like microwave ovens and remote controls are included, as are factory devices like industrial robots and computer - aided design, as well as general "
        + "- purpose devices like personal computers and mobile devices like smartphones.Computers power the Internet, which links hundreds of millions of other computers and users.").ToArray();

    [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
    public static unsafe void Copy(char* source, char* target, int length) 
      while (length >= 8) 
        *(int*)source = *(int*)target; 
        *(int*)(source + 2) = *(int*)(target + 2);
        *(int*)(source + 4) = *(int*)(target + 4);
        *(int*)(source + 6) = *(int*)(target + 6);
        target += 8;
        source += 8;
        length -= 8;
      while (length >= 2) 
        *(int*)source = *(int*)target;
        target += 2;
        source += 2;
        length -= 2;
      if (length > 0)
        *source = *target;

    [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
    public static unsafe void OptimizedCopy(char* source, char* target, int length)
      if (length < 8)
        switch (length)
          case 0:
          case 1:
            *target = *source;
          case 2:
            *(int*)target = *(int*)source;
          case 3:
            *(int*)target = *(int*)source;
            *(target + 2) = *(source + 2);
          case 4:
            *(int*)target = *(int*)source;
            *(int*)(target + 2) = *(int*)(source + 2);
          case 5:
            *(int*)target = *(int*)source;
            *(int*)(target + 2) = *(int*)(source + 2);
            *(target + 4) = *(source + 4);
          case 6:
            *(int*)target = *(int*)source;
            *(int*)(target + 2) = *(int*)(source + 2);
            *(int*)(target + 4) = *(int*)(source + 4);
          case 7:
            *(int*)target = *(int*)source;
            *(int*)(target + 2) = *(int*)(source + 2);
            *(int*)(target + 4) = *(int*)(source + 4);
            *(target + 6) = *(source + 6);

      switch (length & 7)
        case 0:
        case 1:
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 2:
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 3:
          *(int*)(target + length - 4) = *(int*)(source + length - 4);
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 4:
          *(int*)(target + length - 4) = *(int*)(source + length - 4);
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 5:
          *(int*)(target + length - 6) = *(int*)(source + length - 6);
          *(int*)(target + length - 4) = *(int*)(source + length - 4);
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 6:
          *(int*)(target + length - 6) = *(int*)(source + length - 6);
          *(int*)(target + length - 4) = *(int*)(source + length - 4);
          *(int*)(target + length - 2) = *(int*)(source + length - 2);
        case 7:
          *(int*)(target + length - 8) = *(int*)(source + length - 8);
          *(int*)(target + length - 6) = *(int*)(source + length - 6);
          *(int*)(target + length - 4) = *(int*)(source + length - 4);
          *(int*)(target + length - 2) = *(int*)(source + length - 2);

      while (true)
        *(int*)target = *(int*)source;
        *(int*)(target + 2) = *(int*)(source + 2);
        *(int*)(target + 4) = *(int*)(source + 4);
        *(int*)(target + 6) = *(int*)(source + 6);

        if (length < 16) return;
        *(int*)(target + 8) = *(int*)(source + 8);
        *(int*)(target + 10) = *(int*)(source + 10);
        *(int*)(target + 12) = *(int*)(source + 12);
        *(int*)(target + 14) = *(int*)(source + 14);

        if (length < 24) return;
        *(int*)(target + 16) = *(int*)(source + 16);
        *(int*)(target + 18) = *(int*)(source + 18);
        *(int*)(target + 20) = *(int*)(source + 20);
        *(int*)(target + 22) = *(int*)(source + 22);

        if (length < 32) return;
        *(int*)(target + 24) = *(int*)(source + 24);
        *(int*)(target + 26) = *(int*)(source + 26);
        *(int*)(target + 28) = *(int*)(source + 28);
        *(int*)(target + 30) = *(int*)(source + 30);

        if (length < 40) return;
        *(int*)(target + 32) = *(int*)(source + 32);
        *(int*)(target + 34) = *(int*)(source + 34);
        *(int*)(target + 36) = *(int*)(source + 36);
        *(int*)(target + 38) = *(int*)(source + 38);

        if (length < 48) return;
        *(int*)(target + 40) = *(int*)(source + 40);
        *(int*)(target + 42) = *(int*)(source + 42);
        *(int*)(target + 44) = *(int*)(source + 44);
        *(int*)(target + 46) = *(int*)(source + 46);

        if (length < 56) return;
        source += 48;
        target += 48;
        length -= 48;

    private static unsafe long TestCopy()
      long cyclesLength = _cyclesLength;
      char[] sourceArr = _source;
      char[] targetArr = new char[MAX_CHARS];

      fixed (char* source = sourceArr, target = targetArr)
        for (long i = 0; i < cyclesLength; i++)
          for (int j = 1; j <= MAX_CHARS; j++)
            Copy(source, target, j);
      return 1;

    private static unsafe long TestOptimizedCopy()
      long cyclesLength = _cyclesLength;
      char[] sourceArr = _source;
      char[] targetArr = new char[MAX_CHARS];

      fixed (char* source = sourceArr, target = targetArr)
        for (long i = 0; i < cyclesLength; i++)
          for (int j = 1; j <= MAX_CHARS; j++)
            OptimizedCopy(source, target, j);
      return 1;

    public static unsafe void TestMethod(long pocetCyklu = 0)
      Stopwatch stopwatch = new Stopwatch();

      Console.WriteLine( stopwatch.Elapsed.TotalSeconds.ToString() +  " TestCopy");

      Console.WriteLine(stopwatch.Elapsed.TotalSeconds.ToString() + " TestOptimizedCopy");



It seems to me that the worse but shorter algorithm is faster for some reason, but I don't know why. Maybe is it because it takes some time for the CLR to translate the CIL into machine code?

Edit: A very simplified NET Framework 4.8 method from the Buffer class Buffer.cs. I removed the code for HAS_CUSTOM_BLOCKS, BIT64, buffers overlapping... and I changed the data type from byte* to char*. The newly added code is now very close to the original, but the result is even worse. The first simple method above was once written by me, but now that I've come across professional code from the library, I wanted to replace it. But first I compared them and since then I have been wondering why the new one isn't faster. When my solution writes much more source, destination and length values. Anyway, the library code uses a few tricks like copying from the end. A lot of people try to avoid the keyword goto.. :)

[ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
internal unsafe static void Memmove(char* src, char* dest, int len)
    //const int CopyThreshold = 1024; //PLATFORM_WINDOWS (2048 bytes)

    char* srcEnd = src + len;
    char* destEnd = dest + len;

    if (len <= 8) goto MCPY02;
    if (len > 32) goto MCPY05;

    *(int*)dest = *(int*)src;
    *(int*)(dest + 2) = *(int*)(src + 2);
    *(int*)(dest + 4) = *(int*)(src + 4);
    *(int*)(dest + 6) = *(int*)(src + 6);             // [0,16]

    if (len <= 16) goto MCPY01;
    *(int*)(dest + 8) = *(int*)(src + 8);
    *(int*)(dest + 10) = *(int*)(src + 10);
    *(int*)(dest + 12) = *(int*)(src + 12);
    *(int*)(dest + 14) = *(int*)(src + 14);             // [0,32]

    if (len <= 24) goto MCPY01;
    *(int*)(dest + 16) = *(int*)(src + 16);
    *(int*)(dest + 18) = *(int*)(src + 18);
    *(int*)(dest + 20) = *(int*)(src + 20);
    *(int*)(dest + 22) = *(int*)(src + 22);             // [0,48]

    *(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
    *(int*)(destEnd - 6) = *(int*)(srcEnd - 6);
    *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
    *(int*)(destEnd - 2) = *(int*)(srcEnd - 2);

    if ((len & 12) == 0) goto MCPY03;
    *(int*)dest = *(int*)src;
    *(int*)(dest + 2) = *(int*)(src + 2);
    *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
    *(int*)(destEnd - 2) = *(int*)(srcEnd - 2);

    if ((len & 2) == 0) goto MCPY04;
    *(int*)dest = *(int*)src;
    *(int*)(destEnd - 2) = *(int*)(srcEnd - 2);

    if (len == 0) return;
    *dest = *src;

    //if (len > CopyThreshold) goto PInvoke; //I don't use so big range

    int n = len >> 5;

    *(int*)dest = *(int*)src;
    *(int*)(dest + 2) = *(int*)(src + 2);
    *(int*)(dest + 4) = *(int*)(src + 4);
    *(int*)(dest + 6) = *(int*)(src + 6);
    *(int*)(dest + 8) = *(int*)(src + 8);
    *(int*)(dest + 10) = *(int*)(src + 10);
    *(int*)(dest + 12) = *(int*)(src + 12);
    *(int*)(dest + 14) = *(int*)(src + 14);
    *(int*)(dest + 16) = *(int*)(src + 16);
    *(int*)(dest + 18) = *(int*)(src + 19);
    *(int*)(dest + 20) = *(int*)(src + 20);
    *(int*)(dest + 22) = *(int*)(src + 22);
    *(int*)(dest + 24) = *(int*)(src + 24);
    *(int*)(dest + 26) = *(int*)(src + 26);
    *(int*)(dest + 28) = *(int*)(src + 28);
    *(int*)(dest + 30) = *(int*)(src + 30);

    dest += 32;
    src += 32;
    if (n != 0) goto MCPY06;

    len %= 32;
    if (len > 8) goto MCPY00;
    *(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
    *(int*)(destEnd - 6) = *(int*)(srcEnd - 6);
    *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
    *(int*)(destEnd - 2) = *(int*)(srcEnd - 2);

To copy a large number of bytes the class has a very fast (at least compared to managed code) extern C method __Memmove(byte* dest, byte* src, nuint len). Managed memory to managed. I don't know exactly how Benchmark works, but its results are very unstable. One time all methods are almost the same (20% difference) and other time there is a big difference (around 100%). In this case, the very slow Array.Copy method, which many people write about how slow it is and according to Stopwatch is up to 10x slower, is the fastest.

Upvotes: 5

Views: 176

Answers (1)


Reputation: 915

You have a LOT of compare operations in your OptimizedCopy method. Have a look at Marshal.Copy overloads, they could be much faster in this case.

Upvotes: 2

Related Questions