merovingian
merovingian

Reputation: 517

Tiling optimization gcc vs clang

I was looking for L1 cache optimization using the tiling method.

This is the optimization technique I'm using:

// Original loop
void original_loop() {
  for (int i = 0; i < N; i++) {
    sum += array[i];
  }
}

// Loop tiling
void tiled_loop() {
  for (int i = 0; i < N; i += 16) {
    for (int j = 0; j < 16; j++) {
      sum += array[i + j];
    }
  }
}

Then interestingly enough, I found that the tiled loop in clang was actually slower than the original loop (or about the same speed).

This is my benchmarking code:

#include <chrono>
#include <iostream>

const int N = 10000;
const int blockSize = 16;
int array[N];
int sum;

// Original loop
void original_loop() {
  for (int i = 0; i < N; i++) {
    sum += array[i];
  }
}

// Loop tiling
void tiled_loop() {
  for (int i = 0; i < N; i += blockSize) {
    for (int j = 0; j < blockSize; j++) {
      sum += array[i + j];
    }
  }
}

int main() {
  // Initialize array
  for (int i = 0; i < N; i++) {
    array[i] = i;
  }

  // Benchmark original loop
  auto start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < 100000; i++) {
    sum = 0;
    original_loop();
  }
  auto end = std::chrono::high_resolution_clock::now();
  std::chrono::duration<double> elapsed = end - start;
  std::cout << "Original loop: " << elapsed.count() << " seconds" << std::endl;

  // Benchmark tiled loop
  start = std::chrono::high_resolution_clock::now();
  for (int i = 0; i < 100000; i++) {
    sum = 0;
    tiled_loop();
  }
  end = std::chrono::high_resolution_clock::now();
  elapsed = end - start;
  std::cout << "Tiled loop: " << elapsed.count() << " seconds" << std::endl;

  return 0;
}

Some results using gcc 12.2 and -O3 and clang 15.0.0 and -O3:

For N = 1000000 and blockSize = 16
**GCC**
Original loop: 11.1892 seconds
Tiled loop: 9.67448 seconds
**Clang**  
Original loop: 8.52184 seconds
Tiled loop: 8.67858 seconds

Smaller numbers:

For N = 10000 and blockSize = 16
**GCC**
Original loop: 0.094786 seconds
Tiled loop: 0.0436597 seconds
**Clang**
Original loop: 0.0416874 seconds
Tiled loop: 0.0610718 seconds

I've tried this a lot of times. Clang is either doing the same or worse than the original loop. Occasionally it does better than the original one but even then the difference is not as significant as gcc. Any idea?

Here is the assembly (I couldn't find the link to the assembly code to post here. Maybe someone can replace this code with the godbolt link):

GCC:

original_loop():
        mov     ecx, DWORD PTR sum[rip]
        mov     eax, OFFSET FLAT:array
        mov     edx, OFFSET FLAT:array+40000
        pxor    xmm0, xmm0
.L2:
        paddd   xmm0, XMMWORD PTR [rax]
        add     rax, 16
        cmp     rdx, rax
        jne     .L2
        movdqa  xmm1, xmm0
        psrldq  xmm1, 8
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 4
        paddd   xmm0, xmm1
        movd    eax, xmm0
        add     eax, ecx
        mov     DWORD PTR sum[rip], eax
        ret
tiled_loop():
        pxor    xmm1, xmm1
        mov     eax, OFFSET FLAT:array
        mov     edx, OFFSET FLAT:array+40000
        movd    xmm3, DWORD PTR sum[rip]
        movdqa  xmm2, xmm1
        movdqa  xmm0, xmm1
.L6:
        paddd   xmm3, XMMWORD PTR [rax]
        paddd   xmm0, XMMWORD PTR [rax+16]
        add     rax, 64
        paddd   xmm2, XMMWORD PTR [rax-32]
        paddd   xmm1, XMMWORD PTR [rax-16]
        cmp     rdx, rax
        jne     .L6
        paddd   xmm0, xmm3
        paddd   xmm0, xmm2
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 8
        paddd   xmm0, xmm1
        movdqa  xmm1, xmm0
        psrldq  xmm1, 4
        paddd   xmm0, xmm1
        movd    DWORD PTR sum[rip], xmm0
        ret
sum:
        .zero   4
array:
        .zero   40000

Clang:

original_loop():                     # @original_loop()
        pxor    xmm0, xmm0
        mov     eax, 12
        movd    xmm1, dword ptr [rip + sum]     # xmm1 = mem[0],zero,zero,zero
        lea     rcx, [rip + array]
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        paddd   xmm1, xmmword ptr [rcx + 4*rax - 48]
        paddd   xmm0, xmmword ptr [rcx + 4*rax - 32]
        paddd   xmm1, xmmword ptr [rcx + 4*rax - 16]
        paddd   xmm0, xmmword ptr [rcx + 4*rax]
        add     rax, 16
        cmp     rax, 10012
        jne     .LBB0_1
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    dword ptr [rip + sum], xmm0
        ret
tiled_loop():                        # @tiled_loop()
        mov     edx, dword ptr [rip + sum]
        xor     eax, eax
        lea     rcx, [rip + array]
.LBB1_1:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm0, xmmword ptr [rcx + 4*rax]
        movdqa  xmm1, xmmword ptr [rcx + 4*rax + 16]
        paddd   xmm1, xmmword ptr [rcx + 4*rax + 48]
        paddd   xmm0, xmmword ptr [rcx + 4*rax + 32]
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    esi, xmm0
        add     esi, edx
        cmp     rax, 9983
        ja      .LBB1_3
        movdqa  xmm0, xmmword ptr [rcx + 4*rax + 64]
        movdqa  xmm1, xmmword ptr [rcx + 4*rax + 80]
        paddd   xmm1, xmmword ptr [rcx + 4*rax + 112]
        paddd   xmm0, xmmword ptr [rcx + 4*rax + 96]
        paddd   xmm0, xmm1
        pshufd  xmm1, xmm0, 238                 # xmm1 = xmm0[2,3,2,3]
        paddd   xmm1, xmm0
        pshufd  xmm0, xmm1, 85                  # xmm0 = xmm1[1,1,1,1]
        paddd   xmm0, xmm1
        movd    edx, xmm0
        add     edx, esi
        add     rax, 32
        jmp     .LBB1_1
.LBB1_3:
        mov     dword ptr [rip + sum], esi
        ret
array:
        .zero   40000

sum:
        .long   0 

Edit

With -march=native flag

Original loop: 0.0292406 seconds
Tiled loop: 0.173324 seconds

clang performs 10x worse

Upvotes: 3

Views: 376

Answers (1)

John Zwinck
John Zwinck

Reputation: 249532

GCC and Clang often optimize things differently. You're not using -march which you probably need if you want optimal results. With -O3 -march=skylake, Clang unrolls the loop automatically, and GCC will generate similar code if you add #pragma GCC unroll 16 in original_loop(): https://godbolt.org/z/WWr1nToaG

With tiling, GCC's code is OK but Clang's is extremely bloated: https://godbolt.org/z/GYqYTc7cc - I'm not surprised the Clang code runs slower.

If you're going to use tricks like this to try to make a compiler generate better code, you have to accept that it is compiler specific. Tweaks that help on one setup may hurt on another, and it won't be easy to predict. Usually if you write simple code like your original_loop(), at least a given compiler will tend to improve (or stay the same) when newer versions come out.

Upvotes: 3

Related Questions