Reputation: 517
I was looking for L1 cache optimization using the tiling method.
This is the optimization technique I'm using:
// Original loop
void original_loop() {
for (int i = 0; i < N; i++) {
sum += array[i];
}
}
// Loop tiling
void tiled_loop() {
for (int i = 0; i < N; i += 16) {
for (int j = 0; j < 16; j++) {
sum += array[i + j];
}
}
}
Then interestingly enough, I found that the tiled loop in clang was actually slower than the original loop (or about the same speed).
This is my benchmarking code:
#include <chrono>
#include <iostream>
const int N = 10000;
const int blockSize = 16;
int array[N];
int sum;
// Original loop
void original_loop() {
for (int i = 0; i < N; i++) {
sum += array[i];
}
}
// Loop tiling
void tiled_loop() {
for (int i = 0; i < N; i += blockSize) {
for (int j = 0; j < blockSize; j++) {
sum += array[i + j];
}
}
}
int main() {
// Initialize array
for (int i = 0; i < N; i++) {
array[i] = i;
}
// Benchmark original loop
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; i++) {
sum = 0;
original_loop();
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Original loop: " << elapsed.count() << " seconds" << std::endl;
// Benchmark tiled loop
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100000; i++) {
sum = 0;
tiled_loop();
}
end = std::chrono::high_resolution_clock::now();
elapsed = end - start;
std::cout << "Tiled loop: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
Some results using gcc 12.2 and -O3
and clang 15.0.0 and -O3
:
For N = 1000000 and blockSize = 16
**GCC**
Original loop: 11.1892 seconds
Tiled loop: 9.67448 seconds
**Clang**
Original loop: 8.52184 seconds
Tiled loop: 8.67858 seconds
Smaller numbers:
For N = 10000 and blockSize = 16
**GCC**
Original loop: 0.094786 seconds
Tiled loop: 0.0436597 seconds
**Clang**
Original loop: 0.0416874 seconds
Tiled loop: 0.0610718 seconds
I've tried this a lot of times. Clang is either doing the same or worse than the original loop. Occasionally it does better than the original one but even then the difference is not as significant as gcc. Any idea?
Here is the assembly (I couldn't find the link to the assembly code to post here. Maybe someone can replace this code with the godbolt link):
GCC:
original_loop():
mov ecx, DWORD PTR sum[rip]
mov eax, OFFSET FLAT:array
mov edx, OFFSET FLAT:array+40000
pxor xmm0, xmm0
.L2:
paddd xmm0, XMMWORD PTR [rax]
add rax, 16
cmp rdx, rax
jne .L2
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
add eax, ecx
mov DWORD PTR sum[rip], eax
ret
tiled_loop():
pxor xmm1, xmm1
mov eax, OFFSET FLAT:array
mov edx, OFFSET FLAT:array+40000
movd xmm3, DWORD PTR sum[rip]
movdqa xmm2, xmm1
movdqa xmm0, xmm1
.L6:
paddd xmm3, XMMWORD PTR [rax]
paddd xmm0, XMMWORD PTR [rax+16]
add rax, 64
paddd xmm2, XMMWORD PTR [rax-32]
paddd xmm1, XMMWORD PTR [rax-16]
cmp rdx, rax
jne .L6
paddd xmm0, xmm3
paddd xmm0, xmm2
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd DWORD PTR sum[rip], xmm0
ret
sum:
.zero 4
array:
.zero 40000
Clang:
original_loop(): # @original_loop()
pxor xmm0, xmm0
mov eax, 12
movd xmm1, dword ptr [rip + sum] # xmm1 = mem[0],zero,zero,zero
lea rcx, [rip + array]
.LBB0_1: # =>This Inner Loop Header: Depth=1
paddd xmm1, xmmword ptr [rcx + 4*rax - 48]
paddd xmm0, xmmword ptr [rcx + 4*rax - 32]
paddd xmm1, xmmword ptr [rcx + 4*rax - 16]
paddd xmm0, xmmword ptr [rcx + 4*rax]
add rax, 16
cmp rax, 10012
jne .LBB0_1
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd dword ptr [rip + sum], xmm0
ret
tiled_loop(): # @tiled_loop()
mov edx, dword ptr [rip + sum]
xor eax, eax
lea rcx, [rip + array]
.LBB1_1: # =>This Inner Loop Header: Depth=1
movdqa xmm0, xmmword ptr [rcx + 4*rax]
movdqa xmm1, xmmword ptr [rcx + 4*rax + 16]
paddd xmm1, xmmword ptr [rcx + 4*rax + 48]
paddd xmm0, xmmword ptr [rcx + 4*rax + 32]
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd esi, xmm0
add esi, edx
cmp rax, 9983
ja .LBB1_3
movdqa xmm0, xmmword ptr [rcx + 4*rax + 64]
movdqa xmm1, xmmword ptr [rcx + 4*rax + 80]
paddd xmm1, xmmword ptr [rcx + 4*rax + 112]
paddd xmm0, xmmword ptr [rcx + 4*rax + 96]
paddd xmm0, xmm1
pshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
paddd xmm1, xmm0
pshufd xmm0, xmm1, 85 # xmm0 = xmm1[1,1,1,1]
paddd xmm0, xmm1
movd edx, xmm0
add edx, esi
add rax, 32
jmp .LBB1_1
.LBB1_3:
mov dword ptr [rip + sum], esi
ret
array:
.zero 40000
sum:
.long 0
Edit
With -march=native
flag
Original loop: 0.0292406 seconds
Tiled loop: 0.173324 seconds
clang performs 10x worse
Upvotes: 3
Views: 376
Reputation: 249532
GCC and Clang often optimize things differently. You're not using -march
which you probably need if you want optimal results. With -O3 -march=skylake
, Clang unrolls the loop automatically, and GCC will generate similar code if you add #pragma GCC unroll 16
in original_loop()
: https://godbolt.org/z/WWr1nToaG
With tiling, GCC's code is OK but Clang's is extremely bloated: https://godbolt.org/z/GYqYTc7cc - I'm not surprised the Clang code runs slower.
If you're going to use tricks like this to try to make a compiler generate better code, you have to accept that it is compiler specific. Tweaks that help on one setup may hurt on another, and it won't be easy to predict. Usually if you write simple code like your original_loop()
, at least a given compiler will tend to improve (or stay the same) when newer versions come out.
Upvotes: 3