Reputation: 33659
Arstechnia recently had an article Why are some programming languages faster than others. It compares Fortran and C and mentions summing arrays. In Fortran it's assumed that arrays don't overlap so that allows further optimization. In C/C++ pointers to the same type may overlap so this optimization can't be used in general. However, in C/C++ one can use the restrict
or __restrict
keyword to tell the compiler not to assume the pointers overlap. So I started looking into this in regards to auto-vectorization.
The following code vectorizes in GCC and MSVC
void dot_int(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
c[i] = a[i] + b[i];
}
}
I tested this with and without overlapping arrays and it gets the correct result. However, the way I would vectorize this loop manually with SSE does not handle overlapping arrays.
int i=0;
for(; i<n-3; i+=4) {
__m128i a4 = _mm_loadu_si128((__m128i*)&a[i]);
__m128i b4 = _mm_loadu_si128((__m128i*)&b[i]);
__m128i c4 = _mm_add_epi32(a4,b4);
_mm_storeu_si128((__m128i*)c, c4);
}
for(; i<n; i++) {
c[i] = a[i] + b[i];
}
Next I tried using __restrict
. I assumed that since the compiler could assume the arrays don't overlap it would not handle overlapping arrays but both GCC and MSVC still get the correct result for overlapping arrays even with __restrict
.
void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
for(int i=0; i<n; i++) {
c[i] = a[i] + b[i];
}
}
Why does the auto-vectorized code with and without __restrict
get the correct result for overlapping arrays?.
Here is the full code I used to test this:
#include <stdio.h>
#include <immintrin.h>
void dot_int(int *a, int *b, int *c, int n) {
for(int i=0; i<n; i++) {
c[i] = a[i] + b[i];
}
for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n");
}
void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
for(int i=0; i<n; i++) {
c[i] = a[i] + b[i];
}
for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n");
}
void dot_int_SSE(int *a, int *b, int *c, int n) {
int i=0;
for(; i<n-3; i+=4) {
__m128i a4 = _mm_loadu_si128((__m128i*)&a[i]);
__m128i b4 = _mm_loadu_si128((__m128i*)&b[i]);
__m128i c4 = _mm_add_epi32(a4,b4);
_mm_storeu_si128((__m128i*)c, c4);
}
for(; i<n; i++) {
c[i] = a[i] + b[i];
}
for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n");
}
int main() {
const int n = 100;
int a[] = {1,1,1,1,1,1,1,1};
int b1[] = {1,1,1,1,1,1,1,1,1};
int b2[] = {1,1,1,1,1,1,1,1,1};
int b3[] = {1,1,1,1,1,1,1,1,1};
int c[8];
int *c1 = &b1[1];
int *c2 = &b2[1];
int *c3 = &b3[1];
dot_int(a,b1,c, 8);
dot_int_SSE(a,b1,c,8);
dot_int(a,b1,c1, 8);
dot_int_restrict(a,b2,c2,8);
dot_int_SSE(a,b3,c3,8);
}
The output (from MSVC)
2 2 2 2 2 2 2 2 //no overlap default
2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code
2 3 4 5 6 7 8 9 //overlap default
2 3 4 5 6 7 8 9 //overlap with restrict
3 2 2 2 1 1 1 1 //manual SSE vector code
Edit:
Here is another inserting version which produces much simpler code
void dot_int(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
a = (int*)__builtin_assume_aligned (a, 16);
b = (int*)__builtin_assume_aligned (b, 16);
c = (int*)__builtin_assume_aligned (c, 16);
for(int i=0; i<n; i++) {
c[i] = a[i] + b[i];
}
}
Upvotes: 9
Views: 2067
Reputation: 33659
I think I understand what is going on now. It turns out that MSVC and GCC give different results with __restrict
. MSVC gets the correct answer with the overlap and GCC does not. I think it's fair to conclude then that MSVC is ignoring the __restrict
keyword and GCC is using it to optimize further.
The output from GCC
2 2 2 2 2 2 2 2 //no overlap default
2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code
2 3 4 5 6 7 8 9 //overlap without __restrict
2 2 2 2 3 2 2 2 //overlap with __restrict
3 2 2 2 1 1 1 1 //manual SSE vector code
We can produce a purely vectorized function which gives almost as many assembly lines as C doing (all the code is generated with -O3
in GCC 4.9.0) this:
void dot_int(int * __ restrict a, int * __restrict b, int * __restrict c) {
a = (int*)__builtin_assume_aligned (a, 16);
b = (int*)__builtin_assume_aligned (b, 16);
c = (int*)__builtin_assume_aligned (c, 16);
for(int i=0; i<1024; i++) {
c[i] = a[i] + b[i];
}
}
Produces
dot_int(int*, int*, int*):
xorl %eax, %eax
.L2:
movdqa (%rdi,%rax), %xmm0
paddd (%rsi,%rax), %xmm0
movaps %xmm0, (%rdx,%rax)
addq $16, %rax
cmpq $4096, %rax
jne .L2
rep ret
However, if we remove the __restrict on a
which allows a to overlap with c I have determined by looking at the assembly that
void dot_int(int * a, int * __restrict b, int * __restrict c) {
a = (int*)__builtin_assume_aligned (a, 16);
b = (int*)__builtin_assume_aligned (b, 16);
c = (int*)__builtin_assume_aligned (c, 16);
for(int i=0; i<1024; i++) {
c[i] = a[i] + b[i];
}
}
Is identical to
__attribute__((optimize("no-tree-vectorize")))
inline void dot_SSE(int * __restrict a, int * __restrict b, int * __restrict c) {
for(int i=0; i<1024; i+=4) {
__m128i a4 = _mm_load_si128((__m128i*)&a[i]);
__m128i b4 = _mm_load_si128((__m128i*)&a[i]);
__m128i c4 = _mm_add_epi32(a4,b4);
_mm_store_si128((__m128i*)&c[i],c4);
}
}
__attribute__((optimize("no-tree-vectorize")))
void dot_int(int * __restrict a, int * __restrict b, int * __restrict c) {
a = (int*)__builtin_assume_aligned (a, 16);
b = (int*)__builtin_assume_aligned (b, 16);
c = (int*)__builtin_assume_aligned (c, 16);
int pass = 1;
if((c+4)<a || (a+4)<c) pass = 0;
if(pass) {
for(int i=0; i<1024; i++) {
c[i] = a[i] + b[i];
}
}
else {
dot_SSE(a,b,c);
}
}
In other words if the a and c pointers are within 16 bytes of each other (|a-c|<4) then the code branches to a non-vectorized form. This confirms my guess that the auto-vectorized code includes both a vectorized and non-vectorized version to handle overlap.
On the overlapping arrays this gets the correct result: 2 3 4 5 6 7 8 9
Upvotes: 2
Reputation: 22724
I don't get what the problem is. Testing on Linux/64 bit, GCC 4.6, -O3, -mtune=native, -msse4.1 (i.e. a very old compiler/system), this code
void dot_int(int *a, int *b, int *c, int n) {
for(int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
compiles to this inner loop:
.L4:
movdqu (%rdi,%rax), %xmm1
addl $1, %r8d
movdqu (%rsi,%rax), %xmm0
paddd %xmm1, %xmm0
movdqu %xmm0, (%rdx,%rax)
addq $16, %rax
cmpl %r8d, %r10d
ja .L4
cmpl %r9d, %ecx
je .L1
While this code
void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
for(int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
compiles to this:
.L15:
movdqu (%rbx,%rax), %xmm0
addl $1, %r8d
paddd 0(%rbp,%rax), %xmm0
movdqu %xmm0, (%r11,%rax)
addq $16, %rax
cmpl %r10d, %r8d
jb .L15
addl %r12d, %r9d
cmpl %r12d, %r13d
je .L10
As you can clearly see there's one less load. I guess it correclty estimated that there's no need to explicitely load memory before performing the sum, as the result won't overwrite anythng.
There's also room for way more optimizations -- GCC doesn't know that the parameters are f.i. 128 bit aligned, hence it must generate a huge preamble to check that there are no alignment issues (YMMV), and a postable to deal with extra unaligned parts (or less wide than 128 bits). This actually happens with both versions above. This is the complete code generated for dot_int
:
dot_int:
.LFB626:
.cfi_startproc
testl %ecx, %ecx
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
jle .L1
leaq 16(%rdx), %r11
movl %ecx, %r10d
shrl $2, %r10d
leal 0(,%r10,4), %r9d
testl %r9d, %r9d
je .L6
leaq 16(%rdi), %rax
cmpl $6, %ecx
seta %r8b
cmpq %rax, %rdx
seta %al
cmpq %r11, %rdi
seta %bl
orl %ebx, %eax
andl %eax, %r8d
leaq 16(%rsi), %rax
cmpq %rax, %rdx
seta %al
cmpq %r11, %rsi
seta %r11b
orl %r11d, %eax
testb %al, %r8b
je .L6
xorl %eax, %eax
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L4:
movdqu (%rdi,%rax), %xmm1
addl $1, %r8d
movdqu (%rsi,%rax), %xmm0
paddd %xmm1, %xmm0
movdqu %xmm0, (%rdx,%rax)
addq $16, %rax
cmpl %r8d, %r10d
ja .L4
cmpl %r9d, %ecx
je .L1
.L3:
movslq %r9d, %r8
xorl %eax, %eax
salq $2, %r8
addq %r8, %rdx
addq %r8, %rdi
addq %r8, %rsi
.p2align 4,,10
.p2align 3
.L5:
movl (%rdi,%rax,4), %r8d
addl (%rsi,%rax,4), %r8d
movl %r8d, (%rdx,%rax,4)
addq $1, %rax
leal (%r9,%rax), %r8d
cmpl %r8d, %ecx
jg .L5
.L1:
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 8
ret
.L6:
.cfi_restore_state
xorl %r9d, %r9d
jmp .L3
.cfi_endproc
Now in your case the ints effectively not aligned (as they're on the stack), but if you can make them aligned and tell GCC so, then you can improve code generation:
typedef int intvec __attribute__((vector_size(16)));
void dot_int_restrict_alig(intvec * restrict a,
intvec * restrict b,
intvec * restrict c,
unsigned int n) {
for(unsigned int i=0; i<n; ++i) {
c[i] = a[i] + b[i];
}
}
This generates this code, with no preamble:
dot_int_restrict_alig:
.LFB628:
.cfi_startproc
testl %ecx, %ecx
je .L23
subl $1, %ecx
xorl %eax, %eax
addq $1, %rcx
salq $4, %rcx
.p2align 4,,10
.p2align 3
.L25:
movdqa (%rdi,%rax), %xmm0
paddd (%rsi,%rax), %xmm0
movdqa %xmm0, (%rdx,%rax)
addq $16, %rax
cmpq %rcx, %rax
jne .L25
.L23:
rep
ret
.cfi_endproc
Note the usage of the aligned 128 bit load instructions (movdqa
, a as aligned, vs movdqu
, unaligned).
Upvotes: 5
Reputation: 52530
If you use "restrict" with overlapping arrays, you will get undefined behaviour. That's what you get in the "overlap restrict" case. Undefined behaviour means anything can happen. And it did. Just by coincidence, the behaviour was the same as without "restrict". Absolutely correct. It falls straight under the definition of "anything can happen". Nothing to complain about.
Upvotes: 2