Reputation: 6287
Tried the following vector subtraction code with a console project generated by VS2012 Update 1. I didn't really touch the default options other than disabling global optimizations and enabling assembler listings.
Compiled with x64 release configuration on Windows 7 x64 SP1.
#include <stdio.h>
#include <tchar.h>
#include <emmintrin.h>
typedef unsigned short ushort;
typedef unsigned int uint;
void print(__m128i i)
{
auto& arr = i.m128i_u16;
printf("[%d %d %d %d %d %d %d %d]\n", arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]);
}
int _tmain(int argc, _TCHAR* argv[])
{
const int lineSize = 912;
ushort input[lineSize];
ushort vals[lineSize];
// printf("%X %X\n", input, vals); // note this one
for (uint i=0; i<lineSize; i+=8)
{
__m128i vecinput = _mm_loadu_si128((__m128i*) &input[i]);
__m128i vecvals = _mm_loadu_si128((__m128i*) &vals[i]);
__m128i output = _mm_subs_epu16(vecinput, vecvals);
print(output);
printf("===\n");
}
return 0;
}
Generated assembly in release mode:
; 20 : const int lineSize = 912;
; 21 : ushort input[lineSize];
; 22 : ushort vals[lineSize];
; without printf
; 23 : // printf("%X %X\n", input, vals);
; with printf
; 23 : printf("%X %X\n", input, vals);
lea r8, QWORD PTR vals$[rsp]
lea rdx, QWORD PTR input$[rsp]
lea rcx, OFFSET FLAT:??_C@_06NBKGFLKK@?$CFX?5?$CFX?6?$AA@
call QWORD PTR __imp_printf
; 24 :
; 25 : for (uint i=0; i<lineSize; i+=8)
xor esi, esi
lea ebp, QWORD PTR [rsi+114]
npad 2
$LL3@wmain:
; 26 : {
; 27 : __m128i vecinput = _mm_loadu_si128((__m128i*) &input[i]);
movdqu xmm1, XMMWORD PTR input$[rsp+rsi]
; 28 : __m128i vecvals = _mm_loadu_si128((__m128i*) &vals[i]);
; without printf
movdqu xmm0, xmm1
; with printf
movdqu xmm0, XMMWORD PTR vals$[rsp+rsi]
; 29 :
; 30 : __m128i output = _mm_subs_epu16(vecinput, vecvals);
; without printf
psubusw xmm1, xmm1
; with printf
psubusw xmm1, xmm0
; 15 : printf("[%d %d %d %d %d %d %d %d]\n", arr[0], arr[1], arr[2], arr[3], arr[4], arr[5], arr[6], arr[7]);
pextrw ax, xmm1, 7
movzx edi, ax
pextrw ax, xmm1, 6
movzx ebx, ax
pextrw ax, xmm1, 5
mov DWORD PTR [rsp+64], edi
movzx r11d, ax
pextrw ax, xmm1, 4
mov DWORD PTR [rsp+56], ebx
movzx r10d, ax
pextrw ax, xmm1, 3
mov DWORD PTR [rsp+48], r11d
movzx ecx, ax
pextrw ax, xmm1, 2
mov DWORD PTR [rsp+40], r10d
movzx r9d, ax
pextrw ax, xmm1, 1
mov DWORD PTR [rsp+32], ecx
movzx r8d, ax
lea rcx, OFFSET FLAT:??_C@_0BL@ONEMJFJK@?$FL?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?7?$CFd?$FN?6?$AA@
movd eax, xmm1
movzx edx, ax
call QWORD PTR __imp_printf
; 31 : print(output);
; 32 : printf("===\n");
lea rcx, OFFSET FLAT:??_C@_04LEHBMKOA@?$DN?$DN?$DN?6?$AA@
call QWORD PTR __imp_printf
lea rsi, QWORD PTR [rsi+16]
dec rbp
jne $LL3@wmain
; 33 : }
; 34 :
; 35 : return 0;
xor eax, eax
; 95 : }
mov rcx, QWORD PTR __$ArrayPad$[rsp]
xor rcx, rsp
call __security_check_cookie
lea r11, QWORD PTR [rsp+1920]
mov rbx, QWORD PTR [r11+16]
mov rbp, QWORD PTR [r11+24]
mov rsi, QWORD PTR [r11+32]
mov rsp, r11
pop rdi
ret 0
wmain ENDP
So vals is incorrectly treated like being the same as input and the result will always be 0. It's also interesting how xmm0 is never used anymore due to that false optimization, yet still not thrown out. If you uncomment that printf the generated code is correct.
So the question is, is there anything wrong with my code? To me it totally looks like a bug in the optimizer.
Upvotes: 1
Views: 256
Reputation: 340316
You never initialize the arrays ushort input[lineSize]
and ushort vals[lineSize]
, so the optimizer happens to be treating them as being identical, which is fine for undefined behavior.
When you have the printf("%X %X\n", input, vals)
call in there, you're passing the address of the arrays to an external function, so the optimizer has reason to believe that the memory they point to may be updated by that external function.
Upvotes: 2