Reputation: 1103
These two code snippets do the same thing: Adding two float arrays together and storing the result back into them.
Inline Assembler:
void vecAdd_SSE(float* v1, float* v2) {
_asm {
mov esi, v1
mov edi, v2
movups xmm0, [esi]
movups xmm1, [edi]
addps xmm0, xmm1
movups [esi], xmm0
movups [edi], xmm0
}
}
Plain C++ Code:
void vecAdd_Std(float* v1, float* v2) {
v1[0] = v1[0]+ v2[0];
v1[1] = v1[1]+ v2[1];
v1[2] = v1[2]+ v2[2];
v1[3] = v1[3]+ v2[3];
v2[0] = v1[0];
v2[1] = v1[1];
v2[2] = v1[2];
v2[3] = v1[3];
}
Disassembly for C++ Code (Disassembly made in Debug mode because i cannot view the Disassembly in Release mode for some reason):
void vecAdd_Std(float* v1, float* v2) {
push ebp
mov ebp,esp
sub esp,0C0h
push ebx
push esi
push edi
lea edi,[ebp-0C0h]
mov ecx,30h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
v1[0] = v1[0]+ v2[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,0
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v1[1] = v1[1]+ v2[1];
mov eax,4
shl eax,0
v1[1] = v1[1]+ v2[1];
mov ecx,4
shl ecx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,0
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[2] = v1[2]+ v2[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,1
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[3] = v1[3]+ v2[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,3
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v2[0] = v1[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
v2[1] = v1[1];
mov eax,4
shl eax,0
mov ecx,4
shl ecx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[2] = v1[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[3] = v1[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
}
Now I made a time measurement on those to functions and noticed that the inline assembler code takes approximately 10 times longer (in Release mode). Does anybody know why?
Upvotes: 12
Views: 1580
Reputation: 19782
You aren't really calling a function that executes one SSE instruction, are you? There's non-trivial overhead involved in setting up the xmm registers, and you're copying the values from memory to the registers and back, which will take far longer than the actual calculation.
I wouldn't be at all surprised to find that the compiler inlines the C++ version of the function, but doesn't (can't, really) do the same for functions that contain inline assembly.
Upvotes: 5
Reputation: 92271
On my machine (VS2015 64-bit mode), the compiler inlines vecAdd_Std
and produces
00007FF625921C8F vmovups xmm1,xmmword ptr [__xmm@4100000040c000004080000040000000 (07FF625929D60h)]
00007FF625921C97 vmovups xmm4,xmm1
00007FF625921C9B vcvtss2sd xmm1,xmm1,xmm4
Test code
int main() {
float x[4] = {1.0, 2.0, 3.0, 4.0};
float y[4] = {1.0, 2.0, 3.0, 4.0};
vecAdd_Std(x, y);
std::cout << x[0];
}
Upvotes: 19