Reputation: 7014
I have an application where part of the inner loop was basically:
double sum = 0;
for (int i = 0; i != N; ++i, ++data, ++x) sum += *data * x;
If x is an unsigned int, then the code takes 3 times as long as with int!
This was part of a larger code-base, but I got it down to the essentials:
#include <iostream>
#include <cstdlib>
#include <vector>
#include <time.h>
typedef unsigned char uint8;
template<typename T>
double moments(const uint8* data, int N, T wrap) {
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
sum += *data * pos;
++pos;
if (pos == wrap) pos = 0;
}
return sum;
}
template<typename T>
const char* name() { return "unknown"; }
template<>
const char* name<int>() { return "int"; }
template<>
const char* name<unsigned int>() { return "unsigned int"; }
const int Nr_Samples = 10 * 1000;
template<typename T>
void measure(const std::vector<uint8>& data) {
const uint8* dataptr = &data[0];
double moments_results[Nr_Samples];
time_t start, end;
time(&start);
for (int i = 0; i != Nr_Samples; ++i) {
moments_results[i] = moments<T>(dataptr, data.size(), 128);
}
time(&end);
double avg = 0.0;
for (int i = 0; i != Nr_Samples; ++i) avg += moments_results[i];
avg /= Nr_Samples;
std::cout << "With " << name<T>() << ": " << avg << " in " << (end - start) << "secs" << std::endl;
}
int main() {
std::vector<uint8> data(128*1024);
for (int i = 0; i != data.size(); ++i) data[i] = std::rand();
measure<int>(data);
measure<unsigned int>(data);
measure<int>(data);
return 0;
}
Compiling with no optimization:
luispedro@oakeshott:/home/luispedro/tmp/so §g++ test.cpp
luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
With int: 1.06353e+09 in 9secs
With unsigned int: 1.06353e+09 in 14secs
With int: 1.06353e+09 in 9secs
With optimization:
luispedro@oakeshott:/home/luispedro/tmp/so §g++ -O3 test.cpp
luispedro@oakeshott:/home/luispedro/tmp/so §./a.out
With int: 1.06353e+09 in 3secs
With unsigned int: 1.06353e+09 in 12secs
With int: 1.06353e+09 in 4secs
I don't understand why such a large difference in speed. I tried figuring it out from the generated assembly, but I got nowhere. Anyone have any thoughts?
Is this something to do with the hardware or is it a limitation of gcc's optimisation machinery? I'm betting the second.
My machine is an Intel 32 bit running Ubuntu 9.10.
Edit: Since Stephen asked, here is the de-compiled source (from a -O3 compilation). I believe I got the main loops:
int version:
40: 0f b6 14 0b movzbl (%ebx,%ecx,1),%edx
sum += *data * pos;
44: 0f b6 d2 movzbl %dl,%edx
47: 0f af d0 imul %eax,%edx
++pos;
4a: 83 c0 01 add $0x1,%eax
sum += *data * pos;
4d: 89 95 54 c7 fe ff mov %edx,-0x138ac(%ebp)
++pos;
if (pos == wrap) pos = 0;
53: 31 d2 xor %edx,%edx
55: 3d 80 00 00 00 cmp $0x80,%eax
5a: 0f 94 c2 sete %dl
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
5d: 83 c1 01 add $0x1,%ecx
sum += *data * pos;
60: db 85 54 c7 fe ff fildl -0x138ac(%ebp)
++pos;
if (pos == wrap) pos = 0;
66: 83 ea 01 sub $0x1,%edx
69: 21 d0 and %edx,%eax
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6b: 39 f1 cmp %esi,%ecx
sum += *data * pos;
6d: de c1 faddp %st,%st(1)
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6f: 75 cf jne 40
unsigned version:
50: 0f b6 34 13 movzbl (%ebx,%edx,1),%esi
sum += *data * pos;
54: 81 e6 ff 00 00 00 and $0xff,%esi
5a: 31 ff xor %edi,%edi
5c: 0f af f0 imul %eax,%esi
++pos;
5f: 83 c0 01 add $0x1,%eax
if (pos == wrap) pos = 0;
62: 3d 80 00 00 00 cmp $0x80,%eax
67: 0f 94 c1 sete %cl
T pos = 0;
double sum = 0.;
for (int i = 0; i != N; ++i, ++data) {
6a: 83 c2 01 add $0x1,%edx
sum += *data * pos;
6d: 89 bd 54 c7 fe ff mov %edi,-0x138ac(%ebp)
73: 89 b5 50 c7 fe ff mov %esi,-0x138b0(%ebp)
++pos;
if (pos == wrap) pos = 0;
79: 89 ce mov %ecx,%esi
7b: 81 e6 ff 00 00 00 and $0xff,%esi
sum += *data * pos;
81: df ad 50 c7 fe ff fildll -0x138b0(%ebp)
++pos;
if (pos == wrap) pos = 0;
87: 83 ee 01 sub $0x1,%esi
8a: 21 f0 and %esi,%eax
for (int i = 0; i != N; ++i, ++data) {
8c: 3b 95 34 c7 fe ff cmp -0x138cc(%ebp),%edx
sum += *data * pos;
92: de c1 faddp %st,%st(1)
for (int i = 0; i != N; ++i, ++data) {
94: 75 ba jne 50
This is the -O3 version, which is why the source lines jump up and down. Thank you.
Upvotes: 21
Views: 5672
Reputation: 11
output with visual studio 2010 with intel Q6600... (note: I increased the loop count from 128*1024 to 512*1024)
release mode...
With int: 4.23944e+009 in 9secs
With unsigned int: 4.23944e+009 in 18secs
With int: 4.23944e+009 in 9secs
debug mode...
With int: 4.23944e+009 in 34secs
With unsigned int: 4.23944e+009 in 58secs
With int: 4.23944e+009 in 34secs
The ASM in release mode... (unsigned)
for (int i = 0; i != Nr_Samples; ++i) {
011714A1 fldz
011714A3 mov edx,dword ptr [esi+4]
011714A6 add esp,4
011714A9 xor edi,edi
011714AB sub edx,dword ptr [esi]
moments_results[i] = moments<T>(dataptr, data.size(), 128);
011714AD mov ecx,dword ptr [ebp-1388Ch]
011714B3 fld st(0)
011714B5 xor eax,eax
011714B7 test edx,edx
011714B9 je measure<unsigned int>+79h (11714E9h)
011714BB mov esi,edx
011714BD movzx ebx,byte ptr [ecx]
011714C0 imul ebx,eax
011714C3 mov dword ptr [ebp-138A4h],ebx
011714C9 fild dword ptr [ebp-138A4h] //only in unsigned
011714CF test ebx,ebx //only in unsigned
011714D1 jns measure<unsigned int>+69h (11714D9h) //only in unsigned
011714D3 fadd qword ptr [__real@41f0000000000000 (11731C8h)] //only in unsigned
011714D9 inc eax
011714DA faddp st(1),st
011714DC cmp eax,80h
011714E1 jne measure<unsigned int>+75h (11714E5h)
011714E3 xor eax,eax
011714E5 inc ecx
011714E6 dec esi
011714E7 jne measure<unsigned int>+4Dh (11714BDh)
011714E9 fstp qword ptr [ebp+edi*8-13888h]
011714F0 inc edi
011714F1 cmp edi,2710h
011714F7 jne measure<unsigned int>+3Dh (11714ADh)
}
The ASM in release mode... (signed)
for (int i = 0; i != Nr_Samples; ++i) {
012A1351 fldz
012A1353 mov edx,dword ptr [esi+4]
012A1356 add esp,4
012A1359 xor edi,edi
012A135B sub edx,dword ptr [esi]
moments_results[i] = moments<T>(dataptr, data.size(), 128);
012A135D mov ecx,dword ptr [ebp-13890h]
012A1363 fld st(0)
012A1365 xor eax,eax
012A1367 test edx,edx
012A1369 je measure<int>+6Fh (12A138Fh)
012A136B mov esi,edx
012A136D movzx ebx,byte ptr [ecx]
012A1370 imul ebx,eax
012A1373 mov dword ptr [ebp-1388Ch],ebx
012A1379 inc eax
012A137A fild dword ptr [ebp-1388Ch] //only in signed
012A1380 faddp st(1),st
012A1382 cmp eax,80h
012A1387 jne measure<int>+6Bh (12A138Bh)
012A1389 xor eax,eax
012A138B inc ecx
012A138C dec esi
012A138D jne measure<int>+4Dh (12A136Dh)
012A138F fstp qword ptr [ebp+edi*8-13888h]
012A1396 inc edi
012A1397 cmp edi,2710h
012A139D jne measure<int>+3Dh (12A135Dh)
}
interesting... with release mode and SSE enabled..... (fld and flds instructions removed but 4 instructions added)
With int: 4.23944e+009 in 8secs
With unsigned int: 4.23944e+009 in 10secs
With int: 4.23944e+009 in 8secs
for (int i = 0; i != Nr_Samples; ++i) {
00F614C1 mov edx,dword ptr [esi+4]
00F614C4 xorps xmm0,xmm0 //added in sse version
00F614C7 add esp,4
00F614CA xor edi,edi
00F614CC sub edx,dword ptr [esi]
moments_results[i] = moments<T>(dataptr, data.size(), 128);
00F614CE mov ecx,dword ptr [ebp-13894h]
00F614D4 xor eax,eax
00F614D6 movsd mmword ptr [ebp-13890h],xmm0 //added in sse version
00F614DE test edx,edx
00F614E0 je measure<unsigned int>+8Ch (0F6151Ch)
00F614E2 fld qword ptr [ebp-13890h] //added in sse version
00F614E8 mov esi,edx
00F614EA movzx ebx,byte ptr [ecx]
00F614ED imul ebx,eax
00F614F0 mov dword ptr [ebp-1388Ch],ebx
00F614F6 fild dword ptr [ebp-1388Ch]
00F614FC test ebx,ebx
00F614FE jns measure<unsigned int>+76h (0F61506h)
00F61500 fadd qword ptr [__real@41f0000000000000 (0F631C8h)]
00F61506 inc eax
00F61507 faddp st(1),st
00F61509 cmp eax,80h
00F6150E jne measure<unsigned int>+82h (0F61512h)
00F61510 xor eax,eax
00F61512 inc ecx
00F61513 dec esi
00F61514 jne measure<unsigned int>+5Ah (0F614EAh)
00F61516 fstp qword ptr [ebp-13890h]
00F6151C movsd xmm1,mmword ptr [ebp-13890h] //added in sse version
00F61524 movsd mmword ptr [ebp+edi*8-13888h],xmm1 //added in sse version
00F6152D inc edi
00F6152E cmp edi,2710h
00F61534 jne measure<unsigned int>+3Eh (0F614CEh)
}
Upvotes: 1
Reputation: 9161
I ran this on gcc 4.7.0 on a 64 bit machine running Linux. I replaced the time calls with calls to clock_gettime.
CPU: Intel X5680 @3.33 GHZ
GCC flags: -Wall -pedantic -O3 -std=c++11
Results:
With int time per operation in ns: 11996, total time sec: 1.57237
Avg values: 1.06353e+09
With unsigned int time per operation in ns: 11539, total time sec: 1.5125
Avg values: 1.06353e+09
With int time per operation in ns: 11994, total time sec: 1.57217
Avg values: 1.06353e+09
Evidently on my machine/compiler unsigned is faster.
Upvotes: 0
Reputation: 106117
Here's why: many common architectures (including x86) have a hardware instruction to convert signed int to doubles, but do not have a hardware conversion from unsigned to double, so the compiler needs to synthesize the conversion in software. Furthermore, the only unsigned multiply on Intel is a full width multiply, whereas signed multiplies can use the signed multiply low instruction.
GCC's software conversion from unsigned int to double may very well be suboptimal (it almost certainly is, given the magnitude of the slowdown that you observed), but it is expected behavior for the code to be faster when using signed integers.
Assuming a smart compiler, the difference should be much smaller on a 64-bit system, because a 64-bit signed integer -> double conversion can be used to efficiently do a 32-bit unsigned conversion.
Edit: to illustrate, this:
sum += *data * x;
if the integer variables are signed, should compile into something along these lines:
mov (data), %eax
imul %ecx, %eax
cvtsi2sd %eax, %xmm1
addsd %xmm1, %xmm0
on the other hand, if the integer variables are unsigned, cvtsi2sd
can't be used to do the conversion, so a software workaround is required. I would expect to see something like this:
mov (data), %eax
mul %ecx // might be slower than imul
cvtsi2sd %eax, %xmm1 // convert as though signed integer
test %eax, %eax // check if high bit was set
jge 1f // if it was, we need to adjust the converted
addsd (2^32), %xmm1 // value by adding 2^32
1: addsd %xmm1, %xmm0
That would be "acceptable" codegen for the unsigned -> double conversion; it could easily be worse.
All of this is assuming floating-point code generation to SSE (I believe this is the default on the Ubuntu tools, but I could be wrong).
Upvotes: 35
Reputation:
Here's some code produced by VC++ 6.0 - no optimisation:
4: int x = 12345;
0040E6D8 mov dword ptr [ebp-4],3039h
5: double d1 = x;
0040E6DF fild dword ptr [ebp-4]
0040E6E2 fstp qword ptr [ebp-0Ch]
6: unsigned int y = 12345;
0040E6E5 mov dword ptr [ebp-10h],3039h
7: double d2 = y;
0040E6EC mov eax,dword ptr [ebp-10h]
0040E6EF mov dword ptr [ebp-20h],eax
0040E6F2 mov dword ptr [ebp-1Ch],0
0040E6F9 fild qword ptr [ebp-20h]
0040E6FC fstp qword ptr [ebp-18h]
As you can see, converting the unsigned does quite a bit more work.
Upvotes: 3