Reputation: 101
I read about array indexing and pointer arithmetic today and have an idea that two are the same things (e.g. array indexing is a syntactic sugar for pointer arithmetic).
Besides that some other people said array indexing is faster than pointer arithmetic. (??)
I come up with these two code pieces to test performance. I tested them in C# but I think it would be same in C or C++.
for (int i = 0; i < n; i++) // n is size of an array
{
sum += arr1[i] * arr2[i];
}
for (int i = 0; i < n; i++) // n is size of an array
{
sum += *(arrP1 + i) * *(arrP2 + i);
}
I ran the code pieces 1000 times with arrays have more than millions of integer. The result is unexpected to me. Pointer arithmetic is slightly faster (about %10) than array indexing.
Does the result seem correct? Is there any explanation to make it sense?
I want to add generated assembly codes.
sum += arr1[i] * arr2[i]; // C# Code
mov eax,dword ptr [ebp-14h]
cmp dword ptr [eax+4],0
ja 00E63195
call 73EDBE3A
mov eax,dword ptr [eax+8]
mov edx,dword ptr [ebp-0Ch]
cmp edx,dword ptr [eax+4]
jb 00E631A5
call 73EDBE3A
mov eax,dword ptr [eax+edx*4+8]
mov edx,dword ptr [ebp-18h]
cmp dword ptr [edx+4],0
ja 00E631B7
call 73EDBE3A
mov edx,dword ptr [edx+8]
mov ecx,dword ptr [ebp-0Ch]
cmp ecx,dword ptr [edx+4]
jb 00E631C7
call 73EDBE3A
imul eax,dword ptr [edx+ecx*4+8]
add dword ptr [ebp-8],eax
sum += *(arrP1 + i) * *(arrP2 + i); // C# Code
mov eax,dword ptr [ebp-10h]
mov dword ptr [ebp-28h],eax
mov eax,dword ptr [ebp-28h]
mov edx,dword ptr [ebp-1Ch]
mov eax,dword ptr [eax+edx*4]
mov edx,dword ptr [ebp-14h]
mov dword ptr [ebp-2Ch],edx
mov edx,dword ptr [ebp-2Ch]
mov ecx,dword ptr [ebp-1Ch]
imul eax,dword ptr [edx+ecx*4]
add eax,dword ptr [ebp-18h]
mov dword ptr [ebp-18h],eax
Upvotes: 2
Views: 4676
Reputation: 14815
Your two examples are pretty similar.
arr[i]
This line multiply i by sizeof decltype(*arr) and add it to arr. That mean a multiplication and a addition.
If you would like to improve performance with pointer arithmetic, something like following could be better:
for (auto p=&(arrP1[0]); p<&(arrP1[n]); p++) // n is size of an array
{
sum += *p; // removed arrP2 for simplification, but can be achieved too.
}
This new way remove the need of the multiplication.
EDITED to include alternative solutions:
As @SimpleVar pointed, for readability, several nearly equivalent solutions could be selected, depending on the programmer preference.
auto p = arrP1;
for (var i=0; i<n; i++, p++) sum += *p;
or even
for (auto p=arrP1; p<arrP1+n; p++) sum += *p;
Using C++11, there is also another alternative which allow the compiler to decide how to implement (which we could assume a correct performance)
for (auto &p: arrP1) sum += p;
(for this last, arrP1 need to be standard container like std::array or a declared array: it cannot be a pointer, as the loop will not have knowledge of the array size)
Upvotes: 1
Reputation: 5321
The hand optimized pointer based code is:
int* arrP1 = arr1;
int* limitP1 = &arr1[n];
int* arrP2 = arr2;
for (; arrP1<limitP1; ++arrP1, ++arrP2)
{
sum += *arrP1 * *arrP2;
}
Be aware that compiler optimized code may be better than hand optimized code, and the hand optimization might disguise the behavior, preventing the compiler from optimizing, so it can make things worse.
So with a stupid enough compiler optimizer, we can expect the hand optimized pointer version to be better than an indexed version. But you need much more expertise (plus examining generated code and profiling) to ever make doing something like this usefully in real code.
Also if reading either input array causes cache misses, the whole optimization of the code (whether you do it or the compiler does it) ends up being completely useless, because any near sane version has the same performance as any other near sane version with matching cache misses.
Upvotes: 2
Reputation: 129314
Certainly, in C++ the result is identical [and would be in C too]:
#include <stdio.h>
extern int arr1[], arr2[];
const int n = 1000000;
int main()
{
int sum = 0;
for (int i = 0; i < n; i++) // n is size of an array
{
sum += arr1[i] * arr2[i];
}
printf("sum=%d\n", sum);
int* arrP1 = arr1;
int* arrP2 = arr2;
for (int i = 0; i < n; i++) // n is size of an array
{
sum += *(arrP1 + i) * *(arrP2 + i);
}
printf("sum=%d\n", sum);
}
Compiling with clang++ -O2 -S arrptr.cpp
gives this for the first loop:
pxor %xmm0, %xmm0
movq $-4000000, %rax # imm = 0xFFFFFFFFFFC2F700
pxor %xmm1, %xmm1
.align 16, 0x90
.LBB0_1: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqu arr1+4000000(%rax), %xmm3
movdqu arr1+4000016(%rax), %xmm4
movdqu arr2+4000000(%rax), %xmm2
movdqu arr2+4000016(%rax), %xmm5
pshufd $245, %xmm2, %xmm6 # xmm6 = xmm2[1,1,3,3]
pmuludq %xmm3, %xmm2
pshufd $232, %xmm2, %xmm2 # xmm2 = xmm2[0,2,2,3]
pshufd $245, %xmm3, %xmm3 # xmm3 = xmm3[1,1,3,3]
pmuludq %xmm6, %xmm3
pshufd $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3]
punpckldq %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
pshufd $245, %xmm5, %xmm6 # xmm6 = xmm5[1,1,3,3]
pmuludq %xmm4, %xmm5
pshufd $232, %xmm5, %xmm3 # xmm3 = xmm5[0,2,2,3]
pshufd $245, %xmm4, %xmm4 # xmm4 = xmm4[1,1,3,3]
pmuludq %xmm6, %xmm4
pshufd $232, %xmm4, %xmm4 # xmm4 = xmm4[0,2,2,3]
punpckldq %xmm4, %xmm3 # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
paddd %xmm0, %xmm2
paddd %xmm1, %xmm3
movdqu arr1+4000032(%rax), %xmm1
movdqu arr1+4000048(%rax), %xmm4
movdqu arr2+4000032(%rax), %xmm0
movdqu arr2+4000048(%rax), %xmm5
pshufd $245, %xmm0, %xmm6 # xmm6 = xmm0[1,1,3,3]
pmuludq %xmm1, %xmm0
pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
pshufd $245, %xmm1, %xmm1 # xmm1 = xmm1[1,1,3,3]
pmuludq %xmm6, %xmm1
pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
punpckldq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
pshufd $245, %xmm5, %xmm6 # xmm6 = xmm5[1,1,3,3]
pmuludq %xmm4, %xmm5
pshufd $232, %xmm5, %xmm1 # xmm1 = xmm5[0,2,2,3]
pshufd $245, %xmm4, %xmm4 # xmm4 = xmm4[1,1,3,3]
pmuludq %xmm6, %xmm4
pshufd $232, %xmm4, %xmm4 # xmm4 = xmm4[0,2,2,3]
punpckldq %xmm4, %xmm1 # xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
addq $64, %rax
jne .LBB0_1
The second loop does:
pxor %xmm1, %xmm1
pxor %xmm0, %xmm0
movaps (%rsp), %xmm2 # 16-byte Reload
movss %xmm2, %xmm0 # xmm0 = xmm2[0],xmm0[1,2,3]
movq $-4000000, %rax # imm = 0xFFFFFFFFFFC2F700
.align 16, 0x90
.LBB0_3: # %vector.body45
# =>This Inner Loop Header: Depth=1
movdqu arr1+4000000(%rax), %xmm3
movdqu arr1+4000016(%rax), %xmm4
movdqu arr2+4000000(%rax), %xmm2
movdqu arr2+4000016(%rax), %xmm5
pshufd $245, %xmm2, %xmm6 # xmm6 = xmm2[1,1,3,3]
pmuludq %xmm3, %xmm2
pshufd $232, %xmm2, %xmm2 # xmm2 = xmm2[0,2,2,3]
pshufd $245, %xmm3, %xmm3 # xmm3 = xmm3[1,1,3,3]
pmuludq %xmm6, %xmm3
pshufd $232, %xmm3, %xmm3 # xmm3 = xmm3[0,2,2,3]
punpckldq %xmm3, %xmm2 # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
pshufd $245, %xmm5, %xmm6 # xmm6 = xmm5[1,1,3,3]
pmuludq %xmm4, %xmm5
pshufd $232, %xmm5, %xmm3 # xmm3 = xmm5[0,2,2,3]
pshufd $245, %xmm4, %xmm4 # xmm4 = xmm4[1,1,3,3]
pmuludq %xmm6, %xmm4
pshufd $232, %xmm4, %xmm4 # xmm4 = xmm4[0,2,2,3]
punpckldq %xmm4, %xmm3 # xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
paddd %xmm0, %xmm2
paddd %xmm1, %xmm3
movdqu arr1+4000032(%rax), %xmm1
movdqu arr1+4000048(%rax), %xmm4
movdqu arr2+4000032(%rax), %xmm0
movdqu arr2+4000048(%rax), %xmm5
pshufd $245, %xmm0, %xmm6 # xmm6 = xmm0[1,1,3,3]
pmuludq %xmm1, %xmm0
pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
pshufd $245, %xmm1, %xmm1 # xmm1 = xmm1[1,1,3,3]
pmuludq %xmm6, %xmm1
pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
punpckldq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
pshufd $245, %xmm5, %xmm6 # xmm6 = xmm5[1,1,3,3]
pmuludq %xmm4, %xmm5
pshufd $232, %xmm5, %xmm1 # xmm1 = xmm5[0,2,2,3]
pshufd $245, %xmm4, %xmm4 # xmm4 = xmm4[1,1,3,3]
pmuludq %xmm6, %xmm4
pshufd $232, %xmm4, %xmm4 # xmm4 = xmm4[0,2,2,3]
punpckldq %xmm4, %xmm1 # xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
addq $64, %rax
jne .LBB0_3
Which, aside from loop labels and a couple of instructions before the actual loop is identical.
g++ -O2 -S
gives a similarly identical set of loops, but not using SSE instructions and unrolling, so the loops are simpler:
.L2:
movl arr1(%rax), %edx
addq $4, %rax
imull arr2-4(%rax), %edx
addl %edx, %ebx
cmpq $4000000, %rax
jne .L2
movl %ebx, %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L3:
movl arr1(%rax), %edx
addq $4, %rax
imull arr2-4(%rax), %edx
addl %edx, %ebx
cmpq $4000000, %rax
jne .L3
movl %ebx, %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
Upvotes: 1