Reputation: 16603
Let's say we have an array of ints like this:
const int size = 100000;
int array[size];
//set some items to 0 and other items to 1
I'd like to replace all items that have value of 1 with another value, for example 123456. This can be trivially implemented with:
for(int i = 0; i < size ; i++){
if(array[i] != 0)
array[i] = 123456;
}
Out of curiosity, is there a faster way to do this, by some kind of x86 trickery, or is this the best code for the processor?
Upvotes: 33
Views: 73657
Reputation: 8677
For your specific case where you initially have 0 and 1, the following might be faster. You'll have to bench mark it. You probably can't do much better with plain C though; you may need to dive into assembly if you want to take advantage of "x86 trickery" that may exist.
for(int i = 0; i < size ; i++){
array[i] *= 123456;
}
Benchmark code:
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
size_t diff(struct timespec *start, struct timespec *end)
{
return (end->tv_sec - start->tv_sec)*1000000000 + end->tv_nsec - start->tv_nsec;
}
int main(void)
{
const size_t size = 1000000;
int array[size];
for(size_t i=0; i<size; ++i) {
array[i] = rand() & 1;
}
struct timespec start, stop;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
for(size_t i=0; i<size; ++i) {
array[i] *= 123456;
//if(array[i]) array[i] = 123456;
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
printf("size: %zu\t nsec: %09zu\n", size, diff(&start, &stop));
}
Computer: quad core AMD Phenom @2.5GHz, Linux, GCC 4.7, compiled with
$ gcc arr.c -std=gnu99 -lrt -O3 -march=native
if
version: ~5-10ms*=
version: ~1.3msUpvotes: 47
Reputation: 6329
You might want to benchmark this as well:
for(int i = 0; i < size ; i++){
array[i] = (~(array[i]-1) & 123456);
}
I run it through same benchmark as SchighSchagh, with little or no difference on my set up. It may differ on yours, however.
EDIT: Stop the presses!
I just remembered that x86 can "unbranch" ternary operators if arguments between ":" are constants. Consider following code:
for(size_t i=0; i<size; ++i) {
array[i] = array[i] ? 123456 : 0;
}
Looks almost like your original code, doesn't it? Well, disassembly shows that it has been compiled without any branches:
for(size_t i=0; i<size; ++i) {
00E3104C xor eax,eax
00E3104E mov edi,edi
array[i] = array[i] ? 123456 : 0;
00E31050 mov edx,dword ptr [esi+eax*4]
00E31053 neg edx
00E31055 sbb edx,edx
00E31057 and edx,1E240h
00E3105D mov dword ptr [esi+eax*4],edx
00E31060 inc eax
00E31061 cmp eax,5F5E100h
00E31066 jb wmain+50h (0E31050h)
}
Performance-wise it seems on par or little better than my original and SchighSchagh solution. It is more readable and more flexible, though. For instance, it can work with array[i] having values different than 0 and 1.
Bottom line, benchmark AND take a peek in the disassembly.
Upvotes: 13
Reputation: 71060
Here's some Win32 code to profile various versions of the algorithm (compiled using VS2010 Express using default release build):-
#include <windows.h>
#include <stdlib.h>
#include <stdio.h>
const size_t
size = 0x1D4C00;
_declspec(align(16)) int
g_array [size];
_declspec(align(16)) int
_vec4_123456 [] = { 123456, 123456, 123456, 123456 };
void Test (void (*fn) (size_t, int *), char *test)
{
printf ("Executing test: %s\t", test);
for(size_t i=0; i<size; ++i) {
g_array[i] = rand() & 1;
}
LARGE_INTEGER
start,
end;
QueryPerformanceCounter (&start);
fn (size, g_array);
QueryPerformanceCounter (&end);
printf("size: %u\t count: %09u\n", size, (int) (end.QuadPart - start.QuadPart));
}
void Test1 (size_t size, int *array)
{
for(size_t i=0; i<size; ++i) {
array[i] *= 123456;
}
}
void Test2 (size_t size, int *array)
{
for(size_t i=0; i<size; ++i) {
if(array[i]) array[i] = 123456;
}
}
void Test3 (size_t array_size, int *array)
{
__asm
{
mov edi,array
mov ecx, array_size
lea esi, [edi + ecx * 4]
neg ecx
pxor xmm0, xmm0
movdqa xmm1, [_vec4_123456] ; value of { 123456, 123456, 123456, 123456 }
_replaceloop:
movdqa xmm2, [esi + ecx * 4] ; assumes the array is 16 aligned, make that true
add ecx, 4
pcmpeqd xmm2, xmm0
pandn xmm2, xmm1
movdqa [esi + ecx * 4 - 16], xmm2
jnz _replaceloop
}
}
void Test4 (size_t array_size, int *array)
{
array_size = array_size * 8 / 12;
__asm
{
mov edi,array
mov ecx,array_size
lea esi,[edi+ecx*4]
lea edi,[edi+ecx*4]
neg ecx
mov edx,[_vec4_123456]
pxor xmm0,xmm0
movdqa xmm1,[_vec4_123456]
replaceloop:
movdqa xmm2,[esi+ecx*4]
mov eax,[edi]
mov ebx,[edi+4]
movdqa xmm3,[esi+ecx*4+16]
add edi,16
add ecx,9
imul eax,edx
pcmpeqd xmm2,xmm0
imul ebx,edx
pcmpeqd xmm3,xmm0
mov [edi-16],eax
mov [edi-12],ebx
pandn xmm2,xmm1
mov eax,[edi-8]
mov ebx,[edi-4]
pandn xmm3,xmm1
imul eax,edx
movdqa [esi+ecx*4-36],xmm2
imul ebx,edx
movdqa [esi+ecx*4-20],xmm3
mov [edi-8],eax
mov [edi-4],ebx
loop replaceloop
}
}
int main()
{
Test (Test1, "Test1 - mul");
Test (Test2, "Test2 - branch");
Test (Test3, "Test3 - simd");
Test (Test4, "Test4 - simdv2");
}
It's got for tests: C using an if()...
, C using a multiply, harold's simd version and my simd version.
Running it many times (remember, when profiling you should average the results over several runs) there's little difference between all the versions except the branching one which is significantly slower.
This is not very surprising as the algortihm is doing very little work for each memory item. What this means is that the real limiting factor is the bandwidth between the CPU and the memory, the CPU is constantly waiting for the memory to catch up, even with the cpu helping with prefetching the data (ia32's detect and prefetch data linearly).
Upvotes: 3
Reputation: 4200
one more way to speed up the array assignment you can use the c inline assembly. Like below,
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
const int size = 100000;
void main(void) {
int array[size];
int value = 1000;
__asm__ __volatile__("cld\n\t"
"rep\n\t"
"stosl\n\t"
:
:"c"(size*4), "a"(value), "D"(array)
:
);
printf("Array[0] : %d \n", array[0]);
}
This should be speed when we compared to plain c program to assign the array values. And also the stosl instruction take 4 clock cycle.
Upvotes: 1
Reputation: 2085
This might prove faster.
for(int i = 0; i < size ; i++){
array[i] = ((123456 << array[i]) - 123456);
}
EDIT: Changed bitwise operation to left shift.
Upvotes: 2
Reputation: 64904
The array is small enough that it fits in cache, so it should be worthwhile to use SIMD: (not tested)
mov ecx, size
lea esi, [array + ecx * 4]
neg ecx
pxor xmm0, xmm0
movdqa xmm1, [_vec4_123456] ; value of { 123456, 123456, 123456, 123456 }
_replaceloop:
movdqa xmm2, [esi + ecx * 4] ; assumes the array is 16 aligned, make that true
add ecx, 4
pcmpeqd xmm2, xmm0
pandn xmm2, xmm1
movdqa [esi + ecx * 4 - 16], xmm2
jnz _replaceloop
Unrolling by 2 might help.
If you have SSE4.1, you can use SchighSchagh's multiplication trick with pmulld
.
Upvotes: 7
Reputation:
You could use another array or some other data structure to keep track of the indices of the elements you set to one and then only visit those elements. This will work best if there are only few elements that are set to one
Upvotes: 2
Reputation: 409146
For a small array such as your it's no use trying to find another algorithm, and if the values are not in a specific pattern a simple loop is the only way to do it anyway.
However, if you have a very large array (we're talking several million entries), then you can split the work into threads. Each separate thread handles a smaller portion of the whole data set.
Upvotes: 15