Reputation: 8548
I have the following critical place in the code: I need to look up from 64-byte array around 1'000'000 times.
Minimal code:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define n_lookup 64
int main(){
const int n_indices = 1000000;
TYPE lookup[n_lookup];
TYPE indices[n_indices];
TYPE result[n_indices];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup);
for (int i=0; i < n_indices; i++) indices[i] = distribution(generator);
for (int i=0; i < n_lookup; i++) lookup[i] = distribution(generator);
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
// main loop:
for (int i=0; i < n_indices; i++) {
result[i] = lookup[indices[i]];
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element"<< std::endl;
// printing random numbers to avoid code elimination
std::cout << result[12] << result[45];
return 0;
}
After compiling with g++ lookup.cpp -std=gnu++11 -O3 -funroll-loops
I get a bit less than 1ns per element on modern CPU.
I need this operation to work 2-3 times faster (without threads). How can I do this?
P.S. I also was investigating AVX512 (512 bits is exactly the size of lookup table!) instruction set, but it lacks 8-bit gather operations!
Upvotes: 0
Views: 231
Reputation: 6103
At first, there is a bug, distribution(0, 64) produces numbers 0 to 64, 64 can not fit into the array.
You can speed up the lookup 2x by looking up two values a time:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define TYPE2 uint16_t
#define n_lookup 64
void tst() {
const int n_indices = 1000000;// has to be multiple of 2
TYPE lookup[n_lookup];
TYPE indices[n_indices];
TYPE result[n_indices];
TYPE2 lookup2[n_lookup * 256];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup-1);
for (int i = 0; i < n_indices; i++) indices[i] = distribution(generator);
for (int i = 0; i < n_lookup; i++) lookup[i] = distribution(generator);
for (int i = 0; i < n_lookup; ++i) {
for (int j = 0; j < n_lookup; ++j) {
lookup2[(i << 8) | j] = (lookup[i] << 8) | lookup[j];
}
}
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
TYPE2* indices2 = (TYPE2*)indices;
TYPE2* result2 = (TYPE2*)result;
// main loop:
for (int i = 0; i < n_indices / 2; ++i) {
*result2++ = lookup2[*indices2++];
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
for (int i = 0; i < n_indices; i++) {
if (result[i] != lookup[indices[i]]) {
std::cout << "!!!!!!!!!!!!!ERROR!!!!!!!!!!!!!";
}
}
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element" << std::endl;
// printing random numbers to avoid code elimination
std::cout << result[12] << result[45];
}
int main() {
tst();
std::cin.get();
return 0;
}
Upvotes: 2
Reputation: 326
indices
and result
vectors are in different places in memory, but accessed in the same time. It leads to cache-misses. I suggest you to merge result and indices in one vector. Here is the code:
#include <iostream>
#include <stdint.h>
#include <random>
#include <chrono>
#include <ctime>
#define TYPE uint8_t
#define n_lookup 64
int main(){
const int n_indices = 2000000;
TYPE lookup[n_lookup];
// Merge indices and result
// If i is index, then i+1 is result
TYPE ind_res[n_indices];
// preparations
std::default_random_engine generator;
std::uniform_int_distribution<int> distribution(0, n_lookup);
for (int i=0; i < n_indices; i += 2) ind_res[i] = distribution(generator);
for (int i=0; i < n_lookup; i++) lookup[i] = distribution(generator);
std::chrono::time_point<std::chrono::system_clock> start = std::chrono::system_clock::now();
// main loop:
for (int i=0; i < n_indices; i += 2) {
ind_res[i+1] = lookup[ind_res[i]]; // more dense access here, no cache-miss
}
std::chrono::time_point<std::chrono::system_clock> end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
std::cout << "computation took " << elapsed_seconds.count() * 1e9 / n_indices << " ns per element"<< std::endl;
// printing random numbers to avoid code elimination
std::cout << ind_res[24] << ind_res[90];
return 0;
}
My tests shows tha this code runs much faster.
Upvotes: 3
Reputation: 8156
Your code is already really fast. However (on my system) the execution is about 4.858 % faster when you change
const int n_indices = 1000000;
to
const int n_indices = 1048576; // 2^10
This is not much, but it's something.
Upvotes: 0
Reputation: 69922
with -march=native this is what your loops compiles to:
movq %rax, %rbx
xorl %eax, %eax
.L145:
movzbl 128(%rsp,%rax), %edx
movzbl 64(%rsp,%rdx), %edx
movb %dl, 1000128(%rsp,%rax)
addq $1, %rax
cmpq $1000000, %rax
jne .L145
I'm struggling to see how that gets any quicker without parallelisation.
By changing TYPE to int32_t, it gets vectorised:
vpcmpeqd %ymm2, %ymm2, %ymm2
movq %rax, %rbx
xorl %eax, %eax
.L145:
vmovdqa -8000048(%rbp,%rax), %ymm1
vmovdqa %ymm2, %ymm3
vpgatherdd %ymm3, -8000304(%rbp,%ymm1,4), %ymm0
vmovdqa %ymm0, -4000048(%rbp,%rax)
addq $32, %rax
cmpq $4000000, %rax
jne .L145
vzeroupper
Might that help?
Upvotes: 2