Reputation: 507
I am experimenting the row- vs column-major traversal. The general idea is that row-major traversal should be faster due to caching/vectorization/etc. Here is my test code:
// gcc -o main.out main.c -O3
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main() {
size_t dim[] = {10, 50, 100, 200, 500, 1000, 5000, 10000, 20000};
struct timespec ts;
uint32_t* arr_ptr;
double delta, t0;
printf(
"Dim\tArraySize(KB)\tRow-Major Time\tRM Sample\tCol-Major Time\tCM Sample\n"
);
for (int i = 0; i < sizeof(dim) / sizeof(dim[0]); ++i) {
size_t d = dim[i];
printf("%5lu,\t%11lu,\t", d, d * d * sizeof(uint32_t) / 1024);
arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
memset(arr_ptr, 0, d * d * sizeof(uint32_t));
timespec_get(&ts, TIME_UTC);
t0 = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + j * d + k) += (j + k);
}
}
timespec_get(&ts, TIME_UTC);
delta = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
printf("%0.9lf,\t%8u,\t", delta, *(arr_ptr + ts.tv_sec % d * d + ts.tv_nsec % d));
free(arr_ptr);
arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
memset(arr_ptr, 0, d * d * sizeof(uint32_t));
timespec_get(&ts, TIME_UTC);
t0 = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + k * d + j) += (j + k);
}
}
timespec_get(&ts, TIME_UTC);
delta = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
printf("%0.9lf,\t%8u\n", delta, *(arr_ptr + ts.tv_sec % d * d + ts.tv_nsec % d));
free(arr_ptr);
}
return 0;
}
The result is as follows:
Dim ArraySize(KB) Row-Major Time RM Sample Col-Major Time CM Sample
10, 0, 0.000000238, 10, 0.000000238, 18
20, 1, 0.000000238, 19, 0.000000477, 40
50, 9, 0.000002384, 31, 0.000001431, 122
100, 39, 0.000013113, 96, 0.000005245, 272
200, 156, 0.000077486, 263, 0.000042200, 240
500, 976, 0.000452757, 558, 0.000420809, 362
1000, 3906, 0.001549959, 1095, 0.001713991, 1030
2000, 15625, 0.005422354, 3281, 0.006698370, 3571
5000, 97656, 0.036512375, 5439, 0.053869963, 4949
10000, 390625, 0.148355007, 7462, 1.049520969, 6046
20000, 1562500, 0.768568516, 22097, 7.388022661, 32356
The general theory holds only after the dimension reaches 500--before that, column-major traversal keeps outperforming row-major traversal. This seems not random--I ran the test multiple times with different gcc parameters, such as -O3 -ffast-math
and -O3 -fno-tree-vectorize
, the performance gap appears to be stable. But why?
Upvotes: 0
Views: 250
Reputation: 507
After investigating a bit and per suggestion from @Fe2O3, the code is revised to the following and it works:
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
int main() {
size_t dim[] = {10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000};
struct timespec ts;
uint32_t* arr_ptr;
double delta, t0;
printf(
"Dim\tArraySize(KB)\tRow-Major Time\tRM Sample\tCol-Major Time\tCM Sample\n"
);
for (int i = 0; i < sizeof(dim) / sizeof(dim[0]); ++i) {
size_t d = dim[i];
printf("%5lu,\t%11lu,\t", d, d * d * sizeof(uint32_t) / 1024);
arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + j * d + k) = (j * k);
}
}
timespec_get(&ts, TIME_UTC);
t0 = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + j * d + k) += (j + k);
}
}
timespec_get(&ts, TIME_UTC);
delta = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
printf("%0.9lf,\t%8u,\t", delta, *(arr_ptr + ts.tv_sec % d * d + ts.tv_nsec % d));
free(arr_ptr);
arr_ptr = (uint32_t*)malloc(d * d * sizeof(uint32_t));
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + k * d + j) = (j * k);
}
}
timespec_get(&ts, TIME_UTC);
t0 = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0;
for (int j = 0; j < d; ++j) {
for (int k = 0; k < d; ++k) {
*(arr_ptr + k * d + j) += (j + k);
}
}
timespec_get(&ts, TIME_UTC);
delta = ts.tv_sec + ts.tv_nsec / 1000.0 / 1000.0 / 1000.0 - t0;
printf("%0.9lf,\t%8u\n", delta, *(arr_ptr + ts.tv_sec % d * d + ts.tv_nsec % d));
free(arr_ptr);
}
return 0;
}
Results:
Dim ArraySize(KB) Row-Major Time RM Sample Col-Major Time CM Sample
10, 0, 0.000000238, 39, 0.000000238, 3
20, 1, 0.000000238, 83, 0.000000477, 27
50, 9, 0.000000715, 147, 0.000001431, 15
100, 39, 0.000002623, 5129, 0.000005245, 485
200, 156, 0.000009060, 15553, 0.000018835, 24023
500, 976, 0.000072718, 9557, 0.000153065, 7235
1000, 3906, 0.000302553, 91963, 0.001093388, 282539
2000, 15625, 0.001767159, 1004401, 0.006136179, 133513
5000, 97656, 0.010307550, 2686865, 0.045175791, 2730377
10000, 390625, 0.040507793, 44626185, 0.827179193, 53503515
20000, 1562500, 0.206705093, 26209730, 5.742965460, 173268143
The trick is we add one more "preparatory loop" before the timed loop. One possible reason behind it is that malloc()
doesn't really provide all the necessary memory upon request. The memory blocks are actually provided immediately before first access--as a result, the first loop could be slower as malloc()
is still busy letting us have the memory blocks it promised.
After using the allocated memory for the first time, all things are set, so the 2nd loop behaves exactly as we expect.
More detailed version of answer can be fond here
Upvotes: 1