Reputation: 381
I want to measure the time difference between accessing a table entry and accessing another entry after a clflush. Below you will find my attempt, I get almost no penalty for the above two operations. The table is of length 256 with 8 bits in each entry. I suspect my clflush is not working properly. I am compiling with -O3 flag in gcc.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define ARRAYSIZE(arr) (sizeof(arr)/sizeof(arr[0]))
#define REPEAT 10000
unsigned char table[256]={103,198,105,115,81,255,74,236,41,205,186,171,242,251,227,70,124,194,84,248,27,232,231,141,118,90,46,99,51,159,201,154,102,50,13,183,49,88,163,90,37,93,5,23,88,233,94,212,171,178,205,198,155,180,84,17,14,130,116,65,33,61,220,135,112,233,62,161,65,225,252,103,62,1,126,151,234,220,107,150,143,56,92,42,236,176,59,251,50,175,60,84,236,24,219,92,2,26,254,67,251,250,170,58,251,41,209,230,5,60,124,148,117,216,190,97,137,249,92,187,168,153,15,149,177,235,241,179,5,239,247,0,233,161,58,229,202,11,203,208,72,71,100,189,31,35,30,168,28,123,100,197,20,115,90,197,94,75,121,99,59,112,100,36,17,158,9,220,170,212,172,242,27,16,175,59,51,205,227,80,72,71,21,92,187,111,34,25,186,155,125,245,11,225,26,28,127,35,248,41,248,164,27,19,181,202,78,232,152,50,56,224,121,77,61,52,188,95,78,119,250,203,108,5,172,134,33,43,170,26,85,162,190,112,181,115,59,4,92,211,54,148,179,175,226,240,228,158,79,50,21,73,253,130,78,169};
inline void clflush(volatile void *p)
{
asm volatile ("clflush (%0)" :: "r"(p));
}
inline uint64_t rdtsc()
{
unsigned long a, d;
asm volatile ("cpuid; rdtsc" : "=a" (a), "=d" (d) : : "ebx", "ecx");
return a | ((uint64_t)d << 32);
}
inline int func(int *a) {
int i;
for(i=0;i<REPEAT;i++){
a[i]=(int)table[rand()%256];
}
}
void flushCache(unsigned char *start)
{
// flush table
unsigned char* fPtr = (unsigned char*)start;
clflush(fPtr);
clflush(fPtr+64);
clflush(fPtr+128);
clflush(fPtr+192);
clflush(fPtr+256);
}
inline void test()
{
int i=0;
uint64_t start, end;
char c;
int temp[REPEAT];
start = rdtsc();
func(temp);
end = rdtsc();
//following line of code to prevent compiler from optimizing. do something with the return value
for(i-0;i<REPEAT;i++){
temp[i]=temp[i]+temp[i/2];
}
printf("%ld ticks\n", end - start);
}
inline void testflush()
{
int i=0;
uint64_t start, end;
char c;
int temp[REPEAT];
start = rdtsc();
func(temp);
flushCache(table); //flush afer every read
end = rdtsc();
//following line of code to prevent compiler from optimizing. do something with the return value
for(i-0;i<REPEAT;i++){
temp[i]=temp[i]+temp[i/2];
}
printf("%ld ticks\n", end - start);
}
int main(int ac, char **av)
{
test();
printf("Tables in cache!\n");
testflush();
printf("Tables evicted from cache.\n");
test();
return 0;
}
update : I understand ther might be some problem due to table access. Here is another code that evicts a single variable instead of the whole table. This one shows significant inclrease in clock cycle when using the clflush(). Does it mean clflush() is working properly and the incrased time is due to access the variable from memory?
#include <stdint.h>
#include <stdio.h>
#define REPEAT 100000
inline void clflush(volatile void *p)
{
asm volatile ("clflush (%0)" :: "r"(p));
}
inline uint64_t rdtsc()
{
unsigned long a, d;
asm volatile ("rdtsc" : "=a" (a), "=d" (d));
return a | ((uint64_t)d << 32);
}
volatile int i;
inline void test()
{
uint64_t start, end,clock;
volatile int j;
long int rep;
int k;
clock=0;
for(rep=0;rep<REPEAT;rep++){
start = rdtsc();
j = i+1;
end = rdtsc();
clock=clock+(end-start);
k=j;
}
printf("took %lu ticks\n", clock);
}
inline void testflush()
{
uint64_t start, end,clock;
volatile int j;
int k;
long int rep;
clock=0;
for(rep=0;rep<REPEAT;rep++){
start = rdtsc();
j = i+1;
end = rdtsc();
clflush(&i);
clock=clock+(end-start);
k=j;
}
printf("took %lu ticks\n", clock);
}
int main(int ac, char **av)
{
i=5;
printf("------------------------------------------\n");
test();
printf("------------------------------------------\n");
testflush();
printf("------------------------------------------\n");
test();
return 0;
}
Upvotes: 3
Views: 9298
Reputation:
Some issues I see with the code.
You end the timer for testflush
after calling clflush
. Therefore you are timing the cycles necessary for processing these instructions, too. I don't think that is intended.
In your test function you have a loop with 10000 iterations. Each iteration may invoke a reference to one new cache line, but there are only 4 cache lines in table
. So at least 9996 iterations invoke no cache miss anyway.
Thus you are timing 10000 times rand()%256
plus 4 cache loads. Even if cache loads take a few hundred cycles, 10000 iterations of rand()%256
will still overshadow that.
These 10000 integers generated must also be written back. I am not sure whether L1->L2 cache bandwidth would be a limiting factor, but it might be.
You also need to run the test a few thousand times or so and average, the sample variance is way too high otherwise.
Then it might also be possible that the cpu prefetches the cache lines again by speculation before you request them. It is allowed to do so, but I don't know how clever current cpus are.
Upvotes: 1