Accurate memory access time probing with RDTSC and RDTSCP?

Question

I'm trying to make an accurate measurement of memory access to different cache levels, and came up with this code for probing:

__asm__ __volatile__(
        "xor %%eax, %%eax   
"
        "xor %%edi, %%edi   
"
        "xor %%edx, %%edx   
"
        /* time measurement */
        "lfence              
"
        "rdtsc              
"
        "shl $32, %%rdx        
"
        "or %%rdx, %%rax    
"
        "movq %%rax, %%rdi  
"
        /* memory access */
        "movq (%%rsi), %%rbx
"
        /* time measurement */
        "rdtscp              
"
        "shl $32, %%rdx     
"
        "or %%rdx, %%rax    
"
        "movq %%rax, %%rsi  
"
        "cpuid              
"
        : /* output operands */
        "=S"(t2), "=D"(t1)
        : /* input operands */
        "S" (mem)
        : /* clobber description */
        "ebx", "ecx", "edx", "cc", "memory"
    );

However the L1 and L2 cache access just differ by 8 cycles and the results are fluctuating to much, so I decided to check how much impact the surrounding code (apart from the actual memory access) has on the timing:

    __asm__ __volatile__(
        "xor %%eax, %%eax   
"
        "xor %%edi, %%edi   
"
        "xor %%edx, %%edx   
"
        /* time measurement */
        "lfence             
"
        "rdtsc              
"
        "shl $32, %%rdx        
"
        "or %%rdx, %%rax    
"
        "movq %%rax, %%rdi  
"
        /* memory access */
        //"movq (%%rsi), %%rbx
"
        /* time measurement */
        "rdtscp              
"
        "shl $32, %%rdx     
"
        "or %%rdx, %%rax    
"
        "movq %%rax, %%rsi  
"
        "cpuid              
"
        : /* output operands */
        "=S"(t2), "=D"(t1)
        : /* input operands */
        "S" (mem)
        : /* clobber description */
        "ebx", "ecx", "edx", "cc", "memory"
    );

The results looked like this:

./cache_testing
From Memory: 42
From L3: 46
From L2: 40
From L1: 38

./cache_testing
From Memory: 40
From L3: 38
From L2: 36
From L1: 40

I'm aware that I don't hit the different cache levels by purpose at the moment, but I wonder why the timing, in case of the missing memory access is fluctuating so much. The code is running as SCHED_FIFO with the highest priority, pinned to one CPU and shouldn't be dispatched while running. Can anybody tell me if I can improve my code and thereby the results in any way?

Accurate memory access time probing with RDTSC and RDTSCP?

Answers (1)

Related Questions