user2054534
user2054534

Reputation: 181

Finding cache performance

So there's this code cache.c

#include <sys/times.h>
#include <stdio.h>

#define CACHE_MIN (1024) /* smallest cache (in words) */
#define CACHE_MAX (1024*1024) /* largest cache */
#define STRIDE_MIN 1 /* smallest stride (in words) */
#define STRIDE_MAX 128 /* largest stride */
#define SAMPLE 10 /* to get a larger time sample */
#define CLK_TCK 60 /* number clock cycles per second */
int x[CACHE_MAX]; /* array going to stride through */

double get_seconds () { /* routine to read time */

  struct tms rusage;
  times (&rusage); /* UNIX utility: time in clock ticks */
  return (double) (rusage.tms_utime) / CLK_TCK;
}

int main () {

  int register i, index, stride, limit, temp;
  int steps, tsteps, csize;
  double sec0, sec; /* timing variables */
  for (csize = CACHE_MIN; csize <= CACHE_MAX; csize = csize * 2)

    for (stride = STRIDE_MIN; stride <= STRIDE_MAX; stride = stride * 2) {
      sec = 0; /* initialize timer */
      limit = csize - stride + 1; /* cache size this loop */
      steps = 0;
      do { /* repeat until collect 1 second */

    sec0 = get_seconds (); /* start timer */
    for (i = SAMPLE * stride; i != 0; i = i - 1) /* larger sample */
      for (index = 0; index < limit; index = index + stride)
        x[index] = x[index] + 1; /* cache access */
    steps = steps + 1; /* count while loop iterations */
    sec = sec + (get_seconds () - sec0); /* end timer */

      }
      while (sec < 1.0); /* until collect 1 second */

      /* Repeat empty loop to loop subtract overhead */
      tsteps = 0; /* used to match number of while iterations */
      do { /* repeat until same number of iterations as above */

    sec0 = get_seconds (); /* start timer */
    for (i = SAMPLE * stride; i != 0; i = i - 1) /* larger sample */
      for (index = 0; index < limit; index = index + stride)
        temp = temp + index; /* dummy code */
    tsteps = tsteps + 1; /* count while iterations */
    sec = sec - (get_seconds () - sec0); /* - overhead */

      }
      while (tsteps < steps); /* until equal to number of iterations */

      if( stride==STRIDE_MIN ) printf("\n"); /* extra line to separate array sizes */
      printf("Size(bytes): %7d Stride(bytes): %4d read+write: %4.0f ns\n",
         csize * sizeof (int), stride * sizeof (int),
         (double) sec*1e9 / (steps*SAMPLE*stride*((limit-1)/stride + 1)));

    } /* end of both outer for loops */
}

When run, I get this output

Size(bytes):    4096 Stride(bytes):    4 read+write:    1 ns
Size(bytes):    4096 Stride(bytes):    8 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):   16 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):   32 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):   64 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):  128 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):  256 read+write:    0 ns
Size(bytes):    4096 Stride(bytes):  512 read+write:    0 ns

Size(bytes):    8192 Stride(bytes):    4 read+write:    0 ns
Size(bytes):    8192 Stride(bytes):    8 read+write:    1 ns
Size(bytes):    8192 Stride(bytes):   16 read+write:    0 ns
Size(bytes):    8192 Stride(bytes):   32 read+write:    1 ns
Size(bytes):    8192 Stride(bytes):   64 read+write:    0 ns
Size(bytes):    8192 Stride(bytes):  128 read+write:    0 ns
Size(bytes):    8192 Stride(bytes):  256 read+write:    1 ns
Size(bytes):    8192 Stride(bytes):  512 read+write:    0 ns

Size(bytes):   16384 Stride(bytes):    4 read+write:    1 ns
Size(bytes):   16384 Stride(bytes):    8 read+write:    1 ns
Size(bytes):   16384 Stride(bytes):   16 read+write:    1 ns
Size(bytes):   16384 Stride(bytes):   32 read+write:    0 ns
Size(bytes):   16384 Stride(bytes):   64 read+write:    1 ns
Size(bytes):   16384 Stride(bytes):  128 read+write:    0 ns
Size(bytes):   16384 Stride(bytes):  256 read+write:    0 ns
Size(bytes):   16384 Stride(bytes):  512 read+write:    0 ns

Size(bytes):   32768 Stride(bytes):    4 read+write:    1 ns
Size(bytes):   32768 Stride(bytes):    8 read+write:    1 ns
Size(bytes):   32768 Stride(bytes):   16 read+write:    0 ns
Size(bytes):   32768 Stride(bytes):   32 read+write:    0 ns
Size(bytes):   32768 Stride(bytes):   64 read+write:    1 ns
Size(bytes):   32768 Stride(bytes):  128 read+write:    0 ns
Size(bytes):   32768 Stride(bytes):  256 read+write:    0 ns
Size(bytes):   32768 Stride(bytes):  512 read+write:    0 ns

Size(bytes):   65536 Stride(bytes):    4 read+write:    0 ns
Size(bytes):   65536 Stride(bytes):    8 read+write:    0 ns
Size(bytes):   65536 Stride(bytes):   16 read+write:    1 ns
Size(bytes):   65536 Stride(bytes):   32 read+write:    1 ns
Size(bytes):   65536 Stride(bytes):   64 read+write:    2 ns
Size(bytes):   65536 Stride(bytes):  128 read+write:    2 ns
Size(bytes):   65536 Stride(bytes):  256 read+write:    1 ns
Size(bytes):   65536 Stride(bytes):  512 read+write:    1 ns

Size(bytes):  131072 Stride(bytes):    4 read+write:    0 ns
Size(bytes):  131072 Stride(bytes):    8 read+write:    0 ns
Size(bytes):  131072 Stride(bytes):   16 read+write:    0 ns
Size(bytes):  131072 Stride(bytes):   32 read+write:    1 ns
Size(bytes):  131072 Stride(bytes):   64 read+write:    2 ns
Size(bytes):  131072 Stride(bytes):  128 read+write:    2 ns
Size(bytes):  131072 Stride(bytes):  256 read+write:    2 ns
Size(bytes):  131072 Stride(bytes):  512 read+write:    1 ns

Size(bytes):  262144 Stride(bytes):    4 read+write:    0 ns
Size(bytes):  262144 Stride(bytes):    8 read+write:    0 ns
Size(bytes):  262144 Stride(bytes):   16 read+write:    0 ns
Size(bytes):  262144 Stride(bytes):   32 read+write:    1 ns
Size(bytes):  262144 Stride(bytes):   64 read+write:    2 ns
Size(bytes):  262144 Stride(bytes):  128 read+write:    2 ns
Size(bytes):  262144 Stride(bytes):  256 read+write:    2 ns
Size(bytes):  262144 Stride(bytes):  512 read+write:    1 ns

Size(bytes):  524288 Stride(bytes):    4 read+write:    0 ns
Size(bytes):  524288 Stride(bytes):    8 read+write:    0 ns
Size(bytes):  524288 Stride(bytes):   16 read+write:    1 ns
Size(bytes):  524288 Stride(bytes):   32 read+write:    1 ns
Size(bytes):  524288 Stride(bytes):   64 read+write:    3 ns
Size(bytes):  524288 Stride(bytes):  128 read+write:    3 ns
Size(bytes):  524288 Stride(bytes):  256 read+write:    3 ns
Size(bytes):  524288 Stride(bytes):  512 read+write:    3 ns

Size(bytes): 1048576 Stride(bytes):    4 read+write:    1 ns
Size(bytes): 1048576 Stride(bytes):    8 read+write:    1 ns
Size(bytes): 1048576 Stride(bytes):   16 read+write:    1 ns
Size(bytes): 1048576 Stride(bytes):   32 read+write:    1 ns
Size(bytes): 1048576 Stride(bytes):   64 read+write:    3 ns
Size(bytes): 1048576 Stride(bytes):  128 read+write:    3 ns
Size(bytes): 1048576 Stride(bytes):  256 read+write:    3 ns
Size(bytes): 1048576 Stride(bytes):  512 read+write:    3 ns

Size(bytes): 2097152 Stride(bytes):    4 read+write:    1 ns
Size(bytes): 2097152 Stride(bytes):    8 read+write:    1 ns
Size(bytes): 2097152 Stride(bytes):   16 read+write:    1 ns
Size(bytes): 2097152 Stride(bytes):   32 read+write:    1 ns
Size(bytes): 2097152 Stride(bytes):   64 read+write:    3 ns
Size(bytes): 2097152 Stride(bytes):  128 read+write:    3 ns
Size(bytes): 2097152 Stride(bytes):  256 read+write:    3 ns
Size(bytes): 2097152 Stride(bytes):  512 read+write:    3 ns

Size(bytes): 4194304 Stride(bytes):    4 read+write:    1 ns
Size(bytes): 4194304 Stride(bytes):    8 read+write:    1 ns
Size(bytes): 4194304 Stride(bytes):   16 read+write:    1 ns
Size(bytes): 4194304 Stride(bytes):   32 read+write:    2 ns
Size(bytes): 4194304 Stride(bytes):   64 read+write:    3 ns
Size(bytes): 4194304 Stride(bytes):  128 read+write:    3 ns
Size(bytes): 4194304 Stride(bytes):  256 read+write:    3 ns
Size(bytes): 4194304 Stride(bytes):  512 read+write:    3 ns

Now I'm trying to find how fast a cache hit and miss are, and what the size of the first-level cache is along with the block size of the first-level cache.

Isn't the first-level cache size and block size just 4kb? I'm not sure on how to find the speed though, any ideas?

Upvotes: 1

Views: 298

Answers (1)

ZarathustrA
ZarathustrA

Reputation: 3640

  1. Use thread affinity to wire your performance checking thread to the single particular core. This will allow you eliminate effect of thread migration between different processor cores, that leads to wrong results.
  2. Use time stamp counter and measure overhead in CPU cycles. It is most fine grained time measurement timer available on x86 CPU.
  3. Do not forget substract time of time measurement from measured results.
  4. Control code generated by your compiler in disassembler to ensure, that no unwanted optimisation was introduced by compiler (for example placing variable into the CPU register instead of placing it into the memory).

Size of CPU caches and cache lines is highly dependent from the particular CPU model and can vary significantly. Check the documentation for CPU that you use.

Upvotes: 1

Related Questions