Reputation: 185
I want to measure the bandwidth of my DRAM during reading and writing.
Measurement method: repeatedly scan a 128MB array, only accessing the first 4 bytes of each cache line. Reading uses add eax/ebx, [mem]
, writing uses mov [mem], eax
.
The expected result: the bandwidth of writing is 1/2 of the bandwidth of reading, because the cache uses a write-back policy.
The actual result: the bandwidth of writing is only 1/20 of the bandwidth of reading.
Is there any problem with my measurement method? Is the bandwidth difference between reading and writing really that huge?
Code:
asm (R"(
.bss
.align 64
A: .zero 128 * 1024 * 1024
.text
)");
void read() {
asm volatile (R"(
.intel_syntax noprefix
lea rdi, A[rip]
lea rsi, [rdi + 128 * 1024 * 1024]
.align 64
1:
add eax, [rdi + 64 * 0]
add ebx, [rdi + 64 * 1]
add ecx, [rdi + 64 * 2]
add edx, [rdi + 64 * 3]
add eax, [rdi + 64 * 4]
add ebx, [rdi + 64 * 5]
add ecx, [rdi + 64 * 6]
add edx, [rdi + 64 * 7]
add rdi, 64 * 8
cmp rdi, rsi
jne 1b
.att_syntax
)"::: "rdi", "rsi", "eax", "ebx", "ecx", "edx"
);
}
void write() {
asm volatile (R"(
.intel_syntax noprefix
lea rdi, A[rip]
lea rsi, [rdi + 128 * 1024 * 1024]
.align 64
1:
mov [rdi + 64 * 0], al
mov [rdi + 64 * 1], eax
mov [rdi + 64 * 2], eax
mov [rdi + 64 * 3], eax
mov [rdi + 64 * 4], eax
mov [rdi + 64 * 5], eax
mov [rdi + 64 * 6], eax
mov [rdi + 64 * 7], eax
add rdi, 64 * 8
cmp rdi, rsi
jne 1b
.att_syntax
)"::: "rdi", "rsi"
);
}
#include <chrono>
#include <iostream>
using namespace std;
void measure(const char* name, void (*f)()) {
for (int i = 0; i < 20; i++) f();
auto start = chrono::steady_clock::now();
for (int i = 0; i < 200; i++) f();
auto end = chrono::steady_clock::now();
cout << name << ": " << (end - start).count() * 1e-6 << " ms" << endl;
}
int main() {
measure("reading", read);
measure("writing", write);
}
My output:
reading: 113.529 ms
writing: 2254.57 ms
Upvotes: 2
Views: 149