Reputation: 415
I found an unintuitive phenomenon in the production environment:
read(fd, buf, 16 * 1024)
).I found that the latency is 2µs when read()
system call returns 16KB
through strace -c
, but the latency is 5µs when read()
systemc call returns 10B
. (NOTE: server always read 16KB per read() and the difference is client-side. One client writes 16KB at a time and the other writes 10B.)
I think this is counter-intuitive, read()
or write()
16KB should be slower than 10B, but i don’t know much about the linux kernel and network stack.
I write a minimal reproducible example:
The server-side code:
#include <assert.h>
#include <unistd.h>
#include <netinet/in.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/types.h>
const int BUFSIZE = 16 * 1024;
int main() {
// make a socket
int fd = socket(AF_INET, SOCK_STREAM, 0);
assert(fd > 0);
// bind to 0.0.0.0:7731
struct sockaddr_in sock;
memset(&sock, 0, sizeof(sock));
sock.sin_family = AF_INET;
sock.sin_addr.s_addr = INADDR_ANY;
sock.sin_port = htons(7731);
bind(fd, (struct sockaddr*)&sock, sizeof(sock));
assert(listen(fd, 128) == 0);
// accept a peer
struct sockaddr_in peer_sock;
socklen_t socklen;
int peer_fd = accept(fd, (struct sockaddr*)&peer_sock, &socklen);
assert(peer_fd > 0);
// read & write
char buf[BUFSIZE];
int nread = 0;
while (nread = read(peer_fd, buf, sizeof(buf)), nread > 0) {
int sum = 0;
for (int i = 0; i < nread; ++i) {
sum += buf[i];
}
assert(write(peer_fd, buf, nread) == nread);
}
return 0;
}
The client-side code (by golang):
package main
import (
"log"
"net"
)
func main() {
conn, err := net.Dial("tcp", "127.0.0.1:7731")
if err != nil {
log.Fatal(err)
}
// 16KB or 10B buf
buf := make([]byte, 16*1024)
for i := 0; i < 1000000; i++ {
nwrite := 0
for nwrite != len(buf) {
n, err := conn.Write(buf)
if err != nil {
log.Fatal(err)
}
nwrite += n
}
nread := 0
readBuf := make([]byte, len(buf))
for nread != len(buf) {
n, err := conn.Read(readBuf)
if err != nil {
log.Fatal(err)
}
nread += n
}
log.Print(i)
}
}
When the client writes 10B at a time (NOTE: server always read 16KB at a time. i.e. read(fd, buf, 16384)), the strace
and strace -c
of server-side output is:
$ strace ./server
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
read(4, "\0\0\0\0\0\0\0\0\0\0", 16384) = 10
write(4, "\0\0\0\0\0\0\0\0\0\0", 10) = 10
$ strace -c ./a.out
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
65.00 0.233234 9 24596 write
35.00 0.125599 5 24598 read
0.00 0.000000 0 2 open
0.00 0.000000 0 2 close
0.00 0.000000 0 2 fstat
0.00 0.000000 0 5 mmap
0.00 0.000000 0 4 mprotect
0.00 0.000000 0 1 munmap
0.00 0.000000 0 1 brk
0.00 0.000000 0 3 3 access
0.00 0.000000 0 1 socket
0.00 0.000000 0 1 accept
0.00 0.000000 0 1 bind
0.00 0.000000 0 1 listen
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 0.358833 49220 3 total
When the client writes 16KB at a time, the strace
and strace -c
of server-side output is:
$ strace ./server
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
write(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
read(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384) = 16384
$ strace -c ./server
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
61.26 0.063949 3 20692 write
38.74 0.040441 2 20694 read
0.00 0.000000 0 2 open
0.00 0.000000 0 2 close
0.00 0.000000 0 2 fstat
0.00 0.000000 0 5 mmap
0.00 0.000000 0 4 mprotect
0.00 0.000000 0 1 munmap
0.00 0.000000 0 1 brk
0.00 0.000000 0 3 3 access
0.00 0.000000 0 1 socket
0.00 0.000000 0 1 accept
0.00 0.000000 0 1 bind
0.00 0.000000 0 1 listen
0.00 0.000000 0 1 execve
0.00 0.000000 0 1 arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00 0.104390 41412 3 total
Upvotes: 4
Views: 704
Reputation:
This problem is a cache invalidation issue.
When reading a 16k buffer, the CPU requests a 16Kib chunk of memory, during the high latency memory access more memory (say 48Kib more) is read into the cache than requested, depending on hardware cache size and algorithms used. If the memory changes in that region, the cache is invalid and must be reloaded from memory.
Background information:
This is the point where network speed comes into play:
Ethernet Maximum Transmission Unit(MTU) is 1500 bytes - 40 bytes for TCP(IPV4) headers = 1460 bytes, or 1440 for IPV6, may be less depending on options.
So your 10Kib transmission will take 8 packets, and the 16Kib takes 12 packets.
Let us take a look at the process delay from the client side:
The time it takes to copy from memory through to the network interface card (NIC) and out to the wire is nearly the same amount of time for both sizes, since timing is set by network transmission times. Assuming the data is continuously available.
On the receive end:
The data is received and Direct Memory Access (DMA) to the kernel buffer area in memory as it is received. The DMA transfer invalidates the cached portion of the last memory read, requiring a slow reload of the cached data.
Since the cache invalidation occurs more frequently with the 10 Kib data, the process stalls on memory access refreshes than with the 16 Kib data. Leading to significantly more memory delays.
Upvotes: 0
Reputation: 548
Deep inside linux, this function is called when reading from a buffer. This is pretty straight forward, even if one wants to read more bytes than the size of the buffer, that should not influence the time it takes to read it. So reading 16k from a 10k buffer should take the same time as reading 10k from a 10k buffer.
/**
* simple_read_from_buffer - copy data from the buffer to user space
* @to: the user space buffer to read to
* @count: the maximum number of bytes to read
* @ppos: the current position in the buffer
* @from: the buffer to read from
* @available: the size of the buffer
*
* The simple_read_from_buffer() function reads up to @count bytes from the
* buffer @from at offset @ppos into the user space address starting at @to.
*
* On success, the number of bytes read is returned and the offset @ppos is
* advanced by this number, or negative value is returned on error.
**/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
loff_t pos = *ppos;
size_t ret;
if (pos < 0)
return -EINVAL;
if (pos >= available || !count)
return 0;
if (count > available - pos)
count = available - pos;
ret = copy_to_user(to, from + pos, count);
if (ret == count)
return -EFAULT;
count -= ret;
*ppos = pos + count;
return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);
So if we were to talk about simple buffers, you would be correct in your assumption, that reading 10k from a 16k buffer should take less time than reading 16k from the same 16k buffer.
BUT!!!!!!!!!
If we take a look at the socket file operations, we can see that sockets have the read_iter operation defined.
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
.show_fdinfo = sock_show_fdinfo,
};
So when this function is called:
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
ret = new_sync_read(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
}
inc_syscr(current);
return ret;
}
the new_sync_read
is called instead of the read
that would be called if we were to talk about a regular buffer. At the end of the line, the read_iter
of the file is called, which was set to sock_read_iter
in the struct before. That calls the sock_recvmsg
function which, which makes an INET call inet_recvmsg
, which goes down to the TCP implementation like tcp_recvmsg
, which then calls other TCP functions, which then of course do a bunch of black magic involving flags like
MSG_WAITALL
indicating that here is also some waiting involved.
The point is, you would be correct in your assumption if we were to talk about "regular" buffers. But with TCP and networking, things get more complicated. And it is not hard to imagine that the most simple way to read 16k from a socket is that a 16k message arrived and is present in that socket. Else it will get complicated.
PS: I am no expert, this was kind of the first time that I dug into the linux kernel implementation. Nevertheless, it was fun. But there could be errors in what I am saying.
Upvotes: 2