Reputation: 53

What is the most efficient way to pack various numpy arrays into a binary file

I have four numpy arrays that has about N=5 million rows each. The shape of them are:

a: (N, 3, dtype=float32)
b: (N, 3, dtype=float32)
c: (N, 4, dtype=uint8)
d: (N, 4, dtype=uint8)

I want to pack them into a binary file with the same ordering.

This is what I have currently.

buffer = BytesIO()
for row in zip(a, b, c, d):
    buffer.write(row[0].tobytes())
    buffer.write(row[1].tobytes())
    buffer.write(row[2].tobytes())
    buffer.write(row[3].tobytes())

with open(output_path, "wb") as f:
    f.write(buffer.getvalue())

Is there a more time efficient method instead of looping it N times like what I had?

Edit: The binary file is used across different languages (e.g. JS, Cpp) and the reason I needed to serialize it row by row (a[i], b[i], c[i], d[i]) is because I needed to adhere to the file format (.splat).

Upvotes: 1

Answers (3)

Chien Poon

Reputation: 53

Following @ken's answer, I modified the code to make it more generic to take in kwargs:

def chunked_structured_save(save_path, chunk_size=100000, **kwargs):
    arr = np.empty(
        chunk_size,
        dtype=np.dtype([(k, v.dtype, v.shape[1:]) for k, v in kwargs.items()]),
    )

    n_rows = len(list(kwargs.values())[0])
    with open(save_path, "wb") as f:
        for i in range(0, n_rows, chunk_size):
            n_elem = n_rows - i if (i + chunk_size) > n_rows else chunk_size
            for k, v in kwargs.items():
                arr[k][:n_elem] = v[i: i + chunk_size]

            arr[:n_elem].tofile(f)


chunked_structured_save(path, a=a, b=b, c=c, d=d)

Upvotes: 1

ken

Reputation: 3086

It is easy to implement by using structured arrays.

def structured_save(path, a, b, c, d):
    arr = np.empty(
        len(a),
        dtype=np.dtype(
            [
                ("a", a.dtype, a.shape[1:]),
                ("b", b.dtype, b.shape[1:]),
                ("c", c.dtype, c.shape[1:]),
                ("d", d.dtype, d.shape[1:]),
            ]
        ),
    )
    arr["a"] = a
    arr["b"] = b
    arr["c"] = c
    arr["d"] = d
    arr.tofile(path)  # faster than f.write(arr.tobytes())

This is simple, but arr is a copy of a, b, c, d and requires an additional 160MB (32 bytes x 5 million rows) of memory. With such a large memory usage, the impact on execution speed cannot be ignored.

To address this issue, we can do chunked writing.

def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
    arr = np.empty(
        chunk_size,
        dtype=np.dtype(
            [
                ("a", a.dtype, a.shape[1:]),
                ("b", b.dtype, b.shape[1:]),
                ("c", c.dtype, c.shape[1:]),
                ("d", d.dtype, d.shape[1:]),
            ]
        ),
    )

    with open(path, "wb") as f:
        for i in range(0, len(a), chunk_size):
            n_elem = len(a[i: i + chunk_size])
            arr["a"][:n_elem] = a[i: i + chunk_size]
            arr["b"][:n_elem] = b[i: i + chunk_size]
            arr["c"][:n_elem] = c[i: i + chunk_size]
            arr["d"][:n_elem] = d[i: i + chunk_size]
            arr[:n_elem].tofile(f)

It is a little more complicated, but it is superior in both memory usage and execution speed.

Benchmark:

import time
from io import BytesIO
from pathlib import Path

import numpy as np


def baseline(path, a, b, c, d):
    buffer = BytesIO()
    for row in zip(a, b, c, d):
        buffer.write(row[0].tobytes())
        buffer.write(row[1].tobytes())
        buffer.write(row[2].tobytes())
        buffer.write(row[3].tobytes())

    with open(path, "wb") as f:
        f.write(buffer.getvalue())


def structured_save(path, a, b, c, d):
    arr = np.empty(
        len(a),
        dtype=np.dtype(
            [
                ("a", a.dtype, a.shape[1:]),
                ("b", b.dtype, b.shape[1:]),
                ("c", c.dtype, c.shape[1:]),
                ("d", d.dtype, d.shape[1:]),
            ]
        ),
    )
    arr["a"] = a
    arr["b"] = b
    arr["c"] = c
    arr["d"] = d
    arr.tofile(path)


def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
    arr = np.empty(
        chunk_size,
        dtype=np.dtype(
            [
                ("a", a.dtype, a.shape[1:]),
                ("b", b.dtype, b.shape[1:]),
                ("c", c.dtype, c.shape[1:]),
                ("d", d.dtype, d.shape[1:]),
            ]
        ),
    )

    with open(path, "wb") as f:
        for i in range(0, len(a), chunk_size):
            n_elem = len(a[i: i + chunk_size])
            arr["a"][:n_elem] = a[i: i + chunk_size]
            arr["b"][:n_elem] = b[i: i + chunk_size]
            arr["c"][:n_elem] = c[i: i + chunk_size]
            arr["d"][:n_elem] = d[i: i + chunk_size]
            arr[:n_elem].tofile(f)


def main():
    n = 5_000_000
    a = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
    b = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
    c = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
    d = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)

    candidates = [
        baseline,
        structured_save,
        chunked_structured_save,
    ]
    name_len = max(len(f.__name__) for f in candidates)

    path = Path("temp.bin")
    baseline(path, a, b, c, d)
    expected = path.read_bytes()

    start = time.perf_counter()
    np.savez(path, a=a, b=b, c=c, d=d)
    print(f"{'np.savez (reference)':{name_len}}: {time.perf_counter() - start}")

    for f in candidates:
        started = time.perf_counter()
        f(path, a, b, c, d)
        elapsed = time.perf_counter() - started
        print(f"{f.__name__:{name_len}}: {elapsed}")
        assert path.read_bytes() == expected, f"{f.__name__} failed"


if __name__ == "__main__":
    main()

Result:

np.savez (reference)   : 0.8915687190001336
baseline               : 5.309623991999615
structured_save        : 0.2205286160005926
chunked_structured_save: 0.18220391599970753

Note that the result will vary greatly depending on the environment.

Here is the result on my other machine:

np.savez (reference)   : 0.1376536000170745
baseline               : 3.5804199000122026
structured_save        : 0.4771533999883104
chunked_structured_save: 0.13709589999052696

It should also be noted that none of the above solutions take endianness into account.

Upvotes: 2

Daniil Zuev

Reputation: 37

you should use np.savez by default, it's fast and can use compression. but if you don't want this for some reason (eg you want to read the file from other language that cannot import python functions), you should not serialize arrays by rows. instead flatten each array and write them one after another like this:

def arr_write(stream, arr):
    shape = np.array([len(arr.shape), *arr.shape], dtype='<i4')
    if arr.dtype == np.dtype('u1'): dtype = b' u1'
    elif arr.dtype == np.dtype('<f4'): dtype = b'<f4'
    else: raise ValueError(f'unknown dtype: {arr.dtype}')
    stream.write(dtype)
    stream.write(shape.tobytes())
    stream.write(arr.ravel().tobytes())

arrays = [a, b, c, d]
with open(file, 'wb') as f:
    f.write(len(arrays).to_bytes(4, 'little')) # optional 
    for arr in arrays: arr_write(f, arr)

this way all the basic info about the arrays can be inferred from file.

Upvotes: 0

What is the most efficient way to pack various numpy arrays into a binary file

Answers (3)

Related Questions