Reputation: 53
I have four numpy arrays that has about N=5 million rows each. The shape of them are:
I want to pack them into a binary file with the same ordering.
This is what I have currently.
buffer = BytesIO()
for row in zip(a, b, c, d):
buffer.write(row[0].tobytes())
buffer.write(row[1].tobytes())
buffer.write(row[2].tobytes())
buffer.write(row[3].tobytes())
with open(output_path, "wb") as f:
f.write(buffer.getvalue())
Is there a more time efficient method instead of looping it N times like what I had?
Edit: The binary file is used across different languages (e.g. JS, Cpp) and the reason I needed to serialize it row by row (a[i], b[i], c[i], d[i]) is because I needed to adhere to the file format (.splat).
Upvotes: 1
Views: 124
Reputation: 53
Following @ken's answer, I modified the code to make it more generic to take in kwargs:
def chunked_structured_save(save_path, chunk_size=100000, **kwargs):
arr = np.empty(
chunk_size,
dtype=np.dtype([(k, v.dtype, v.shape[1:]) for k, v in kwargs.items()]),
)
n_rows = len(list(kwargs.values())[0])
with open(save_path, "wb") as f:
for i in range(0, n_rows, chunk_size):
n_elem = n_rows - i if (i + chunk_size) > n_rows else chunk_size
for k, v in kwargs.items():
arr[k][:n_elem] = v[i: i + chunk_size]
arr[:n_elem].tofile(f)
chunked_structured_save(path, a=a, b=b, c=c, d=d)
Upvotes: 1
Reputation: 3086
It is easy to implement by using structured arrays.
def structured_save(path, a, b, c, d):
arr = np.empty(
len(a),
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
arr["a"] = a
arr["b"] = b
arr["c"] = c
arr["d"] = d
arr.tofile(path) # faster than f.write(arr.tobytes())
This is simple, but arr
is a copy of a, b, c, d
and requires an additional 160MB (32 bytes x 5 million rows) of memory. With such a large memory usage, the impact on execution speed cannot be ignored.
To address this issue, we can do chunked writing.
def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
arr = np.empty(
chunk_size,
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
with open(path, "wb") as f:
for i in range(0, len(a), chunk_size):
n_elem = len(a[i: i + chunk_size])
arr["a"][:n_elem] = a[i: i + chunk_size]
arr["b"][:n_elem] = b[i: i + chunk_size]
arr["c"][:n_elem] = c[i: i + chunk_size]
arr["d"][:n_elem] = d[i: i + chunk_size]
arr[:n_elem].tofile(f)
It is a little more complicated, but it is superior in both memory usage and execution speed.
Benchmark:
import time
from io import BytesIO
from pathlib import Path
import numpy as np
def baseline(path, a, b, c, d):
buffer = BytesIO()
for row in zip(a, b, c, d):
buffer.write(row[0].tobytes())
buffer.write(row[1].tobytes())
buffer.write(row[2].tobytes())
buffer.write(row[3].tobytes())
with open(path, "wb") as f:
f.write(buffer.getvalue())
def structured_save(path, a, b, c, d):
arr = np.empty(
len(a),
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
arr["a"] = a
arr["b"] = b
arr["c"] = c
arr["d"] = d
arr.tofile(path)
def chunked_structured_save(path, a, b, c, d, chunk_size=100_000):
arr = np.empty(
chunk_size,
dtype=np.dtype(
[
("a", a.dtype, a.shape[1:]),
("b", b.dtype, b.shape[1:]),
("c", c.dtype, c.shape[1:]),
("d", d.dtype, d.shape[1:]),
]
),
)
with open(path, "wb") as f:
for i in range(0, len(a), chunk_size):
n_elem = len(a[i: i + chunk_size])
arr["a"][:n_elem] = a[i: i + chunk_size]
arr["b"][:n_elem] = b[i: i + chunk_size]
arr["c"][:n_elem] = c[i: i + chunk_size]
arr["d"][:n_elem] = d[i: i + chunk_size]
arr[:n_elem].tofile(f)
def main():
n = 5_000_000
a = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
b = np.arange((n * 3)).reshape(n, 3).astype(np.float32)
c = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
d = np.arange((n * 4)).reshape(n, 4).astype(np.uint8)
candidates = [
baseline,
structured_save,
chunked_structured_save,
]
name_len = max(len(f.__name__) for f in candidates)
path = Path("temp.bin")
baseline(path, a, b, c, d)
expected = path.read_bytes()
start = time.perf_counter()
np.savez(path, a=a, b=b, c=c, d=d)
print(f"{'np.savez (reference)':{name_len}}: {time.perf_counter() - start}")
for f in candidates:
started = time.perf_counter()
f(path, a, b, c, d)
elapsed = time.perf_counter() - started
print(f"{f.__name__:{name_len}}: {elapsed}")
assert path.read_bytes() == expected, f"{f.__name__} failed"
if __name__ == "__main__":
main()
Result:
np.savez (reference) : 0.8915687190001336
baseline : 5.309623991999615
structured_save : 0.2205286160005926
chunked_structured_save: 0.18220391599970753
Note that the result will vary greatly depending on the environment.
Here is the result on my other machine:
np.savez (reference) : 0.1376536000170745
baseline : 3.5804199000122026
structured_save : 0.4771533999883104
chunked_structured_save: 0.13709589999052696
It should also be noted that none of the above solutions take endianness into account.
Upvotes: 2
Reputation: 37
you should use np.savez by default, it's fast and can use compression. but if you don't want this for some reason (eg you want to read the file from other language that cannot import python functions), you should not serialize arrays by rows. instead flatten each array and write them one after another like this:
def arr_write(stream, arr):
shape = np.array([len(arr.shape), *arr.shape], dtype='<i4')
if arr.dtype == np.dtype('u1'): dtype = b' u1'
elif arr.dtype == np.dtype('<f4'): dtype = b'<f4'
else: raise ValueError(f'unknown dtype: {arr.dtype}')
stream.write(dtype)
stream.write(shape.tobytes())
stream.write(arr.ravel().tobytes())
arrays = [a, b, c, d]
with open(file, 'wb') as f:
f.write(len(arrays).to_bytes(4, 'little')) # optional
for arr in arrays: arr_write(f, arr)
this way all the basic info about the arrays can be inferred from file.
Upvotes: 0