Reputation: 445
I have N NumPy arrays of shape data[n,m,3]
. I want to fit/squeeze/split/slice/reshape them into N' arrays of shape new_data_#[1000,m,3]
where # is the indexing of new arrays. The problem is that n can be smaller, or bigger than 1000. When it is smaller somehow I should fill the rest of 1000 capacity of new_array with the next array, and when it is bigger than 1000 I should make a new_data_# and add the rest to that one. I don't know how to manage this. Here is a pseudo-code but it can't be done this way, for example, the while maybe is not necessary. The output can be written to the disk or returned in a new data format.
def array2blocks(array_files)
for each N in array_files:
N = data = np.random.rand(n, m, 3)
new_data = np.zeros((1000, m, 3), dtype=np.float32)
j=0
index = 0
while j <= new_data.shape[0]:
for i in range(data.shape[0]):
print("--->", data[i,:,:])
print (i)
if i <= new_data.shape[0]:
# here first we should check the left capacity of new_data and then insert data into it
# new_data[i, :, :] = data[i, :, :] #this overrides previous items so not correct
print(new_data)
else:
print('n>1000')
new_data_name = 'new_data' + '_' + str(index)
# here fill rest of the data in the new_data
...
index += 1
#when capacity is full write it to the disk
print(new_data)
UPDATE with Aaron's old answer:
I replaced 1000 with batch_size = 5
to make it simple.
def numpyarrays2blocks(array_files):
N1 = np.random.rand(7, 4, 3)
N2 = np.random.rand(7, 4, 3)
N3 = np.random.rand(4, 4, 3)
# array_files = []
array_files.append(N1)
array_files.append(N2)
array_files.append(N3)
for N in array_files:
n = N.shape[0]
m = N.shape[1]
batch_size = 5
# N = data = np.random.rand(n, m, 3)
data = N
# print(data)
new_arrays = []
i = 0 # the current row index to insert
while i < n:
new_data = np.zeros((batch_size, m, 3), dtype=np.float32)
j = min(i + batch_size, n) # the last row (exclusive) to copy to new_data
# j - i is the number of rows to copy
new_data[:j - i, :, :] = data[i:j, :, :]
print('NEW DATA: ', new_data)
i = j # update the index
new_arrays.append(new_data)
print(new_arrays)
Upvotes: 2
Views: 706
Reputation: 114440
You can concatenate
all your original arrays split
them:
ars = ... # list of N arrays
ars = np.concatenate(ars, axis=0)
ars = np.split(ars, np.arange(1000, ars.shape[0], 1000))
The last line can be written as ars = np.split(ars, 1000)
, but only if you're sure that the total number of elements is a multiple of 1000, since np.split
will barf otherwise. Specifying explicit split-points, as with np.arange
, allows you to have a shorter final segment.
Upvotes: 0
Reputation: 1368
data
is used to store the temporary result, and data_start
is the index to insert rows to data
.data
if it is None
yield data
if it is fully filled.merge_and_split
is a generator so that the memory demand should be low.
import random
from typing import Iterator
import numpy as np
def merge_and_split(arrays, batch_size) -> Iterator:
arrays = tuple(arrays)
dtype = arrays[0].dtype
data_shape = (batch_size,) + arrays[0].shape[1:]
assert all(a.shape[1:] == data_shape[1:] for a in arrays), "Shape mismatch"
data = None
data_start = 0
for src in arrays:
src_index = 0
src_avail = src.shape[0]
while src_avail >= 1:
if data is None:
# allocate if None
data = np.zeros(data_shape, dtype=dtype)
data_start = 0
num_moved = min(batch_size - data_start, src_avail)
data[data_start:data_start + num_moved, ...] = src[src_index:src_index + num_moved, ...]
data_start += num_moved
src_index += num_moved
src_avail -= num_moved
if data_start >= batch_size:
yield data
data = None
if data is not None:
yield data
def input_arrays():
number = 10
r = random.Random(13)
return [np.random.randint(0, 10, size=(r.randint(1, 5), 4, 3)) for _ in range(number)]
def main():
# Testing input and output
arrays = input_arrays()
# for i, item in enumerate(arrays):
# print('input', i, item.shape)
# print(item)
result = list(merge_and_split(arrays, 5))
# for i, item in enumerate(result):
# print('result', i, item.shape)
# print(item)
src_concat = np.vstack(arrays)
row_number = sum(s.shape[0] for s in arrays)
print('concatenated', src_concat.shape, row_number)
out_concat = np.vstack(result)
print(out_concat.shape)
print((out_concat[0:row_number, ...] == src_concat).all()) # They are indeed the same
if __name__ == '__main__':
main()
Upvotes: 1