Bruce
Bruce

Reputation: 445

Generate equal size batches from N numpy arrays

I have N NumPy arrays of shape data[n,m,3]. I want to fit/squeeze/split/slice/reshape them into N' arrays of shape new_data_#[1000,m,3] where # is the indexing of new arrays. The problem is that n can be smaller, or bigger than 1000. When it is smaller somehow I should fill the rest of 1000 capacity of new_array with the next array, and when it is bigger than 1000 I should make a new_data_# and add the rest to that one. I don't know how to manage this. Here is a pseudo-code but it can't be done this way, for example, the while maybe is not necessary. The output can be written to the disk or returned in a new data format.

def array2blocks(array_files)
 for each N in array_files:
    N = data = np.random.rand(n, m, 3)
    new_data = np.zeros((1000, m, 3), dtype=np.float32)
    j=0
    index = 0
    while j <= new_data.shape[0]:
        for i in range(data.shape[0]):
            print("--->", data[i,:,:])
            print (i)
            if i <= new_data.shape[0]:
                # here first we should check the left capacity of new_data and then insert data into it
                # new_data[i, :, :] = data[i, :, :] #this overrides previous items so not correct
                print(new_data)
            else:
                print('n>1000')
                new_data_name = 'new_data' + '_' + str(index)
                # here fill rest of the data in the new_data
                ...
                index += 1
            #when capacity is full write it to the disk
    print(new_data)

UPDATE with Aaron's old answer: I replaced 1000 with batch_size = 5 to make it simple.

def numpyarrays2blocks(array_files):
    N1 = np.random.rand(7, 4, 3)
    N2 = np.random.rand(7, 4, 3)
    N3 = np.random.rand(4, 4, 3)
    # array_files = []
    array_files.append(N1)
    array_files.append(N2)
    array_files.append(N3)
    for N in array_files:
        n = N.shape[0]
        m = N.shape[1]
        batch_size = 5
        # N = data = np.random.rand(n, m, 3)
        data = N
        # print(data)
        new_arrays = []
        i = 0  # the current row index to insert
        while i < n:
            new_data = np.zeros((batch_size, m, 3), dtype=np.float32)
            j = min(i + batch_size, n)  # the last row (exclusive) to copy to new_data
            # j - i is the number of rows to copy
            new_data[:j - i, :, :] = data[i:j, :, :]
            print('NEW DATA: ', new_data)
            i = j  # update the index
            new_arrays.append(new_data)
    print(new_arrays)

Upvotes: 2

Views: 706

Answers (2)

Mad Physicist
Mad Physicist

Reputation: 114440

You can concatenate all your original arrays split them:

ars = ... # list of N arrays
ars = np.concatenate(ars, axis=0)
ars = np.split(ars, np.arange(1000, ars.shape[0], 1000))

The last line can be written as ars = np.split(ars, 1000), but only if you're sure that the total number of elements is a multiple of 1000, since np.split will barf otherwise. Specifying explicit split-points, as with np.arange, allows you to have a shorter final segment.

Upvotes: 0

Aaron
Aaron

Reputation: 1368

  1. data is used to store the temporary result, and data_start is the index to insert rows to data.
  2. Allocate data if it is None
  3. yield data if it is fully filled.

merge_and_split is a generator so that the memory demand should be low.

import random
from typing import Iterator

import numpy as np


def merge_and_split(arrays, batch_size) -> Iterator:
    arrays = tuple(arrays)

    dtype = arrays[0].dtype

    data_shape = (batch_size,) + arrays[0].shape[1:]

    assert all(a.shape[1:] == data_shape[1:] for a in arrays), "Shape mismatch"

    data = None
    data_start = 0

    for src in arrays:
        src_index = 0
        src_avail = src.shape[0]

        while src_avail >= 1:
            if data is None:
                # allocate if None
                data = np.zeros(data_shape, dtype=dtype)
                data_start = 0

            num_moved = min(batch_size - data_start, src_avail)
            data[data_start:data_start + num_moved, ...] = src[src_index:src_index + num_moved, ...]

            data_start += num_moved
            src_index += num_moved
            src_avail -= num_moved

            if data_start >= batch_size:
                yield data
                data = None

    if data is not None:
        yield data


def input_arrays():
    number = 10

    r = random.Random(13)

    return [np.random.randint(0, 10, size=(r.randint(1, 5), 4, 3)) for _ in range(number)]


def main():
    # Testing input and output
    arrays = input_arrays()

    # for i, item in enumerate(arrays):
    #     print('input', i, item.shape)
    #     print(item)

    result = list(merge_and_split(arrays, 5))

    # for i, item in enumerate(result):
    #     print('result', i, item.shape)
    #     print(item)

    src_concat = np.vstack(arrays)
    row_number = sum(s.shape[0] for s in arrays)
    print('concatenated', src_concat.shape, row_number)

    out_concat = np.vstack(result)
    print(out_concat.shape)
    print((out_concat[0:row_number, ...] == src_concat).all())  # They are indeed the same


if __name__ == '__main__':
    main()

Upvotes: 1

Related Questions