Reputation: 601

How can I efficiently convert a 4D numpy array into a pandas DataFrame with indices as columns?

I have a 4D numpy array of shape (4, 155, 240, 240). I would like to create a pandas DataFrame with one row for each element of this array, and five columns: one for each of the four indices, and one for the value in the array. The code I'm using right now looks like this:

import pandas as pd
import numpy as np

# some array of this shape
im = np.zeros((4, 155, 240, 240))

df = {col: [] for col in ['mode', 'x', 'y', 'z', 'val']}
for idx, val in np.ndenumerate(im):
    df['mode'].append(idx[0])
    df['y'].append(idx[1])
    df['x'].append(idx[2])
    df['z'].append(idx[3])
    df['val'].append(val)
df = pd.DataFrame(df)

Is there a way to do this more efficiently, possibly using vectorized operations?

Upvotes: 2

Answers (4)

piRSquared

Reputation: 294258

I'd use a combination of // and %

r = np.arange(im.size)[:, None]
s = np.array(im.shape)

np.column_stack([r // (im.size // s.cumprod()) % s, im.ravel()])

Demo

im = np.zeros((2, 3, 2, 3), dtype=int)

r = np.arange(im.size)[:, None]
s = np.array(im.shape)

np.column_stack([r // (im.size // s.cumprod()) % s, im.ravel()])

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 2, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0],
       [0, 0, 1, 2, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0],
       [0, 1, 0, 2, 0],
       [0, 1, 1, 0, 0],
       [0, 1, 1, 1, 0],
       [0, 1, 1, 2, 0],
       [0, 2, 0, 0, 0],
       [0, 2, 0, 1, 0],
       [0, 2, 0, 2, 0],
       [0, 2, 1, 0, 0],
       [0, 2, 1, 1, 0],
       [0, 2, 1, 2, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 2, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 1, 0],
       [1, 0, 1, 2, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 0, 1, 0],
       [1, 1, 0, 2, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 1, 0],
       [1, 1, 1, 2, 0],
       [1, 2, 0, 0, 0],
       [1, 2, 0, 1, 0],
       [1, 2, 0, 2, 0],
       [1, 2, 1, 0, 0],
       [1, 2, 1, 1, 0],
       [1, 2, 1, 2, 0]])

Upvotes: 2

Divakar

Reputation: 221574

Here's one array-initialization based approach -

def meshed_df(im):    
    s0,s1,s2,s3 = im.shape
    r0,r1,r2,r3 = np.ogrid[:s0,:s1,:s2,:s3]
    out = np.empty((s0,s1,s2,s3,5),dtype=im.dtype)
    out[...,0] = r0 # np.arange(s0)[:,None,None,None]
    out[...,1] = im
    out[...,2] = r2 # np.arange(s2)[None,None,:,None]
    out[...,3] = r1 # np.arange(s1)[None,:,None,None]
    out[...,4] = r3 # np.arange(s3)[None,None,None,:]
    return pd.DataFrame(out.reshape(-1,5), columns =[['mode', 'val','x', 'y', 'z']])

Upvotes: 1

akuiper

Reputation: 214957

Seems you need the indices of the elements, for which you can try numpy.meshgrid:

arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()])

arr
#array([[   0.,    0.,    0.,    0.,    0.],
#       [   0.,    0.,    0.,    1.,    0.],
#       [   0.,    0.,    0.,    2.,    0.],
#       ..., 
#       [   3.,  154.,  239.,  237.,    0.],
#       [   3.,  154.,  239.,  238.,    0.],
#       [   3.,  154.,  239.,  239.,    0.]])

Then construct a data frame from it:

pd.DataFrame(arr, columns = ['mode', 'x', 'y', 'z', 'val'])

Timing comparison with a normal for loop over pd.ndenumerate:

mesh = pd.DataFrame(np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()]),
                   columns=["mode", "x", "y", "z", "val"])

loop = pd.DataFrame([index + (x,) for index, x in np.ndenumerate(im)], columns=["mode", "x", "y", "z", "val"])

(loop.values == mesh.values).all()
# True

%timeit mesh = pd.DataFrame(np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()]), columns=["mode", "x", "y", "z", "val"])
# 1 loop, best of 3: 2.07 s per loop

%timeit loop = pd.DataFrame([index + (x,) for index, x in np.ndenumerate(im)], columns=["mode", "x", "y", "z", "val"])
# 1 loop, best of 3: 1min 2s per loop

Upvotes: 5

unutbu

Reputation: 879591

senderle's cartesian_product_transpose (or cartesian_product) is the fastest way I know to enumerate a cartesian product as a NumPy array.

import numpy as np
import pandas as pd
import functools

im = np.array((4, 155, 240, 240))
im = np.arange(im.prod()).reshape(im)

def cartesian_product_transpose(*arrays):
    """
    http://stackoverflow.com/a/11146645/190597 (senderle)
    """
    broadcastable = np.ix_(*arrays)
    broadcasted = np.broadcast_arrays(*broadcastable)
    dtype = np.find_common_type([arr.dtype for arr in broadcasted], [])
    rows, cols = functools.reduce(np.multiply, broadcasted[0].shape), len(broadcasted)
    out = np.empty(rows * cols, dtype=dtype)
    start, end = 0, rows
    for a in broadcasted:
        out[start:end] = a.reshape(-1)
        start, end = end, end + rows
    return out.reshape(cols, rows).T

df = pd.DataFrame(cartesian_product_broadcasted(*[np.arange(i) for i in im.shape]),
                  columns=['mode', 'x', 'y', 'z'])
df['val'] = im.ravel()

Upvotes: 3

How can I efficiently convert a 4D numpy array into a pandas DataFrame with indices as columns?

Answers (4)

Related Questions