Reputation: 601
I have a 4D numpy array of shape (4, 155, 240, 240). I would like to create a pandas DataFrame with one row for each element of this array, and five columns: one for each of the four indices, and one for the value in the array. The code I'm using right now looks like this:
import pandas as pd
import numpy as np
# some array of this shape
im = np.zeros((4, 155, 240, 240))
df = {col: [] for col in ['mode', 'x', 'y', 'z', 'val']}
for idx, val in np.ndenumerate(im):
df['mode'].append(idx[0])
df['y'].append(idx[1])
df['x'].append(idx[2])
df['z'].append(idx[3])
df['val'].append(val)
df = pd.DataFrame(df)
Is there a way to do this more efficiently, possibly using vectorized operations?
Upvotes: 2
Views: 6517
Reputation: 294258
I'd use a combination of //
and %
r = np.arange(im.size)[:, None]
s = np.array(im.shape)
np.column_stack([r // (im.size // s.cumprod()) % s, im.ravel()])
Demo
im = np.zeros((2, 3, 2, 3), dtype=int)
r = np.arange(im.size)[:, None]
s = np.array(im.shape)
np.column_stack([r // (im.size // s.cumprod()) % s, im.ravel()])
array([[0, 0, 0, 0, 0],
[0, 0, 0, 1, 0],
[0, 0, 0, 2, 0],
[0, 0, 1, 0, 0],
[0, 0, 1, 1, 0],
[0, 0, 1, 2, 0],
[0, 1, 0, 0, 0],
[0, 1, 0, 1, 0],
[0, 1, 0, 2, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 1, 1, 2, 0],
[0, 2, 0, 0, 0],
[0, 2, 0, 1, 0],
[0, 2, 0, 2, 0],
[0, 2, 1, 0, 0],
[0, 2, 1, 1, 0],
[0, 2, 1, 2, 0],
[1, 0, 0, 0, 0],
[1, 0, 0, 1, 0],
[1, 0, 0, 2, 0],
[1, 0, 1, 0, 0],
[1, 0, 1, 1, 0],
[1, 0, 1, 2, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 1, 0],
[1, 1, 0, 2, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 2, 0],
[1, 2, 0, 0, 0],
[1, 2, 0, 1, 0],
[1, 2, 0, 2, 0],
[1, 2, 1, 0, 0],
[1, 2, 1, 1, 0],
[1, 2, 1, 2, 0]])
Upvotes: 2
Reputation: 221574
Here's one array-initialization based approach -
def meshed_df(im):
s0,s1,s2,s3 = im.shape
r0,r1,r2,r3 = np.ogrid[:s0,:s1,:s2,:s3]
out = np.empty((s0,s1,s2,s3,5),dtype=im.dtype)
out[...,0] = r0 # np.arange(s0)[:,None,None,None]
out[...,1] = im
out[...,2] = r2 # np.arange(s2)[None,None,:,None]
out[...,3] = r1 # np.arange(s1)[None,:,None,None]
out[...,4] = r3 # np.arange(s3)[None,None,None,:]
return pd.DataFrame(out.reshape(-1,5), columns =[['mode', 'val','x', 'y', 'z']])
Upvotes: 1
Reputation: 214957
Seems you need the indices of the elements, for which you can try numpy.meshgrid
:
arr = np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()])
arr
#array([[ 0., 0., 0., 0., 0.],
# [ 0., 0., 0., 1., 0.],
# [ 0., 0., 0., 2., 0.],
# ...,
# [ 3., 154., 239., 237., 0.],
# [ 3., 154., 239., 238., 0.],
# [ 3., 154., 239., 239., 0.]])
Then construct a data frame from it:
pd.DataFrame(arr, columns = ['mode', 'x', 'y', 'z', 'val'])
Timing comparison with a normal for loop over pd.ndenumerate
:
mesh = pd.DataFrame(np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()]),
columns=["mode", "x", "y", "z", "val"])
loop = pd.DataFrame([index + (x,) for index, x in np.ndenumerate(im)], columns=["mode", "x", "y", "z", "val"])
(loop.values == mesh.values).all()
# True
%timeit mesh = pd.DataFrame(np.column_stack(list(map(np.ravel, np.meshgrid(*map(np.arange, im.shape), indexing="ij"))) + [im.ravel()]), columns=["mode", "x", "y", "z", "val"])
# 1 loop, best of 3: 2.07 s per loop
%timeit loop = pd.DataFrame([index + (x,) for index, x in np.ndenumerate(im)], columns=["mode", "x", "y", "z", "val"])
# 1 loop, best of 3: 1min 2s per loop
Upvotes: 5
Reputation: 879591
senderle's cartesian_product_transpose
(or cartesian_product
) is the fastest way I know to enumerate a cartesian product as a NumPy array.
import numpy as np
import pandas as pd
import functools
im = np.array((4, 155, 240, 240))
im = np.arange(im.prod()).reshape(im)
def cartesian_product_transpose(*arrays):
"""
http://stackoverflow.com/a/11146645/190597 (senderle)
"""
broadcastable = np.ix_(*arrays)
broadcasted = np.broadcast_arrays(*broadcastable)
dtype = np.find_common_type([arr.dtype for arr in broadcasted], [])
rows, cols = functools.reduce(np.multiply, broadcasted[0].shape), len(broadcasted)
out = np.empty(rows * cols, dtype=dtype)
start, end = 0, rows
for a in broadcasted:
out[start:end] = a.reshape(-1)
start, end = end, end + rows
return out.reshape(cols, rows).T
df = pd.DataFrame(cartesian_product_broadcasted(*[np.arange(i) for i in im.shape]),
columns=['mode', 'x', 'y', 'z'])
df['val'] = im.ravel()
Upvotes: 3