Reputation: 772
I have a huge data frame:
df.shape = (106, 3364)
And when I want to run the following code i get MemoryError
i = df.values.T
j = np.nansum((i - i[:, None]) ** 2, axis=2) ** .5
MemoryError Traceback (most recent call last) in ----> 1 j = np.nansum((i - i[:, None]) ** 2, axis=2) ** .5
MemoryError:
Any method to avoid that?
Upvotes: 0
Views: 322
Reputation: 59731
One option is to use Numba. It will consume less memory and actually be faster too.
import pandas as pd
import numpy as np
import numba as nb
def compute_distances(df):
i = df.values.T
result = np.empty((len(i), len(i)), dtype=i.dtype)
_compute_distances_nb(i, result)
return result
@nb.njit(parallel=True)
def _compute_distances_nb(data, result):
for i in nb.prange(data.shape[0]):
for j in nb.prange(data.shape[0]):
s = 0
for k in nb.prange(data.shape[1]):
d = data[i, k] - data[j, k]
if not np.isnan(d):
s += np.square(d)
result[i, j] = np.sqrt(s)
# Original method for comparison
def compute_distances_np(df):
i = df.values.T
return np.nansum((i - i[:, None]) ** 2, axis=2) ** .5
# Test
np.random.seed(0)
# Make random data
df = pd.DataFrame(np.random.random((100, 500)))
# Put some NaN values
df[np.random.random(df.shape) < .2] = np.nan
# Compute distances
d1 = compute_distances(df)
d2 = compute_distances_np(df)
print(np.allclose(d1, d2))
# True
%timeit compute_distances(df)
# 8.05 ms ± 698 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit compute_distances_np(df)
# 356 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
np.random.seed(0)
# Make some random data
df = pd.DataFrame(np.random.random((100, 500)))
# Put some NaN values
df[np.random.random(df.shape) < .2] = np.nan
# Compute distances
d1 = compute_distances(df)
d2 = compute_distances_np(df)
print(np.allclose(d1, d2))
%timeit compute_distances(df)
# 8.05 ms ± 698 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit compute_distances_np(df)
# 356 ms ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Upvotes: 2