Reputation: 2487
I'm working with fairly large dataframes, and final step in to remove np.nan, uppercase strings, remove spaces, commas, replace NaN, NaT with "". I have code that works, but if large datasets causes memory crash. Any ideas how to optimize this code? I've tried various techniques.
Below is a sample small dataframe to show where i'm at:
df = pd.DataFrame([(-1, 'bronze', -1.0),
('silv er', '-1.0', 'br,on ze'),
('NaN', 'silver', 'nan'),
('', 'NaT', 'NONE')],
columns=('Canada', 'China', 'South Korea'))
import pandas as pd
import numpy as np
import time
def remove_nan_negatives(df):
t1 = time.time()
df.replace([np.inf, -np.inf], np.nan, inplace = True)
# select numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna("")
non_numeric_columns = df.select_dtypes(include=[object]).columns
df[non_numeric_columns] = df[non_numeric_columns].fillna("")
df[non_numeric_columns] = df[non_numeric_columns].replace("nan", "")
df[non_numeric_columns] = df[non_numeric_columns].replace("NAN", "")
df[non_numeric_columns] = df[non_numeric_columns].apply(lambda x: x.astype(str).str.upper())
df[non_numeric_columns] = df[non_numeric_columns].apply(lambda x: x.str.strip())
df = df.replace("-1", "")
df = df.replace("-1.0", "")
df = df.replace(-1, "")
df = df.replace(-1.0, "")
df = df.replace("NaN", "")
df = df.replace("NAN", "")
df = df.replace("NaT", "")
df = df.replace("NAT", "")
df = df.replace("NONE", "")
df = df.replace(",", "")
df = df.fillna(value="")
t2 = time.time()
print('time taken', str((t2-t1)))
return df
remove_nan_negatives(df)
# time taken 0.04082131385803223
Upvotes: 0
Views: 428
Reputation: 1263
Here's another way of doing it if you don't want to have to explicitly specify everything to remove:
import time
import math
def remove_negative(x):
if (not isinstance(x,int)) and (not isinstance(x,float)):
return x
if x<0.0:
return ""
else:
return x
def remove_uppercase(x):
if not isinstance(x,str):
return x
if x.isupper():
return ""
else:
return x
def remove_specific(x):
to_remove = [' ',',','NaN','NaT']
if x in to_remove:
return ""
else:
return x
def remove_nan(x):
if isinstance(x,str):
return x
if math.isnan(x):
return ""
else:
return x
def remove_stringnums(x):
try:
float(x)
except ValueError:
return x
else:
return ""
def remove_nan_negatives(df):
cols = []
for col in df.columns:
cols.append(df[col].apply(remove_negative)
.apply(remove_uppercase)
.apply(remove_specific)
.apply(remove_stringnums)
.apply(remove_nan))
return pd.concat(cols,axis=1)
t1 = time.time()
remove_nan_negatives(df)
t2 = time.time()
print('time taken', str((t2-t1)))
# Canada China South Korea
# 0 bronze
# 1 silv er br,on ze
# 2 silver
# 3
# time taken 0.0038068294525146484
Upvotes: 0
Reputation: 61920
You could try the following function, which remove some duplicate work in yours:
def remove_nan_negatives_fast(df):
non_numeric_columns = df.select_dtypes(include=[object]).columns
df[non_numeric_columns] = df[non_numeric_columns].apply(lambda x: x.str.upper().str.strip())
replacements = {np.inf: "",
-np.inf: "",
"-1": "",
"-1.0": "",
-1: "",
-1.0: "",
"NaN": "",
"NaT": "",
"nan": "",
"NAN": "",
"NAT": "",
"NONE": "",
",": "", }
df = df.replace(replacements).fillna("")
return df
On my machine it gives the following results:
%timeit remove_nan_negatives(df)
19.8 ms ± 3.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit remove_nan_negatives_fast(df1)
5.02 ms ± 386 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
So about a 5x speed-up gain. Notice that the main gain comes from using a dictionary in the replace function, at the same time saving the creation of intermediate DataFrames.
Upvotes: 1