Reputation: 2898
If I have an existing pandas dataframe, is there a way to generate the python code, which when executed in another python script, will reproduce that dataframe.
e.g.
In[1]: df
Out[1]:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
In[2]: someFunToWriteDfCode(df)
Out[2]:
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
...: 'income': [40000, 50000, 42000]})
Upvotes: 40
Views: 6076
Reputation: 539
You can dump to a byte buffer and then use base64
def df2str(df):
buffer = io.BytesIO()
pickle.dump(df, buffer)
buffer.seek(0)
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def str2df(s):
buffer = io.BytesIO()
buffer.write(base64.b64decode(s.encode('utf-8')))
buffer.seek(0)
return pickle.load(buffer)
example:
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
df2str(df)
// outputs
// 'gASV3A... ...1Yi4='
and then you can simply restore df using
restored = str2df('gASV3A... ...1Yi4=')
Upvotes: 0
Reputation: 2554
Expanding on other answers a little by adding NaT as a type.
def frame_to_code(frame):
convert = str(frame.to_dict()).replace(" nan"," float('nan')").replace(" NaT"," pd.NaT")
return f"df = pd.DataFrame({convert})"
Upvotes: 1
Reputation: 10606
Supported pd.DataFrame attributes:
'nan'
import numpy as np
import pandas as pd
import re
def _values_to_code(vals):
"""
Code representation of values
Parameters
----------
vals : List
Returns
-------
str :
vals as code string
"""
values = str(vals)
values = re.sub(r" nan(?<![,\]])", " np.nan", values)
return values
def _dtype_to_code(dtype):
"""
Code representation of dtypes
Parameters
----------
dtypes : datatype
dtype to convert. Example: np.float64
Returns
-------
str :
dtype as code string
"""
dtype = str(dtype)
dtype = re.sub(r"float64", " np.float64", dtype)
dtype = re.sub(r"int64", " np.int64", dtype)
return dtype
def df_to_code(df):
code = "pd.DataFrame({"
# columns with values
for col in df.columns:
values = _values_to_code(df[col].tolist())
dtype = _dtype_to_code(df.dtypes[col])
code += f'\n\t\'{col}\': np.array({values}, dtype={dtype}),'
code += '\n}'
# index
values = _values_to_code(df.index)
dtype = _dtype_to_code(df.index.dtype)
code += f', index=pd.{values}'
code += ')'
return code
if __name__ == "__main__":
df = pd.DataFrame({
'simple_float': np.array([1, 2, 3], dtype=float),
'simple_int': np.array([4, 5, 6], dtype=int),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index = [0, '1', 2])
# small unittest
exec('df2 = ' + df_to_code(df))
assert df.equals(df2)
print(df_to_code(df))
pd.DataFrame({
'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
'simple_int': np.array([4, 5, 6], dtype= np.int64),
'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
}, index=pd.Index([0, '1', 2], dtype='object'))
You can directly paste the output into a python console and enjoy ;)
>>> import numpy as np
>>> import pandas as pd
>>> pd.DataFrame({
... 'simple_float': np.array([1.0, 2.0, 3.0], dtype= np.float64),
... 'simple_int': np.array([4, 5, 6], dtype= np.int64),
... 'nan_variations': np.array(['np.nan', 'nan', np.nan], dtype=object),
... 'fancy_content': np.array([4, 'x', [1, 2]], dtype=object),
... }, index=pd.Index([0, '1', 2], dtype='object'))
simple_float simple_int nan_variations fancy_content
0 1.0 4 np.nan 4
1 2.0 5 nan x
2 3.0 6 NaN [1, 2]
Upvotes: 2
Reputation: 661
Here's another approach that does not use dicts
import numpy as np
def dataframe_to_code(df):
data = np.array2string(df.to_numpy(), separator=', ')
data = data.replace(" nan", " float('nan')")
cols = df.columns.tolist()
return f"""df = pd.DataFrame({data}, columns={cols})"""
The data.replace(" nan", " float('nan')")
is optional and was inspired by madokis excellent answer.
Note that np.array2string
only works for numpy versions 1.11 and higher.
I recommend using https://github.com/psf/black to format the output
Upvotes: 1
Reputation: 3810
I always used this code which help me much
def gen_code(df):
return 'pickle.loads({})'.format(pickle.dumps(df))
import pickle
code_string = gen_code(df)
code_string
So now you can copy the output of the code_string
and paste it as follow to that string variable A
A= 'Paste your code_string here'
import pickle
df=eval(A)
This had helped me copy and past data frames in such platform
Upvotes: 1
Reputation: 663
You could try to use the to_dict() method on DataFrame:
print "df = pd.DataFrame( %s )" % (str(df.to_dict()))
If your data contains NaN's, you'll have to replace them with float('nan'):
print "df = pd.DataFrame( %s )" % (str(df.to_dict()).replace(" nan"," float('nan')"))
Upvotes: 35
Reputation: 879
You can first save the dataframe you have, and then load in another python script when necessary. You can do it with two packages: pickle
and shelve
.
pickle
:import pandas as pd
import pickle
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with open('dataframe', 'wb') as pfile:
pickle.dump(df, pfile) # save df in a file named "dataframe"
To read the dataframe in another file:
import pickle
with open('dataframe', 'rb') as pfile:
df2 = pickle.load(pfile) # read the dataframe stored in file "dataframe"
print(df2)
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
shelve
:import pandas as pd
import shelve
df = pd.DataFrame({'user': ['Bob', 'Jane', 'Alice'],
'income': [40000, 50000, 42000]})
with shelve.open('dataframe2') as shelf:
shelf['df'] = df # store the dataframe in file "dataframe"
To read the dataframe in another file:
import shelve
with shelve.open('dataframe2') as shelf:
print(shelf['df']) # read the dataframe
Output:
income user
0 40000 Bob
1 50000 Jane
2 42000 Alice
Upvotes: -1