Reputation: 1182
Say I have a pandas dataframe with multiple dtypes and I want to add data from another which might have other types. Is there an easy way to make pandas convert the types or raise an error when, for instance, concatenating the dataframes?
import pandas as pd
old = pd.DataFrame({'one':[1,2,3],'two':[100,200,300]}).astype({'one':'int8', 'two':'Int16'})
new = pd.DataFrame({'one':['11','22','33'],'two':['110','220','330']})
(old.dtypes == new.dtypes).all() # False
# a normal pd.concat([old, new]) at this point would result in 'object' dtypes for
# both columns. I want something like
result = coercive_concat([old, new])
(old.dtypes == result.dtypes).all() # True
(result.dtypes == new.dtypes).all() # False
# and it should raise an error when it can't coerce the types
odd = pd.DataFrame({'one':['eleventeen','22','33'],'two':['110','220','330']})
result = coercive_concat([old, odd]) # Error
Upvotes: 0
Views: 746
Reputation: 1182
This is the current version of my solution (old version at the end of this answer). The function below handles null values and booleans. Nulls are a pain because pandas will switch to float
when appending np.nan
even for pandas Int
types (nulls must be np.NA
to preserve the column type). Booleans are a problem when reading boolean fields out of text files because the nonempty string "False"
is boolean True
.
import pandas as pd
def coerce_types(coerce, template, insert_columns=True):
'''
coerce one pandas Series or DataFrame into the same data types as
another. This function was initially required because pandas cannot
convert string values directly to pd.Int types, but pd.Int types are
the only integer types that have a null value in pandas. The
solution is to first convert to numpy.int types by using lower case
names for integer dtypes in DataFrame.astype() and then convert to
any other type.
See https://stackoverflow.com/q/67646300/875343
'''
coerce = coerce.copy()
if (type(coerce) == pd.Series):
if (type(template) != pd.Series):
raise TypeError('if object to coerce has type pd.Series '
'then template must have type pd.Series')
try:
coerce = coerce.astype(str(template.dtypes))
except:
whereNaN = coerce.isna()
coerce = coerce.fillna(0)
coerce = coerce.astype(str(template.dtypes).lower())
coerce = coerce.astype(str(template.dtypes))
coerce = coerce.mask(whereNaN, pd.NA)
else:
if not all([l in template.columns for l in coerce.columns]):
missing_cols = [l for l in coerce.columns
if l not in template.columns]
raise ValueError('Every column in dataframe to coerce must '
'exist in template dataframe. Column '
'label(s) {} do not exist in the template.'
.format(missing_cols))
common_cols = [l for l in coerce.columns if l in template.columns]
if insert_columns and not all([l in coerce.columns
for l in template.columns]):
for i in range(len(template.columns)):
label = template.columns[i]
if label not in common_cols:
coerce.insert(i, label, pd.NA)
c = template.columns
else:
c = common_cols
coerce = coerce[c]
if 'is_directed' in coerce.columns:
print(coerce['is_directed'])
# non-empty strings are boolean True (the string "False" is
# truthy) so check for columns of type 'object' that need to be
# coerced to boolean and then map "true" and "false" to their
# boolean values (case insensitive).
types_to_coerce = pd.Series(coerce.dtypes, index=c)
template_types = pd.Series(template[c].dtypes, index=c)
where_template_bool = (template_types == bool)
if where_template_bool.any():
bool_labels = template_types.loc[where_template_bool].index.array
where_coerce_also_obj = (types_to_coerce[bool_labels] == 'object')
if where_coerce_also_obj.any():
labels = where_coerce_also_obj.index.array
bool_map = {'true':True, 'false':False}
mapper = lambda x: x.str.lower().map(bool_map)
coerce.loc[:, labels] = coerce[labels].apply(mapper)
if 'is_directed' in coerce.columns:
print(coerce['is_directed'])
try:
coerce = coerce.astype(dict(zip(coerce.columns,
template[c].dtypes.array)))
except:
whereNaN = coerce.isna()
coerce = coerce.fillna(0)
lower_case_types = template[c].dtypes.astype(str).str.lower().array
coerce = coerce.astype(dict(zip(coerce.columns, lower_case_types)))
coerce = coerce.astype(dict(zip(coerce.columns,
template[c].dtypes.array)))
coerce = coerce.mask(whereNaN, pd.NA)
return coerce
The answer from Anurang Dabas is correct but it does not handle null values. Since pandas often defaults to numpy np.nan
(which does not convert to pandas Int
types) instead of the newer pandas pd.NA
type (which does), applying that answer alone will throw errors for null values in some cases, for instance when a column has type 'object'
with some np.nan
values and you try to convert to a pd.Int
type. The function below handles np.nan
values.
import pandas as pd
def coerce_types(change=None, like=None):
if (type(change) == pd.Series):
try:
change = change.astype(str(like.dtypes))
except:
whereNaN = change.isna()
change = change.fillna(0)
change = change.astype(str(like.dtypes).lower())
change = change.astype(str(like.dtypes))
change = change.mask(whereNaN, pd.NA)
else:
try:
change = change.astype(dict(zip(change.columns, like.dtypes.values)))
except:
whereNaN = change.isna()
change = change.fillna(0)
change = change.astype(dict(zip(change.columns, like.dtypes.astype(str).str.lower().values)))
change = change.astype(dict(zip(change.columns, like.dtypes.values)))
change = change.mask(whereNaN, pd.NA)
return change
note that this will convert all null values in the dataframe to pd.NA
and that is not equivalent to np.nan
. You will need to modify this code if you want to preserve the null types from the original dataframe. Example with dataframes:
import pandas as pd
old_types = ['Int8', 'float', 'Int32', 'object']
old_data = pd.DataFrame({'a':1, 'b':1e6, 'c':3, 'd':'value'}, index=[0])
old_data = old_data.astype(dict(zip(old_data.columns, old_types)))
all(old_data.dtypes.values == ['Int8', 'float64', 'Int32', 'object'])
# True
new_data = pd.DataFrame({'a':2, 'b':'555', 'c':4}, index=[0])
all(new_data.dtypes.values == ['int64', 'object', 'int64'])
# True
naive_result = old_data.append(new_data)
all(old_data.dtypes == naive_result.dtypes)
# False
correct_result = coerce_types(naive_result, old_data)
all(old_data.dtypes == correct_result.dtypes)
# True
all(correct_result.dtypes.values == ['Int8', 'float64', 'Int32', 'object'])
# True
Example with series:
import pandas as pd
import numpy as np
old_data = pd.Series(['1', '2', np.nan, 4.0])
old_data.astype('Int8')
# TypeError: object cannot be converted to an IntegerDtype
old_data = coerce_types(old_data, pd.Series([], dtype='Int8'))
print(old_data)
'''
output:
0 1
1 2
2 <NA>
3 4
dtype: Int8
'''
Upvotes: 0
Reputation: 24322
Try:
new=new.astype(dict(zip(new.columns, old.dtypes.astype(str).str.lower().values)))
Now if you print new.dtypes
you will get:
one int8
two int16
dtype: object
Upvotes: 1
Reputation: 470
There is way to treat every column data as string type during the reading of file or creating a dataframe by passing argument dtype=str
This will change NaN as "NULL" for null values
Here is a code snippet which changes the all dtypes as string
df_read = pd.read_csv(savefile, dtype=str)
May be you can go through this similar question asked in stackoverflow . Also , this behaviour varies little with version of pandas you are using. In newer pandas, the code snippet will work fine
I have modified your code little bit below
import pandas as pd
old = pd.DataFrame({'one':[1,2,3],'two':[100,200,300]}, dtype=str)
new = pd.DataFrame({'one':['11','22','33'],'two':['110','220','330']}, dtype=str)
print((old.dtypes == new.dtypes).all()) # This will return True as all data types are string now
Upvotes: 0