Reputation: 431
I'm trying to discover what is the best way (in terms of readability) to make a function that compares any two general pandas DataFrames using the following rules
For example:
>>> df1 Product Price 0 Computer 1200.0 1 Phone 800.0 2 Printer 200.0 3 Desk 350.0
>>> df2 Product Price 0 Phone 800.000000 1 Computer 1200.000001 2 Printer 200.000000 3 Desk 350.000000
my_appr_dataframe_compare(df1, df2)
should return True
, even that they have swapped rows, and the value of a cell is not exactly equals
Another example:
>>> df3 Price Product 0 1200.0 Computer 1 800.0 Phone 2 200.0 Printer 3 350.0 Desk
my_appr_dataframe_compare(df1, df3)
should return True
, even that they have swapped columns
>>> df4 Product Price 0 Computer 1200 1 Phone 800 2 Printer 200 3 Desk 350
my_appr_dataframe_compare(df1, df4)
should report difference on the 'Price1' column type, as one is an int, and the other is a float
Upvotes: 0
Views: 92
Reputation: 431
It gave some work, but I manage to get it done, don't think it's Pythonic though.
import pandas as pd
import numpy as np
from numbers import Real
from typing import Callable
epsilon = 0.000_1
def appr(a: Real, b: Real, factor: Real) -> bool:
factor = abs(factor)
return (a >= b-factor) and (a <= b+factor)
def absolute_appr(a: Real, b: Real) -> bool:
return appr(a, b, epsilon)
def compare_element(e1, e2) -> bool:
if not type(e1).__module__ == type(e2).__module__:
return False
if type(e1).__module__ == np.__name__:
if not e1.dtype.kind == e2.dtype.kind:
print('Kinds of ' + str(e1) + ' (' + str(e1.dtype.kind) + ') and ' +
str(e2) + ' (' + str(e2.dtype.kind) + ') are different')
return False
if e1.dtype.kind == 'f':
if np.isnan(e1) and np.isnan(e2):
return True
return absolute_appr(e1, e2)
return e1 == e2
if not type(e1) is type(e2):
print('Types of ' + str(e1) + ' (' + str(type(e1)) + ') and ' +
str(e2) + ' (' + str(type(e2)) + ') are different')
return False
if isinstance(e1, float):
return absolute_appr(e1, e2)
return e1 == e2
def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
s1 = set(df1.columns.values.tolist()).copy()
s2 = set(df2.columns.values.tolist()).copy()
return s1 == s2
def normalize_dataframe(df: pd.DataFrame) -> list:
df.reset_index(inplace=True)
cl: list = df.columns.values.tolist()
cl.sort()
df.sort_values(by=cl, inplace=True)
df.reset_index(inplace=True, drop=True)
return cl
def approx_compare_dataframe(df1: pd.DataFrame, df2: pd.DataFrame):
df1 = df1.copy()
df2 = df2.copy()
normalize_dataframe(df1)
col_list: list = normalize_dataframe(df2)
if not compare_columns(df1, df2):
return False
for col in col_list:
for row in df1.index:
e1 = df1[col][row]
e2 = df2[col][row]
if not compare_element(e1, e2):
print('On col ' + str(col) + ' and row ' + str(row))
print(str(e1) + ' is not equal to ' + str(e2))
print('DataFrame 1: ')
print(df1)
print('DataFrame 2: ')
print(df2)
return False
return True
Tests as follow:
index1 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns1 = pd.MultiIndex.from_tuples([('speed', 'max'),
('speed', 'min'),
('species', 'type')])
df1 = pd.DataFrame([(389.0, 388.0, 'fly'),
( 24.0, 23.0, 'fly'),
( 80.5, 80.0, 'run'),
(np.nan, -1.0, 'jump')],
index=index1,
columns=columns1)
index2 = pd.MultiIndex.from_tuples([
('mammal', 'lion'),
('mammal', 'monkey'),
('bird', 'falcon'),
('bird', 'parrot'),
],
names=['class', 'name'])
columns2 = pd.MultiIndex.from_tuples([('speed', 'max'),
('speed', 'min'),
('species', 'type')])
df2 = pd.DataFrame([
( 80.5, 80.0, 'run'),
(np.nan, -1.0, 'jump'),
(389.0, 388.0, 'fly'),
( 24.0, 23.0, 'fly'),
],
index=index2,
columns=columns2)
index3 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns3 = pd.MultiIndex.from_tuples([('speed', 'max'),
('speed', 'min'),
('species', 'type')])
df3 = pd.DataFrame([(389.00000001, 388.0, 'fly'),
( 24.0, 23.0, 'fly'),
( 80.5, 80.0, 'run'),
(np.nan, -1.0, 'jump')],
index=index3,
columns=columns3)
index4 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns4 = pd.MultiIndex.from_tuples([
('speed', 'min'),
('speed', 'max'),
('species', 'type')])
df4 = pd.DataFrame([
(388.0, 389.0, 'fly'),
(23.0, 24.0, 'fly'),
(80.0, 80.5, 'run'),
(-1.0, np.nan, 'jump')],
index=index4,
columns=columns4)
index5 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns5 = pd.MultiIndex.from_tuples([
('speed', 'min'),
('speed', 'max'),
('species', 'type')])
df5 = pd.DataFrame([
(388, 389.0, 'fly'),
(23, 24.0, 'fly'),
(80, 80.5, 'run'),
(-1, np.nan, 'jump')],
index=index5,
columns=columns5)
df5 = df5.astype({('speed', 'min'): int})
index6 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns6 = pd.MultiIndex.from_tuples([('speed', 'max'),
('speed', 'min'),
('species', 'type')])
df6 = pd.DataFrame([(388.0, 388.0, 'fly'),
( 24.0, 23.0, 'fly'),
( 80.5, 80.0, 'run'),
(np.nan, -1.0, 'jump')],
index=index6,
columns=columns6)
index7 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
('bird', 'parrot'),
('mammal', 'lion'),
('mammal', 'monkey')],
names=['class', 'name'])
columns7 = pd.MultiIndex.from_tuples([('speed', 'max'),
('speed', 'min'),
('species', 'type')])
df7 = pd.DataFrame([(389.0, 388.0, 'fly'),
( 24.0, 23.0, 'fly'),
( 80.5, 80.0, 'run'),
(np.nan, np.nan, 'jump')],
index=index7,
columns=columns7)
>>> df1
speed species
max min type
class name
bird falcon 389.0 388.0 fly
parrot 24.0 23.0 fly
mammal lion 80.5 80.0 run
monkey NaN -1.0 jump
>>> df2
speed species
max min type
class name
mammal lion 80.5 80.0 run
monkey NaN -1.0 jump
bird falcon 389.0 388.0 fly
parrot 24.0 23.0 fly
>>> df3
speed species
max min type
class name
bird falcon 389.0 388.0 fly
parrot 24.0 23.0 fly
mammal lion 80.5 80.0 run
monkey NaN -1.0 jump
>>> df4
speed species
min max type
class name
bird falcon 388.0 389.0 fly
parrot 23.0 24.0 fly
mammal lion 80.0 80.5 run
monkey -1.0 NaN jump
>>> df5
speed species
min max type
class name
bird falcon 388 389.0 fly
parrot 23 24.0 fly
mammal lion 80 80.5 run
monkey -1 NaN jump
>>> df6
speed species
max min type
class name
bird falcon 388.0 388.0 fly
parrot 24.0 23.0 fly
mammal lion 80.5 80.0 run
monkey NaN -1.0 jump
>>> df7
speed species
max min type
class name
bird falcon 389.0 388.0 fly
parrot 24.0 23.0 fly
mammal lion 80.5 80.0 run
monkey NaN NaN jump
approx_compare_dataframe(df1, df1)
True
approx_compare_dataframe(df1, df2)
True
approx_compare_dataframe(df1, df3)
True
approx_compare_dataframe(df1, df4)
True
approx_compare_dataframe(df1, df5)
Kinds of 388.0 (f) and 388 (i) are different
On col ('speed', 'min') and row 0
388.0 is not equal to 388
DataFrame 1:
class name speed species
max min type
0 bird falcon 389.0 388.0 fly
1 bird parrot 24.0 23.0 fly
2 mammal lion 80.5 80.0 run
3 mammal monkey NaN -1.0 jump
DataFrame 2:
class name speed species
min max type
0 bird falcon 388 389.0 fly
1 bird parrot 23 24.0 fly
2 mammal lion 80 80.5 run
3 mammal monkey -1 NaN jump
False
approx_compare_dataframe(df1, df6)
On col ('speed', 'max') and row 0
389.0 is not equal to 388.0
DataFrame 1:
class name speed species
max min type
0 bird falcon 389.0 388.0 fly
1 bird parrot 24.0 23.0 fly
2 mammal lion 80.5 80.0 run
3 mammal monkey NaN -1.0 jump
DataFrame 2:
class name speed species
max min type
0 bird falcon 388.0 388.0 fly
1 bird parrot 24.0 23.0 fly
2 mammal lion 80.5 80.0 run
3 mammal monkey NaN -1.0 jump
False
approx_compare_dataframe(df1, df7)
On col ('speed', 'min') and row 3
-1.0 is not equal to nan
DataFrame 1:
class name speed species
max min type
0 bird falcon 389.0 388.0 fly
1 bird parrot 24.0 23.0 fly
2 mammal lion 80.5 80.0 run
3 mammal monkey NaN -1.0 jump
DataFrame 2:
class name speed species
max min type
0 bird falcon 389.0 388.0 fly
1 bird parrot 24.0 23.0 fly
2 mammal lion 80.5 80.0 run
3 mammal monkey NaN NaN jump
False
Upvotes: 1
Reputation: 323326
We can try merge with indicator
df1.merge(df2,indicator=True,how='outer')['_merge'].eq('both').all()
False
df1.merge(df3,indicator=True,how='outer')['_merge'].eq('both').all()
True
Sample output
df1.merge(df3,indicator=True,how='outer')
Product Price _merge
0 Computer 1200.0 both
1 Phone 800.0 both
2 Printer 200.0 both
3 Desk 350.0 both
Upvotes: 1