General way to compare two pandas dataframe and its columns type

Question

I'm trying to discover what is the best way (in terms of readability) to make a function that compares any two general pandas DataFrames using the following rules

don't care for the order of columns or rows,
report differences in type of columns,
use a approximate comparison function for the float types,
and report any other obvious difference, like lacking a column, or row

For example:

>>> df1
    Product   Price
0  Computer  1200.0
1     Phone   800.0
2   Printer   200.0
3      Desk   350.0

>>> df2
    Product        Price
0     Phone   800.000000 
1  Computer  1200.000001
2   Printer   200.000000
3      Desk   350.000000

my_appr_dataframe_compare(df1, df2) should return True, even that they have swapped rows, and the value of a cell is not exactly equals

Another example:

>>> df3
      Price   Product
0    1200.0  Computer
1     800.0     Phone
2     200.0   Printer
3     350.0      Desk

my_appr_dataframe_compare(df1, df3) should return True, even that they have swapped columns

>>> df4
    Product   Price
0  Computer    1200
1     Phone     800
2   Printer     200
3      Desk     350

my_appr_dataframe_compare(df1, df4) should report difference on the 'Price1' column type, as one is an int, and the other is a float

vinicius de novaes · Accepted Answer

It gave some work, but I manage to get it done, don't think it's Pythonic though.

import pandas as pd
import numpy as np

from numbers import Real
from typing import Callable

epsilon = 0.000_1


def appr(a: Real, b: Real, factor: Real) -> bool:
    factor = abs(factor)
    return (a >= b-factor) and (a <= b+factor)


def absolute_appr(a: Real, b: Real) -> bool:
    return appr(a, b, epsilon)


def compare_element(e1, e2) -> bool:
    if not type(e1).__module__ == type(e2).__module__:
        return False
    if type(e1).__module__ == np.__name__:
        if not e1.dtype.kind == e2.dtype.kind:
            print('Kinds of ' + str(e1) + ' (' + str(e1.dtype.kind) + ') and ' +
                  str(e2) + ' (' + str(e2.dtype.kind) + ') are different')
            return False
        if e1.dtype.kind == 'f':
            if np.isnan(e1) and np.isnan(e2):
                return True
            return absolute_appr(e1, e2)
        return e1 == e2

    if not type(e1) is type(e2):
        print('Types of ' + str(e1) + ' (' + str(type(e1)) + ') and ' +
              str(e2) + ' (' + str(type(e2)) + ') are different')
        return False
    if isinstance(e1, float):
        return absolute_appr(e1, e2)
    return e1 == e2


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    s1 = set(df1.columns.values.tolist()).copy()
    s2 = set(df2.columns.values.tolist()).copy()
    return s1 == s2


def normalize_dataframe(df: pd.DataFrame) -> list:
    df.reset_index(inplace=True)
    cl: list = df.columns.values.tolist()
    cl.sort()
    df.sort_values(by=cl, inplace=True)
    df.reset_index(inplace=True, drop=True)
    return cl


def approx_compare_dataframe(df1: pd.DataFrame, df2: pd.DataFrame):
    df1 = df1.copy()
    df2 = df2.copy()
    normalize_dataframe(df1)
    col_list: list = normalize_dataframe(df2)
    if not compare_columns(df1, df2):
        return False
    for col in col_list:
        for row in df1.index:
            e1 = df1[col][row]
            e2 = df2[col][row]
            if not compare_element(e1, e2):
                print('On col ' + str(col) + ' and row ' + str(row))
                print(str(e1) + ' is not equal to ' + str(e2))
                print('DataFrame 1: ')
                print(df1)
                print('DataFrame 2: ')
                print(df2)
                return False
    return True

Tests as follow:

index1 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns1 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df1 = pd.DataFrame([(389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index1,
                  columns=columns1)


index2 = pd.MultiIndex.from_tuples([
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey'), 
                                   ('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                    ],
                                  names=['class', 'name'])
columns2 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df2 = pd.DataFrame([
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump'), 
                   (389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                    ],
                  index=index2,
                  columns=columns2)


index3 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns3 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df3 = pd.DataFrame([(389.00000001, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index3,
                  columns=columns3)


index4 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns4 = pd.MultiIndex.from_tuples([
                                     ('speed', 'min'),
                                     ('speed', 'max'),
                                     ('species', 'type')])
df4 = pd.DataFrame([
                   (388.0, 389.0, 'fly'),
                   (23.0,  24.0, 'fly'),
                   (80.0,  80.5, 'run'),
                   (-1.0, np.nan, 'jump')],
                  index=index4,
                  columns=columns4)


index5 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns5 = pd.MultiIndex.from_tuples([
                                     ('speed', 'min'),
                                     ('speed', 'max'),
                                     ('species', 'type')])
df5 = pd.DataFrame([
                   (388, 389.0, 'fly'),
                   (23,  24.0, 'fly'),
                   (80,  80.5, 'run'),
                   (-1, np.nan, 'jump')],
                  index=index5,
                  columns=columns5)

df5 = df5.astype({('speed', 'min'): int})


index6 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns6 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df6 = pd.DataFrame([(388.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index6,
                  columns=columns6)


index7 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns7 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df7 = pd.DataFrame([(389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, np.nan, 'jump')],
                  index=index7,
                  columns=columns7)

>>> df1
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df2
               speed        species
                 max    min    type
class  name                        
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly

>>> df3
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df4
               speed        species
                 min    max    type
class  name                        
bird   falcon  388.0  389.0     fly
       parrot   23.0   24.0     fly
mammal lion     80.0   80.5     run
       monkey   -1.0    NaN    jump

>>> df5
              speed        species
                min    max    type
class  name                       
bird   falcon   388  389.0     fly
       parrot    23   24.0     fly
mammal lion      80   80.5     run
       monkey    -1    NaN    jump

>>> df6

               speed        species
                 max    min    type
class  name                        
bird   falcon  388.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df7
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN    NaN    jump

approx_compare_dataframe(df1, df1)
True

approx_compare_dataframe(df1, df2)
True

approx_compare_dataframe(df1, df3)
True

approx_compare_dataframe(df1, df4)
True

approx_compare_dataframe(df1, df5)
Kinds of 388.0 (f) and 388 (i) are different
On col ('speed', 'min') and row 0
388.0 is not equal to 388
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name speed        species
                    min    max    type
0    bird  falcon   388  389.0     fly
1    bird  parrot    23   24.0     fly
2  mammal    lion    80   80.5     run
3  mammal  monkey    -1    NaN    jump
False

approx_compare_dataframe(df1, df6)
On col ('speed', 'max') and row 0
389.0 is not equal to 388.0
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  388.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
False

approx_compare_dataframe(df1, df7)
On col ('speed', 'min') and row 3
-1.0 is not equal to nan
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN    NaN    jump
False

General way to compare two pandas dataframe and its columns type

Answers (2)

Related Questions