vinicius de novaes
vinicius de novaes

Reputation: 431

General way to compare two pandas dataframe and its columns type

I'm trying to discover what is the best way (in terms of readability) to make a function that compares any two general pandas DataFrames using the following rules

  1. don't care for the order of columns or rows,
  2. report differences in type of columns,
  3. use a approximate comparison function for the float types,
  4. and report any other obvious difference, like lacking a column, or row

For example:

>>> df1
    Product   Price
0  Computer  1200.0
1     Phone   800.0
2   Printer   200.0
3      Desk   350.0
>>> df2
    Product        Price
0     Phone   800.000000 
1  Computer  1200.000001
2   Printer   200.000000
3      Desk   350.000000

my_appr_dataframe_compare(df1, df2) should return True, even that they have swapped rows, and the value of a cell is not exactly equals

Another example:

>>> df3
      Price   Product
0    1200.0  Computer
1     800.0     Phone
2     200.0   Printer
3     350.0      Desk

my_appr_dataframe_compare(df1, df3) should return True, even that they have swapped columns

>>> df4
    Product   Price
0  Computer    1200
1     Phone     800
2   Printer     200
3      Desk     350

my_appr_dataframe_compare(df1, df4) should report difference on the 'Price1' column type, as one is an int, and the other is a float

Upvotes: 0

Views: 92

Answers (2)

vinicius de novaes
vinicius de novaes

Reputation: 431

It gave some work, but I manage to get it done, don't think it's Pythonic though.

import pandas as pd
import numpy as np

from numbers import Real
from typing import Callable

epsilon = 0.000_1


def appr(a: Real, b: Real, factor: Real) -> bool:
    factor = abs(factor)
    return (a >= b-factor) and (a <= b+factor)


def absolute_appr(a: Real, b: Real) -> bool:
    return appr(a, b, epsilon)


def compare_element(e1, e2) -> bool:
    if not type(e1).__module__ == type(e2).__module__:
        return False
    if type(e1).__module__ == np.__name__:
        if not e1.dtype.kind == e2.dtype.kind:
            print('Kinds of ' + str(e1) + ' (' + str(e1.dtype.kind) + ') and ' +
                  str(e2) + ' (' + str(e2.dtype.kind) + ') are different')
            return False
        if e1.dtype.kind == 'f':
            if np.isnan(e1) and np.isnan(e2):
                return True
            return absolute_appr(e1, e2)
        return e1 == e2

    if not type(e1) is type(e2):
        print('Types of ' + str(e1) + ' (' + str(type(e1)) + ') and ' +
              str(e2) + ' (' + str(type(e2)) + ') are different')
        return False
    if isinstance(e1, float):
        return absolute_appr(e1, e2)
    return e1 == e2


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> bool:
    s1 = set(df1.columns.values.tolist()).copy()
    s2 = set(df2.columns.values.tolist()).copy()
    return s1 == s2


def normalize_dataframe(df: pd.DataFrame) -> list:
    df.reset_index(inplace=True)
    cl: list = df.columns.values.tolist()
    cl.sort()
    df.sort_values(by=cl, inplace=True)
    df.reset_index(inplace=True, drop=True)
    return cl


def approx_compare_dataframe(df1: pd.DataFrame, df2: pd.DataFrame):
    df1 = df1.copy()
    df2 = df2.copy()
    normalize_dataframe(df1)
    col_list: list = normalize_dataframe(df2)
    if not compare_columns(df1, df2):
        return False
    for col in col_list:
        for row in df1.index:
            e1 = df1[col][row]
            e2 = df2[col][row]
            if not compare_element(e1, e2):
                print('On col ' + str(col) + ' and row ' + str(row))
                print(str(e1) + ' is not equal to ' + str(e2))
                print('DataFrame 1: ')
                print(df1)
                print('DataFrame 2: ')
                print(df2)
                return False
    return True

Tests as follow:

index1 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns1 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df1 = pd.DataFrame([(389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index1,
                  columns=columns1)


index2 = pd.MultiIndex.from_tuples([
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey'), 
                                   ('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                    ],
                                  names=['class', 'name'])
columns2 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df2 = pd.DataFrame([
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump'), 
                   (389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                    ],
                  index=index2,
                  columns=columns2)


index3 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns3 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df3 = pd.DataFrame([(389.00000001, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index3,
                  columns=columns3)


index4 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns4 = pd.MultiIndex.from_tuples([
                                     ('speed', 'min'),
                                     ('speed', 'max'),
                                     ('species', 'type')])
df4 = pd.DataFrame([
                   (388.0, 389.0, 'fly'),
                   (23.0,  24.0, 'fly'),
                   (80.0,  80.5, 'run'),
                   (-1.0, np.nan, 'jump')],
                  index=index4,
                  columns=columns4)


index5 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns5 = pd.MultiIndex.from_tuples([
                                     ('speed', 'min'),
                                     ('speed', 'max'),
                                     ('species', 'type')])
df5 = pd.DataFrame([
                   (388, 389.0, 'fly'),
                   (23,  24.0, 'fly'),
                   (80,  80.5, 'run'),
                   (-1, np.nan, 'jump')],
                  index=index5,
                  columns=columns5)

df5 = df5.astype({('speed', 'min'): int})


index6 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns6 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df6 = pd.DataFrame([(388.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, -1.0, 'jump')],
                  index=index6,
                  columns=columns6)


index7 = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                   ('bird', 'parrot'),
                                   ('mammal', 'lion'),
                                   ('mammal', 'monkey')],
                                  names=['class', 'name'])
columns7 = pd.MultiIndex.from_tuples([('speed', 'max'),
                                     ('speed', 'min'),
                                     ('species', 'type')])
df7 = pd.DataFrame([(389.0, 388.0, 'fly'),
                   ( 24.0, 23.0, 'fly'),
                   ( 80.5, 80.0, 'run'),
                   (np.nan, np.nan, 'jump')],
                  index=index7,
                  columns=columns7)

>>> df1
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df2
               speed        species
                 max    min    type
class  name                        
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly

>>> df3
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df4
               speed        species
                 min    max    type
class  name                        
bird   falcon  388.0  389.0     fly
       parrot   23.0   24.0     fly
mammal lion     80.0   80.5     run
       monkey   -1.0    NaN    jump

>>> df5
              speed        species
                min    max    type
class  name                       
bird   falcon   388  389.0     fly
       parrot    23   24.0     fly
mammal lion      80   80.5     run
       monkey    -1    NaN    jump

>>> df6

               speed        species
                 max    min    type
class  name                        
bird   falcon  388.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN   -1.0    jump

>>> df7
               speed        species
                 max    min    type
class  name                        
bird   falcon  389.0  388.0     fly
       parrot   24.0   23.0     fly
mammal lion     80.5   80.0     run
       monkey    NaN    NaN    jump

approx_compare_dataframe(df1, df1)
True

approx_compare_dataframe(df1, df2)
True

approx_compare_dataframe(df1, df3)
True

approx_compare_dataframe(df1, df4)
True

approx_compare_dataframe(df1, df5)
Kinds of 388.0 (f) and 388 (i) are different
On col ('speed', 'min') and row 0
388.0 is not equal to 388
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name speed        species
                    min    max    type
0    bird  falcon   388  389.0     fly
1    bird  parrot    23   24.0     fly
2  mammal    lion    80   80.5     run
3  mammal  monkey    -1    NaN    jump
False

approx_compare_dataframe(df1, df6)
On col ('speed', 'max') and row 0
389.0 is not equal to 388.0
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  388.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
False

approx_compare_dataframe(df1, df7)
On col ('speed', 'min') and row 3
-1.0 is not equal to nan
DataFrame 1: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN   -1.0    jump
DataFrame 2: 
    class    name  speed        species
                     max    min    type
0    bird  falcon  389.0  388.0     fly
1    bird  parrot   24.0   23.0     fly
2  mammal    lion   80.5   80.0     run
3  mammal  monkey    NaN    NaN    jump
False

Upvotes: 1

BENY
BENY

Reputation: 323326

We can try merge with indicator

df1.merge(df2,indicator=True,how='outer')['_merge'].eq('both').all()
False
df1.merge(df3,indicator=True,how='outer')['_merge'].eq('both').all()
True

Sample output

df1.merge(df3,indicator=True,how='outer')
    Product   Price _merge
0  Computer  1200.0   both
1     Phone   800.0   both
2   Printer   200.0   both
3      Desk   350.0   both

Upvotes: 1

Related Questions