AIF360 PYTHON - OPTIMIZED PRE-PROCESSING

Question

I have been busy with the AIF360 library in Python. I do get the error 'MemoryError: Unable to allocate 42.7 TiB for an array with shape (46926397440000,) and data type int8', while I have a very small dataset (1000 rows max). This is the code I get the error for:

The other pre-processing techniques did work (meaning there is no problem with the Dataset Structure and the Metrics).

Does anyone have experience with this library and this function?

Tips are welcome!

**To try it out yourself (on the german credit dataset): ** Libraries

!pip install aif360
from aif360.algorithms.preprocessing import LFR
import pandas as pd
import numpy as np
import scipy.optimize as optim

from aif360.algorithms import Transformer as TR
from aif360.algorithms.preprocessing.lfr_helpers import helpers as lfr_helpers
from aif360.algorithms.preprocessing import Reweighing

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing import LFR, Reweighing

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, auc
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
#import tensorflow as tf

from aif360.datasets import GermanDataset, BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.algorithms.preprocessing import OptimPreproc

from IPython.display import Markdown, display

import cvxpy as cp
import numpy as np
from aif360.algorithms.preprocessing.optim_preproc_helpers.opt_tools import OptTools

data = pd.read_csv('original_data.csv')
data.head()

# Example on gender

# Gender
dataset_gender = StandardDataset(data, 
                          label_name='Creditability', 
                          favorable_classes=[1], 
                          protected_attribute_names=['Gender'], 
                          privileged_classes=[[0]],
                          features_to_drop=['Age', 'Foreign_worker', 'Age_grouped'])
        
dataset_orig_train_gender, dataset_orig_test_gender = dataset_gender.split([0.7], shuffle=True)

privileged_groups_gender = [{'Gender': 0}]
unprivileged_groups_gender = [{'Gender': 1}]

metric_orig_train_gender = BinaryLabelDatasetMetric(dataset_orig_train_gender, 
                                             unprivileged_groups=unprivileged_groups_gender,
                                             privileged_groups=privileged_groups_gender)

print("SPD gender= %f" % metric_orig_train_gender.statistical_parity_difference())
print("DI gender = %f" % metric_orig_train_gender.disparate_impact())

CODE FOR THE OPTIMIZED PRE-PROCESSING:

optim_options = {
    "epsilon": 0.05,
    "clist": [0.99, 1.99, 2.99],
    "dlist": [.1, 0.05, 0]
}
    
OP = OptimPreproc(OptTools, optim_options)
OP = OP.fit(dataset_orig_train_gender)

dataset_preprocess_train_gender = OP.transform(dataset_orig_train_gender, transform_Y=True)
dataset_preprocess_train_gender = dataset_orig_train_gender.align_datasets(dataset_preprocess_train_gender)

AIF360 PYTHON - OPTIMIZED PRE-PROCESSING

Answers (1)

Related Questions