Perturb training data with missing values and noise Autogluon multimodal predictor

Question

I'm currently working on experimenting on different multimodal models with missing data. For that I want to add random 15% missing features and 15% noise. I want to do that for this model.

import os
import json
import argparse
import time
import random
from datetime import datetime 
from typing import Tuple

from ray import tune
import pandas as pd
from autogluon.multimodal import MultiModalPredictor, __version__
from sklearn.metrics import log_loss

from ..utils import get_exp_constraint, prepare_ag_dataset
from ..autogluon.exec import get_metric_names


def get_fit_hyperparameters(model_names: str) -> Tuple[dict, dict]:
    if model_names == 'fusion':
        # the default one shown in table
        hyperparameters = {
                "model.names": ["hf_text", "timm_image", "clip", "categorical_mlp", "numerical_mlp", "fusion_mlp"],
                "optimization.max_epochs": 1000,
                "model.hf_text.checkpoint_name": "roberta-base",
                "model.timm_image.checkpoint_name": "vit_base_patch32_224",
                }
    elif model_names == 'clip':
        hyperparameters = {
                "model.names": ["clip"],
                "data.categorical.convert_to_text": True,
                "data.numerical.convert_to_text": True,
                "optimization.max_epochs": 1000,
                }
    elif model_names == 'swin':
        # "swin_base_patch4_window7_224"
        hyperparameters = {
                "model.names": ["timm_image"],
                "optimization.max_epochs": 1000,
                }
    elif model_names == 'vit':
        hyperparameters = {
                "model.names": ["timm_image"],
                "model.timm_image.checkpoint_name": "vit_base_patch32_224",
                "optimization.max_epochs": 1000,
                }
    elif model_names == 'electra':
        # "google/electra-base-discriminator"
        hyperparameters = {
                "model.names": ["hf_text"],
                "data.categorical.convert_to_text": True,
                "data.numerical.convert_to_text": True,
                "optimization.max_epochs": 1000,
                }
    elif model_names == 'roberta':
        hyperparameters = {
                "model.names": ["hf_text"],
                "model.hf_text.checkpoint_name": "roberta-base",
                "data.categorical.convert_to_text": True,
                "data.numerical.convert_to_text": True,
                "optimization.max_epochs": 1000,
                }
    else:
        raise ValueError(f'Not support model_names={model_names}')
    return hyperparameters


def main(args: argparse.Namespace):
    if not os.path.exists(args.exp_save_dir):
        os.makedirs(args.exp_save_dir)
    ts_duration = time.time()
    random.seed(args.seed)
    # load task configure
    with open(os.path.join(args.dataset_dir, 'info.json')) as fopen:
        info_dict = json.load(fopen)
    col_label = info_dict['label']
    eval_metric = info_dict['eval_metric']
    # load train, dev, test
    train_data, dev_data, test_data, feature_metadata = prepare_ag_dataset(args.dataset_dir)
    # prepare predictor
    model_save_dir = os.path.join(args.exp_save_dir, 'ag_ckpt')
    if args.do_load_ckpt:
        predictor = MultiModalPredictor.load(model_save_dir)
    else:
        hyperparameters = get_fit_hyperparameters(args.fit_setting)
        predictor = MultiModalPredictor(label=col_label, 
                                        path=model_save_dir, 
                                        eval_metric=eval_metric,
                                        )
        # do train
        ts = time.time()
        predictor.fit(train_data=train_data, 
                      tuning_data=dev_data, 
                      time_limit=args.fit_time_limit,
                      hyperparameters=hyperparameters,
                      seed=args.seed,
                      )
        te = time.time()
        training_duration = te - ts
    # do test
    metric_names = get_metric_names(predictor.problem_type)
    ts = time.time()
    test_metric_res = predictor.evaluate(test_data, metrics=metric_names)
    te = time.time()
    predict_duration = te - ts
    if 'log_loss' in test_metric_res:
        # mm_predictor log_loss has some issue
        y_pred_proba = predictor.predict_proba(test_data)
        test_metric_res['log_loss'] = log_loss(test_data[col_label], y_pred_proba)
    print(f'Test metrics={test_metric_res}')
    te_duration = time.time()
    if not args.do_load_ckpt:
        result = dict(
                task=info_dict['task'],
                framework=f'AutoMM-{args.fit_setting}',
                constraint=get_exp_constraint(args.fit_time_limit),
                type=predictor.problem_type,
                params=args.__dict__,
                framework_version=__version__,
                utc=datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S'),
                duration=round(te_duration-ts_duration, 1),
                training_duration=round(training_duration, 1),
                predict_duration=round(predict_duration, 1),
                seed=args.seed,
                )
        result.update(test_metric_res)
        exp_result_save_path = os.path.join(args.exp_save_dir, 'results.csv')
        result_df = pd.DataFrame.from_records([result])
        result_df.to_csv(exp_result_save_path, index=False)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="AutoGluon Multimodal predictor arguments to set")
    # required arguments
    parser.add_argument('--dataset_dir', type=str, required=True,
                        help='Which dataset to use. Expect a directory contains csvs and images.')
    parser.add_argument('--exp_save_dir', type=str, required=True,
                        help='the directory to save model checkpoints and exp result csv')
    # optional arguments
    # please refer to https://auto.gluon.ai/dev/tutorials/multimodal/beginner_multimodal.html
    parser.add_argument('--fit_setting', type=str, default='fusion',
                        choices=['fusion', 'clip', 'swin', 'electra', 'vit', 'resnet', 'roberta'],
                        help="Use which models. `fusion` represents multimodal fusion method AutoMM; `clip` represent txt-img model CLIP. default=fusion.", 
                        )
    parser.add_argument('--fit_time_limit', type=int, default=3600,
            help="How long training should run for (wallclock time in seconds). default=3600 (1 hour)")
    parser.add_argument('--seed', type=int, default=0,
                        help="global random seed. default=0")
    parser.add_argument('--do_load_ckpt', action='store_true',
                        help='a flag. If set, model will be loaded from `exp_save_dir`, and training process will be skipped. default=False.')

    args = parser.parse_args()
    print(f'[INFO] Exp arguments: {args}')
    main(args)

The data I'm using is a table which contains text, images and tabular data and I only want to add missing and noise to columns which refer to as text. So far I tried this by replacing missing values with a string "MISSING" and for adding noise I either add random noise values or replacing it with random values from the same column. But the results don't decrease they somehow stay the same. And I don't know why this happens, since at least adding missing features should decrease the performance. Is there something I'm doing wrong, when perturbing the data? The code is from here: https://github.com/lujiaying/MUG-Bench/tree/master/baselines/automm

def apply_missing_to_rows(data: pd.DataFrame, row_indices: np.ndarray, missing_value='MISSING'):
    data = data.copy()
    data.loc[row_indices, :] = missing_value
    return data

def apply_noise_to_rows(data: pd.DataFrame, row_indices: np.ndarray):
    data = data.copy()
    for col in data.columns:
        if pd.api.types.is_numeric_dtype(data[col]):
            noise = np.random.normal(0, 1.0 * data[col].std(), len(row_indices))
            data.loc[row_indices, col] += noise
        elif pd.api.types.is_categorical_dtype(data[col]) or data[col].dtype == 'object':
            unique_values = data[col].unique()
            random_replacements = np.random.choice(unique_values, len(row_indices), replace=True)
            data.loc[row_indices, col] = random_replacements
    return data

Perturb training data with missing values and noise Autogluon multimodal predictor

Answers (0)

Related Questions