Reputation: 41
I'm currently working on experimenting on different multimodal models with missing data. For that I want to add random 15% missing features and 15% noise. I want to do that for this model.
import os
import json
import argparse
import time
import random
from datetime import datetime
from typing import Tuple
from ray import tune
import pandas as pd
from autogluon.multimodal import MultiModalPredictor, __version__
from sklearn.metrics import log_loss
from ..utils import get_exp_constraint, prepare_ag_dataset
from ..autogluon.exec import get_metric_names
def get_fit_hyperparameters(model_names: str) -> Tuple[dict, dict]:
if model_names == 'fusion':
# the default one shown in table
hyperparameters = {
"model.names": ["hf_text", "timm_image", "clip", "categorical_mlp", "numerical_mlp", "fusion_mlp"],
"optimization.max_epochs": 1000,
"model.hf_text.checkpoint_name": "roberta-base",
"model.timm_image.checkpoint_name": "vit_base_patch32_224",
}
elif model_names == 'clip':
hyperparameters = {
"model.names": ["clip"],
"data.categorical.convert_to_text": True,
"data.numerical.convert_to_text": True,
"optimization.max_epochs": 1000,
}
elif model_names == 'swin':
# "swin_base_patch4_window7_224"
hyperparameters = {
"model.names": ["timm_image"],
"optimization.max_epochs": 1000,
}
elif model_names == 'vit':
hyperparameters = {
"model.names": ["timm_image"],
"model.timm_image.checkpoint_name": "vit_base_patch32_224",
"optimization.max_epochs": 1000,
}
elif model_names == 'electra':
# "google/electra-base-discriminator"
hyperparameters = {
"model.names": ["hf_text"],
"data.categorical.convert_to_text": True,
"data.numerical.convert_to_text": True,
"optimization.max_epochs": 1000,
}
elif model_names == 'roberta':
hyperparameters = {
"model.names": ["hf_text"],
"model.hf_text.checkpoint_name": "roberta-base",
"data.categorical.convert_to_text": True,
"data.numerical.convert_to_text": True,
"optimization.max_epochs": 1000,
}
else:
raise ValueError(f'Not support model_names={model_names}')
return hyperparameters
def main(args: argparse.Namespace):
if not os.path.exists(args.exp_save_dir):
os.makedirs(args.exp_save_dir)
ts_duration = time.time()
random.seed(args.seed)
# load task configure
with open(os.path.join(args.dataset_dir, 'info.json')) as fopen:
info_dict = json.load(fopen)
col_label = info_dict['label']
eval_metric = info_dict['eval_metric']
# load train, dev, test
train_data, dev_data, test_data, feature_metadata = prepare_ag_dataset(args.dataset_dir)
# prepare predictor
model_save_dir = os.path.join(args.exp_save_dir, 'ag_ckpt')
if args.do_load_ckpt:
predictor = MultiModalPredictor.load(model_save_dir)
else:
hyperparameters = get_fit_hyperparameters(args.fit_setting)
predictor = MultiModalPredictor(label=col_label,
path=model_save_dir,
eval_metric=eval_metric,
)
# do train
ts = time.time()
predictor.fit(train_data=train_data,
tuning_data=dev_data,
time_limit=args.fit_time_limit,
hyperparameters=hyperparameters,
seed=args.seed,
)
te = time.time()
training_duration = te - ts
# do test
metric_names = get_metric_names(predictor.problem_type)
ts = time.time()
test_metric_res = predictor.evaluate(test_data, metrics=metric_names)
te = time.time()
predict_duration = te - ts
if 'log_loss' in test_metric_res:
# mm_predictor log_loss has some issue
y_pred_proba = predictor.predict_proba(test_data)
test_metric_res['log_loss'] = log_loss(test_data[col_label], y_pred_proba)
print(f'Test metrics={test_metric_res}')
te_duration = time.time()
if not args.do_load_ckpt:
result = dict(
task=info_dict['task'],
framework=f'AutoMM-{args.fit_setting}',
constraint=get_exp_constraint(args.fit_time_limit),
type=predictor.problem_type,
params=args.__dict__,
framework_version=__version__,
utc=datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S'),
duration=round(te_duration-ts_duration, 1),
training_duration=round(training_duration, 1),
predict_duration=round(predict_duration, 1),
seed=args.seed,
)
result.update(test_metric_res)
exp_result_save_path = os.path.join(args.exp_save_dir, 'results.csv')
result_df = pd.DataFrame.from_records([result])
result_df.to_csv(exp_result_save_path, index=False)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="AutoGluon Multimodal predictor arguments to set")
# required arguments
parser.add_argument('--dataset_dir', type=str, required=True,
help='Which dataset to use. Expect a directory contains csvs and images.')
parser.add_argument('--exp_save_dir', type=str, required=True,
help='the directory to save model checkpoints and exp result csv')
# optional arguments
# please refer to https://auto.gluon.ai/dev/tutorials/multimodal/beginner_multimodal.html
parser.add_argument('--fit_setting', type=str, default='fusion',
choices=['fusion', 'clip', 'swin', 'electra', 'vit', 'resnet', 'roberta'],
help="Use which models. `fusion` represents multimodal fusion method AutoMM; `clip` represent txt-img model CLIP. default=fusion.",
)
parser.add_argument('--fit_time_limit', type=int, default=3600,
help="How long training should run for (wallclock time in seconds). default=3600 (1 hour)")
parser.add_argument('--seed', type=int, default=0,
help="global random seed. default=0")
parser.add_argument('--do_load_ckpt', action='store_true',
help='a flag. If set, model will be loaded from `exp_save_dir`, and training process will be skipped. default=False.')
args = parser.parse_args()
print(f'[INFO] Exp arguments: {args}')
main(args)
The data I'm using is a table which contains text, images and tabular data and I only want to add missing and noise to columns which refer to as text. So far I tried this by replacing missing values with a string "MISSING" and for adding noise I either add random noise values or replacing it with random values from the same column. But the results don't decrease they somehow stay the same. And I don't know why this happens, since at least adding missing features should decrease the performance. Is there something I'm doing wrong, when perturbing the data? The code is from here: https://github.com/lujiaying/MUG-Bench/tree/master/baselines/automm
def apply_missing_to_rows(data: pd.DataFrame, row_indices: np.ndarray, missing_value='MISSING'):
data = data.copy()
data.loc[row_indices, :] = missing_value
return data
def apply_noise_to_rows(data: pd.DataFrame, row_indices: np.ndarray):
data = data.copy()
for col in data.columns:
if pd.api.types.is_numeric_dtype(data[col]):
noise = np.random.normal(0, 1.0 * data[col].std(), len(row_indices))
data.loc[row_indices, col] += noise
elif pd.api.types.is_categorical_dtype(data[col]) or data[col].dtype == 'object':
unique_values = data[col].unique()
random_replacements = np.random.choice(unique_values, len(row_indices), replace=True)
data.loc[row_indices, col] = random_replacements
return data
Upvotes: 0
Views: 40