Denisevi4
Denisevi4

Reputation: 81

quick lightgbm vs xgboost check. lightgbm makes the wrong cut?

I wanted to check xgboost vs lightgbm on a simple dataset and I see lightgbm is making the wrong cut.

I use a simple Iris dataset.

import xgboost as xgb
import argparse
import logging
import xgboost as xgb
import traceback
import pandas as pd
from sklearn import datasets
import socket

def read_train_data():
    """
    Read file based on the rank of worker.
    We use the sklearn.iris data for demonstration
    You can extend this to read distributed data source like HDFS, HIVE etc
    :param rank: the id of each worker
    :param num_workers: total number of workers in this cluster
    :param path: the input file name or the place to read the data
    :return: XGBoost Dmatrix
    """
    iris = datasets.load_iris()
    iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

    separate = 100
    train_df = iris_df.iloc[:separate]
    test_df = iris_df.iloc[separate:]
    return train_df, test_df

def get_dmatrix(df):
        columns = list(df.columns)
        for i in range(10):
            icol = f"P{i}"
            if icol in columns:
                columns.remove(icol)
        target_column = columns[-1]
        columns = columns[:-1]
        
        assert target_column not in columns
        print('Target column', target_column)

        x = df[columns]
        y = df[target_column]
        print(x)
        return xgb.DMatrix(data=x, label=y)

def get_lgb_dataset(df):
        columns = list(df.columns)
        for i in range(10):
            icol = f"P{i}"
            if icol in columns:
                columns.remove(icol)
        target_column = columns[-1]
        columns = columns[:-1]
        
        assert target_column not in columns
        print('Target column', target_column)

        x = df[columns]
        y = df[target_column]
        print(x)
        return lgb.Dataset(data=x, label=y, params={
                                                    "max_bin": 1000,
                                                    "feature_pre_filter": False
                                                   })
    
train_df, test_df = read_train_data()
dtrain = get_dmatrix(train_df)
dtest = get_dmatrix(test_df)

ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)
    
n_estimators = 1
params = {'max_depth': 1,
                  'eta': 1,
                  'silent': 1,
                  "tree_method": "hist",
                  "objective": "reg:linear",
                  'class_thresholds': (0,),
                  'class_thresholds_weights': (0,),
                  'min_child_weight': 20,
                  'reg_alpha': 0,
                  'reg_lambda': 0,
                  'base_score': 0
                  
                  }

evals_result = {}
bst = xgb.train(
            dtrain=dtrain,
            num_boost_round=int(n_estimators),
            params=params,
            evals=[(dtrain, 'train'), (dtest, 'test')],
            evals_result=evals_result
        )
        
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'max_depth': 1,
    'metric': {'rmse'},
    'num_leaves': 131072,
    'learning_rate': 1.0,
    'verbose': 0,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'deterministic': True,
}
gbm = lgb.train(params,
                ltrain,
                num_boost_round=n_estimators,
                valid_sets=ltest)

def get_best(df):
    column = df.columns[0]
    target_column = df.columns[1]
    
    sorted_df = df.sort_values(column).reset_index()
    
    plotutil.display(sorted_df.drop(columns=['index']))
    #return sorted_df
    
    vals_list = sorted_df[column]
    
    best_value = None
    best_cut = None
    
    cuts = []
    vals = []
    
    previous_cut = None
    
    for i in range(20, len(vals_list)-20):
        cut = (vals_list[i] + vals_list[i+1]) * 0.5
        if cut == previous_cut:
            continue
        if vals_list[i] == vals_list[i+1]:
            continue
        previous_cut = cut
        
        df_left = df[df[column] <= cut]
        df_right = df[df[column] > cut]
        
        preds_left = df_left[target_column].mean()
        preds_right = df_right[target_column].mean()
        
        e_left = (df_left[target_column] - preds_left)**2
        e_right = (df_right[target_column] - preds_right)**2
        e = np.sqrt(e_left.sum() + e_right.sum())
        
        cuts.append(cut)
        vals.append(e)
        
        if best_value is None or e < best_value:
            best_value = e
            best_cut = cut
    return best_cut, best_value, cuts, vals

a = get_best(train_df[["petal length (cm)", "petal width (cm)"]])

lightgbm is making slightly different cut on "petal length (cm)" than xgboost. xgboost gets cut 3.0 while lightgbm 3.15.

xgboost gets rmse 1.5685661, while lightgbm gets 1.767285302052378 on training dataset.

I checked the best possible cut on "petal length (cm)" in this case using a function above. In theory the best cut value should be 2.45(between 1.9 and 3.0) with RMSE 1.5685662242952958

xgboost hist basically gets the right split. It just places the cut value at 3.0 instead of in between 1.9 and 3.0. If one uses tree_method="exact", then xgboost gets the cut value of 2.45.

However lightgbm misses the right cut - it thinks it should be between rows 50 and 51. Not sure what happened with it. Maybe somehow row 50 gets into the same bin with some other data? But I don't know how because nbins=256 by default and the amount of data is 100.

Upvotes: 1

Views: 329

Answers (1)

James Lamb
James Lamb

Reputation: 2732

Short Answer

But I don't know how because nbins=256 by default and the amount of data is 100.

lightgbm has a parameter, min_data_in_bin, which is used to speed up training by preventing the creation of very small bins (in the most extreme case, bins with a single sample).

The default value of this parameter is 3 (as noted in the documentation). As a general rule, lightgbm's parameter defaults are chosen to provide acceptable performance on medium-sized datasets, and need to be adjusted if you want to train on a very small dataset like the one in this example.

If you set min_data_in_bin=1 and set max_bins to a number greater than the total number of samples, every possible split in every feature will be explored, and lightgbm will find the "best" one.

Note that min_data_in_bin is a Dataset parameter, so it needs to be passed to lgb.Dataset() (not one of the training functions like lgb.train()).

Explanation and Reproducible Example

I created a reproducible example based on your provided code, using lightgbm=3.2.1 installed from PyPI with pip install lightgbm.

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error as mse

def get_lgb_dataset(df):
    columns = list(df.columns)
    for i in range(10):
        icol = f"P{i}"
        if icol in columns:
            columns.remove(icol)
    target_column = columns[-1]
    columns = columns[:-1]
    x = df[columns]
    y = df[target_column]
    return lgb.Dataset(
        data=x,
        label=y,
        params={
            "max_bin": 1000,
            "feature_pre_filter": False
        }
    )

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]

ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'max_depth': 1,
    'metric': {'rmse'},
    'learning_rate': 1.0,
    'deterministic': True,
}
gbm = lgb.train(
    params,
    ltrain,
    num_boost_round=1,
    valid_sets=[ltest]
)

# see chosen spllit
print(gbm.trees_to_dataframe()[["tree_index", "node_depth", "split_feature", "threshold", "split_gain", "count"]])

# get rmse on the training data
feature_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]

# RMSE on training set
actuals = train_df[target_col].values
preds = gbm.predict(train_df[feature_cols])
print("----")
print(f"RMSE on train set: {np.sqrt(mse(actuals, preds))}")

lightgbm produces a model with 1 tree, with a single split on petal_length_(cm) <= 3.15. That model has RMSE of 0.1767 on the training set.

   tree_index  node_depth      split_feature  threshold  split_gain  count
0           0           1  petal_length_(cm)       3.15   28.497101    100
1           0           2               None        NaN         NaN     51
2           0           2               None        NaN         NaN     49
----
RMSE on train set: 0.17672852964158403

I then re-ran the same code, but added min_data_in_bin: 1 to the params passed into lgb.Dataset().

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error as mse

def get_lgb_dataset(df):
    columns = list(df.columns)
    for i in range(10):
        icol = f"P{i}"
        if icol in columns:
            columns.remove(icol)
    target_column = columns[-1]
    columns = columns[:-1]
    x = df[columns]
    y = df[target_column]
    return lgb.Dataset(
        data=x,
        label=y,
        params={
            "max_bin": 1000,
            "min_data_in_bin": 1,
            "feature_pre_filter": False
        }
    )

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]

ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'max_depth': 1,
    'metric': {'rmse'},
    'learning_rate': 1.0,
    'deterministic': True,
}
gbm = lgb.train(
    params,
    ltrain,
    num_boost_round=1,
    valid_sets=[ltest]
)

# see chosen split
print(gbm.trees_to_dataframe()[["tree_index", "node_depth", "split_feature", "threshold", "split_gain", "count"]])

# get rmse on the training data
feature_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]

# RMSE on training set
actuals = train_df[target_col].values
preds = gbm.predict(train_df[feature_cols])
print("----")
print(f"RMSE on train set: {np.sqrt(mse(actuals, preds))}")

As you can see below, lightgbm now chooses the same split that you found to be best for the RMSE objective, and which xgboost chose when using the exact mode.

   tree_index  node_depth      split_feature  threshold  split_gain  count
0           0           1  petal_length_(cm)       2.45       29.16    100
1           0           2               None        NaN         NaN     50
2           0           2               None        NaN         NaN     50
----
RMSE on train set: 0.15685662242953116

Upvotes: 4

Related Questions