Reputation: 81
I wanted to check xgboost vs lightgbm on a simple dataset and I see lightgbm is making the wrong cut.
I use a simple Iris dataset.
import xgboost as xgb
import argparse
import logging
import xgboost as xgb
import traceback
import pandas as pd
from sklearn import datasets
import socket
def read_train_data():
"""
Read file based on the rank of worker.
We use the sklearn.iris data for demonstration
You can extend this to read distributed data source like HDFS, HIVE etc
:param rank: the id of each worker
:param num_workers: total number of workers in this cluster
:param path: the input file name or the place to read the data
:return: XGBoost Dmatrix
"""
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
return train_df, test_df
def get_dmatrix(df):
columns = list(df.columns)
for i in range(10):
icol = f"P{i}"
if icol in columns:
columns.remove(icol)
target_column = columns[-1]
columns = columns[:-1]
assert target_column not in columns
print('Target column', target_column)
x = df[columns]
y = df[target_column]
print(x)
return xgb.DMatrix(data=x, label=y)
def get_lgb_dataset(df):
columns = list(df.columns)
for i in range(10):
icol = f"P{i}"
if icol in columns:
columns.remove(icol)
target_column = columns[-1]
columns = columns[:-1]
assert target_column not in columns
print('Target column', target_column)
x = df[columns]
y = df[target_column]
print(x)
return lgb.Dataset(data=x, label=y, params={
"max_bin": 1000,
"feature_pre_filter": False
})
train_df, test_df = read_train_data()
dtrain = get_dmatrix(train_df)
dtest = get_dmatrix(test_df)
ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)
n_estimators = 1
params = {'max_depth': 1,
'eta': 1,
'silent': 1,
"tree_method": "hist",
"objective": "reg:linear",
'class_thresholds': (0,),
'class_thresholds_weights': (0,),
'min_child_weight': 20,
'reg_alpha': 0,
'reg_lambda': 0,
'base_score': 0
}
evals_result = {}
bst = xgb.train(
dtrain=dtrain,
num_boost_round=int(n_estimators),
params=params,
evals=[(dtrain, 'train'), (dtest, 'test')],
evals_result=evals_result
)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'max_depth': 1,
'metric': {'rmse'},
'num_leaves': 131072,
'learning_rate': 1.0,
'verbose': 0,
'lambda_l1': 0,
'lambda_l2': 0,
'deterministic': True,
}
gbm = lgb.train(params,
ltrain,
num_boost_round=n_estimators,
valid_sets=ltest)
def get_best(df):
column = df.columns[0]
target_column = df.columns[1]
sorted_df = df.sort_values(column).reset_index()
plotutil.display(sorted_df.drop(columns=['index']))
#return sorted_df
vals_list = sorted_df[column]
best_value = None
best_cut = None
cuts = []
vals = []
previous_cut = None
for i in range(20, len(vals_list)-20):
cut = (vals_list[i] + vals_list[i+1]) * 0.5
if cut == previous_cut:
continue
if vals_list[i] == vals_list[i+1]:
continue
previous_cut = cut
df_left = df[df[column] <= cut]
df_right = df[df[column] > cut]
preds_left = df_left[target_column].mean()
preds_right = df_right[target_column].mean()
e_left = (df_left[target_column] - preds_left)**2
e_right = (df_right[target_column] - preds_right)**2
e = np.sqrt(e_left.sum() + e_right.sum())
cuts.append(cut)
vals.append(e)
if best_value is None or e < best_value:
best_value = e
best_cut = cut
return best_cut, best_value, cuts, vals
a = get_best(train_df[["petal length (cm)", "petal width (cm)"]])
lightgbm is making slightly different cut on "petal length (cm)" than xgboost. xgboost gets cut 3.0 while lightgbm 3.15.
xgboost gets rmse 1.5685661, while lightgbm gets 1.767285302052378 on training dataset.
I checked the best possible cut on "petal length (cm)" in this case using a function above. In theory the best cut value should be 2.45(between 1.9 and 3.0) with RMSE 1.5685662242952958
xgboost hist basically gets the right split. It just places the cut value at 3.0 instead of in between 1.9 and 3.0. If one uses tree_method="exact", then xgboost gets the cut value of 2.45.
However lightgbm misses the right cut - it thinks it should be between rows 50 and 51. Not sure what happened with it. Maybe somehow row 50 gets into the same bin with some other data? But I don't know how because nbins=256 by default and the amount of data is 100.
Upvotes: 1
Views: 329
Reputation: 2732
Short Answer
But I don't know how because nbins=256 by default and the amount of data is 100.
lightgbm
has a parameter, min_data_in_bin
, which is used to speed up training by preventing the creation of very small bins (in the most extreme case, bins with a single sample).
The default value of this parameter is 3
(as noted in the documentation). As a general rule, lightgbm
's parameter defaults are chosen to provide acceptable performance on medium-sized datasets, and need to be adjusted if you want to train on a very small dataset like the one in this example.
If you set min_data_in_bin=1
and set max_bins
to a number greater than the total number of samples, every possible split in every feature will be explored, and lightgbm
will find the "best" one.
Note that min_data_in_bin
is a Dataset parameter, so it needs to be passed to lgb.Dataset()
(not one of the training functions like lgb.train()
).
Explanation and Reproducible Example
I created a reproducible example based on your provided code, using lightgbm=3.2.1
installed from PyPI with pip install lightgbm
.
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error as mse
def get_lgb_dataset(df):
columns = list(df.columns)
for i in range(10):
icol = f"P{i}"
if icol in columns:
columns.remove(icol)
target_column = columns[-1]
columns = columns[:-1]
x = df[columns]
y = df[target_column]
return lgb.Dataset(
data=x,
label=y,
params={
"max_bin": 1000,
"feature_pre_filter": False
}
)
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'max_depth': 1,
'metric': {'rmse'},
'learning_rate': 1.0,
'deterministic': True,
}
gbm = lgb.train(
params,
ltrain,
num_boost_round=1,
valid_sets=[ltest]
)
# see chosen spllit
print(gbm.trees_to_dataframe()[["tree_index", "node_depth", "split_feature", "threshold", "split_gain", "count"]])
# get rmse on the training data
feature_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]
# RMSE on training set
actuals = train_df[target_col].values
preds = gbm.predict(train_df[feature_cols])
print("----")
print(f"RMSE on train set: {np.sqrt(mse(actuals, preds))}")
lightgbm
produces a model with 1 tree, with a single split on petal_length_(cm) <= 3.15
. That model has RMSE of 0.1767 on the training set.
tree_index node_depth split_feature threshold split_gain count
0 0 1 petal_length_(cm) 3.15 28.497101 100
1 0 2 None NaN NaN 51
2 0 2 None NaN NaN 49
----
RMSE on train set: 0.17672852964158403
I then re-ran the same code, but added min_data_in_bin: 1
to the params
passed into lgb.Dataset()
.
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error as mse
def get_lgb_dataset(df):
columns = list(df.columns)
for i in range(10):
icol = f"P{i}"
if icol in columns:
columns.remove(icol)
target_column = columns[-1]
columns = columns[:-1]
x = df[columns]
y = df[target_column]
return lgb.Dataset(
data=x,
label=y,
params={
"max_bin": 1000,
"min_data_in_bin": 1,
"feature_pre_filter": False
}
)
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
ltrain = get_lgb_dataset(train_df)
ltest = get_lgb_dataset(test_df)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'max_depth': 1,
'metric': {'rmse'},
'learning_rate': 1.0,
'deterministic': True,
}
gbm = lgb.train(
params,
ltrain,
num_boost_round=1,
valid_sets=[ltest]
)
# see chosen split
print(gbm.trees_to_dataframe()[["tree_index", "node_depth", "split_feature", "threshold", "split_gain", "count"]])
# get rmse on the training data
feature_cols = train_df.columns[:-1]
target_col = train_df.columns[-1]
# RMSE on training set
actuals = train_df[target_col].values
preds = gbm.predict(train_df[feature_cols])
print("----")
print(f"RMSE on train set: {np.sqrt(mse(actuals, preds))}")
As you can see below, lightgbm
now chooses the same split that you found to be best for the RMSE objective, and which xgboost
chose when using the exact
mode.
tree_index node_depth split_feature threshold split_gain count
0 0 1 petal_length_(cm) 2.45 29.16 100
1 0 2 None NaN NaN 50
2 0 2 None NaN NaN 50
----
RMSE on train set: 0.15685662242953116
Upvotes: 4