Deprecate LabelEncoder in XGBClassifier; Enable cuDF/cuPy inputs in XGBClassifier (#6269)
* Deprecate LabelEncoder in XGBClassifier; skip LabelEncoder for cuDF/cuPy inputs * Add unit tests for cuDF and cuPy inputs with XGBClassifier * Fix lint * Clarify warning * Move use_label_encoder option to XGBClassifier constructor * Add a test for cudf.Series * Add use_label_encoder to XGBRFClassifier doc * Address reviewer feedback
This commit is contained in:
parent
bcfab4d726
commit
c8ec62103a
@ -7,6 +7,7 @@ import json
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from .core import Booster, DMatrix, XGBoostError
|
from .core import Booster, DMatrix, XGBoostError
|
||||||
from .training import train
|
from .training import train
|
||||||
|
from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
|
||||||
|
|
||||||
# Do not use class names on scikit-learn directly. Re-define the classes on
|
# Do not use class names on scikit-learn directly. Re-define the classes on
|
||||||
# .compat to guarantee the behavior without scikit-learn
|
# .compat to guarantee the behavior without scikit-learn
|
||||||
@ -345,7 +346,7 @@ class XGBModel(XGBModelBase):
|
|||||||
params = self.get_params()
|
params = self.get_params()
|
||||||
# Parameters that should not go into native learner.
|
# Parameters that should not go into native learner.
|
||||||
wrapper_specific = {
|
wrapper_specific = {
|
||||||
'importance_type', 'kwargs', 'missing', 'n_estimators'}
|
'importance_type', 'kwargs', 'missing', 'n_estimators', 'use_label_encoder'}
|
||||||
filtered = dict()
|
filtered = dict()
|
||||||
for k, v in params.items():
|
for k, v in params.items():
|
||||||
if k not in wrapper_specific:
|
if k not in wrapper_specific:
|
||||||
@ -430,6 +431,9 @@ class XGBModel(XGBModelBase):
|
|||||||
if k == 'classes_':
|
if k == 'classes_':
|
||||||
self.classes_ = np.array(v)
|
self.classes_ = np.array(v)
|
||||||
continue
|
continue
|
||||||
|
if k == 'use_label_encoder':
|
||||||
|
self.use_label_encoder = bool(v)
|
||||||
|
continue
|
||||||
if k == 'type' and type(self).__name__ != v:
|
if k == 'type' and type(self).__name__ != v:
|
||||||
msg = 'Current model type: {}, '.format(type(self).__name__) + \
|
msg = 'Current model type: {}, '.format(type(self).__name__) + \
|
||||||
'type of model in file: {}'.format(v)
|
'type of model in file: {}'.format(v)
|
||||||
@ -763,21 +767,53 @@ class XGBModel(XGBModelBase):
|
|||||||
['model', 'objective'], extra_parameters='''
|
['model', 'objective'], extra_parameters='''
|
||||||
n_estimators : int
|
n_estimators : int
|
||||||
Number of boosting rounds.
|
Number of boosting rounds.
|
||||||
|
use_label_encoder : bool
|
||||||
|
(Deprecated) Use the label encoder from scikit-learn to encode the labels. For new code,
|
||||||
|
we recommend that you set this parameter to False.
|
||||||
''')
|
''')
|
||||||
class XGBClassifier(XGBModel, XGBClassifierBase):
|
class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||||
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
|
# pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
|
||||||
def __init__(self, objective="binary:logistic", **kwargs):
|
def __init__(self, objective="binary:logistic", use_label_encoder=True, **kwargs):
|
||||||
|
self.use_label_encoder = use_label_encoder
|
||||||
super().__init__(objective=objective, **kwargs)
|
super().__init__(objective=objective, **kwargs)
|
||||||
|
|
||||||
def fit(self, X, y, sample_weight=None, base_margin=None,
|
def fit(self, X, y, sample_weight=None, base_margin=None,
|
||||||
eval_set=None, eval_metric=None,
|
eval_set=None, eval_metric=None,
|
||||||
early_stopping_rounds=None, verbose=True, xgb_model=None,
|
early_stopping_rounds=None, verbose=True, xgb_model=None,
|
||||||
sample_weight_eval_set=None, feature_weights=None, callbacks=None):
|
sample_weight_eval_set=None, feature_weights=None, callbacks=None):
|
||||||
# pylint: disable = attribute-defined-outside-init,arguments-differ
|
# pylint: disable = attribute-defined-outside-init,arguments-differ,too-many-statements
|
||||||
|
|
||||||
|
can_use_label_encoder = True
|
||||||
|
label_encoding_check_error = (
|
||||||
|
'The label must consist of integer labels of form 0, 1, 2, ..., [num_class - 1].')
|
||||||
|
label_encoder_deprecation_msg = (
|
||||||
|
'The use of label encoder in XGBClassifier is deprecated and will be ' +
|
||||||
|
'removed in a future release. To remove this warning, do the ' +
|
||||||
|
'following: 1) Pass option use_label_encoder=False when constructing ' +
|
||||||
|
'XGBClassifier object; and 2) Encode your labels (y) as integers ' +
|
||||||
|
'starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].')
|
||||||
|
|
||||||
evals_result = {}
|
evals_result = {}
|
||||||
|
if _is_cudf_df(y) or _is_cudf_ser(y):
|
||||||
|
import cupy as cp # pylint: disable=E0401
|
||||||
|
self.classes_ = cp.unique(y.values)
|
||||||
|
self.n_classes_ = len(self.classes_)
|
||||||
|
can_use_label_encoder = False
|
||||||
|
if not cp.array_equal(self.classes_, cp.arange(self.n_classes_)):
|
||||||
|
raise ValueError(label_encoding_check_error)
|
||||||
|
elif _is_cupy_array(y):
|
||||||
|
import cupy as cp # pylint: disable=E0401
|
||||||
|
self.classes_ = cp.unique(y)
|
||||||
|
self.n_classes_ = len(self.classes_)
|
||||||
|
can_use_label_encoder = False
|
||||||
|
if not cp.array_equal(self.classes_, cp.arange(self.n_classes_)):
|
||||||
|
raise ValueError(label_encoding_check_error)
|
||||||
|
else:
|
||||||
self.classes_ = np.unique(y)
|
self.classes_ = np.unique(y)
|
||||||
self.n_classes_ = len(self.classes_)
|
self.n_classes_ = len(self.classes_)
|
||||||
|
if not self.use_label_encoder and (
|
||||||
|
not np.array_equal(self.classes_, np.arange(self.n_classes_))):
|
||||||
|
raise ValueError(label_encoding_check_error)
|
||||||
|
|
||||||
xgb_options = self.get_xgb_params()
|
xgb_options = self.get_xgb_params()
|
||||||
|
|
||||||
@ -801,8 +837,18 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
else:
|
else:
|
||||||
xgb_options.update({"eval_metric": eval_metric})
|
xgb_options.update({"eval_metric": eval_metric})
|
||||||
|
|
||||||
|
if self.use_label_encoder:
|
||||||
|
if not can_use_label_encoder:
|
||||||
|
raise ValueError('The option use_label_encoder=True is incompatible with inputs ' +
|
||||||
|
'of type cuDF or cuPy. Please set use_label_encoder=False when ' +
|
||||||
|
'constructing XGBClassifier object. NOTE: ' +
|
||||||
|
label_encoder_deprecation_msg)
|
||||||
|
warnings.warn(label_encoder_deprecation_msg, UserWarning)
|
||||||
self._le = XGBoostLabelEncoder().fit(y)
|
self._le = XGBoostLabelEncoder().fit(y)
|
||||||
training_labels = self._le.transform(y)
|
label_transform = self._le.transform
|
||||||
|
else:
|
||||||
|
label_transform = (lambda x: x)
|
||||||
|
training_labels = label_transform(y)
|
||||||
|
|
||||||
if eval_set is not None:
|
if eval_set is not None:
|
||||||
if sample_weight_eval_set is None:
|
if sample_weight_eval_set is None:
|
||||||
@ -811,7 +857,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
assert len(sample_weight_eval_set) == len(eval_set)
|
assert len(sample_weight_eval_set) == len(eval_set)
|
||||||
evals = list(
|
evals = list(
|
||||||
DMatrix(eval_set[i][0],
|
DMatrix(eval_set[i][0],
|
||||||
label=self._le.transform(eval_set[i][1]),
|
label=label_transform(eval_set[i][1]),
|
||||||
missing=self.missing, weight=sample_weight_eval_set[i],
|
missing=self.missing, weight=sample_weight_eval_set[i],
|
||||||
nthread=self.n_jobs)
|
nthread=self.n_jobs)
|
||||||
for i in range(len(eval_set))
|
for i in range(len(eval_set))
|
||||||
@ -919,9 +965,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
|
|
||||||
if hasattr(self, '_le'):
|
if hasattr(self, '_le'):
|
||||||
return self._le.inverse_transform(column_indexes)
|
return self._le.inverse_transform(column_indexes)
|
||||||
warnings.warn(
|
return column_indexes
|
||||||
'Label encoder is not defined. Returning class probability.')
|
|
||||||
return class_probs
|
|
||||||
|
|
||||||
def predict_proba(self, data, ntree_limit=None, validate_features=False,
|
def predict_proba(self, data, ntree_limit=None, validate_features=False,
|
||||||
base_margin=None):
|
base_margin=None):
|
||||||
@ -1012,6 +1056,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
|||||||
extra_parameters='''
|
extra_parameters='''
|
||||||
n_estimators : int
|
n_estimators : int
|
||||||
Number of trees in random forest to fit.
|
Number of trees in random forest to fit.
|
||||||
|
use_label_encoder : bool
|
||||||
|
(Deprecated) Use the label encoder from scikit-learn to encode the labels. For new code,
|
||||||
|
we recommend that you set this parameter to False.
|
||||||
''')
|
''')
|
||||||
class XGBRFClassifier(XGBClassifier):
|
class XGBRFClassifier(XGBClassifier):
|
||||||
# pylint: disable=missing-docstring
|
# pylint: disable=missing-docstring
|
||||||
@ -1020,11 +1067,13 @@ class XGBRFClassifier(XGBClassifier):
|
|||||||
subsample=0.8,
|
subsample=0.8,
|
||||||
colsample_bynode=0.8,
|
colsample_bynode=0.8,
|
||||||
reg_lambda=1e-5,
|
reg_lambda=1e-5,
|
||||||
|
use_label_encoder=True,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
super().__init__(learning_rate=learning_rate,
|
super().__init__(learning_rate=learning_rate,
|
||||||
subsample=subsample,
|
subsample=subsample,
|
||||||
colsample_bynode=colsample_bynode,
|
colsample_bynode=colsample_bynode,
|
||||||
reg_lambda=reg_lambda,
|
reg_lambda=reg_lambda,
|
||||||
|
use_label_encoder=use_label_encoder,
|
||||||
**kwargs)
|
**kwargs)
|
||||||
|
|
||||||
def get_xgb_params(self):
|
def get_xgb_params(self):
|
||||||
|
|||||||
@ -172,6 +172,34 @@ Arrow specification.'''
|
|||||||
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
|
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_cudf())
|
||||||
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
|
@pytest.mark.skipif(**tm.no_pandas())
|
||||||
|
def test_cudf_training_with_sklearn():
|
||||||
|
from cudf import DataFrame as df
|
||||||
|
from cudf import Series as ss
|
||||||
|
import pandas as pd
|
||||||
|
np.random.seed(1)
|
||||||
|
X = pd.DataFrame(np.random.randn(50, 10))
|
||||||
|
y = pd.DataFrame((np.random.randn(50) > 0).astype(np.int8))
|
||||||
|
weights = np.random.random(50) + 1.0
|
||||||
|
cudf_weights = df.from_pandas(pd.DataFrame(weights))
|
||||||
|
base_margin = np.random.random(50)
|
||||||
|
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
|
||||||
|
|
||||||
|
X_cudf = df.from_pandas(X)
|
||||||
|
y_cudf = df.from_pandas(y)
|
||||||
|
y_cudf_series = ss(data=y.iloc[:, 0])
|
||||||
|
|
||||||
|
for y_obj in [y_cudf, y_cudf_series]:
|
||||||
|
clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist', use_label_encoder=False)
|
||||||
|
clf.fit(X_cudf, y_obj, sample_weight=cudf_weights, base_margin=cudf_base_margin,
|
||||||
|
eval_set=[(X_cudf, y_obj)])
|
||||||
|
pred = clf.predict(X_cudf)
|
||||||
|
assert np.array_equal(np.unique(pred), np.array([0, 1]))
|
||||||
|
|
||||||
|
|
||||||
class IterForDMatrixTest(xgb.core.DataIter):
|
class IterForDMatrixTest(xgb.core.DataIter):
|
||||||
'''A data iterator for XGBoost DMatrix.
|
'''A data iterator for XGBoost DMatrix.
|
||||||
|
|
||||||
|
|||||||
@ -108,6 +108,25 @@ def _test_cupy_metainfo(DMatrixT):
|
|||||||
dmat_cupy.get_uint_info('group_ptr'))
|
dmat_cupy.get_uint_info('group_ptr'))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**tm.no_cupy())
|
||||||
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
|
def test_cupy_training_with_sklearn():
|
||||||
|
import cupy as cp
|
||||||
|
np.random.seed(1)
|
||||||
|
cp.random.seed(1)
|
||||||
|
X = cp.random.randn(50, 10, dtype='float32')
|
||||||
|
y = (cp.random.randn(50, dtype='float32') > 0).astype('int8')
|
||||||
|
weights = np.random.random(50) + 1
|
||||||
|
cupy_weights = cp.array(weights)
|
||||||
|
base_margin = np.random.random(50)
|
||||||
|
cupy_base_margin = cp.array(base_margin)
|
||||||
|
|
||||||
|
clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist', use_label_encoder=False)
|
||||||
|
clf.fit(X, y, sample_weight=cupy_weights, base_margin=cupy_base_margin, eval_set=[(X, y)])
|
||||||
|
pred = clf.predict(X)
|
||||||
|
assert np.array_equal(np.unique(pred), np.array([0, 1]))
|
||||||
|
|
||||||
|
|
||||||
class TestFromCupy:
|
class TestFromCupy:
|
||||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||||
Arrow specification.'''
|
Arrow specification.'''
|
||||||
|
|||||||
@ -706,19 +706,17 @@ def save_load_model(model_path):
|
|||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
from sklearn.model_selection import KFold
|
from sklearn.model_selection import KFold
|
||||||
|
|
||||||
digits = load_digits(2)
|
digits = load_digits(n_class=2)
|
||||||
y = digits['target']
|
y = digits['target']
|
||||||
X = digits['data']
|
X = digits['data']
|
||||||
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
||||||
for train_index, test_index in kf.split(X, y):
|
for train_index, test_index in kf.split(X, y):
|
||||||
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
|
xgb_model = xgb.XGBClassifier(use_label_encoder=False).fit(X[train_index], y[train_index])
|
||||||
xgb_model.save_model(model_path)
|
xgb_model.save_model(model_path)
|
||||||
xgb_model = xgb.XGBClassifier()
|
xgb_model = xgb.XGBClassifier(use_label_encoder=False)
|
||||||
xgb_model.load_model(model_path)
|
xgb_model.load_model(model_path)
|
||||||
assert isinstance(xgb_model.classes_, np.ndarray)
|
assert isinstance(xgb_model.classes_, np.ndarray)
|
||||||
assert isinstance(xgb_model._Booster, xgb.Booster)
|
assert isinstance(xgb_model._Booster, xgb.Booster)
|
||||||
assert isinstance(xgb_model._le, XGBoostLabelEncoder)
|
|
||||||
assert isinstance(xgb_model._le.classes_, np.ndarray)
|
|
||||||
preds = xgb_model.predict(X[test_index])
|
preds = xgb_model.predict(X[test_index])
|
||||||
labels = y[test_index]
|
labels = y[test_index]
|
||||||
err = sum(1 for i in range(len(preds))
|
err = sum(1 for i in range(len(preds))
|
||||||
@ -750,7 +748,7 @@ def test_save_load_model():
|
|||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
with TemporaryDirectory() as tempdir:
|
with TemporaryDirectory() as tempdir:
|
||||||
model_path = os.path.join(tempdir, 'digits.model.json')
|
model_path = os.path.join(tempdir, 'digits.model.json')
|
||||||
digits = load_digits(2)
|
digits = load_digits(n_class=2)
|
||||||
y = digits['target']
|
y = digits['target']
|
||||||
X = digits['data']
|
X = digits['data']
|
||||||
booster = xgb.train({'tree_method': 'hist',
|
booster = xgb.train({'tree_method': 'hist',
|
||||||
@ -761,7 +759,7 @@ def test_save_load_model():
|
|||||||
booster.save_model(model_path)
|
booster.save_model(model_path)
|
||||||
cls = xgb.XGBClassifier()
|
cls = xgb.XGBClassifier()
|
||||||
cls.load_model(model_path)
|
cls.load_model(model_path)
|
||||||
predt_1 = cls.predict(X)
|
predt_1 = cls.predict_proba(X)[:, 1]
|
||||||
assert np.allclose(predt_0, predt_1)
|
assert np.allclose(predt_0, predt_1)
|
||||||
|
|
||||||
cls = xgb.XGBModel()
|
cls = xgb.XGBModel()
|
||||||
@ -778,7 +776,7 @@ def test_RFECV():
|
|||||||
|
|
||||||
# Regression
|
# Regression
|
||||||
X, y = load_boston(return_X_y=True)
|
X, y = load_boston(return_X_y=True)
|
||||||
bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
|
bst = xgb.XGBRegressor(booster='gblinear', learning_rate=0.1,
|
||||||
n_estimators=10,
|
n_estimators=10,
|
||||||
objective='reg:squarederror',
|
objective='reg:squarederror',
|
||||||
random_state=0, verbosity=0)
|
random_state=0, verbosity=0)
|
||||||
@ -791,7 +789,7 @@ def test_RFECV():
|
|||||||
bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
|
bst = xgb.XGBClassifier(booster='gblinear', learning_rate=0.1,
|
||||||
n_estimators=10,
|
n_estimators=10,
|
||||||
objective='binary:logistic',
|
objective='binary:logistic',
|
||||||
random_state=0, verbosity=0)
|
random_state=0, verbosity=0, use_label_encoder=False)
|
||||||
rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc')
|
rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='roc_auc')
|
||||||
rfecv.fit(X, y)
|
rfecv.fit(X, y)
|
||||||
|
|
||||||
@ -802,7 +800,7 @@ def test_RFECV():
|
|||||||
n_estimators=10,
|
n_estimators=10,
|
||||||
objective='multi:softprob',
|
objective='multi:softprob',
|
||||||
random_state=0, reg_alpha=0.001, reg_lambda=0.01,
|
random_state=0, reg_alpha=0.001, reg_lambda=0.01,
|
||||||
scale_pos_weight=0.5, verbosity=0)
|
scale_pos_weight=0.5, verbosity=0, use_label_encoder=False)
|
||||||
rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss')
|
rfecv = RFECV(estimator=bst, step=1, cv=3, scoring='neg_log_loss')
|
||||||
rfecv.fit(X, y)
|
rfecv.fit(X, y)
|
||||||
|
|
||||||
@ -811,7 +809,7 @@ def test_RFECV():
|
|||||||
rfecv = RFECV(estimator=reg)
|
rfecv = RFECV(estimator=reg)
|
||||||
rfecv.fit(X, y)
|
rfecv.fit(X, y)
|
||||||
|
|
||||||
cls = xgb.XGBClassifier()
|
cls = xgb.XGBClassifier(use_label_encoder=False)
|
||||||
rfecv = RFECV(estimator=cls, step=1, cv=3,
|
rfecv = RFECV(estimator=cls, step=1, cv=3,
|
||||||
scoring='neg_mean_squared_error')
|
scoring='neg_mean_squared_error')
|
||||||
rfecv.fit(X, y)
|
rfecv.fit(X, y)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user