diff --git a/demo/guide-python/boost_from_prediction.py b/demo/guide-python/boost_from_prediction.py index 021aa2231..948b47a9f 100755 --- a/demo/guide-python/boost_from_prediction.py +++ b/demo/guide-python/boost_from_prediction.py @@ -1,5 +1,4 @@ #!/usr/bin/python -import numpy as np import xgboost as xgb dtrain = xgb.DMatrix('../data/agaricus.txt.train') @@ -8,18 +7,19 @@ watchlist = [(dtest, 'eval'), (dtrain, 'train')] ### # advanced: start from a initial base prediction # -print ('start running example to start from a initial prediction') +print('start running example to start from a initial prediction') # specify parameters via map, definition are same as c++ version -param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} +param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # train xgboost for 1 round bst = xgb.train(param, dtrain, 1, watchlist) -# Note: we need the margin value instead of transformed prediction in set_base_margin -# do predict with output_margin=True, will always give you margin values before logistic transformation +# Note: we need the margin value instead of transformed prediction in +# set_base_margin +# do predict with output_margin=True, will always give you margin values +# before logistic transformation ptrain = bst.predict(dtrain, output_margin=True) ptest = bst.predict(dtest, output_margin=True) dtrain.set_base_margin(ptrain) dtest.set_base_margin(ptest) - print('this is result of running from initial prediction') -bst = xgb.train(param, dtrain, 1, watchlist) \ No newline at end of file +bst = xgb.train(param, dtrain, 1, watchlist) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 4406fef1f..99d09c333 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -434,9 +434,11 @@ class DMatrix(object): _feature_names = None # for previous version's pickle _feature_types = None - def __init__(self, data, label=None, missing=None, - weight=None, silent=False, - feature_names=None, feature_types=None, + def __init__(self, data, label=None, weight=None, base_margin=None, + missing=None, + silent=False, + feature_names=None, + feature_types=None, nthread=None): """Parameters ---------- @@ -492,6 +494,7 @@ class DMatrix(object): label = _maybe_pandas_label(label) label = _maybe_dt_array(label) weight = _maybe_dt_array(weight) + base_margin = _maybe_dt_array(base_margin) if isinstance(data, (STRING_TYPES, os_PathLike)): handle = ctypes.c_void_p() @@ -518,19 +521,11 @@ class DMatrix(object): ' {}'.format(type(data).__name__)) if label is not None: - if isinstance(label, np.ndarray): - self.set_label_npy2d(label) - elif _use_columnar_initializer(label): - self.set_interface_info('label', label) - else: - self.set_label(label) + self.set_label(label) if weight is not None: - if isinstance(weight, np.ndarray): - self.set_weight_npy2d(weight) - elif _use_columnar_initializer(label): - self.set_interface_info('weight', weight) - else: - self.set_weight(weight) + self.set_weight(weight) + if base_margin is not None: + self.set_base_margin(base_margin) self.feature_names = feature_names self.feature_types = feature_types @@ -792,7 +787,12 @@ class DMatrix(object): label: array like The label information to be set into DMatrix """ - self.set_float_info('label', label) + if isinstance(label, np.ndarray): + self.set_label_npy2d(label) + elif _use_columnar_initializer(label): + self.set_interface_info('label', label) + else: + self.set_float_info('label', label) def set_label_npy2d(self, label): """Set label of dmatrix @@ -820,7 +820,12 @@ class DMatrix(object): data points within each group, so it doesn't make sense to assign weights to individual data points. """ - self.set_float_info('weight', weight) + if isinstance(weight, np.ndarray): + self.set_weight_npy2d(weight) + elif _use_columnar_initializer(weight): + self.set_interface_info('weight', weight) + else: + self.set_float_info('weight', weight) def set_weight_npy2d(self, weight): """ Set weight of each instance diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 39ad73415..943dc051c 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -288,9 +288,9 @@ class XGBModel(XGBModelBase): self._Booster = Booster({'n_jobs': self.n_jobs}) self._Booster.load_model(fname) - def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, - early_stopping_rounds=None, verbose=True, xgb_model=None, - sample_weight_eval_set=None, callbacks=None): + def fit(self, X, y, sample_weight=None, base_margin=None, + eval_set=None, eval_metric=None, early_stopping_rounds=None, + verbose=True, xgb_model=None, sample_weight_eval_set=None, callbacks=None): # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init """Fit gradient boosting model @@ -302,6 +302,8 @@ class XGBModel(XGBModelBase): Labels sample_weight : array_like instance weights + base_margin : array_like + global bias for each instance. eval_set : list, optional A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. @@ -346,14 +348,10 @@ class XGBModel(XGBModelBase): [xgb.callback.reset_learning_rate(custom_rates)] """ - if sample_weight is not None: - trainDmatrix = DMatrix(X, label=y, weight=sample_weight, - missing=self.missing, - nthread=self.n_jobs) - else: - trainDmatrix = DMatrix(X, label=y, missing=self.missing, - nthread=self.n_jobs) - + trainDmatrix = DMatrix(data=X, label=y, weight=sample_weight, + base_margin=base_margin, + missing=self.missing, + nthread=self.n_jobs) evals_result = {} if eval_set is not None: @@ -404,7 +402,8 @@ class XGBModel(XGBModelBase): self.best_ntree_limit = self._Booster.best_ntree_limit return self - def predict(self, data, output_margin=False, ntree_limit=None, validate_features=True): + def predict(self, data, output_margin=False, ntree_limit=None, + validate_features=True, base_margin=None): """ Predict with `data`. @@ -442,7 +441,8 @@ class XGBModel(XGBModelBase): prediction : numpy array """ # pylint: disable=missing-docstring,invalid-name - test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) + test_dmatrix = DMatrix(data, base_margin=base_margin, + missing=self.missing, nthread=self.n_jobs) # get ntree_limit to use - if none specified, default to # best_ntree_limit if defined, otherwise 0. if ntree_limit is None: @@ -621,7 +621,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): base_score=base_score, random_state=random_state, missing=missing, **kwargs) - def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None, + def fit(self, X, y, sample_weight=None, base_margin=None, + eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, callbacks=None): # pylint: disable = attribute-defined-outside-init,arguments-differ @@ -675,13 +676,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase): raise ValueError( 'Please reshape the input data X into 2-dimensional matrix.') self._features_count = X.shape[1] - - if sample_weight is not None: - train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, - missing=self.missing, nthread=self.n_jobs) - else: - train_dmatrix = DMatrix(X, label=training_labels, - missing=self.missing, nthread=self.n_jobs) + train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, + base_margin=base_margin, + missing=self.missing, nthread=self.n_jobs) self._Booster = train(xgb_options, train_dmatrix, self.get_num_boosting_rounds(), evals=evals, early_stopping_rounds=early_stopping_rounds, @@ -706,7 +703,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): fit.__doc__ = XGBModel.fit.__doc__.replace('Fit gradient boosting model', 'Fit gradient boosting classifier', 1) - def predict(self, data, output_margin=False, ntree_limit=None, validate_features=True): + def predict(self, data, output_margin=False, ntree_limit=None, + validate_features=True, base_margin=None): """ Predict with `data`. @@ -729,7 +727,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): Parameters ---------- - data : DMatrix + data : array_like The dmatrix storing the input. output_margin : bool Whether to output the raw untransformed margin value. @@ -743,7 +741,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): ------- prediction : numpy array """ - test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) + test_dmatrix = DMatrix(data, base_margin=base_margin, + missing=self.missing, nthread=self.n_jobs) if ntree_limit is None: ntree_limit = getattr(self, "best_ntree_limit", 0) class_probs = self.get_booster().predict(test_dmatrix, @@ -761,7 +760,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): column_indexes[class_probs > 0.5] = 1 return self._le.inverse_transform(column_indexes) - def predict_proba(self, data, ntree_limit=None, validate_features=True): + def predict_proba(self, data, ntree_limit=None, validate_features=True, + base_margin=None): """ Predict the probability of each `data` example being of a given class. @@ -787,7 +787,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): prediction : numpy array a numpy array with the probability of each data example being of a given class. """ - test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) + test_dmatrix = DMatrix(data, base_margin=base_margin, + missing=self.missing, nthread=self.n_jobs) if ntree_limit is None: ntree_limit = getattr(self, "best_ntree_limit", 0) class_probs = self.get_booster().predict(test_dmatrix, @@ -1045,7 +1046,8 @@ class XGBRanker(XGBModel): if "rank:" not in self.objective: raise ValueError("please use XGBRanker for ranking task") - def fit(self, X, y, group, sample_weight=None, eval_set=None, + def fit(self, X, y, group, sample_weight=None, base_margin=None, + eval_set=None, sample_weight_eval_set=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=False, xgb_model=None, callbacks=None): @@ -1072,6 +1074,8 @@ class XGBRanker(XGBModel): data points within each group, so it doesn't make sense to assign weights to individual data points. + base_margin : array_like + Global bias for each instance. eval_set : list, optional A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. @@ -1138,14 +1142,10 @@ class XGBRanker(XGBModel): ret.set_group(group) return ret - if sample_weight is not None: - train_dmatrix = _dmat_init( - group, data=X, label=y, weight=sample_weight, - missing=self.missing, nthread=self.n_jobs) - else: - train_dmatrix = _dmat_init( - group, data=X, label=y, - missing=self.missing, nthread=self.n_jobs) + train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, + base_margin=base_margin, + missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_group(group) evals_result = {} @@ -1192,9 +1192,11 @@ class XGBRanker(XGBModel): return self - def predict(self, data, output_margin=False, ntree_limit=0, validate_features=True): + def predict(self, data, output_margin=False, + ntree_limit=0, validate_features=True, base_margin=None): - test_dmatrix = DMatrix(data, missing=self.missing) + test_dmatrix = DMatrix(data, base_margin=base_margin, + missing=self.missing) if ntree_limit is None: ntree_limit = getattr(self, "best_ntree_limit", 0) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index be5725a02..089b63d09 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -132,6 +132,21 @@ class TestModels(unittest.TestCase): bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay) assert isinstance(bst, xgb.core.Booster) + def test_boost_from_prediction(self): + # Re-construct dtrain here to avoid modification + margined = xgb.DMatrix(dpath + 'agaricus.txt.train') + bst = xgb.train({'tree_method': 'hist'}, margined, 1) + predt_0 = bst.predict(margined, output_margin=True) + margined.set_base_margin(predt_0) + bst = xgb.train({'tree_method': 'hist'}, margined, 1) + predt_1 = bst.predict(margined) + + assert np.any(np.abs(predt_1 - predt_0) > 1e-6) + + bst = xgb.train({'tree_method': 'hist'}, dtrain, 2) + predt_2 = bst.predict(dtrain) + assert np.all(np.abs(predt_2 - predt_1) < 1e-6) + def test_custom_objective(self): param = {'max_depth': 2, 'eta': 1, 'verbosity': 0} watchlist = [(dtest, 'eval'), (dtrain, 'train')] diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index a098a97ae..33ccecc25 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -695,3 +695,23 @@ def test_XGBClassifier_resume(): assert np.any(pred1 != pred2) assert log_loss1 > log_loss2 + + +def test_boost_from_prediction(): + from sklearn.datasets import load_breast_cancer + X, y = load_breast_cancer(return_X_y=True) + model_0 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4) + model_0.fit(X=X, y=y) + margin = model_0.predict(X, output_margin=True) + + model_1 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = model_1.predict(X, base_margin=margin) + + cls_2 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8) + cls_2.fit(X=X, y=y) + predictions_2 = cls_2.predict(X, base_margin=margin) + assert np.all(predictions_1 == predictions_2)