Revert ntree limit fix (#6616) (#6622)

The old (before fix) best_ntree_limit ignores the num_class parameters, which is incorrect. In before we workarounded it in c++ layer to avoid possible breaking changes on other language bindings. But the Python interpretation stayed incorrect. The PR fixed that in Python to consider num_class, but didn't remove the old workaround, so tree calculation in predictor is incorrect, see PredictBatch in CPUPredictor.
This commit is contained in:
Jiaming Yuan 2021-01-20 04:20:07 +08:00 committed by GitHub
parent a018028471
commit d3ec116322
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 26 additions and 14 deletions

View File

@ -142,9 +142,7 @@ def _train_internal(params, dtrain,
) )
else: else:
raise ValueError(f'Unknown booster: {booster}') raise ValueError(f'Unknown booster: {booster}')
num_groups = int(config['learner']['learner_model_param']['num_class']) bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
num_groups = 1 if num_groups == 0 else num_groups
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups
# Copy to serialise and unserialise booster to reset state and free # Copy to serialise and unserialise booster to reset state and free
# training memory # training memory
@ -184,9 +182,10 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
If there's more than one metric in the **eval_metric** parameter given in If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping. **params**, the last metric will be used for early stopping.
If early stopping occurs, the model will have three additional fields: If early stopping occurs, the model will have three additional fields:
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. ``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. Use
(Use ``bst.best_ntree_limit`` to get the correct value if ``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
``num_parallel_tree`` and/or ``num_class`` appears in the parameters) ``num_class`` appears in the parameters. ``best_ntree_limit`` is the result of
``num_parallel_tree * best_iteration``.
evals_result: dict evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist. This dictionary stores the evaluation results of all the items in watchlist.

View File

@ -33,9 +33,15 @@ def run_predict_leaf(predictor):
y = rng.randint(low=0, high=classes, size=rows) y = rng.randint(low=0, high=classes, size=rows)
m = xgb.DMatrix(X, y) m = xgb.DMatrix(X, y)
booster = xgb.train( booster = xgb.train(
{'num_parallel_tree': num_parallel_tree, 'num_class': classes, {
'predictor': predictor, 'tree_method': 'hist'}, m, "num_parallel_tree": num_parallel_tree,
num_boost_round=num_boost_round) "num_class": classes,
"predictor": predictor,
"tree_method": "hist",
},
m,
num_boost_round=num_boost_round,
)
empty = xgb.DMatrix(np.ones(shape=(0, cols))) empty = xgb.DMatrix(np.ones(shape=(0, cols)))
empty_leaf = booster.predict(empty, pred_leaf=True) empty_leaf = booster.predict(empty, pred_leaf=True)
@ -52,12 +58,19 @@ def run_predict_leaf(predictor):
end = classes * num_parallel_tree * (j + 1) end = classes * num_parallel_tree * (j + 1)
layer = row[start: end] layer = row[start: end]
for c in range(classes): for c in range(classes):
tree_group = layer[c * num_parallel_tree: tree_group = layer[c * num_parallel_tree: (c + 1) * num_parallel_tree]
(c+1) * num_parallel_tree]
assert tree_group.shape[0] == num_parallel_tree assert tree_group.shape[0] == num_parallel_tree
# no subsampling so tree in same forest should output same # no subsampling so tree in same forest should output same
# leaf. # leaf.
assert np.all(tree_group == tree_group[0]) assert np.all(tree_group == tree_group[0])
ntree_limit = 2
sliced = booster.predict(
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit
)
first = sliced[0, ...]
assert first.shape[0] == classes * num_parallel_tree * ntree_limit
return leaf return leaf

View File

@ -123,13 +123,13 @@ class TestTrainingContinuation:
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
num_boost_round=7) num_boost_round=7)
assert gbdt_05.best_ntree_limit == ( assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5 gbdt_05.best_iteration + 1) * self.num_parallel_tree
gbdt_05 = xgb.train(xgb_params_03, gbdt_05 = xgb.train(xgb_params_03,
dtrain_5class, dtrain_5class,
num_boost_round=3, num_boost_round=3,
xgb_model=gbdt_05) xgb_model=gbdt_05)
assert gbdt_05.best_ntree_limit == ( assert gbdt_05.best_ntree_limit == (
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5 gbdt_05.best_iteration + 1) * self.num_parallel_tree
res1 = gbdt_05.predict(dtrain_5class) res1 = gbdt_05.predict(dtrain_5class)
res2 = gbdt_05.predict(dtrain_5class, res2 = gbdt_05.predict(dtrain_5class,

View File

@ -92,7 +92,7 @@ def test_best_ntree_limit():
) )
if forest: if forest:
assert cls.best_ntree_limit == rounds * forest * cls.n_classes_ assert cls.best_ntree_limit == rounds * forest
else: else:
assert cls.best_ntree_limit == 0 assert cls.best_ntree_limit == 0