[breaking] Add prediction fucntion for DMatrix and use inplace predict for dask. (#6668)

* Add a new API function for predicting on `DMatrix`.  This function aligns
with rest of the `XGBoosterPredictFrom*` functions on semantic of function
arguments.
* Purge `ntree_limit` from libxgboost, use iteration instead.
* [dask] Use `inplace_predict` by default for dask sklearn models.
* [dask] Run prediction shape inference on worker instead of client.

The breaking change is in the Python sklearn `apply` function, I made it to be
consistent with other prediction functions where `best_iteration` is used by
default.
This commit is contained in:
Jiaming Yuan
2021-02-08 18:26:32 +08:00
committed by GitHub
parent dbb5208a0a
commit 4656b09d5d
29 changed files with 1134 additions and 604 deletions

View File

@@ -434,7 +434,13 @@ class TestModels:
booster[...:end] = booster
sliced_0 = booster[1:3]
np.testing.assert_allclose(
booster.predict(dtrain, iteration_range=(1, 3)), sliced_0.predict(dtrain)
)
sliced_1 = booster[3:7]
np.testing.assert_allclose(
booster.predict(dtrain, iteration_range=(3, 7)), sliced_1.predict(dtrain)
)
predt_0 = sliced_0.predict(dtrain, output_margin=True)
predt_1 = sliced_1.predict(dtrain, output_margin=True)

View File

@@ -47,30 +47,27 @@ def run_predict_leaf(predictor):
empty_leaf = booster.predict(empty, pred_leaf=True)
assert empty_leaf.shape[0] == 0
leaf = booster.predict(m, pred_leaf=True)
leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
assert leaf.shape[0] == rows
assert leaf.shape[1] == classes * num_parallel_tree * num_boost_round
assert leaf.shape[1] == num_boost_round
assert leaf.shape[2] == classes
assert leaf.shape[3] == num_parallel_tree
for i in range(rows):
row = leaf[i, ...]
for j in range(num_boost_round):
start = classes * num_parallel_tree * j
end = classes * num_parallel_tree * (j + 1)
layer = row[start: end]
for c in range(classes):
tree_group = layer[c * num_parallel_tree: (c + 1) * num_parallel_tree]
for k in range(classes):
tree_group = leaf[i, j, k, :]
assert tree_group.shape[0] == num_parallel_tree
# no subsampling so tree in same forest should output same
# leaf.
# No sampling, all trees within forest are the same
assert np.all(tree_group == tree_group[0])
ntree_limit = 2
sliced = booster.predict(
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
)
first = sliced[0, ...]
assert first.shape[0] == classes * num_parallel_tree * ntree_limit
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
return leaf
@@ -78,6 +75,23 @@ def test_predict_leaf():
run_predict_leaf('cpu_predictor')
def test_predict_shape():
from sklearn.datasets import load_boston
X, y = load_boston(return_X_y=True)
reg = xgb.XGBRegressor(n_estimators=1)
reg.fit(X, y)
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
assert len(predt.shape) == 2
assert predt.shape[0] == X.shape[0]
assert predt.shape[1] == 1
contrib = reg.get_booster().predict(
xgb.DMatrix(X), pred_contribs=True, strict_shape=True
)
assert len(contrib.shape) == 3
assert contrib.shape[1] == 1
class TestInplacePredict:
'''Tests for running inplace prediction'''
@classmethod
@@ -92,8 +106,7 @@ class TestInplacePredict:
dtrain = xgb.DMatrix(cls.X, cls.y)
cls.booster = xgb.train({'tree_method': 'hist'},
dtrain, num_boost_round=10)
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
cls.test = xgb.DMatrix(cls.X[:10, ...])

View File

@@ -159,12 +159,9 @@ def test_dask_predict_shape_infer(client: "Client") -> None:
assert prediction.shape[1] == 3
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
from sklearn.datasets import load_breast_cancer
X_, y_ = load_breast_cancer(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
def run_boost_from_prediction(
X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client"
) -> None:
model_0 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, random_state=0, n_estimators=4,
tree_method=tree_method)
@@ -202,6 +199,30 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
assert margined_res[i] < unmargined_res[i]
@pytest.mark.parametrize("tree_method", ["hist", "approx"])
def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
from sklearn.datasets import load_breast_cancer
X_, y_ = load_breast_cancer(return_X_y=True)
X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
run_boost_from_prediction(X, y, tree_method, client)
def test_inplace_predict(client: "Client") -> None:
from sklearn.datasets import load_boston
X_, y_ = load_boston(return_X_y=True)
X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
booster = reg.get_booster()
base_margin = y
inplace = xgb.dask.inplace_predict(
client, booster, X, base_margin=base_margin
).compute()
Xy = xgb.dask.DaskDMatrix(client, X, base_margin=base_margin)
copied = xgb.dask.predict(client, booster, Xy).compute()
np.testing.assert_allclose(inplace, copied)
def test_dask_missing_value_reg(client: "Client") -> None:
X_0 = np.ones((20 // 2, kCols))
X_1 = np.zeros((20 // 2, kCols))
@@ -288,10 +309,13 @@ def test_dask_regressor(model: str, client: "Client") -> None:
assert forest == 2
@pytest.mark.parametrize("model", ["boosting", "rf"])
def test_dask_classifier(model: str, client: "Client") -> None:
X, y, w = generate_array(with_weights=True)
y = (y * 10).astype(np.int32)
def run_dask_classifier(
X: xgb.dask._DaskCollection,
y: xgb.dask._DaskCollection,
w: xgb.dask._DaskCollection,
model: str,
client: "Client",
) -> None:
if model == "boosting":
classifier = xgb.dask.DaskXGBClassifier(
verbosity=1, n_estimators=2, eval_metric="merror"
@@ -306,14 +330,13 @@ def test_dask_classifier(model: str, client: "Client") -> None:
classifier.client = client
classifier.fit(X, y, sample_weight=w, eval_set=[(X, y)])
prediction = classifier.predict(X)
prediction = classifier.predict(X).compute()
assert prediction.ndim == 1
assert prediction.shape[0] == kRows
history = classifier.evals_result()
assert isinstance(prediction, da.Array)
assert isinstance(history, dict)
assert list(history.keys())[0] == "validation_0"
@@ -332,7 +355,7 @@ def test_dask_classifier(model: str, client: "Client") -> None:
assert forest == 2
# Test .predict_proba()
probas = classifier.predict_proba(X)
probas = classifier.predict_proba(X).compute()
assert classifier.n_classes_ == 10
assert probas.ndim == 2
assert probas.shape[0] == kRows
@@ -341,18 +364,33 @@ def test_dask_classifier(model: str, client: "Client") -> None:
cls_booster = classifier.get_booster()
single_node_proba = cls_booster.inplace_predict(X.compute())
np.testing.assert_allclose(single_node_proba, probas.compute())
# test shared by CPU and GPU
if isinstance(single_node_proba, np.ndarray):
np.testing.assert_allclose(single_node_proba, probas)
else:
import cupy
cupy.testing.assert_allclose(single_node_proba, probas)
# Test with dataframe.
X_d = dd.from_dask_array(X)
y_d = dd.from_dask_array(y)
classifier.fit(X_d, y_d)
# Test with dataframe, not shared with GPU as cupy doesn't work well with da.unique.
if isinstance(X, da.Array):
X_d: dd.DataFrame = X.to_dask_dataframe()
assert classifier.n_classes_ == 10
prediction = classifier.predict(X_d).compute()
assert classifier.n_classes_ == 10
prediction_df = classifier.predict(X_d).compute()
assert prediction.ndim == 1
assert prediction.shape[0] == kRows
assert prediction_df.ndim == 1
assert prediction_df.shape[0] == kRows
np.testing.assert_allclose(prediction_df, prediction)
probas = classifier.predict_proba(X).compute()
np.testing.assert_allclose(single_node_proba, probas)
@pytest.mark.parametrize("model", ["boosting", "rf"])
def test_dask_classifier(model: str, client: "Client") -> None:
X, y, w = generate_array(with_weights=True)
y = (y * 10).astype(np.int32)
run_dask_classifier(X, y, w, model, client)
@pytest.mark.skipif(**tm.no_sklearn())
@@ -913,9 +951,9 @@ class TestWithDask:
train = xgb.dask.DaskDMatrix(client, dX, dy)
dX = dd.from_array(X)
dX = client.persist(dX, workers={dX: workers[1]})
dX = client.persist(dX, workers=workers[1])
dy = dd.from_array(y)
dy = client.persist(dy, workers={dy: workers[1]})
dy = client.persist(dy, workers=workers[1])
valid = xgb.dask.DaskDMatrix(client, dX, dy)
merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
@@ -1060,6 +1098,16 @@ class TestWithDask:
assert_shape(shap.shape)
assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)
X = dd.from_dask_array(X).repartition(npartitions=32)
y = dd.from_dask_array(y).repartition(npartitions=32)
shap_df = xgb.dask.predict(
client, booster, X, pred_contribs=True, validate_features=False
).compute()
assert_shape(shap_df.shape)
assert np.allclose(
np.sum(shap_df, axis=len(shap_df.shape) - 1), margin, 1e-5, 1e-5
)
def run_shap_cls_sklearn(self, X: Any, y: Any, client: "Client") -> None:
X, y = da.from_array(X, chunks=(32, -1)), da.from_array(y, chunks=32)
cls = xgb.dask.DaskXGBClassifier(n_estimators=4)