Back port fixes to 1.2 (#6002)

* Fix sklearn doc. (#5980)

* Enforce tree order in JSON. (#5974)

* Make JSON model IO more future proof by using tree id in model loading.

* Fix dask predict shape infer. (#5989)

* [Breaking] Fix .predict() method and add .predict_proba() in xgboost.dask.DaskXGBClassifier (#5986)
This commit is contained in:
Jiaming Yuan
2020-08-11 20:22:31 +08:00
committed by GitHub
parent 7856da5827
commit 936a854baa
5 changed files with 105 additions and 32 deletions

View File

@@ -5,6 +5,7 @@ import sys
import numpy as np
import json
import asyncio
from sklearn.datasets import make_classification
if sys.platform.startswith("win"):
pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -36,7 +37,7 @@ def generate_array():
def test_from_dask_dataframe():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X, y = generate_array()
@@ -74,7 +75,7 @@ def test_from_dask_dataframe():
def test_from_dask_array():
with LocalCluster(n_workers=5, threads_per_worker=5) as cluster:
with LocalCluster(n_workers=kWorkers, threads_per_worker=5) as cluster:
with Client(cluster) as client:
X, y = generate_array()
dtrain = DaskDMatrix(client, X, y)
@@ -104,8 +105,28 @@ def test_from_dask_array():
assert np.all(single_node_predt == from_arr.compute())
def test_dask_predict_shape_infer():
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X, y = make_classification(n_samples=1000, n_informative=5,
n_classes=3)
X_ = dd.from_array(X, chunksize=100)
y_ = dd.from_array(y, chunksize=100)
dtrain = xgb.dask.DaskDMatrix(client, data=X_, label=y_)
model = xgb.dask.train(
client,
{"objective": "multi:softprob", "num_class": 3},
dtrain=dtrain
)
preds = xgb.dask.predict(client, model, dtrain)
assert preds.shape[0] == preds.compute().shape[0]
assert preds.shape[1] == preds.compute().shape[1]
def test_dask_missing_value_reg():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X_0 = np.ones((20 // 2, kCols))
X_1 = np.zeros((20 // 2, kCols))
@@ -144,19 +165,19 @@ def test_dask_missing_value_cls():
missing=0.0)
cls.client = client
cls.fit(X, y, eval_set=[(X, y)])
dd_predt = cls.predict(X).compute()
dd_pred_proba = cls.predict_proba(X).compute()
np_X = X.compute()
np_predt = cls.get_booster().predict(
np_pred_proba = cls.get_booster().predict(
xgb.DMatrix(np_X, missing=0.0))
np.testing.assert_allclose(np_predt, dd_predt)
np.testing.assert_allclose(np_pred_proba, dd_pred_proba)
cls = xgb.dask.DaskXGBClassifier()
assert hasattr(cls, 'missing')
def test_dask_regressor():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X, y = generate_array()
regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
@@ -178,7 +199,7 @@ def test_dask_regressor():
def test_dask_classifier():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X, y = generate_array()
y = (y * 10).astype(np.int32)
@@ -201,7 +222,18 @@ def test_dask_classifier():
assert len(list(history['validation_0'])) == 1
assert len(history['validation_0']['merror']) == 2
# Test .predict_proba()
probas = classifier.predict_proba(X)
assert classifier.n_classes_ == 10
assert probas.ndim == 2
assert probas.shape[0] == kRows
assert probas.shape[1] == 10
cls_booster = classifier.get_booster()
single_node_proba = cls_booster.inplace_predict(X.compute())
np.testing.assert_allclose(single_node_proba,
probas.compute())
# Test with dataframe.
X_d = dd.from_dask_array(X)
@@ -218,7 +250,7 @@ def test_dask_classifier():
@pytest.mark.skipif(**tm.no_sklearn())
def test_sklearn_grid_search():
from sklearn.model_selection import GridSearchCV
with LocalCluster(n_workers=4) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
X, y = generate_array()
reg = xgb.dask.DaskXGBRegressor(learning_rate=0.1,
@@ -292,7 +324,9 @@ def run_empty_dmatrix_cls(client, parameters):
evals=[(dtrain, 'validation')],
num_boost_round=2)
predictions = xgb.dask.predict(client=client, model=out,
data=dtrain).compute()
data=dtrain)
assert predictions.shape[1] == n_classes
predictions = predictions.compute()
_check_outputs(out, predictions)
# train has more rows than evals
@@ -315,7 +349,7 @@ def run_empty_dmatrix_cls(client, parameters):
# environment and Exact doesn't support it.
def test_empty_dmatrix_hist():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
parameters = {'tree_method': 'hist'}
run_empty_dmatrix_reg(client, parameters)
@@ -323,7 +357,7 @@ def test_empty_dmatrix_hist():
def test_empty_dmatrix_approx():
with LocalCluster(n_workers=5) as cluster:
with LocalCluster(n_workers=kWorkers) as cluster:
with Client(cluster) as client:
parameters = {'tree_method': 'approx'}
run_empty_dmatrix_reg(client, parameters)
@@ -397,7 +431,13 @@ async def run_dask_classifier_asyncio(scheduler_address):
assert len(list(history['validation_0'])) == 1
assert len(history['validation_0']['merror']) == 2
# Test .predict_proba()
probas = await classifier.predict_proba(X)
assert classifier.n_classes_ == 10
assert probas.ndim == 2
assert probas.shape[0] == kRows
assert probas.shape[1] == 10
# Test with dataframe.
X_d = dd.from_dask_array(X)