Back port fixes to 1.2 (#6002)

* Fix sklearn doc. (#5980)

* Enforce tree order in JSON. (#5974)

* Make JSON model IO more future proof by using tree id in model loading.

* Fix dask predict shape infer. (#5989)

* [Breaking] Fix .predict() method and add .predict_proba() in xgboost.dask.DaskXGBClassifier (#5986)
This commit is contained in:
Jiaming Yuan
2020-08-11 20:22:31 +08:00
committed by GitHub
parent 7856da5827
commit 936a854baa
5 changed files with 105 additions and 32 deletions

View File

@@ -738,7 +738,8 @@ async def _predict_async(client: Client, model, data, *args,
predt = booster.predict(data=local_x,
validate_features=local_x.num_row() != 0,
*args)
ret = (delayed(predt), order)
columns = 1 if len(predt.shape) == 1 else predt.shape[1]
ret = ((delayed(predt), columns), order)
predictions.append(ret)
return predictions
@@ -775,8 +776,10 @@ async def _predict_async(client: Client, model, data, *args,
# See https://docs.dask.org/en/latest/array-creation.html
arrays = []
for i, shape in enumerate(shapes):
arrays.append(da.from_delayed(results[i], shape=(shape[0], ),
dtype=numpy.float32))
arrays.append(da.from_delayed(
results[i][0], shape=(shape[0],)
if results[i][1] == 1 else (shape[0], results[i][1]),
dtype=numpy.float32))
predictions = await da.concatenate(arrays, axis=0)
return predictions
@@ -978,6 +981,7 @@ class DaskScikitLearnBase(XGBModel):
def client(self, clt):
self._client = clt
@xgboost_model_doc("""Implementation of the Scikit-Learn API for XGBoost.""",
['estimators', 'model'])
class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
@@ -1032,9 +1036,6 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
['estimators', 'model']
)
class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
# pylint: disable=missing-docstring
_client = None
async def _fit_async(self, X, y,
sample_weights=None,
eval_set=None,
@@ -1078,13 +1079,34 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
return self.client.sync(self._fit_async, X, y, sample_weights,
eval_set, sample_weight_eval_set, verbose)
async def _predict_async(self, data):
async def _predict_proba_async(self, data):
_assert_dask_support()
test_dmatrix = await DaskDMatrix(client=self.client, data=data,
missing=self.missing)
pred_probs = await predict(client=self.client,
model=self.get_booster(), data=test_dmatrix)
return pred_probs
def predict_proba(self, data): # pylint: disable=arguments-differ,missing-docstring
_assert_dask_support()
return self.client.sync(self._predict_proba_async, data)
async def _predict_async(self, data):
_assert_dask_support()
test_dmatrix = await DaskDMatrix(client=self.client, data=data,
missing=self.missing)
pred_probs = await predict(client=self.client,
model=self.get_booster(), data=test_dmatrix)
if self.n_classes_ == 2:
preds = (pred_probs > 0.5).astype(int)
else:
preds = da.argmax(pred_probs, axis=1)
return preds
def predict(self, data): # pylint: disable=arguments-differ
_assert_dask_support()
return self.client.sync(self._predict_async, data)

View File

@@ -77,7 +77,7 @@ __model_doc = '''
gamma : float
Minimum loss reduction required to make a further partition on a leaf
node of the tree.
min_child_weight : int
min_child_weight : float
Minimum sum of instance weight(hessian) needed in a child.
max_delta_step : int
Maximum delta step we allow each tree's weight estimation to be.