[Backport] Rename data to X in predict_proba. (#6555) (#6586)

* [Breaking] Rename `data` to `X` in `predict_proba`. (#6555)

New Scikit-Learn version uses keyword argument, and `X` is the predefined
keyword.

* Use pip to install latest Python graphviz on Windows CI.

* Suppress health check.
This commit is contained in:
Jiaming Yuan 2021-01-10 16:05:17 +08:00 committed by GitHub
parent a78d0d4110
commit 7aec915dcd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 44 additions and 26 deletions

View File

@ -1210,10 +1210,10 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
early_stopping_rounds=early_stopping_rounds, early_stopping_rounds=early_stopping_rounds,
verbose=verbose) verbose=verbose)
async def _predict_proba_async(self, data, output_margin=False, async def _predict_proba_async(self, X, output_margin=False,
base_margin=None): base_margin=None):
test_dmatrix = await DaskDMatrix( test_dmatrix = await DaskDMatrix(
client=self.client, data=data, base_margin=base_margin, client=self.client, data=X, base_margin=base_margin,
missing=self.missing missing=self.missing
) )
pred_probs = await predict(client=self.client, pred_probs = await predict(client=self.client,
@ -1223,11 +1223,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
return pred_probs return pred_probs
# pylint: disable=arguments-differ,missing-docstring # pylint: disable=arguments-differ,missing-docstring
def predict_proba(self, data, output_margin=False, base_margin=None): def predict_proba(self, X, output_margin=False, base_margin=None):
_assert_dask_support() _assert_dask_support()
return self.client.sync( return self.client.sync(
self._predict_proba_async, self._predict_proba_async,
data, X=X,
output_margin=output_margin, output_margin=output_margin,
base_margin=base_margin base_margin=base_margin
) )

View File

@ -995,10 +995,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
return self._le.inverse_transform(column_indexes) return self._le.inverse_transform(column_indexes)
return column_indexes return column_indexes
def predict_proba(self, data, ntree_limit=None, validate_features=False, def predict_proba(self, X, ntree_limit=None, validate_features=False,
base_margin=None): base_margin=None):
""" """ Predict the probability of each `X` example being of a given class.
Predict the probability of each `data` example being of a given class.
.. note:: This function is not thread safe .. note:: This function is not thread safe
@ -1008,21 +1007,22 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Parameters Parameters
---------- ----------
data : array_like X : array_like
Feature matrix. Feature matrix.
ntree_limit : int ntree_limit : int
Limit number of trees in the prediction; defaults to best_ntree_limit if defined Limit number of trees in the prediction; defaults to best_ntree_limit if
(i.e. it has been trained with early stopping), otherwise 0 (use all trees). defined (i.e. it has been trained with early stopping), otherwise 0 (use all
trees).
validate_features : bool validate_features : bool
When this is True, validate that the Booster's and data's feature_names are identical. When this is True, validate that the Booster's and data's feature_names are
Otherwise, it is assumed that the feature_names are the same. identical. Otherwise, it is assumed that the feature_names are the same.
Returns Returns
------- -------
prediction : numpy array prediction : numpy array
a numpy array with the probability of each data example being of a given class. a numpy array with the probability of each data example being of a given class.
""" """
test_dmatrix = DMatrix(data, base_margin=base_margin, test_dmatrix = DMatrix(X, base_margin=base_margin,
missing=self.missing, nthread=self.n_jobs) missing=self.missing, nthread=self.n_jobs)
if ntree_limit is None: if ntree_limit is None:
ntree_limit = getattr(self, "best_ntree_limit", 0) ntree_limit = getattr(self, "best_ntree_limit", 0)

View File

@ -9,7 +9,6 @@ dependencies:
- scikit-learn - scikit-learn
- pandas - pandas
- pytest - pytest
- python-graphviz
- boto3 - boto3
- hypothesis - hypothesis
- jsonschema - jsonschema
@ -17,3 +16,4 @@ dependencies:
- pip: - pip:
- cupy-cuda101 - cupy-cuda101
- modin[all] - modin[all]
- graphviz

View File

@ -5,8 +5,10 @@ import numpy as np
import asyncio import asyncio
import xgboost import xgboost
import subprocess import subprocess
import hypothesis
from hypothesis import given, strategies, settings, note from hypothesis import given, strategies, settings, note
from hypothesis._settings import duration from hypothesis._settings import duration
from hypothesis import HealthCheck
from test_gpu_updaters import parameter_strategy from test_gpu_updaters import parameter_strategy
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
@ -19,6 +21,11 @@ from test_with_dask import _get_client_workers # noqa
from test_with_dask import generate_array # noqa from test_with_dask import generate_array # noqa
import testing as tm # noqa import testing as tm # noqa
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try: try:
import dask.dataframe as dd import dask.dataframe as dd
@ -161,19 +168,24 @@ class TestDistributedGPU:
run_with_dask_dataframe(dxgb.DaskDMatrix, client) run_with_dask_dataframe(dxgb.DaskDMatrix, client)
run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
@given(params=parameter_strategy, num_rounds=strategies.integers(1, 20), @given(
dataset=tm.dataset_strategy) params=parameter_strategy,
@settings(deadline=duration(seconds=120)) num_rounds=strategies.integers(1, 20),
dataset=tm.dataset_strategy,
)
@settings(deadline=duration(seconds=120), suppress_health_check=suppress)
@pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda()) @pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster']) @pytest.mark.parametrize(
"local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
)
@pytest.mark.mgpu @pytest.mark.mgpu
def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster): def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster):
with Client(local_cuda_cluster) as client: with Client(local_cuda_cluster) as client:
run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client)
client) run_gpu_hist(
run_gpu_hist(params, num_rounds, dataset, params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client
dxgb.DaskDeviceQuantileDMatrix, client) )
@pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask())

View File

@ -8,7 +8,8 @@ import asyncio
from sklearn.datasets import make_classification from sklearn.datasets import make_classification
import os import os
import subprocess import subprocess
from hypothesis import given, settings, note import hypothesis
from hypothesis import given, settings, note, HealthCheck
from test_updaters import hist_parameter_strategy, exact_parameter_strategy from test_updaters import hist_parameter_strategy, exact_parameter_strategy
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
@ -17,6 +18,12 @@ if tm.no_dask()['condition']:
pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True) pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True)
if hasattr(HealthCheck, 'function_scoped_fixture'):
suppress = [HealthCheck.function_scoped_fixture]
else:
suppress = hypothesis.utils.conventions.not_set
try: try:
from distributed import LocalCluster, Client, get_client from distributed import LocalCluster, Client, get_client
from distributed.utils_test import client, loop, cluster_fixture from distributed.utils_test import client, loop, cluster_fixture
@ -668,14 +675,14 @@ class TestWithDask:
@given(params=hist_parameter_strategy, @given(params=hist_parameter_strategy,
dataset=tm.dataset_strategy) dataset=tm.dataset_strategy)
@settings(deadline=None) @settings(deadline=None, suppress_health_check=suppress)
def test_hist(self, params, dataset, client): def test_hist(self, params, dataset, client):
num_rounds = 30 num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'hist') self.run_updater_test(client, params, num_rounds, dataset, 'hist')
@given(params=exact_parameter_strategy, @given(params=exact_parameter_strategy,
dataset=tm.dataset_strategy) dataset=tm.dataset_strategy)
@settings(deadline=None) @settings(deadline=None, suppress_health_check=suppress)
def test_approx(self, client, params, dataset): def test_approx(self, client, params, dataset):
num_rounds = 30 num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'approx') self.run_updater_test(client, params, num_rounds, dataset, 'approx')
@ -795,7 +802,6 @@ class TestDaskCallbacks:
merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')]) merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
assert len(merged) == 2 assert len(merged) == 2
def test_data_initialization(self): def test_data_initialization(self):
'''Assert each worker has the correct amount of data, and DMatrix initialization doesn't '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't
generate unnecessary copies of data. generate unnecessary copies of data.