From 7aec915dcdd0189b0049d3866d0f469c1f8497d9 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 10 Jan 2021 16:05:17 +0800 Subject: [PATCH] [Backport] Rename `data` to `X` in `predict_proba`. (#6555) (#6586) * [Breaking] Rename `data` to `X` in `predict_proba`. (#6555) New Scikit-Learn version uses keyword argument, and `X` is the predefined keyword. * Use pip to install latest Python graphviz on Windows CI. * Suppress health check. --- python-package/xgboost/dask.py | 8 +++---- python-package/xgboost/sklearn.py | 18 ++++++++-------- tests/ci_build/conda_env/win64_test.yml | 2 +- tests/python-gpu/test_gpu_with_dask.py | 28 ++++++++++++++++++------- tests/python/test_with_dask.py | 14 +++++++++---- 5 files changed, 44 insertions(+), 26 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index 4000c280a..d9830f924 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -1210,10 +1210,10 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase): early_stopping_rounds=early_stopping_rounds, verbose=verbose) - async def _predict_proba_async(self, data, output_margin=False, + async def _predict_proba_async(self, X, output_margin=False, base_margin=None): test_dmatrix = await DaskDMatrix( - client=self.client, data=data, base_margin=base_margin, + client=self.client, data=X, base_margin=base_margin, missing=self.missing ) pred_probs = await predict(client=self.client, @@ -1223,11 +1223,11 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase): return pred_probs # pylint: disable=arguments-differ,missing-docstring - def predict_proba(self, data, output_margin=False, base_margin=None): + def predict_proba(self, X, output_margin=False, base_margin=None): _assert_dask_support() return self.client.sync( self._predict_proba_async, - data, + X=X, output_margin=output_margin, base_margin=base_margin ) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 717ab1d3f..0572c77d4 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -995,10 +995,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase): return self._le.inverse_transform(column_indexes) return column_indexes - def predict_proba(self, data, ntree_limit=None, validate_features=False, + def predict_proba(self, X, ntree_limit=None, validate_features=False, base_margin=None): - """ - Predict the probability of each `data` example being of a given class. + """ Predict the probability of each `X` example being of a given class. .. note:: This function is not thread safe @@ -1008,21 +1007,22 @@ class XGBClassifier(XGBModel, XGBClassifierBase): Parameters ---------- - data : array_like + X : array_like Feature matrix. ntree_limit : int - Limit number of trees in the prediction; defaults to best_ntree_limit if defined - (i.e. it has been trained with early stopping), otherwise 0 (use all trees). + Limit number of trees in the prediction; defaults to best_ntree_limit if + defined (i.e. it has been trained with early stopping), otherwise 0 (use all + trees). validate_features : bool - When this is True, validate that the Booster's and data's feature_names are identical. - Otherwise, it is assumed that the feature_names are the same. + When this is True, validate that the Booster's and data's feature_names are + identical. Otherwise, it is assumed that the feature_names are the same. Returns ------- prediction : numpy array a numpy array with the probability of each data example being of a given class. """ - test_dmatrix = DMatrix(data, base_margin=base_margin, + test_dmatrix = DMatrix(X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) if ntree_limit is None: ntree_limit = getattr(self, "best_ntree_limit", 0) diff --git a/tests/ci_build/conda_env/win64_test.yml b/tests/ci_build/conda_env/win64_test.yml index df06ebff2..f353c8af7 100644 --- a/tests/ci_build/conda_env/win64_test.yml +++ b/tests/ci_build/conda_env/win64_test.yml @@ -9,7 +9,6 @@ dependencies: - scikit-learn - pandas - pytest -- python-graphviz - boto3 - hypothesis - jsonschema @@ -17,3 +16,4 @@ dependencies: - pip: - cupy-cuda101 - modin[all] + - graphviz diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index a0bafd2ef..e2070af88 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -5,8 +5,10 @@ import numpy as np import asyncio import xgboost import subprocess +import hypothesis from hypothesis import given, strategies, settings, note from hypothesis._settings import duration +from hypothesis import HealthCheck from test_gpu_updaters import parameter_strategy if sys.platform.startswith("win"): @@ -19,6 +21,11 @@ from test_with_dask import _get_client_workers # noqa from test_with_dask import generate_array # noqa import testing as tm # noqa +if hasattr(HealthCheck, 'function_scoped_fixture'): + suppress = [HealthCheck.function_scoped_fixture] +else: + suppress = hypothesis.utils.conventions.not_set + try: import dask.dataframe as dd @@ -161,19 +168,24 @@ class TestDistributedGPU: run_with_dask_dataframe(dxgb.DaskDMatrix, client) run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client) - @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20), - dataset=tm.dataset_strategy) - @settings(deadline=duration(seconds=120)) + @given( + params=parameter_strategy, + num_rounds=strategies.integers(1, 20), + dataset=tm.dataset_strategy, + ) + @settings(deadline=duration(seconds=120), suppress_health_check=suppress) @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask_cuda()) - @pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster']) + @pytest.mark.parametrize( + "local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"] + ) @pytest.mark.mgpu def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster): with Client(local_cuda_cluster) as client: - run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, - client) - run_gpu_hist(params, num_rounds, dataset, - dxgb.DaskDeviceQuantileDMatrix, client) + run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, client) + run_gpu_hist( + params, num_rounds, dataset, dxgb.DaskDeviceQuantileDMatrix, client + ) @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_dask()) diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index ba697ab4d..0e876fcad 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -8,7 +8,8 @@ import asyncio from sklearn.datasets import make_classification import os import subprocess -from hypothesis import given, settings, note +import hypothesis +from hypothesis import given, settings, note, HealthCheck from test_updaters import hist_parameter_strategy, exact_parameter_strategy if sys.platform.startswith("win"): @@ -17,6 +18,12 @@ if tm.no_dask()['condition']: pytest.skip(msg=tm.no_dask()['reason'], allow_module_level=True) +if hasattr(HealthCheck, 'function_scoped_fixture'): + suppress = [HealthCheck.function_scoped_fixture] +else: + suppress = hypothesis.utils.conventions.not_set + + try: from distributed import LocalCluster, Client, get_client from distributed.utils_test import client, loop, cluster_fixture @@ -668,14 +675,14 @@ class TestWithDask: @given(params=hist_parameter_strategy, dataset=tm.dataset_strategy) - @settings(deadline=None) + @settings(deadline=None, suppress_health_check=suppress) def test_hist(self, params, dataset, client): num_rounds = 30 self.run_updater_test(client, params, num_rounds, dataset, 'hist') @given(params=exact_parameter_strategy, dataset=tm.dataset_strategy) - @settings(deadline=None) + @settings(deadline=None, suppress_health_check=suppress) def test_approx(self, client, params, dataset): num_rounds = 30 self.run_updater_test(client, params, num_rounds, dataset, 'approx') @@ -795,7 +802,6 @@ class TestDaskCallbacks: merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')]) assert len(merged) == 2 - def test_data_initialization(self): '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't generate unnecessary copies of data.