[breaking] Remove the predictor param, allow fallback to prediction using DMatrix. (#9129)

- A `DeviceOrd` struct is implemented to indicate the device. It will eventually replace the `gpu_id` parameter. - The `predictor` parameter is removed. - Fallback to `DMatrix` when `inplace_predict` is not available. - The heuristic for choosing a predictor is only used during training.
2023-07-03 19:23:54 +08:00
parent 3a0f787703
commit 39390cc2ee
54 changed files with 1049 additions and 778 deletions
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -1,5 +1,5 @@
-'''Loading a pickled model generated by test_pickling.py, only used by
-`test_gpu_with_dask.py`'''
+"""Loading a pickled model generated by test_pickling.py, only used by
+`test_gpu_with_dask.py`"""
 import json
 import os

@@ -12,9 +12,9 @@ from xgboost import testing as tm


 class TestLoadPickle:
-    def test_load_pkl(self):
-        '''Test whether prediction is correct.'''
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_load_pkl(self) -> None:
+        """Test whether prediction is correct."""
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
        bst = load_pickle(model_path)
        x, y = build_dataset()
        if isinstance(bst, xgb.Booster):
@@ -28,46 +28,42 @@ class TestLoadPickle:

        assert len(res) == 10

-    def test_predictor_type_is_auto(self):
-        '''Under invalid CUDA_VISIBLE_DEVICES, predictor should be set to
-        auto'''
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_context_is_removed(self) -> None:
+        """Under invalid CUDA_VISIBLE_DEVICES, context should reset"""
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'auto'
+        assert config["learner"]["generic_param"]["gpu_id"] == "-1"

-    def test_predictor_type_is_gpu(self):
-        '''When CUDA_VISIBLE_DEVICES is not specified, keep using
-        `gpu_predictor`'''
-        assert 'CUDA_VISIBLE_DEVICES' not in os.environ.keys()
+    def test_context_is_preserved(self) -> None:
+        """Test the device context is preserved after pickling."""
+        assert "CUDA_VISIBLE_DEVICES" not in os.environ.keys()
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'gpu_predictor'
+        assert config["learner"]["generic_param"]["gpu_id"] == "0"

-    def test_wrap_gpu_id(self):
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '0'
+    def test_wrap_gpu_id(self) -> None:
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
        bst = load_pickle(model_path)
        config = bst.save_config()
        config = json.loads(config)
-        assert config['learner']['generic_param']['gpu_id'] == '0'
+        assert config["learner"]["generic_param"]["gpu_id"] == "0"

        x, y = build_dataset()
        test_x = xgb.DMatrix(x)
        res = bst.predict(test_x)
        assert len(res) == 10

-    def test_training_on_cpu_only_env(self):
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_training_on_cpu_only_env(self) -> None:
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
        rng = np.random.RandomState(1994)
        X = rng.randn(10, 10)
        y = rng.randn(10)
        with tm.captured_output() as (out, err):
            # Test no thrust exception is thrown
            with pytest.raises(xgb.core.XGBoostError):
-                xgb.train({'tree_method': 'gpu_hist'}, xgb.DMatrix(X, y))
+                xgb.train({"tree_method": "gpu_hist"}, xgb.DMatrix(X, y))

-            assert out.getvalue().find('No visible GPU is found') != -1
+            assert out.getvalue().find("No visible GPU is found") != -1
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ class TestQuantileDMatrix:
        np.testing.assert_equal(h_ret.indices, d_ret.indices)

        booster = xgb.train(
-            {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}, dtrain=d_m
+            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
        )

        np.testing.assert_allclose(
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -221,9 +221,10 @@ Arrow specification.'''
    def test_specified_device(self):
        import cupy as cp
        cp.cuda.runtime.setDevice(0)
-        dtrain = dmatrix_from_cupy(
-            np.float32, xgb.QuantileDMatrix, np.nan)
-        with pytest.raises(xgb.core.XGBoostError):
+        dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
+        with pytest.raises(
+            xgb.core.XGBoostError, match="Data is resided on a different device"
+        ):
            xgb.train(
                {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
            )
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -1,5 +1,4 @@
-'''Test model IO with pickle.'''
-import json
+"""Test model IO with pickle."""
 import os
 import pickle
 import subprocess
@@ -11,49 +10,48 @@ import xgboost as xgb
 from xgboost import XGBClassifier
 from xgboost import testing as tm

-model_path = './model.pkl'
+model_path = "./model.pkl"

 pytestmark = tm.timeout(30)


 def build_dataset():
    N = 10
-    x = np.linspace(0, N*N, N*N)
+    x = np.linspace(0, N * N, N * N)
    x = x.reshape((N, N))
    y = np.linspace(0, N, N)
    return x, y


 def save_pickle(bst, path):
-    with open(path, 'wb') as fd:
+    with open(path, "wb") as fd:
        pickle.dump(bst, fd)


 def load_pickle(path):
-    with open(path, 'rb') as fd:
+    with open(path, "rb") as fd:
        bst = pickle.load(fd)
    return bst


 class TestPickling:
-    args_template = [
-        "pytest",
-        "--verbose",
-        "-s",
-        "--fulltrace"]
+    args_template = ["pytest", "--verbose", "-s", "--fulltrace"]

    def run_pickling(self, bst) -> None:
        save_pickle(bst, model_path)
        args = [
-            "pytest", "--verbose", "-s", "--fulltrace",
-            "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_load_pkl"
+            "pytest",
+            "--verbose",
+            "-s",
+            "--fulltrace",
+            "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_load_pkl",
        ]
-        command = ''
+        command = ""
        for arg in args:
            command += arg
-            command += ' '
+            command += " "

-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
        env = os.environ.copy()
        # Passing new_environment directly to `env' argument results
        # in failure on Windows:
@@ -72,7 +70,7 @@ class TestPickling:
        x, y = build_dataset()
        train_x = xgb.DMatrix(x, label=y)

-        param = {'tree_method': 'gpu_hist', "gpu_id": 0}
+        param = {"tree_method": "gpu_hist", "gpu_id": 0}
        bst = xgb.train(param, train_x)
        self.run_pickling(bst)

@@ -91,43 +89,46 @@ class TestPickling:
        X, y = build_dataset()
        dtrain = xgb.DMatrix(X, y)

-        bst = xgb.train({'tree_method': 'gpu_hist',
-                         'gpu_id': 1},
-                        dtrain, num_boost_round=6)
+        bst = xgb.train(
+            {"tree_method": "gpu_hist", "gpu_id": 1}, dtrain, num_boost_round=6
+        )

-        model_path = 'model.pkl'
+        model_path = "model.pkl"
        save_pickle(bst, model_path)
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '0'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "0"}
        env = os.environ.copy()
        env.update(cuda_environment)
        args = self.args_template.copy()
        args.append(
-            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_wrap_gpu_id"
+            "./tests/python-gpu/" "load_pickle.py::TestLoadPickle::test_wrap_gpu_id"
        )
        status = subprocess.call(args, env=env)
        assert status == 0
        os.remove(model_path)

-    def test_pickled_predictor(self):
-        x, y = build_dataset()
+    def test_pickled_context(self):
+        x, y = tm.make_sparse_regression(10, 10, sparsity=0.8, as_dense=True)
        train_x = xgb.DMatrix(x, label=y)

-        param = {'tree_method': 'gpu_hist',
-                 'verbosity': 1, 'predictor': 'gpu_predictor'}
+        param = {"tree_method": "gpu_hist", "verbosity": 1}
        bst = xgb.train(param, train_x)
-        config = json.loads(bst.save_config())
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'gpu_predictor'
+
+        with tm.captured_output() as (out, err):
+            bst.inplace_predict(x)
+
+        # The warning is redirected to Python callback, so it's printed in stdout
+        # instead of stderr.
+        stdout = out.getvalue()
+        assert stdout.find("mismatched devices") != -1

        save_pickle(bst, model_path)

        args = self.args_template.copy()
-        args.append(
-            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_predictor_type_is_auto")
+        root = tm.project_root(__file__)
+        path = os.path.join(root, "tests", "python-gpu", "load_pickle.py")
+        args.append(path + "::TestLoadPickle::test_context_is_removed")

-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
        env = os.environ.copy()
        env.update(cuda_environment)

@@ -138,25 +139,29 @@ class TestPickling:
        args = self.args_template.copy()
        args.append(
            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_predictor_type_is_gpu")
+            "load_pickle.py::TestLoadPickle::test_context_is_preserved"
+        )

        # Load in environment that has GPU.
        env = os.environ.copy()
-        assert 'CUDA_VISIBLE_DEVICES' not in env.keys()
+        assert "CUDA_VISIBLE_DEVICES" not in env.keys()
        status = subprocess.call(args, env=env)
        assert status == 0

        os.remove(model_path)

    @pytest.mark.skipif(**tm.no_sklearn())
-    def test_predict_sklearn_pickle(self):
+    def test_predict_sklearn_pickle(self) -> None:
        from sklearn.datasets import load_digits
+
        x, y = load_digits(return_X_y=True)

-        kwargs = {'tree_method': 'gpu_hist',
-                  'predictor': 'gpu_predictor',
-                  'objective': 'binary:logistic',
-                  'n_estimators': 10}
+        kwargs = {
+            "tree_method": "gpu_hist",
+            "objective": "binary:logistic",
+            "gpu_id": 0,
+            "n_estimators": 10,
+        }

        model = XGBClassifier(**kwargs)
        model.fit(x, y)
@@ -165,24 +170,25 @@ class TestPickling:
        del model

        # load model
-        model: xgb.XGBClassifier = load_pickle("model.pkl")
+        model = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
-        bst.set_param({'predictor': 'cpu_predictor'})
+        tm.set_ordinal(-1, bst)
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)

    def test_training_on_cpu_only_env(self):
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
        env = os.environ.copy()
        env.update(cuda_environment)
        args = self.args_template.copy()
        args.append(
            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_training_on_cpu_only_env")
+            "load_pickle.py::TestLoadPickle::test_training_on_cpu_only_env"
+        )
        status = subprocess.call(args, env=env)
        assert status == 0
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -1,4 +1,5 @@
 import sys
+from copy import copy

 import numpy as np
 import pytest
@@ -11,8 +12,10 @@ from xgboost.compat import PANDAS_INSTALLED
 if PANDAS_INSTALLED:
    from hypothesis.extra.pandas import column, data_frames, range_indexes
 else:
+
    def noop(*args, **kwargs):
        pass
+
    column, data_frames, range_indexes = noop, noop, noop

 sys.path.append("tests/python")
@@ -21,16 +24,20 @@ from test_predict import run_threaded_predict  # noqa

 rng = np.random.RandomState(1994)

-shap_parameter_strategy = strategies.fixed_dictionaries({
-    'max_depth': strategies.integers(1, 11),
-    'max_leaves': strategies.integers(0, 256),
-    'num_parallel_tree': strategies.sampled_from([1, 10]),
-}).filter(lambda x: x['max_depth'] > 0 or x['max_leaves'] > 0)
+shap_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 256),
+        "num_parallel_tree": strategies.sampled_from([1, 10]),
+    }
+).filter(lambda x: x["max_depth"] > 0 or x["max_leaves"] > 0)

-predict_parameter_strategy = strategies.fixed_dictionaries({
-    'max_depth': strategies.integers(1, 8),
-    'num_parallel_tree': strategies.sampled_from([1, 4]),
-})
+predict_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 8),
+        "num_parallel_tree": strategies.sampled_from([1, 4]),
+    }
+)

 pytestmark = tm.timeout(20)

@@ -47,43 +54,45 @@ class TestGPUPredict:
        # with 5000 rows is 0.04.
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
-                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                     label=[0, 1] * int(num_rows / 2))
-                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                   label=[0, 1] * int(num_rows / 2))
-                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                    label=[0, 1] * int(num_rows / 2))
-                watchlist = [(dtrain, 'train'), (dval, 'validation')]
+                dtrain = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dval = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dtest = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                watchlist = [(dtrain, "train"), (dval, "validation")]
                res = {}
                param = {
                    "objective": "binary:logistic",
-                    "predictor": "gpu_predictor",
-                    'eval_metric': 'logloss',
-                    'tree_method': 'gpu_hist',
-                    'max_depth': 1
+                    "eval_metric": "logloss",
+                    "tree_method": "gpu_hist",
+                    "gpu_id": 0,
+                    "max_depth": 1,
                }
-                bst = xgb.train(param, dtrain, iterations, evals=watchlist,
-                                evals_result=res)
-                assert self.non_increasing(res["train"]["logloss"])
+                bst = xgb.train(
+                    param, dtrain, iterations, evals=watchlist, evals_result=res
+                )
+                assert tm.non_increasing(res["train"]["logloss"], tolerance=0.001)
+
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

-                param["predictor"] = "cpu_predictor"
-                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
+                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst_cpu = copy(bst)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)

-                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train,
-                                           rtol=1e-6)
-                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val,
-                                           rtol=1e-6)
-                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test,
-                                           rtol=1e-6)
-
-    def non_increasing(self, L):
-        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))
+                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6)

    # Test case for a bug where multiple batch predictions made on a
    # test set produce incorrect results
@@ -94,26 +103,22 @@ class TestGPUPredict:

        n = 1000
        X, y = make_regression(n, random_state=rng)
-        X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                            random_state=123)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
-        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"
+        bst = xgb.train(params, dtrain)

-        params['predictor'] = "gpu_predictor"
-        bst_gpu_predict = xgb.train(params, dtrain)
+        tm.set_ordinal(0, bst)
+        # Don't reuse the DMatrix for prediction, otherwise the result is cached.
+        predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
+        predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
+        tm.set_ordinal(-1, bst)
+        predict_cpu = bst.predict(xgb.DMatrix(X_test))

-        params['predictor'] = "cpu_predictor"
-        bst_cpu_predict = xgb.train(params, dtrain)
-
-        predict0 = bst_gpu_predict.predict(dtest)
-        predict1 = bst_gpu_predict.predict(dtest)
-        cpu_predict = bst_cpu_predict.predict(dtest)
-
-        assert np.allclose(predict0, predict1)
-        assert np.allclose(predict0, cpu_predict)
+        assert np.allclose(predict_gpu_0, predict_gpu_1)
+        assert np.allclose(predict_gpu_0, predict_cpu)

    @pytest.mark.skipif(**tm.no_sklearn())
    def test_sklearn(self):
@@ -121,30 +126,31 @@ class TestGPUPredict:
        tr_size = 2500
        X = np.random.rand(m, n)
        y = 200 * np.matmul(X, np.arange(-3, -3 + n))
+        y = y.reshape(y.size)
        X_train, y_train = X[:tr_size, :], y[:tr_size]
        X_test, y_test = X[tr_size:, :], y[tr_size:]

-        # First with cpu_predictor
-        params = {'tree_method': 'gpu_hist',
-                  'predictor': 'cpu_predictor',
-                  'n_jobs': -1,
-                  'seed': 123}
-        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
-        cpu_train_score = m.score(X_train, y_train)
-        cpu_test_score = m.score(X_test, y_test)
-
-        # Now with gpu_predictor
-        params['predictor'] = 'gpu_predictor'
-
+        params = {
+            "tree_method": "gpu_hist",
+            "gpu_id": "0",
+            "n_jobs": -1,
+            "seed": 123,
+        }
        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
        gpu_train_score = m.score(X_train, y_train)
        gpu_test_score = m.score(X_test, y_test)

+        # Now with cpu
+        m = tm.set_ordinal(-1, m)
+        cpu_train_score = m.score(X_train, y_train)
+        cpu_test_score = m.score(X_test, y_test)
+
        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)

    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
        import cupy as cp
+
        dtrain.set_info(base_margin=base_margin)
        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
@@ -152,10 +158,11 @@ class TestGPUPredict:

    def run_inplace_predict_cupy(self, device: int) -> None:
        import cupy as cp
+
        cp.cuda.runtime.setDevice(device)
        rows = 1000
        cols = 10
-        missing = 11            # set to integer for testing
+        missing = 11  # set to integer for testing

        cp_rng = cp.random.RandomState(1994)
        cp.random.set_random_state(cp_rng)
@@ -168,7 +175,7 @@ class TestGPUPredict:
        dtrain = xgb.DMatrix(X, y)

        booster = xgb.train(
-            {'tree_method': 'gpu_hist', "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
        )

        test = xgb.DMatrix(X[:10, ...], missing=missing)
@@ -186,7 +193,7 @@ class TestGPUPredict:
        # Don't do this on Windows, see issue #5793
        if sys.platform.startswith("win"):
            pytest.skip(
-                'Multi-threaded in-place prediction with cuPy is not working on Windows'
+                "Multi-threaded in-place prediction with cuPy is not working on Windows"
            )
        for i in range(10):
            run_threaded_predict(X, rows, predict_dense)
@@ -205,9 +212,10 @@ class TestGPUPredict:
        )
        reg.fit(X, y)

+        reg = tm.set_ordinal(device, reg)
        gpu_predt = reg.predict(X)
-        reg.set_params(predictor="cpu_predictor")
-        cpu_predt = reg.predict(X)
+        reg = tm.set_ordinal(-1, reg)
+        cpu_predt = reg.predict(cp.asnumpy(X))
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
        cp.cuda.runtime.setDevice(0)

@@ -215,11 +223,11 @@ class TestGPUPredict:
    def test_inplace_predict_cupy(self):
        self.run_inplace_predict_cupy(0)

-    @pytest.mark.xfail
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.mgpu
    def test_inplace_predict_cupy_specified_device(self):
        import cupy as cp
+
        n_devices = cp.cuda.runtime.getDeviceCount()
        for d in range(n_devices):
            self.run_inplace_predict_cupy(d)
@@ -230,6 +238,7 @@ class TestGPUPredict:
        import cudf
        import cupy as cp
        import pandas as pd
+
        rows = 1000
        cols = 10
        rng = np.random.RandomState(1994)
@@ -241,8 +250,7 @@ class TestGPUPredict:

        dtrain = xgb.DMatrix(X, y)

-        booster = xgb.train({'tree_method': 'gpu_hist'},
-                            dtrain, num_boost_round=10)
+        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
        test = xgb.DMatrix(X)
        predt_from_array = booster.inplace_predict(X)
        predt_from_dmatrix = booster.predict(test)
@@ -272,11 +280,12 @@ class TestGPUPredict:
    def test_shap(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
+        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst = tm.set_ordinal(0, bst)
        shap = bst.predict(test_dmat, pred_contribs=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
@@ -289,31 +298,35 @@ class TestGPUPredict:
    def test_shap_interactions(self, num_rounds, dataset, param):
        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
            return
-        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
+        param.update({"tree_method": "hist", "gpu_id": 0})
        param = dataset.set_params(param)
        dmat = dataset.get_dmat()
        bst = xgb.train(param, dmat, num_rounds)
        test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst = tm.set_ordinal(0, bst)
        shap = bst.predict(test_dmat, pred_interactions=True)
        margin = bst.predict(test_dmat, output_margin=True)
        assume(len(dataset.y) > 0)
-        assert np.allclose(np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
-                           margin,
-                           1e-3, 1e-3)
+        assert np.allclose(
+            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
+            margin,
+            1e-3,
+            1e-3,
+        )

    def test_shap_categorical(self):
        X, y = tm.make_categorical(100, 20, 7, False)
        Xy = xgb.DMatrix(X, y, enable_categorical=True)
        booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)

-        booster.set_param({"predictor": "gpu_predictor"})
+        booster = tm.set_ordinal(0, booster)
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
            np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
        )

-        booster.set_param({"predictor": "cpu_predictor"})
+        booster = tm.set_ordinal(-1, booster)
        shap = booster.predict(Xy, pred_contribs=True)
        margin = booster.predict(Xy, output_margin=True)
        np.testing.assert_allclose(
@@ -321,18 +334,20 @@ class TestGPUPredict:
        )

    def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf('gpu_predictor')
-        cpu_leaf = run_predict_leaf('cpu_predictor')
+        gpu_leaf = run_predict_leaf(0)
+        cpu_leaf = run_predict_leaf(-1)
        np.testing.assert_equal(gpu_leaf, cpu_leaf)

    def run_predict_leaf_booster(self, param, num_rounds, dataset):
        param = dataset.set_params(param)
        m = dataset.get_dmat()
-        booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds)
-        booster.set_param({'predictor': 'cpu_predictor'})
+        booster = xgb.train(
+            param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
+        )
+        booster = tm.set_ordinal(-1, booster)
        cpu_leaf = booster.predict(m, pred_leaf=True)

-        booster.set_param({'predictor': 'gpu_predictor'})
+        booster = tm.set_ordinal(0, booster)
        gpu_leaf = booster.predict(m, pred_leaf=True)

        np.testing.assert_equal(cpu_leaf, gpu_leaf)
@@ -344,8 +359,8 @@ class TestGPUPredict:
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param['booster'] = 'gbtree'
-        param['tree_method'] = 'gpu_hist'
+        param["booster"] = "gbtree"
+        param["tree_method"] = "gpu_hist"
        self.run_predict_leaf_booster(param, 10, dataset)

    @given(predict_parameter_strategy, tm.make_dataset_strategy())
@@ -355,42 +370,61 @@ class TestGPUPredict:
        if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
            return

-        param['booster'] = 'dart'
-        param['tree_method'] = 'gpu_hist'
+        param["booster"] = "dart"
+        param["tree_method"] = "gpu_hist"
        self.run_predict_leaf_booster(param, 10, dataset)

    @pytest.mark.skipif(**tm.no_sklearn())
    @pytest.mark.skipif(**tm.no_pandas())
-    @given(df=data_frames([column('x0', elements=strategies.integers(min_value=0, max_value=3)),
-                           column('x1', elements=strategies.integers(min_value=0, max_value=5))],
-                          index=range_indexes(min_size=20, max_size=50)))
+    @given(
+        df=data_frames(
+            [
+                column("x0", elements=strategies.integers(min_value=0, max_value=3)),
+                column("x1", elements=strategies.integers(min_value=0, max_value=5)),
+            ],
+            index=range_indexes(min_size=20, max_size=50),
+        )
+    )
    @settings(deadline=None, max_examples=20, print_blob=True)
    def test_predict_categorical_split(self, df):
        from sklearn.metrics import mean_squared_error

-        df = df.astype('category')
-        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
+        df = df.astype("category")
+        x0, x1 = df["x0"].to_numpy(), df["x1"].to_numpy()
        y = (x0 * 10 - 20) + (x1 - 2)
        dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)

        params = {
-            'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor',
-            'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse'
+            "tree_method": "gpu_hist",
+            "max_depth": 3,
+            "learning_rate": 1.0,
+            "base_score": 0.0,
+            "eval_metric": "rmse",
+            "gpu_id": "0",
        }

        eval_history = {}
-        bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')],
-                        verbose_eval=False, evals_result=eval_history)
-
+        bst = xgb.train(
+            params,
+            dtrain,
+            num_boost_round=5,
+            evals=[(dtrain, "train")],
+            verbose_eval=False,
+            evals_result=eval_history,
+        )
+        bst = tm.set_ordinal(0, bst)
        pred = bst.predict(dtrain)
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
-        np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
+        np.testing.assert_almost_equal(
+            rmse, eval_history["train"]["rmse"][-1], decimal=5
+        )

    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize("n_classes", [2, 3])
    def test_predict_dart(self, n_classes):
        import cupy as cp
        from sklearn.datasets import make_classification
+
        n_samples = 1000
        X_, y_ = make_classification(
            n_samples=n_samples, n_informative=5, n_classes=n_classes
@@ -403,7 +437,7 @@ class TestGPUPredict:
                "tree_method": "gpu_hist",
                "booster": "dart",
                "rate_drop": 0.5,
-                "objective": "binary:logistic"
+                "objective": "binary:logistic",
            }
        else:
            params = {
@@ -411,15 +445,18 @@ class TestGPUPredict:
                "booster": "dart",
                "rate_drop": 0.5,
                "objective": "multi:softprob",
-                "num_class": n_classes
+                "num_class": n_classes,
            }

        booster = xgb.train(params, Xy, num_boost_round=32)
-        # predictor=auto
+
+        # auto (GPU)
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)
+
+        # CPU
+        booster = tm.set_ordinal(-1, booster)
        cpu_inplace = booster.inplace_predict(X_)
-        booster.set_param({"predictor": "cpu_predictor"})
        cpu_copied = booster.predict(Xy)

        copied = cp.array(copied)
@@ -427,7 +464,8 @@ class TestGPUPredict:
        cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)

-        booster.set_param({"predictor": "gpu_predictor"})
+        # GPU
+        booster = tm.set_ordinal(0, booster)
        inplace = booster.inplace_predict(X)
        copied = booster.predict(Xy)

@@ -437,12 +475,11 @@ class TestGPUPredict:
    @pytest.mark.skipif(**tm.no_cupy())
    def test_dtypes(self):
        import cupy as cp
+
        rows = 1000
        cols = 10
        rng = cp.random.RandomState(1994)
-        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(
-            rows, cols
-        )
+        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
@@ -450,19 +487,16 @@ class TestGPUPredict:
        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
-            cp.signedinteger,
            cp.byte,
            cp.short,
            cp.intc,
            cp.int_,
            cp.longlong,
-            cp.unsignedinteger,
            cp.ubyte,
            cp.ushort,
            cp.uintc,
            cp.uint,
            cp.ulonglong,
-            cp.floating,
            cp.half,
            cp.single,
            cp.double,
@@ -472,9 +506,7 @@ class TestGPUPredict:
            cp.testing.assert_allclose(predt, predt_orig)

        # boolean
-        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(
-            rows, cols
-        )
+        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
        predt_orig = booster.inplace_predict(orig)
        for dtype in [cp.bool8, cp.bool_]:
            X = cp.array(orig, dtype=dtype)
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -29,7 +29,6 @@ def comp_training_with_rank_objective(
        "booster": "gbtree",
        "tree_method": "gpu_hist",
        "gpu_id": 0,
-        "predictor": "gpu_predictor",
    }

    num_trees = 100
@@ -54,7 +53,6 @@ def comp_training_with_rank_objective(
        "booster": "gbtree",
        "tree_method": "hist",
        "gpu_id": -1,
-        "predictor": "cpu_predictor",
    }
    cpu_params["objective"] = rank_objective
    cpu_params["eval_metric"] = metric_name
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -260,7 +260,6 @@ class TestGPUUpdaters:
                "seed": 66,
                "subsample": 0.5,
                "gamma": 0.2,
-                "predictor": "auto",
                "eval_metric": "auc",
            },
            num_boost_round=150,