[backport] Fix prediction with cat data in sklearn interface. (#7306) (#7312)

* Specify DMatrix parameter for pre-processing dataframe. * Add document about the behaviour of prediction.
2021-10-12 18:49:57 +08:00
parent 36e247aca4
commit 30c1b5c54c
2 changed files with 17 additions and 4 deletions
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -174,7 +174,9 @@ __model_doc = f'''
        Device ordinal.
    validate_parameters : Optional[bool]
        Give warnings for unknown parameter.
-
+    predictor : Optional[str]
+        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
+        gpu_predictor].
    enable_categorical : bool

        .. versionadded:: 1.5.0
@@ -807,7 +809,11 @@ class XGBModel(XGBModelBase):
        # Inplace predict doesn't handle as many data types as DMatrix, but it's
        # sufficient for dask interface where input is simpiler.
        predictor = self.get_params().get("predictor", None)
-        if predictor in ("auto", None) and self.booster != "gblinear":
+        if (
+            not self.enable_categorical
+            and predictor in ("auto", None)
+            and self.booster != "gblinear"
+        ):
            return True
        return False

@@ -834,7 +840,9 @@ class XGBModel(XGBModelBase):
        iteration_range: Optional[Tuple[int, int]] = None,
    ) -> np.ndarray:
        """Predict with `X`.  If the model is trained with early stopping, then `best_iteration`
-        is used automatically.
+        is used automatically.  For tree models, when data is on GPU, like cupy array or
+        cuDF dataframe and `predictor` is not specified, the prediction is run on GPU
+        automatically, otherwise it will run on CPU.

        .. note:: This function is only thread safe for `gbtree` and `dart`.

@@ -862,6 +870,7 @@ class XGBModel(XGBModelBase):
        Returns
        -------
        prediction
+
        """
        iteration_range = _convert_ntree_limit(
            self.get_booster(), ntree_limit, iteration_range
@@ -886,7 +895,10 @@ class XGBModel(XGBModelBase):
                pass

        test = DMatrix(
-            X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs
+            X, base_margin=base_margin,
+            missing=self.missing,
+            nthread=self.n_jobs,
+            enable_categorical=self.enable_categorical
        )
        return self.get_booster().predict(
            data=test,
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -59,6 +59,7 @@ def test_categorical():
    )
    X = pd.DataFrame(X.todense()).astype("category")
    clf.fit(X, y)
+    assert not clf._can_use_inplace_predict()

    with tempfile.TemporaryDirectory() as tempdir:
        model = os.path.join(tempdir, "categorial.json")