Make QuantileDMatrix default to sklearn esitmators. (#8220)

2022-09-13 13:52:19 +08:00
parent a2686543a9
commit bdf265076d
6 changed files with 91 additions and 39 deletions
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -726,10 +726,9 @@ def _create_quantile_dmatrix(
    if parts is None:
        msg = f"worker {worker.address} has an empty DMatrix."
        LOGGER.warning(msg)
-        import cupy

        d = QuantileDMatrix(
-            cupy.zeros((0, 0)),
+            numpy.empty((0, 0)),
            feature_names=feature_names,
            feature_types=feature_types,
            max_bin=max_bin,
@@ -1544,15 +1543,21 @@ def inplace_predict(  # pylint: disable=unused-argument


 async def _async_wrap_evaluation_matrices(
-    client: Optional["distributed.Client"], **kwargs: Any
+    client: Optional["distributed.Client"],
+    tree_method: Optional[str],
+    max_bin: Optional[int],
+    **kwargs: Any,
 ) -> Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]]:
    """A switch function for async environment."""

-    def _inner(**kwargs: Any) -> DaskDMatrix:
-        m = DaskDMatrix(client=client, **kwargs)
-        return m
+    def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
+        if tree_method in ("hist", "gpu_hist"):
+            return DaskQuantileDMatrix(
+                client=client, ref=ref, max_bin=max_bin, **kwargs
+            )
+        return DaskDMatrix(client=client, **kwargs)

-    train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_inner, **kwargs)
+    train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_dispatch, **kwargs)
    train_dmatrix = await train_dmatrix
    if evals is None:
        return train_dmatrix, evals
@@ -1756,6 +1761,8 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
        params = self.get_xgb_params()
        dtrain, evals = await _async_wrap_evaluation_matrices(
            client=self.client,
+            tree_method=self.tree_method,
+            max_bin=self.max_bin,
            X=X,
            y=y,
            group=None,
@@ -1851,6 +1858,8 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
        params = self.get_xgb_params()
        dtrain, evals = await _async_wrap_evaluation_matrices(
            self.client,
+            tree_method=self.tree_method,
+            max_bin=self.max_bin,
            X=X,
            y=y,
            group=None,
@@ -2057,6 +2066,8 @@ class DaskXGBRanker(DaskScikitLearnBase, XGBRankerMixIn):
        params = self.get_xgb_params()
        dtrain, evals = await _async_wrap_evaluation_matrices(
            self.client,
+            tree_method=self.tree_method,
+            max_bin=self.max_bin,
            X=X,
            y=y,
            group=None,
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -38,6 +38,7 @@ from .core import (
    Booster,
    DMatrix,
    Metric,
+    QuantileDMatrix,
    XGBoostError,
    _convert_ntree_limit,
    _deprecate_positional_args,
@@ -430,7 +431,8 @@ def _wrap_evaluation_matrices(
    enable_categorical: bool,
    feature_types: Optional[FeatureTypes],
 ) -> Tuple[Any, List[Tuple[Any, str]]]:
-    """Convert array_like evaluation matrices into DMatrix.  Perform validation on the way."""
+    """Convert array_like evaluation matrices into DMatrix.  Perform validation on the
+    way."""
    train_dmatrix = create_dmatrix(
        data=X,
        label=y,
@@ -442,6 +444,7 @@ def _wrap_evaluation_matrices(
        missing=missing,
        enable_categorical=enable_categorical,
        feature_types=feature_types,
+        ref=None,
    )

    n_validation = 0 if eval_set is None else len(eval_set)
@@ -491,6 +494,7 @@ def _wrap_evaluation_matrices(
                    missing=missing,
                    enable_categorical=enable_categorical,
                    feature_types=feature_types,
+                    ref=train_dmatrix,
                )
                evals.append(m)
        nevals = len(evals)
@@ -904,6 +908,17 @@ class XGBModel(XGBModelBase):

        return model, metric, params, early_stopping_rounds, callbacks

+    def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
+        # Use `QuantileDMatrix` to save memory.
+        if self.tree_method in ("hist", "gpu_hist"):
+            try:
+                return QuantileDMatrix(
+                    **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
+                )
+            except TypeError:  # `QuantileDMatrix` supports lesser types than DMatrix
+                pass
+        return DMatrix(**kwargs, nthread=self.n_jobs)
+
    def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None:
        if evals_result:
            self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result)
@@ -996,7 +1011,7 @@ class XGBModel(XGBModelBase):
                base_margin_eval_set=base_margin_eval_set,
                eval_group=None,
                eval_qid=None,
-                create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
+                create_dmatrix=self._create_dmatrix,
                enable_categorical=self.enable_categorical,
                feature_types=self.feature_types,
            )
@@ -1479,7 +1494,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                base_margin_eval_set=base_margin_eval_set,
                eval_group=None,
                eval_qid=None,
-                create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
+                create_dmatrix=self._create_dmatrix,
                enable_categorical=self.enable_categorical,
                feature_types=self.feature_types,
            )
@@ -1930,7 +1945,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
                base_margin_eval_set=base_margin_eval_set,
                eval_group=eval_group,
                eval_qid=eval_qid,
-                create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
+                create_dmatrix=self._create_dmatrix,
                enable_categorical=self.enable_categorical,
                feature_types=self.feature_types,
            )