Rewrite approx (#7214)

This PR rewrites the approx tree method to use codebase from hist for better performance and code sharing. The rewrite has many benefits: - Support for both `max_leaves` and `max_depth`. - Support for `grow_policy`. - Support for mono constraint. - Support for feature weights. - Support for easier bin configuration (`max_bin`). - Support for categorical data. - Faster performance for most of the datasets. (many times faster) - Support for prediction cache. - Significantly better performance for external memory. - Unites the code base between approx and hist.
2022-01-10 21:15:05 +08:00
parent ed95e77752
commit 001503186c
22 changed files with 635 additions and 264 deletions
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -267,6 +267,16 @@ __model_doc = f'''
            callbacks = [xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                                    save_best=True)]

+    max_cat_to_onehot : bool
+
+        .. versionadded:: 1.6.0
+
+        A threshold for deciding whether XGBoost should use one-hot encoding based split
+        for categorical data.  When number of categories is lesser than the threshold then
+        one-hot encoding is chosen, otherwise the categories will be partitioned into
+        children nodes.  Only relevant for regression and binary classification and
+        `approx` tree method.
+
    kwargs : dict, optional
        Keyword arguments for XGBoost Booster object.  Full documentation of parameters
        can be found :doc:`here </parameter>`.
@@ -483,6 +493,7 @@ class XGBModel(XGBModelBase):
        eval_metric: Optional[Union[str, List[str], Callable]] = None,
        early_stopping_rounds: Optional[int] = None,
        callbacks: Optional[List[TrainingCallback]] = None,
+        max_cat_to_onehot: Optional[int] = None,
        **kwargs: Any
    ) -> None:
        if not SKLEARN_INSTALLED:
@@ -522,6 +533,7 @@ class XGBModel(XGBModelBase):
        self.eval_metric = eval_metric
        self.early_stopping_rounds = early_stopping_rounds
        self.callbacks = callbacks
+        self.max_cat_to_onehot = max_cat_to_onehot
        if kwargs:
            self.kwargs = kwargs

@@ -800,8 +812,8 @@ class XGBModel(XGBModelBase):
            _duplicated("callbacks")
        callbacks = self.callbacks if self.callbacks is not None else callbacks

-        # lastly check categorical data support.
-        if self.enable_categorical and params.get("tree_method", None) != "gpu_hist":
+        tree_method = params.get("tree_method", None)
+        if self.enable_categorical and tree_method not in ("gpu_hist", "approx"):
            raise ValueError(
                "Experimental support for categorical data is not implemented for"
                " current tree method yet."
@@ -876,8 +888,7 @@ class XGBModel(XGBModelBase):
        feature_weights :
            Weight for each feature, defines the probability of each feature being
            selected when colsample is being used.  All values must be greater than 0,
-            otherwise a `ValueError` is thrown.  Only available for `hist`, `gpu_hist` and
-            `exact` tree methods.
+            otherwise a `ValueError` is thrown.

        callbacks :
            .. deprecated: 1.6.0
@@ -1750,8 +1761,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        feature_weights :
            Weight for each feature, defines the probability of each feature being
            selected when colsample is being used.  All values must be greater than 0,
-            otherwise a `ValueError` is thrown.  Only available for `hist`, `gpu_hist` and
-            `exact` tree methods.
+            otherwise a `ValueError` is thrown.

        callbacks :
            .. deprecated: 1.6.0