Support optimal partitioning for GPU hist. (#7652)

* Implement `MaxCategory` in quantile. * Implement partition-based split for GPU evaluation. Currently, it's based on the existing evaluation function. * Extract an evaluator from GPU Hist to store the needed states. * Added some CUDA stream/event utilities. * Update document with references. * Fixed a bug in approx evaluator where the number of data points is less than the number of categories.
2022-02-15 03:03:12 +08:00
parent 2369d55e9a
commit 0d0abe1845
26 changed files with 1088 additions and 528 deletions
--- a/demo/guide-python/cat_in_the_dat.py
+++ b/demo/guide-python/cat_in_the_dat.py
@@ -61,7 +61,12 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
    return X, y


-params = {"tree_method": "gpu_hist", "use_label_encoder": False, "n_estimators": 32}
+params = {
+    "tree_method": "gpu_hist",
+    "use_label_encoder": False,
+    "n_estimators": 32,
+    "colsample_bylevel": 0.7,
+}


 def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
@@ -70,13 +75,13 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
        X, y, random_state=1994, test_size=0.2
    )
    # Specify `enable_categorical`.
-    clf = xgb.XGBClassifier(**params, enable_categorical=True)
-    clf.fit(
-        X_train,
-        y_train,
-        eval_set=[(X_test, y_test), (X_train, y_train)],
+    clf = xgb.XGBClassifier(
+        **params,
        eval_metric="auc",
+        enable_categorical=True,
+        max_cat_to_onehot=1,    # We use optimal partitioning exclusively
    )
+    clf.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)])
    clf.save_model(os.path.join(output_dir, "categorical.json"))

    y_score = clf.predict_proba(X_test)[:, 1]  # proba of positive samples
--- a/demo/guide-python/categorical.py
+++ b/demo/guide-python/categorical.py
@@ -3,15 +3,15 @@ Getting started with categorical data
 =====================================

 Experimental support for categorical data.  After 1.5 XGBoost `gpu_hist` tree method has
-experimental support for one-hot encoding based tree split, and in 1.6 `approx` supported
+experimental support for one-hot encoding based tree split, and in 1.6 `approx` support
 was added.

 In before, users need to run an encoder themselves before passing the data into XGBoost,
-which creates a sparse matrix and potentially increase memory usage.  This demo showcases
-the experimental categorical data support, more advanced features are planned.
-
-Also, see :doc:`the tutorial </tutorials/categorical>` for using XGBoost with categorical data
+which creates a sparse matrix and potentially increase memory usage.  This demo
+showcases the experimental categorical data support, more advanced features are planned.

+Also, see :doc:`the tutorial </tutorials/categorical>` for using XGBoost with
+categorical data.

    .. versionadded:: 1.5.0

@@ -55,8 +55,11 @@ def main() -> None:
    # For scikit-learn interface, the input data must be pandas DataFrame or cudf
    # DataFrame with categorical features
    X, y = make_categorical(100, 10, 4, False)
-    # Specify `enable_categorical` to True.
-    reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)
+    # Specify `enable_categorical` to True, also we use onehot encoding based split
+    # here for demonstration. For details see the document of `max_cat_to_onehot`.
+    reg = xgb.XGBRegressor(
+        tree_method="gpu_hist", enable_categorical=True, max_cat_to_onehot=5
+    )
    reg.fit(X, y, eval_set=[(X, y)])

    # Pass in already encoded data