[pyspark] Fixing xgboost.spark python doc (#8200)

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
2022-08-25 14:41:48 +08:00 · 2022-08-25 14:41:48 +08:00 · 651f0a8889
commit 651f0a8889
parent d03794ce7a
1 changed files with 37 additions and 27 deletions
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@ -141,7 +141,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction

    SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.

-    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
+    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
    param for each xgboost worker will be set equal to `spark.task.cpus` config value.


@ -243,15 +243,15 @@ _set_pyspark_xgb_cls_param_attrs(SparkXGBClassifier, SparkXGBClassifierModel)

 class SparkXGBRanker(_SparkXGBEstimator):
    """SparkXGBRanker is a PySpark ML estimator. It implements the XGBoost
-    classification algorithm based on XGBoost python library, and it can be used in
+    ranking algorithm based on XGBoost python library, and it can be used in
    PySpark Pipeline and PySpark ML meta algorithms like
    :py:class:`~pyspark.ml.tuning.CrossValidator`/
    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
    :py:class:`~pyspark.ml.classification.OneVsRest`

    SparkXGBRanker automatically supports most of the parameters in
-    `xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:class:`xgboost.XGBClassifier` fit and predict method.
+    `xgboost.XGBRanker` constructor and most of the parameters used in
+    :py:class:`xgboost.XGBRanker` fit and predict method.

    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
    see doc below for more details.
@ -273,21 +273,21 @@ class SparkXGBRanker(_SparkXGBEstimator):

    callbacks:
        The export and import of the callback functions are at best effort. For
-        details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc.
+        details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
    validation_indicator_col:
-        For params related to `xgboost.XGBClassifier` training with
+        For params related to `xgboost.XGBRanker` training with
        evaluation dataset's supervision,
-        set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
-        parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
+        set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
+        parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
        fit method.
    weight_col:
        To specify the weight of the training and validation dataset, set
-        :py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
-        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
+        :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
+        `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
        fit method.
    xgb_model:
        Set the value to be the instance returned by
-        :func:`xgboost.spark.SparkXGBClassifierModel.get_booster`.
+        :func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
    num_workers:
        Integer that specifies the number of XGBoost workers to use.
        Each XGBoost worker corresponds to one spark task.
@ -313,23 +313,33 @@ class SparkXGBRanker(_SparkXGBEstimator):
    Examples
    --------

-    >>> from xgboost.spark import SparkXGBClassifier
+    >>> from xgboost.spark import SparkXGBRanker
    >>> from pyspark.ml.linalg import Vectors
-    >>> df_train = spark.createDataFrame([
-    ...     (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
-    ...     (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
-    ...     (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
-    ...     (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
-    ... ], ["features", "label", "isVal", "weight"])
-    >>> df_test = spark.createDataFrame([
-    ...     (Vectors.dense(1.0, 2.0, 3.0), ),
-    ... ], ["features"])
-    >>> xgb_classifier = SparkXGBClassifier(max_depth=5, missing=0.0,
-    ...     validation_indicator_col='isVal', weight_col='weight',
-    ...     early_stopping_rounds=1, eval_metric='logloss')
-    >>> xgb_clf_model = xgb_classifier.fit(df_train)
-    >>> xgb_clf_model.transform(df_test).show()
-
+    >>> ranker = SparkXGBRanker(qid_col="qid")
+    >>> df_train = spark.createDataFrame(
+    ...     [
+    ...         (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
+    ...         (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
+    ...         (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
+    ...         (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
+    ...         (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
+    ...         (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
+    ...     ],
+    ...     ["features", "label", "qid"],
+    ... )
+    >>> df_test = spark.createDataFrame(
+    ...     [
+    ...         (Vectors.dense(1.5, 2.0, 3.0), 0),
+    ...         (Vectors.dense(4.5, 5.0, 6.0), 0),
+    ...         (Vectors.dense(9.0, 4.5, 8.0), 0),
+    ...         (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1),
+    ...         (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1),
+    ...         (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1),
+    ...     ],
+    ...     ["features", "qid"],
+    ... )
+    >>> model = ranker.fit(df_train)
+    >>> model.transform(df_test).show()
    """

    def __init__(self, **kwargs):