[pyspark] Fixing xgboost.spark python doc (#8200)

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
This commit is contained in:
WeichenXu 2022-08-25 14:41:48 +08:00 committed by GitHub
parent d03794ce7a
commit 651f0a8889
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -141,7 +141,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
SparkXGBClassifier doesn't support `validate_features` and `output_margin` param. SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread` SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value. param for each xgboost worker will be set equal to `spark.task.cpus` config value.
@ -243,15 +243,15 @@ _set_pyspark_xgb_cls_param_attrs(SparkXGBClassifier, SparkXGBClassifierModel)
class SparkXGBRanker(_SparkXGBEstimator): class SparkXGBRanker(_SparkXGBEstimator):
"""SparkXGBRanker is a PySpark ML estimator. It implements the XGBoost """SparkXGBRanker is a PySpark ML estimator. It implements the XGBoost
classification algorithm based on XGBoost python library, and it can be used in ranking algorithm based on XGBoost python library, and it can be used in
PySpark Pipeline and PySpark ML meta algorithms like PySpark Pipeline and PySpark ML meta algorithms like
:py:class:`~pyspark.ml.tuning.CrossValidator`/ :py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/ :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest` :py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBRanker automatically supports most of the parameters in SparkXGBRanker automatically supports most of the parameters in
`xgboost.XGBClassifier` constructor and most of the parameters used in `xgboost.XGBRanker` constructor and most of the parameters used in
:py:class:`xgboost.XGBClassifier` fit and predict method. :py:class:`xgboost.XGBRanker` fit and predict method.
SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`, SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details. see doc below for more details.
@ -273,21 +273,21 @@ class SparkXGBRanker(_SparkXGBEstimator):
callbacks: callbacks:
The export and import of the callback functions are at best effort. For The export and import of the callback functions are at best effort. For
details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc. details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
validation_indicator_col: validation_indicator_col:
For params related to `xgboost.XGBClassifier` training with For params related to `xgboost.XGBRanker` training with
evaluation dataset's supervision, evaluation dataset's supervision,
set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col` set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier` parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
fit method. fit method.
weight_col: weight_col:
To specify the weight of the training and validation dataset, set To specify the weight of the training and validation dataset, set
:py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting :py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
`sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier` `sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
fit method. fit method.
xgb_model: xgb_model:
Set the value to be the instance returned by Set the value to be the instance returned by
:func:`xgboost.spark.SparkXGBClassifierModel.get_booster`. :func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
num_workers: num_workers:
Integer that specifies the number of XGBoost workers to use. Integer that specifies the number of XGBoost workers to use.
Each XGBoost worker corresponds to one spark task. Each XGBoost worker corresponds to one spark task.
@ -313,23 +313,33 @@ class SparkXGBRanker(_SparkXGBEstimator):
Examples Examples
-------- --------
>>> from xgboost.spark import SparkXGBClassifier >>> from xgboost.spark import SparkXGBRanker
>>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.linalg import Vectors
>>> df_train = spark.createDataFrame([ >>> ranker = SparkXGBRanker(qid_col="qid")
... (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0), >>> df_train = spark.createDataFrame(
... (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0), ... [
... (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0), ... (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
... (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0), ... (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
... ], ["features", "label", "isVal", "weight"]) ... (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
>>> df_test = spark.createDataFrame([ ... (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
... (Vectors.dense(1.0, 2.0, 3.0), ), ... (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
... ], ["features"]) ... (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
>>> xgb_classifier = SparkXGBClassifier(max_depth=5, missing=0.0, ... ],
... validation_indicator_col='isVal', weight_col='weight', ... ["features", "label", "qid"],
... early_stopping_rounds=1, eval_metric='logloss') ... )
>>> xgb_clf_model = xgb_classifier.fit(df_train) >>> df_test = spark.createDataFrame(
>>> xgb_clf_model.transform(df_test).show() ... [
... (Vectors.dense(1.5, 2.0, 3.0), 0),
... (Vectors.dense(4.5, 5.0, 6.0), 0),
... (Vectors.dense(9.0, 4.5, 8.0), 0),
... (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1),
... (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1),
... (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1),
... ],
... ["features", "qid"],
... )
>>> model = ranker.fit(df_train)
>>> model.transform(df_test).show()
""" """
def __init__(self, **kwargs): def __init__(self, **kwargs):