[pyspark] Fixing xgboost.spark python doc (#8200)

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
This commit is contained in:
WeichenXu 2022-08-25 14:41:48 +08:00 committed by GitHub
parent d03794ce7a
commit 651f0a8889
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -141,7 +141,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.
SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
param for each xgboost worker will be set equal to `spark.task.cpus` config value.
@ -243,15 +243,15 @@ _set_pyspark_xgb_cls_param_attrs(SparkXGBClassifier, SparkXGBClassifierModel)
class SparkXGBRanker(_SparkXGBEstimator):
"""SparkXGBRanker is a PySpark ML estimator. It implements the XGBoost
classification algorithm based on XGBoost python library, and it can be used in
ranking algorithm based on XGBoost python library, and it can be used in
PySpark Pipeline and PySpark ML meta algorithms like
:py:class:`~pyspark.ml.tuning.CrossValidator`/
:py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
:py:class:`~pyspark.ml.classification.OneVsRest`
SparkXGBRanker automatically supports most of the parameters in
`xgboost.XGBClassifier` constructor and most of the parameters used in
:py:class:`xgboost.XGBClassifier` fit and predict method.
`xgboost.XGBRanker` constructor and most of the parameters used in
:py:class:`xgboost.XGBRanker` fit and predict method.
SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
see doc below for more details.
@ -273,21 +273,21 @@ class SparkXGBRanker(_SparkXGBEstimator):
callbacks:
The export and import of the callback functions are at best effort. For
details, see :py:attr:`xgboost.spark.SparkXGBClassifier.callbacks` param doc.
details, see :py:attr:`xgboost.spark.SparkXGBRanker.callbacks` param doc.
validation_indicator_col:
For params related to `xgboost.XGBClassifier` training with
For params related to `xgboost.XGBRanker` training with
evaluation dataset's supervision,
set :py:attr:`xgboost.spark.SparkXGBClassifier.validation_indicator_col`
parameter instead of setting the `eval_set` parameter in `xgboost.XGBClassifier`
set :py:attr:`xgboost.spark.XGBRanker.validation_indicator_col`
parameter instead of setting the `eval_set` parameter in `xgboost.XGBRanker`
fit method.
weight_col:
To specify the weight of the training and validation dataset, set
:py:attr:`xgboost.spark.SparkXGBClassifier.weight_col` parameter instead of setting
`sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBClassifier`
:py:attr:`xgboost.spark.SparkXGBRanker.weight_col` parameter instead of setting
`sample_weight` and `sample_weight_eval_set` parameter in `xgboost.XGBRanker`
fit method.
xgb_model:
Set the value to be the instance returned by
:func:`xgboost.spark.SparkXGBClassifierModel.get_booster`.
:func:`xgboost.spark.SparkXGBRankerModel.get_booster`.
num_workers:
Integer that specifies the number of XGBoost workers to use.
Each XGBoost worker corresponds to one spark task.
@ -313,23 +313,33 @@ class SparkXGBRanker(_SparkXGBEstimator):
Examples
--------
>>> from xgboost.spark import SparkXGBClassifier
>>> from xgboost.spark import SparkXGBRanker
>>> from pyspark.ml.linalg import Vectors
>>> df_train = spark.createDataFrame([
... (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
... (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
... (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
... (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
... ], ["features", "label", "isVal", "weight"])
>>> df_test = spark.createDataFrame([
... (Vectors.dense(1.0, 2.0, 3.0), ),
... ], ["features"])
>>> xgb_classifier = SparkXGBClassifier(max_depth=5, missing=0.0,
... validation_indicator_col='isVal', weight_col='weight',
... early_stopping_rounds=1, eval_metric='logloss')
>>> xgb_clf_model = xgb_classifier.fit(df_train)
>>> xgb_clf_model.transform(df_test).show()
>>> ranker = SparkXGBRanker(qid_col="qid")
>>> df_train = spark.createDataFrame(
... [
... (Vectors.dense(1.0, 2.0, 3.0), 0, 0),
... (Vectors.dense(4.0, 5.0, 6.0), 1, 0),
... (Vectors.dense(9.0, 4.0, 8.0), 2, 0),
... (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 0, 1),
... (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, 1),
... (Vectors.sparse(3, {1: 8.0, 2: 9.5}), 2, 1),
... ],
... ["features", "label", "qid"],
... )
>>> df_test = spark.createDataFrame(
... [
... (Vectors.dense(1.5, 2.0, 3.0), 0),
... (Vectors.dense(4.5, 5.0, 6.0), 0),
... (Vectors.dense(9.0, 4.5, 8.0), 0),
... (Vectors.sparse(3, {1: 1.0, 2: 6.0}), 1),
... (Vectors.sparse(3, {1: 6.0, 2: 7.0}), 1),
... (Vectors.sparse(3, {1: 8.0, 2: 10.5}), 1),
... ],
... ["features", "qid"],
... )
>>> model = ranker.fit(df_train)
>>> model.transform(df_test).show()
"""
def __init__(self, **kwargs):