Update Python API doc (#3619)

* Add XGBRanker to Python API doc

* Show inherited members of XGBRegressor in API doc, since XGBRegressor uses default methods from XGBModel

* Add table of contents to Python API doc

* Skip JVM doc download if not available

* Show inherited members for XGBRegressor and XGBRanker

* Expose XGBRanker to Python XGBoost module directory

* Add docstring to XGBRegressor.predict() and XGBRanker.predict()

* Fix rendering errors in Python docstrings

* Fix lint
This commit is contained in:
Philip Hyunsu Cho 2018-08-22 18:59:30 -07:00 committed by GitHub
parent 4912c1f9c6
commit 4ed8a88240
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 168 additions and 74 deletions

View File

@ -14,6 +14,7 @@
from subprocess import call from subprocess import call
from sh.contrib import git from sh.contrib import git
import urllib.request import urllib.request
from urllib.error import HTTPError
from recommonmark.parser import CommonMarkParser from recommonmark.parser import CommonMarkParser
import sys import sys
import re import re
@ -24,8 +25,11 @@ import guzzle_sphinx_theme
git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')] git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
git_branch = [x for x in git_branch if 'HEAD' not in x] git_branch = [x for x in git_branch if 'HEAD' not in x]
print('git_branch = {}'.format(git_branch[0])) print('git_branch = {}'.format(git_branch[0]))
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0])) try:
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True) filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
except HTTPError:
print('JVM doc not found. Skipping...')
# If extensions (or modules to document with autodoc) are in another directory, # If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the

View File

@ -274,7 +274,7 @@ and then loading the model in another session:
With regards to ML pipeline save and load, please refer the next section. With regards to ML pipeline save and load, please refer the next section.
Interact with Other Bindings of XGBoost Interact with Other Bindings of XGBoost
------------------------------------ ---------------------------------------
After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by:
.. code-block:: scala .. code-block:: scala

View File

@ -2,6 +2,10 @@ Python API Reference
==================== ====================
This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package. This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
.. contents::
:backlinks: none
:local:
Core Data Structure Core Data Structure
------------------- -------------------
.. automodule:: xgboost.core .. automodule:: xgboost.core
@ -29,9 +33,15 @@ Scikit-Learn API
.. automodule:: xgboost.sklearn .. automodule:: xgboost.sklearn
.. autoclass:: xgboost.XGBRegressor .. autoclass:: xgboost.XGBRegressor
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. autoclass:: xgboost.XGBClassifier .. autoclass:: xgboost.XGBClassifier
:members: :members:
:inherited-members:
:show-inheritance:
.. autoclass:: xgboost.XGBRanker
:members:
:inherited-members:
:show-inheritance: :show-inheritance:
Plotting API Plotting API

View File

@ -12,7 +12,7 @@ from .core import DMatrix, Booster
from .training import train, cv from .training import train, cv
from . import rabit # noqa from . import rabit # noqa
try: try:
from .sklearn import XGBModel, XGBClassifier, XGBRegressor from .sklearn import XGBModel, XGBClassifier, XGBRegressor, XGBRanker
from .plotting import plot_importance, plot_tree, to_graphviz from .plotting import plot_importance, plot_tree, to_graphviz
except ImportError: except ImportError:
pass pass
@ -23,5 +23,5 @@ with open(VERSION_FILE) as f:
__all__ = ['DMatrix', 'Booster', __all__ = ['DMatrix', 'Booster',
'train', 'cv', 'train', 'cv',
'XGBModel', 'XGBClassifier', 'XGBRegressor', 'XGBModel', 'XGBClassifier', 'XGBRegressor', 'XGBRanker',
'plot_importance', 'plot_tree', 'to_graphviz'] 'plot_importance', 'plot_tree', 'to_graphviz']

View File

@ -1376,11 +1376,12 @@ class Booster(object):
def get_score(self, fmap='', importance_type='weight'): def get_score(self, fmap='', importance_type='weight'):
"""Get feature importance of each feature. """Get feature importance of each feature.
Importance type can be defined as: Importance type can be defined as:
'weight' - the number of times a feature is used to split the data across all trees.
'gain' - the average gain across all splits the feature is used in. * 'weight': the number of times a feature is used to split the data across all trees.
'cover' - the average coverage across all splits the feature is used in. * 'gain': the average gain across all splits the feature is used in.
'total_gain' - the total gain across all splits the feature is used in. * 'cover': the average coverage across all splits the feature is used in.
'total_cover' - the total coverage across all splits the feature is used in. * 'total_gain': the total gain across all splits the feature is used in.
* 'total_cover': the total coverage across all splits the feature is used in.
Parameters Parameters
---------- ----------
@ -1496,6 +1497,7 @@ class Booster(object):
def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True): def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
"""Get split value histogram of a feature """Get split value histogram of a feature
Parameters Parameters
---------- ----------
feature: str feature: str
@ -1506,7 +1508,7 @@ class Booster(object):
The maximum number of bins. The maximum number of bins.
Number of bins equals number of unique split values n_unique, Number of bins equals number of unique split values n_unique,
if bins == None or bins > n_unique. if bins == None or bins > n_unique.
as_pandas : bool, default True as_pandas: bool, default True
Return pd.DataFrame when pandas is installed. Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return numpy ndarray. If False or pandas is not installed, return numpy ndarray.

View File

@ -28,9 +28,10 @@ def plot_importance(booster, ax=None, height=0.2,
grid : bool, Turn the axes grids on or off. Default is True (On). grid : bool, Turn the axes grids on or off. Default is True (On).
importance_type : str, default "weight" importance_type : str, default "weight"
How the importance is calculated: either "weight", "gain", or "cover" How the importance is calculated: either "weight", "gain", or "cover"
"weight" is the number of times a feature appears in a tree
"gain" is the average gain of splits which use the feature * "weight" is the number of times a feature appears in a tree
"cover" is the average coverage of splits which use the feature * "gain" is the average gain of splits which use the feature
* "cover" is the average coverage of splits which use the feature
where coverage is defined as the number of samples affected by the split where coverage is defined as the number of samples affected by the split
max_num_features : int, default None max_num_features : int, default None
Maximum number of top features displayed on plot. If None, all features will be displayed. Maximum number of top features displayed on plot. If None, all features will be displayed.

View File

@ -99,14 +99,16 @@ class XGBModel(XGBModelBase):
missing : float, optional missing : float, optional
Value in the data which needs to be present as a missing value. If Value in the data which needs to be present as a missing value. If
None, defaults to np.nan. None, defaults to np.nan.
**kwargs : dict, optional \*\*kwargs : dict, optional
Keyword arguments for XGBoost Booster object. Full documentation of parameters can Keyword arguments for XGBoost Booster object. Full documentation of parameters can
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
Attempting to set a parameter via the constructor args and **kwargs dict simultaneously Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously
will result in a TypeError. will result in a TypeError.
Note:
**kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via .. note:: \*\*kwargs unsupported by scikit-learn
this argument will interact properly with Sklearn.
\*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters
passed via this argument will interact properly with scikit-learn.
Note Note
---- ----
@ -217,6 +219,7 @@ class XGBModel(XGBModelBase):
def save_model(self, fname): def save_model(self, fname):
""" """
Save the model to a file. Save the model to a file.
Parameters Parameters
---------- ----------
fname : string fname : string
@ -227,6 +230,7 @@ class XGBModel(XGBModelBase):
def load_model(self, fname): def load_model(self, fname):
""" """
Load the model from a file. Load the model from a file.
Parameters Parameters
---------- ----------
fname : string or a memory buffer fname : string or a memory buffer
@ -336,6 +340,39 @@ class XGBModel(XGBModelBase):
return self return self
def predict(self, data, output_margin=False, ntree_limit=None): def predict(self, data, output_margin=False, ntree_limit=None):
"""
Predict with `data`.
.. note:: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call ``predict()``.
.. note:: Using ``predict()`` with DART booster
If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
some of the trees will be evaluated. This will produce incorrect results if ``data`` is
not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
a nonzero value, e.g.
.. code-block:: python
preds = bst.predict(dtest, ntree_limit=num_round)
Parameters
----------
data : DMatrix
The dmatrix storing the input.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to best_ntree_limit if defined
(i.e. it has been trained with early stopping), otherwise 0 (use all trees).
Returns
-------
prediction : numpy array
"""
# pylint: disable=missing-docstring,invalid-name # pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)
# get ntree_limit to use - if none specified, default to # get ntree_limit to use - if none specified, default to
@ -372,10 +409,10 @@ class XGBModel(XGBModelBase):
def evals_result(self): def evals_result(self):
"""Return the evaluation results. """Return the evaluation results.
If eval_set is passed to the `fit` function, you can call evals_result() to If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to
get evaluation results for all passed eval_sets. When eval_metric is also get evaluation results for all passed eval_sets. When ``eval_metric`` is also
passed to the `fit` function, the evals_result will contain the eval_metrics passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics``
passed to the `fit` function passed to the ``fit`` function
Returns Returns
------- -------
@ -383,6 +420,9 @@ class XGBModel(XGBModelBase):
Example Example
------- -------
.. code-block:: python
param_dist = {'objective':'binary:logistic', 'n_estimators':2} param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBModel(**param_dist) clf = xgb.XGBModel(**param_dist)
@ -395,6 +435,9 @@ class XGBModel(XGBModelBase):
evals_result = clf.evals_result() evals_result = clf.evals_result()
The variable evals_result will contain: The variable evals_result will contain:
.. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']}, {'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}} 'validation_1': {'logloss': ['0.41965', '0.17686']}}
""" """
@ -408,9 +451,11 @@ class XGBModel(XGBModelBase):
@property @property
def feature_importances_(self): def feature_importances_(self):
""" """
Feature importances property
Returns Returns
------- -------
feature_importances_ : array of shape = [n_features] feature_importances_ : array of shape ``[n_features]``
""" """
b = self.get_booster() b = self.get_booster()
@ -422,9 +467,8 @@ class XGBModel(XGBModelBase):
class XGBClassifier(XGBModel, XGBClassifierBase): class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name # pylint: disable=missing-docstring,too-many-arguments,invalid-name
__doc__ = """Implementation of the scikit-learn API for XGBoost classification. __doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
def __init__(self, max_depth=3, learning_rate=0.1, def __init__(self, max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True, n_estimators=100, silent=True,
@ -610,10 +654,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def predict_proba(self, data, ntree_limit=None): def predict_proba(self, data, ntree_limit=None):
""" """
Predict the probability of each `data` example being of a given class. Predict the probability of each `data` example being of a given class.
NOTE: This function is not thread safe.
.. note:: This function is not thread safe
For each booster object, predict can only be called from one thread. For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call xgb.copy() to make copies If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call predict of model object and then call predict
Parameters Parameters
---------- ----------
data : DMatrix data : DMatrix
@ -621,6 +668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
ntree_limit : int ntree_limit : int
Limit number of trees in the prediction; defaults to best_ntree_limit if defined Limit number of trees in the prediction; defaults to best_ntree_limit if defined
(i.e. it has been trained with early stopping), otherwise 0 (use all trees). (i.e. it has been trained with early stopping), otherwise 0 (use all trees).
Returns Returns
------- -------
prediction : numpy array prediction : numpy array
@ -652,6 +700,9 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Example Example
------- -------
.. code-block:: python
param_dist = {'objective':'binary:logistic', 'n_estimators':2} param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBClassifier(**param_dist) clf = xgb.XGBClassifier(**param_dist)
@ -663,7 +714,10 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
evals_result = clf.evals_result() evals_result = clf.evals_result()
The variable evals_result will contain: The variable ``evals_result`` will contain
.. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']}, {'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}} 'validation_1': {'logloss': ['0.41965', '0.17686']}}
""" """
@ -677,8 +731,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
class XGBRegressor(XGBModel, XGBRegressorBase): class XGBRegressor(XGBModel, XGBRegressorBase):
# pylint: disable=missing-docstring # pylint: disable=missing-docstring
__doc__ = """Implementation of the scikit-learn API for XGBoost regression. __doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:]) + '\n'.join(XGBModel.__doc__.split('\n')[2:])
class XGBRanker(XGBModel): class XGBRanker(XGBModel):
@ -731,14 +785,16 @@ class XGBRanker(XGBModel):
missing : float, optional missing : float, optional
Value in the data which needs to be present as a missing value. If Value in the data which needs to be present as a missing value. If
None, defaults to np.nan. None, defaults to np.nan.
**kwargs : dict, optional \*\*kwargs : dict, optional
Keyword arguments for XGBoost Booster object. Full documentation of parameters can Keyword arguments for XGBoost Booster object. Full documentation of parameters can
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
Attempting to set a parameter via the constructor args and **kwargs dict simultaneously Attempting to set a parameter via the constructor args and \*\*kwargs dict
will result in a TypeError. simultaneously will result in a TypeError.
Note:
**kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via .. note:: \*\*kwargs unsupported by scikit-learn
this argument will interact properly with Sklearn.
\*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters
passed via this argument will interact properly with scikit-learn.
Note Note
---- ----
@ -750,16 +806,25 @@ class XGBRanker(XGBModel):
For example, if your original data look like: For example, if your original data look like:
+-------+-----------+---------------+
| qid | label | features | | qid | label | features |
+-------+-----------+---------------+
| 1 | 0 | x_1 | | 1 | 0 | x_1 |
+-------+-----------+---------------+
| 1 | 1 | x_2 | | 1 | 1 | x_2 |
+-------+-----------+---------------+
| 1 | 0 | x_3 | | 1 | 0 | x_3 |
+-------+-----------+---------------+
| 2 | 0 | x_4 | | 2 | 0 | x_4 |
+-------+-----------+---------------+
| 2 | 1 | x_5 | | 2 | 1 | x_5 |
+-------+-----------+---------------+
| 2 | 1 | x_6 | | 2 | 1 | x_6 |
+-------+-----------+---------------+
| 2 | 1 | x_7 | | 2 | 1 | x_7 |
+-------+-----------+---------------+
then your group array should be [3, 4]. then your group array should be ``[3, 4]``.
""" """
def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
@ -908,3 +973,5 @@ class XGBRanker(XGBModel):
return self.get_booster().predict(test_dmatrix, return self.get_booster().predict(test_dmatrix,
output_margin=output_margin, output_margin=output_margin,
ntree_limit=ntree_limit) ntree_limit=ntree_limit)
predict.__doc__ = XGBModel.predict.__doc__

View File

@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
and/or num_class appears in the parameters) and/or num_class appears in the parameters)
evals_result: dict evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist. This dictionary stores the evaluation results of all the items in watchlist.
Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
a parameter containing ('eval_metric': 'logloss') a parameter containing ('eval_metric': 'logloss'), the **evals_result**
Returns: {'train': {'logloss': ['0.48253', '0.35953']}, returns
.. code-block:: none
{'train': {'logloss': ['0.48253', '0.35953']},
'eval': {'logloss': ['0.480385', '0.357756']}} 'eval': {'logloss': ['0.480385', '0.357756']}}
verbose_eval : bool or int verbose_eval : bool or int
Requires at least one item in evals. Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is If **verbose_eval** is True then the evaluation metric on the validation set is
printed at each boosting stage. printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set If **verbose_eval** is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage is printed at every given **verbose_eval** boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed. / the boosting stage found by using **early_stopping_rounds** is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage. is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function (deprecated - use callback API instead) learning_rates: list or function (deprecated - use callback API instead)
List of learning rate for each boosting round List of learning rate for each boosting round
@ -328,10 +334,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
folds : a KFold or StratifiedKFold instance or list of fold indices folds : a KFold or StratifiedKFold instance or list of fold indices
Sklearn KFolds or StratifiedKFolds object. Sklearn KFolds or StratifiedKFolds object.
Alternatively may explicitly pass sample indices for each fold. Alternatively may explicitly pass sample indices for each fold.
For `n` folds, `folds` should be a length `n` list of tuples. For ``n`` folds, ``folds`` should be a length ``n`` list of tuples.
Each tuple is `(in,out)` where `in` is a list of indices to be used Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used
as the training samples for the `n`th fold and `out` is a list of as the training samples for the ``n`` th fold and ``out`` is a list of
indices to be used as the testing samples for the `n`th fold. indices to be used as the testing samples for the ``n`` th fold.
metrics : string or list of strings metrics : string or list of strings
Evaluation metrics to be watched in CV. Evaluation metrics to be watched in CV.
obj : function obj : function
@ -363,7 +369,11 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at end of each iteration. List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using xgb.callback module. It is possible to use predefined callbacks by using xgb.callback module.
Example: [xgb.callback.reset_learning_rate(custom_rates)] Example:
.. code-block:: none
[xgb.callback.reset_learning_rate(custom_rates)]
shuffle : bool shuffle : bool
Shuffle data before creating folds. Shuffle data before creating folds.