Update Python API doc (#3619)

* Show inherited members of XGBRegressor in API doc, since XGBRegressor uses default methods from XGBModel

* Add table of contents to Python API doc

* Skip JVM doc download if not available

* Show inherited members for XGBRegressor

* Add docstring to XGBRegressor.predict()

* Fix rendering errors in Python docstrings

* Fix lint
This commit is contained in:
Philip Cho 2018-09-05 12:15:23 -07:00
parent d1c250f8cf
commit 953ed1a99b
No known key found for this signature in database
GPG Key ID: A758FA046E1F6BB8
6 changed files with 134 additions and 61 deletions

View File

@ -14,6 +14,7 @@
from subprocess import call from subprocess import call
from sh.contrib import git from sh.contrib import git
import urllib.request import urllib.request
from urllib.error import HTTPError
from recommonmark.parser import CommonMarkParser from recommonmark.parser import CommonMarkParser
import sys import sys
import re import re
@ -24,8 +25,11 @@ import guzzle_sphinx_theme
git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')] git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
git_branch = [x for x in git_branch if 'HEAD' not in x] git_branch = [x for x in git_branch if 'HEAD' not in x]
print('git_branch = {}'.format(git_branch[0])) print('git_branch = {}'.format(git_branch[0]))
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0])) try:
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True) filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
except HTTPError:
print('JVM doc not found. Skipping...')
# If extensions (or modules to document with autodoc) are in another directory, # If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the

View File

@ -2,6 +2,10 @@ Python API Reference
==================== ====================
This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package. This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
.. contents::
:backlinks: none
:local:
Core Data Structure Core Data Structure
------------------- -------------------
.. automodule:: xgboost.core .. automodule:: xgboost.core
@ -29,9 +33,11 @@ Scikit-Learn API
.. automodule:: xgboost.sklearn .. automodule:: xgboost.sklearn
.. autoclass:: xgboost.XGBRegressor .. autoclass:: xgboost.XGBRegressor
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
.. autoclass:: xgboost.XGBClassifier .. autoclass:: xgboost.XGBClassifier
:members: :members:
:inherited-members:
:show-inheritance: :show-inheritance:
Plotting API Plotting API

View File

@ -1212,9 +1212,10 @@ class Booster(object):
def get_score(self, fmap='', importance_type='weight'): def get_score(self, fmap='', importance_type='weight'):
"""Get feature importance of each feature. """Get feature importance of each feature.
Importance type can be defined as: Importance type can be defined as:
'weight' - the number of times a feature is used to split the data across all trees.
'gain' - the average gain of the feature when it is used in trees * 'weight': the number of times a feature is used to split the data across all trees.
'cover' - the average coverage of the feature when it is used in trees * 'gain': the average gain across all splits the feature is used in.
* 'cover': the average coverage across all splits the feature is used in.
Parameters Parameters
---------- ----------
@ -1317,6 +1318,7 @@ class Booster(object):
def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True): def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
"""Get split value histogram of a feature """Get split value histogram of a feature
Parameters Parameters
---------- ----------
feature: str feature: str
@ -1327,7 +1329,7 @@ class Booster(object):
The maximum number of bins. The maximum number of bins.
Number of bins equals number of unique split values n_unique, Number of bins equals number of unique split values n_unique,
if bins == None or bins > n_unique. if bins == None or bins > n_unique.
as_pandas : bool, default True as_pandas: bool, default True
Return pd.DataFrame when pandas is installed. Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return numpy ndarray. If False or pandas is not installed, return numpy ndarray.

View File

@ -28,10 +28,11 @@ def plot_importance(booster, ax=None, height=0.2,
grid : bool, Turn the axes grids on or off. Default is True (On). grid : bool, Turn the axes grids on or off. Default is True (On).
importance_type : str, default "weight" importance_type : str, default "weight"
How the importance is calculated: either "weight", "gain", or "cover" How the importance is calculated: either "weight", "gain", or "cover"
"weight" is the number of times a feature appears in a tree
"gain" is the average gain of splits which use the feature * "weight" is the number of times a feature appears in a tree
"cover" is the average coverage of splits which use the feature * "gain" is the average gain of splits which use the feature
where coverage is defined as the number of samples affected by the split * "cover" is the average coverage of splits which use the feature
where coverage is defined as the number of samples affected by the split
max_num_features : int, default None max_num_features : int, default None
Maximum number of top features displayed on plot. If None, all features will be displayed. Maximum number of top features displayed on plot. If None, all features will be displayed.
height : float, default 0.2 height : float, default 0.2

View File

@ -99,14 +99,16 @@ class XGBModel(XGBModelBase):
missing : float, optional missing : float, optional
Value in the data which needs to be present as a missing value. If Value in the data which needs to be present as a missing value. If
None, defaults to np.nan. None, defaults to np.nan.
**kwargs : dict, optional \*\*kwargs : dict, optional
Keyword arguments for XGBoost Booster object. Full documentation of parameters can Keyword arguments for XGBoost Booster object. Full documentation of parameters can
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md. be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
Attempting to set a parameter via the constructor args and **kwargs dict simultaneously Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously
will result in a TypeError. will result in a TypeError.
Note:
**kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via .. note:: \*\*kwargs unsupported by scikit-learn
this argument will interact properly with Sklearn.
\*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters
passed via this argument will interact properly with scikit-learn.
Note Note
---- ----
@ -237,7 +239,7 @@ class XGBModel(XGBModelBase):
instance weights on the i-th validation set. instance weights on the i-th validation set.
eval_metric : str, callable, optional eval_metric : str, callable, optional
If a str, should be a built-in evaluation metric to use. See If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call doc/parameter.rst. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a signature is func(y_predicted, y_true) where y_true will be a
DMatrix object such that you may need to call the get_label DMatrix object such that you may need to call the get_label
method. It must return a str, value pair where the str is a name method. It must return a str, value pair where the str is a name
@ -314,6 +316,38 @@ class XGBModel(XGBModelBase):
return self return self
def predict(self, data, output_margin=False, ntree_limit=0): def predict(self, data, output_margin=False, ntree_limit=0):
"""
Predict with `data`.
.. note:: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call ``predict()``.
.. note:: Using ``predict()`` with DART booster
If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
some of the trees will be evaluated. This will produce incorrect results if ``data`` is
not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
a nonzero value, e.g.
.. code-block:: python
preds = bst.predict(dtest, ntree_limit=num_round)
Parameters
----------
data : DMatrix
The dmatrix storing the input.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
prediction : numpy array
"""
# pylint: disable=missing-docstring,invalid-name # pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)
return self.get_booster().predict(test_dmatrix, return self.get_booster().predict(test_dmatrix,
@ -346,10 +380,10 @@ class XGBModel(XGBModelBase):
def evals_result(self): def evals_result(self):
"""Return the evaluation results. """Return the evaluation results.
If eval_set is passed to the `fit` function, you can call evals_result() to If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to
get evaluation results for all passed eval_sets. When eval_metric is also get evaluation results for all passed eval_sets. When ``eval_metric`` is also
passed to the `fit` function, the evals_result will contain the eval_metrics passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics``
passed to the `fit` function passed to the ``fit`` function
Returns Returns
------- -------
@ -357,20 +391,26 @@ class XGBModel(XGBModelBase):
Example Example
------- -------
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBModel(**param_dist) .. code-block:: python
clf.fit(X_train, y_train, param_dist = {'objective':'binary:logistic', 'n_estimators':2}
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result() clf = xgb.XGBModel(**param_dist)
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result()
The variable evals_result will contain: The variable evals_result will contain:
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}} .. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
""" """
if self.evals_result_: if self.evals_result_:
evals_result = self.evals_result_ evals_result = self.evals_result_
@ -382,9 +422,11 @@ class XGBModel(XGBModelBase):
@property @property
def feature_importances_(self): def feature_importances_(self):
""" """
Feature importances property
Returns Returns
------- -------
feature_importances_ : array of shape = [n_features] feature_importances_ : array of shape ``[n_features]``
""" """
b = self.get_booster() b = self.get_booster()
@ -396,9 +438,8 @@ class XGBModel(XGBModelBase):
class XGBClassifier(XGBModel, XGBClassifierBase): class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name # pylint: disable=missing-docstring,too-many-arguments,invalid-name
__doc__ = """Implementation of the scikit-learn API for XGBoost classification. __doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
def __init__(self, max_depth=3, learning_rate=0.1, def __init__(self, max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True, n_estimators=100, silent=True,
@ -439,7 +480,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
instance weights on the i-th validation set. instance weights on the i-th validation set.
eval_metric : str, callable, optional eval_metric : str, callable, optional
If a str, should be a built-in evaluation metric to use. See If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call doc/parameter.rst. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a signature is func(y_predicted, y_true) where y_true will be a
DMatrix object such that you may need to call the get_label DMatrix object such that you may need to call the get_label
method. It must return a str, value pair where the str is a name method. It must return a str, value pair where the str is a name
@ -567,10 +608,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def predict_proba(self, data, ntree_limit=0): def predict_proba(self, data, ntree_limit=0):
""" """
Predict the probability of each `data` example being of a given class. Predict the probability of each `data` example being of a given class.
NOTE: This function is not thread safe.
For each booster object, predict can only be called from one thread. .. note:: This function is not thread safe
If you want to run prediction using multiple thread, call xgb.copy() to make copies
of model object and then call predict For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call predict
Parameters Parameters
---------- ----------
data : DMatrix data : DMatrix
@ -606,20 +650,26 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Example Example
------- -------
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBClassifier(**param_dist) .. code-block:: python
clf.fit(X_train, y_train, param_dist = {'objective':'binary:logistic', 'n_estimators':2}
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result() clf = xgb.XGBClassifier(**param_dist)
The variable evals_result will contain: clf.fit(X_train, y_train,
{'validation_0': {'logloss': ['0.604835', '0.531479']}, eval_set=[(X_train, y_train), (X_test, y_test)],
'validation_1': {'logloss': ['0.41965', '0.17686']}} eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result()
The variable ``evals_result`` will contain
.. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
""" """
if self.evals_result_: if self.evals_result_:
evals_result = self.evals_result_ evals_result = self.evals_result_
@ -631,5 +681,5 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
class XGBRegressor(XGBModel, XGBRegressorBase): class XGBRegressor(XGBModel, XGBRegressorBase):
# pylint: disable=missing-docstring # pylint: disable=missing-docstring
__doc__ = """Implementation of the scikit-learn API for XGBoost regression. __doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:]) + '\n'.join(XGBModel.__doc__.split('\n')[2:])

View File

@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
and/or num_class appears in the parameters) and/or num_class appears in the parameters)
evals_result: dict evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist. This dictionary stores the evaluation results of all the items in watchlist.
Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
a parameter containing ('eval_metric': 'logloss') a parameter containing ('eval_metric': 'logloss'), the **evals_result**
Returns: {'train': {'logloss': ['0.48253', '0.35953']}, returns
'eval': {'logloss': ['0.480385', '0.357756']}}
.. code-block:: none
{'train': {'logloss': ['0.48253', '0.35953']},
'eval': {'logloss': ['0.480385', '0.357756']}}
verbose_eval : bool or int verbose_eval : bool or int
Requires at least one item in evals. Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is If **verbose_eval** is True then the evaluation metric on the validation set is
printed at each boosting stage. printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set If **verbose_eval** is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage is printed at every given **verbose_eval** boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed. / the boosting stage found by using **early_stopping_rounds** is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage. is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function (deprecated - use callback API instead) learning_rates: list or function (deprecated - use callback API instead)
List of learning rate for each boosting round List of learning rate for each boosting round
@ -341,8 +347,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at end of each iteration. List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using xgb.callback module. It is possible to use predefined callbacks by using xgb.callback module.
Example: [xgb.callback.reset_learning_rate(custom_rates)] Example:
shuffle : bool
.. code-block:: none
[xgb.callback.reset_learning_rate(custom_rates)]
shuffle : bool
Shuffle data before creating folds. Shuffle data before creating folds.
Returns Returns