Update Python API doc (#3619)

* Show inherited members of XGBRegressor in API doc, since XGBRegressor uses default methods from XGBModel

* Add table of contents to Python API doc

* Skip JVM doc download if not available

* Show inherited members for XGBRegressor

* Add docstring to XGBRegressor.predict()

* Fix rendering errors in Python docstrings

* Fix lint
This commit is contained in:
Philip Cho 2018-09-05 12:15:23 -07:00
parent d1c250f8cf
commit 953ed1a99b
No known key found for this signature in database
GPG Key ID: A758FA046E1F6BB8
6 changed files with 134 additions and 61 deletions

View File

@ -14,6 +14,7 @@
from subprocess import call
from sh.contrib import git
import urllib.request
from urllib.error import HTTPError
from recommonmark.parser import CommonMarkParser
import sys
import re
@ -24,8 +25,11 @@ import guzzle_sphinx_theme
git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
git_branch = [x for x in git_branch if 'HEAD' not in x]
print('git_branch = {}'.format(git_branch[0]))
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
try:
filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
except HTTPError:
print('JVM doc not found. Skipping...')
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the

View File

@ -2,6 +2,10 @@ Python API Reference
====================
This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
.. contents::
:backlinks: none
:local:
Core Data Structure
-------------------
.. automodule:: xgboost.core
@ -29,9 +33,11 @@ Scikit-Learn API
.. automodule:: xgboost.sklearn
.. autoclass:: xgboost.XGBRegressor
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: xgboost.XGBClassifier
:members:
:inherited-members:
:show-inheritance:
Plotting API

View File

@ -1212,9 +1212,10 @@ class Booster(object):
def get_score(self, fmap='', importance_type='weight'):
"""Get feature importance of each feature.
Importance type can be defined as:
'weight' - the number of times a feature is used to split the data across all trees.
'gain' - the average gain of the feature when it is used in trees
'cover' - the average coverage of the feature when it is used in trees
* 'weight': the number of times a feature is used to split the data across all trees.
* 'gain': the average gain across all splits the feature is used in.
* 'cover': the average coverage across all splits the feature is used in.
Parameters
----------
@ -1317,6 +1318,7 @@ class Booster(object):
def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
"""Get split value histogram of a feature
Parameters
----------
feature: str
@ -1327,7 +1329,7 @@ class Booster(object):
The maximum number of bins.
Number of bins equals number of unique split values n_unique,
if bins == None or bins > n_unique.
as_pandas : bool, default True
as_pandas: bool, default True
Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return numpy ndarray.

View File

@ -28,10 +28,11 @@ def plot_importance(booster, ax=None, height=0.2,
grid : bool, Turn the axes grids on or off. Default is True (On).
importance_type : str, default "weight"
How the importance is calculated: either "weight", "gain", or "cover"
"weight" is the number of times a feature appears in a tree
"gain" is the average gain of splits which use the feature
"cover" is the average coverage of splits which use the feature
where coverage is defined as the number of samples affected by the split
* "weight" is the number of times a feature appears in a tree
* "gain" is the average gain of splits which use the feature
* "cover" is the average coverage of splits which use the feature
where coverage is defined as the number of samples affected by the split
max_num_features : int, default None
Maximum number of top features displayed on plot. If None, all features will be displayed.
height : float, default 0.2

View File

@ -99,14 +99,16 @@ class XGBModel(XGBModelBase):
missing : float, optional
Value in the data which needs to be present as a missing value. If
None, defaults to np.nan.
**kwargs : dict, optional
\*\*kwargs : dict, optional
Keyword arguments for XGBoost Booster object. Full documentation of parameters can
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md.
Attempting to set a parameter via the constructor args and **kwargs dict simultaneously
be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously
will result in a TypeError.
Note:
**kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via
this argument will interact properly with Sklearn.
.. note:: \*\*kwargs unsupported by scikit-learn
\*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters
passed via this argument will interact properly with scikit-learn.
Note
----
@ -237,7 +239,7 @@ class XGBModel(XGBModelBase):
instance weights on the i-th validation set.
eval_metric : str, callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call
doc/parameter.rst. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a
DMatrix object such that you may need to call the get_label
method. It must return a str, value pair where the str is a name
@ -314,6 +316,38 @@ class XGBModel(XGBModelBase):
return self
def predict(self, data, output_margin=False, ntree_limit=0):
"""
Predict with `data`.
.. note:: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call ``predict()``.
.. note:: Using ``predict()`` with DART booster
If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
some of the trees will be evaluated. This will produce incorrect results if ``data`` is
not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
a nonzero value, e.g.
.. code-block:: python
preds = bst.predict(dtest, ntree_limit=num_round)
Parameters
----------
data : DMatrix
The dmatrix storing the input.
output_margin : bool
Whether to output the raw untransformed margin value.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
prediction : numpy array
"""
# pylint: disable=missing-docstring,invalid-name
test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)
return self.get_booster().predict(test_dmatrix,
@ -346,10 +380,10 @@ class XGBModel(XGBModelBase):
def evals_result(self):
"""Return the evaluation results.
If eval_set is passed to the `fit` function, you can call evals_result() to
get evaluation results for all passed eval_sets. When eval_metric is also
passed to the `fit` function, the evals_result will contain the eval_metrics
passed to the `fit` function
If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to
get evaluation results for all passed eval_sets. When ``eval_metric`` is also
passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics``
passed to the ``fit`` function
Returns
-------
@ -357,20 +391,26 @@ class XGBModel(XGBModelBase):
Example
-------
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBModel(**param_dist)
.. code-block:: python
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
evals_result = clf.evals_result()
clf = xgb.XGBModel(**param_dist)
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result()
The variable evals_result will contain:
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
.. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
"""
if self.evals_result_:
evals_result = self.evals_result_
@ -382,9 +422,11 @@ class XGBModel(XGBModelBase):
@property
def feature_importances_(self):
"""
Feature importances property
Returns
-------
feature_importances_ : array of shape = [n_features]
feature_importances_ : array of shape ``[n_features]``
"""
b = self.get_booster()
@ -396,9 +438,8 @@ class XGBModel(XGBModelBase):
class XGBClassifier(XGBModel, XGBClassifierBase):
# pylint: disable=missing-docstring,too-many-arguments,invalid-name
__doc__ = """Implementation of the scikit-learn API for XGBoost classification.
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
__doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])
def __init__(self, max_depth=3, learning_rate=0.1,
n_estimators=100, silent=True,
@ -439,7 +480,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
instance weights on the i-th validation set.
eval_metric : str, callable, optional
If a str, should be a built-in evaluation metric to use. See
doc/parameter.md. If callable, a custom evaluation metric. The call
doc/parameter.rst. If callable, a custom evaluation metric. The call
signature is func(y_predicted, y_true) where y_true will be a
DMatrix object such that you may need to call the get_label
method. It must return a str, value pair where the str is a name
@ -567,10 +608,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
def predict_proba(self, data, ntree_limit=0):
"""
Predict the probability of each `data` example being of a given class.
NOTE: This function is not thread safe.
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call xgb.copy() to make copies
of model object and then call predict
.. note:: This function is not thread safe
For each booster object, predict can only be called from one thread.
If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
of model object and then call predict
Parameters
----------
data : DMatrix
@ -606,20 +650,26 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
Example
-------
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
clf = xgb.XGBClassifier(**param_dist)
.. code-block:: python
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
param_dist = {'objective':'binary:logistic', 'n_estimators':2}
evals_result = clf.evals_result()
clf = xgb.XGBClassifier(**param_dist)
The variable evals_result will contain:
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
clf.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric='logloss',
verbose=True)
evals_result = clf.evals_result()
The variable ``evals_result`` will contain
.. code-block:: none
{'validation_0': {'logloss': ['0.604835', '0.531479']},
'validation_1': {'logloss': ['0.41965', '0.17686']}}
"""
if self.evals_result_:
evals_result = self.evals_result_
@ -631,5 +681,5 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
class XGBRegressor(XGBModel, XGBRegressorBase):
# pylint: disable=missing-docstring
__doc__ = """Implementation of the scikit-learn API for XGBoost regression.
""" + '\n'.join(XGBModel.__doc__.split('\n')[2:])
__doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\
+ '\n'.join(XGBModel.__doc__.split('\n')[2:])

View File

@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
and/or num_class appears in the parameters)
evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist.
Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
a parameter containing ('eval_metric': 'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953']},
'eval': {'logloss': ['0.480385', '0.357756']}}
a parameter containing ('eval_metric': 'logloss'), the **evals_result**
returns
.. code-block:: none
{'train': {'logloss': ['0.48253', '0.35953']},
'eval': {'logloss': ['0.480385', '0.357756']}}
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is
If **verbose_eval** is True then the evaluation metric on the validation set is
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
If **verbose_eval** is an integer then the evaluation metric on the validation set
is printed at every given **verbose_eval** boosting stage. The last boosting stage
/ the boosting stage found by using **early_stopping_rounds** is also printed.
Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function (deprecated - use callback API instead)
List of learning rate for each boosting round
@ -341,8 +347,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using xgb.callback module.
Example: [xgb.callback.reset_learning_rate(custom_rates)]
shuffle : bool
Example:
.. code-block:: none
[xgb.callback.reset_learning_rate(custom_rates)]
shuffle : bool
Shuffle data before creating folds.
Returns