Update Python API doc (#3619)

* Show inherited members of XGBRegressor in API doc, since XGBRegressor uses default methods from XGBModel * Add table of contents to Python API doc * Skip JVM doc download if not available * Show inherited members for XGBRegressor * Add docstring to XGBRegressor.predict() * Fix rendering errors in Python docstrings * Fix lint
2018-09-05 12:15:23 -07:00
parent d1c250f8cf
commit 953ed1a99b
6 changed files with 134 additions and 61 deletions
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -14,6 +14,7 @@
 from subprocess import call
 from sh.contrib import git
 import urllib.request
+from urllib.error import HTTPError
 from recommonmark.parser import CommonMarkParser
 import sys
 import re
@@ -24,8 +25,11 @@ import guzzle_sphinx_theme
 git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')]
 git_branch = [x for x in git_branch if 'HEAD' not in x]
 print('git_branch = {}'.format(git_branch[0]))
-filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
-call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
+try:
+  filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0]))
+  call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
+except HTTPError:
+  print('JVM doc not found. Skipping...')

 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -2,6 +2,10 @@ Python API Reference
 ====================
 This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.

+.. contents::
+  :backlinks: none
+  :local:
+
 Core Data Structure
 -------------------
 .. automodule:: xgboost.core
@@ -29,9 +33,11 @@ Scikit-Learn API
 .. automodule:: xgboost.sklearn
 .. autoclass:: xgboost.XGBRegressor
    :members:
+    :inherited-members:
    :show-inheritance:
 .. autoclass:: xgboost.XGBClassifier
    :members:
+    :inherited-members:
    :show-inheritance:

 Plotting API
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1212,9 +1212,10 @@ class Booster(object):
    def get_score(self, fmap='', importance_type='weight'):
        """Get feature importance of each feature.
        Importance type can be defined as:
-            'weight' - the number of times a feature is used to split the data across all trees.
-            'gain' - the average gain of the feature when it is used in trees
-            'cover' - the average coverage of the feature when it is used in trees
+
+        * 'weight': the number of times a feature is used to split the data across all trees.
+        * 'gain': the average gain across all splits the feature is used in.
+        * 'cover': the average coverage across all splits the feature is used in.

        Parameters
        ----------
@@ -1317,6 +1318,7 @@ class Booster(object):

    def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True):
        """Get split value histogram of a feature
+
        Parameters
        ----------
        feature: str
@@ -1327,7 +1329,7 @@ class Booster(object):
            The maximum number of bins.
            Number of bins equals number of unique split values n_unique,
            if bins == None or bins > n_unique.
-        as_pandas : bool, default True
+        as_pandas: bool, default True
            Return pd.DataFrame when pandas is installed.
            If False or pandas is not installed, return numpy ndarray.

--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -28,10 +28,11 @@ def plot_importance(booster, ax=None, height=0.2,
    grid : bool, Turn the axes grids on or off.  Default is True (On).
    importance_type : str, default "weight"
        How the importance is calculated: either "weight", "gain", or "cover"
-        "weight" is the number of times a feature appears in a tree
-        "gain" is the average gain of splits which use the feature
-        "cover" is the average coverage of splits which use the feature
-            where coverage is defined as the number of samples affected by the split
+
+        * "weight" is the number of times a feature appears in a tree
+        * "gain" is the average gain of splits which use the feature
+        * "cover" is the average coverage of splits which use the feature
+          where coverage is defined as the number of samples affected by the split
    max_num_features : int, default None
        Maximum number of top features displayed on plot. If None, all features will be displayed.
    height : float, default 0.2
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -99,14 +99,16 @@ class XGBModel(XGBModelBase):
    missing : float, optional
        Value in the data which needs to be present as a missing value. If
        None, defaults to np.nan.
-    **kwargs : dict, optional
+    \*\*kwargs : dict, optional
        Keyword arguments for XGBoost Booster object.  Full documentation of parameters can
-        be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md.
-        Attempting to set a parameter via the constructor args and **kwargs dict simultaneously
+        be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst.
+        Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously
        will result in a TypeError.
-        Note:
-            **kwargs is unsupported by Sklearn.  We do not guarantee that parameters passed via
-            this argument will interact properly with Sklearn.
+
+        .. note:: \*\*kwargs unsupported by scikit-learn
+
+            \*\*kwargs is unsupported by scikit-learn.  We do not guarantee that parameters
+            passed via this argument will interact properly with scikit-learn.

    Note
    ----
@@ -237,7 +239,7 @@ class XGBModel(XGBModelBase):
            instance weights on the i-th validation set.
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
-            doc/parameter.md. If callable, a custom evaluation metric. The call
+            doc/parameter.rst. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
@@ -314,6 +316,38 @@ class XGBModel(XGBModelBase):
        return self

    def predict(self, data, output_margin=False, ntree_limit=0):
+        """
+        Predict with `data`.
+
+        .. note:: This function is not thread safe.
+
+          For each booster object, predict can only be called from one thread.
+          If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
+          of model object and then call ``predict()``.
+
+        .. note:: Using ``predict()`` with DART booster
+
+          If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+          some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+          not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+          a nonzero value, e.g.
+
+          .. code-block:: python
+
+            preds = bst.predict(dtest, ntree_limit=num_round)
+
+        Parameters
+        ----------
+        data : DMatrix
+            The dmatrix storing the input.
+        output_margin : bool
+            Whether to output the raw untransformed margin value.
+        ntree_limit : int
+            Limit number of trees in the prediction; defaults to 0 (use all trees).
+        Returns
+        -------
+        prediction : numpy array
+        """
        # pylint: disable=missing-docstring,invalid-name
        test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs)
        return self.get_booster().predict(test_dmatrix,
@@ -346,10 +380,10 @@ class XGBModel(XGBModelBase):
    def evals_result(self):
        """Return the evaluation results.

-        If eval_set is passed to the `fit` function, you can call evals_result() to
-        get evaluation results for all passed eval_sets. When eval_metric is also
-        passed to the `fit` function, the evals_result will contain the eval_metrics
-        passed to the `fit` function
+        If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to
+        get evaluation results for all passed eval_sets. When ``eval_metric`` is also
+        passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics``
+        passed to the ``fit`` function

        Returns
        -------
@@ -357,20 +391,26 @@ class XGBModel(XGBModelBase):

        Example
        -------
-        param_dist = {'objective':'binary:logistic', 'n_estimators':2}

-        clf = xgb.XGBModel(**param_dist)
+        .. code-block:: python

-        clf.fit(X_train, y_train,
-                eval_set=[(X_train, y_train), (X_test, y_test)],
-                eval_metric='logloss',
-                verbose=True)
+            param_dist = {'objective':'binary:logistic', 'n_estimators':2}

-        evals_result = clf.evals_result()
+            clf = xgb.XGBModel(**param_dist)
+
+            clf.fit(X_train, y_train,
+                    eval_set=[(X_train, y_train), (X_test, y_test)],
+                    eval_metric='logloss',
+                    verbose=True)
+
+            evals_result = clf.evals_result()

        The variable evals_result will contain:
-        {'validation_0': {'logloss': ['0.604835', '0.531479']},
-         'validation_1': {'logloss': ['0.41965', '0.17686']}}
+
+        .. code-block:: none
+
+            {'validation_0': {'logloss': ['0.604835', '0.531479']},
+            'validation_1': {'logloss': ['0.41965', '0.17686']}}
        """
        if self.evals_result_:
            evals_result = self.evals_result_
@@ -382,9 +422,11 @@ class XGBModel(XGBModelBase):
    @property
    def feature_importances_(self):
        """
+        Feature importances property
+
        Returns
        -------
-        feature_importances_ : array of shape = [n_features]
+        feature_importances_ : array of shape ``[n_features]``

        """
        b = self.get_booster()
@@ -396,9 +438,8 @@ class XGBModel(XGBModelBase):

 class XGBClassifier(XGBModel, XGBClassifierBase):
    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
-    __doc__ = """Implementation of the scikit-learn API for XGBoost classification.
-
-    """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])
+    __doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \
+        + '\n'.join(XGBModel.__doc__.split('\n')[2:])

    def __init__(self, max_depth=3, learning_rate=0.1,
                 n_estimators=100, silent=True,
@@ -439,7 +480,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            instance weights on the i-th validation set.
        eval_metric : str, callable, optional
            If a str, should be a built-in evaluation metric to use. See
-            doc/parameter.md. If callable, a custom evaluation metric. The call
+            doc/parameter.rst. If callable, a custom evaluation metric. The call
            signature is func(y_predicted, y_true) where y_true will be a
            DMatrix object such that you may need to call the get_label
            method. It must return a str, value pair where the str is a name
@@ -567,10 +608,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
    def predict_proba(self, data, ntree_limit=0):
        """
        Predict the probability of each `data` example being of a given class.
-        NOTE: This function is not thread safe.
-              For each booster object, predict can only be called from one thread.
-              If you want to run prediction using multiple thread, call xgb.copy() to make copies
-              of model object and then call predict
+
+        .. note:: This function is not thread safe
+
+            For each booster object, predict can only be called from one thread.
+            If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
+            of model object and then call predict
+
        Parameters
        ----------
        data : DMatrix
@@ -606,20 +650,26 @@ class XGBClassifier(XGBModel, XGBClassifierBase):

        Example
        -------
-        param_dist = {'objective':'binary:logistic', 'n_estimators':2}

-        clf = xgb.XGBClassifier(**param_dist)
+        .. code-block:: python

-        clf.fit(X_train, y_train,
-                eval_set=[(X_train, y_train), (X_test, y_test)],
-                eval_metric='logloss',
-                verbose=True)
+            param_dist = {'objective':'binary:logistic', 'n_estimators':2}

-        evals_result = clf.evals_result()
+            clf = xgb.XGBClassifier(**param_dist)

-        The variable evals_result will contain:
-        {'validation_0': {'logloss': ['0.604835', '0.531479']},
-         'validation_1': {'logloss': ['0.41965', '0.17686']}}
+            clf.fit(X_train, y_train,
+                    eval_set=[(X_train, y_train), (X_test, y_test)],
+                    eval_metric='logloss',
+                    verbose=True)
+
+            evals_result = clf.evals_result()
+
+        The variable ``evals_result`` will contain
+
+        .. code-block:: none
+
+            {'validation_0': {'logloss': ['0.604835', '0.531479']},
+            'validation_1': {'logloss': ['0.41965', '0.17686']}}
        """
        if self.evals_result_:
            evals_result = self.evals_result_
@@ -631,5 +681,5 @@ class XGBClassifier(XGBModel, XGBClassifierBase):

 class XGBRegressor(XGBModel, XGBRegressorBase):
    # pylint: disable=missing-docstring
-    __doc__ = """Implementation of the scikit-learn API for XGBoost regression.
-    """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])
+    __doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\
+        + '\n'.join(XGBModel.__doc__.split('\n')[2:])
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
        and/or num_class appears in the parameters)
    evals_result: dict
        This dictionary stores the evaluation results of all the items in watchlist.
+
        Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
-        a parameter containing ('eval_metric': 'logloss')
-        Returns: {'train': {'logloss': ['0.48253', '0.35953']},
-                  'eval': {'logloss': ['0.480385', '0.357756']}}
+        a parameter containing ('eval_metric': 'logloss'), the **evals_result**
+        returns
+
+        .. code-block:: none
+
+            {'train': {'logloss': ['0.48253', '0.35953']},
+             'eval': {'logloss': ['0.480385', '0.357756']}}
+
    verbose_eval : bool or int
        Requires at least one item in evals.
-        If `verbose_eval` is True then the evaluation metric on the validation set is
+        If **verbose_eval** is True then the evaluation metric on the validation set is
        printed at each boosting stage.
-        If `verbose_eval` is an integer then the evaluation metric on the validation set
-        is printed at every given `verbose_eval` boosting stage. The last boosting stage
-        / the boosting stage found by using `early_stopping_rounds` is also printed.
-        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        If **verbose_eval** is an integer then the evaluation metric on the validation set
+        is printed at every given **verbose_eval** boosting stage. The last boosting stage
+        / the boosting stage found by using **early_stopping_rounds** is also printed.
+        Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function (deprecated - use callback API instead)
        List of learning rate for each boosting round
@@ -341,8 +347,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
        It is possible to use predefined callbacks by using xgb.callback module.
-        Example: [xgb.callback.reset_learning_rate(custom_rates)]
-     shuffle : bool
+        Example:
+
+        .. code-block:: none
+
+            [xgb.callback.reset_learning_rate(custom_rates)]
+    shuffle : bool
        Shuffle data before creating folds.

    Returns