diff --git a/doc/conf.py b/doc/conf.py index 7efdf2070..759a785a4 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -14,6 +14,7 @@ from subprocess import call from sh.contrib import git import urllib.request +from urllib.error import HTTPError from recommonmark.parser import CommonMarkParser import sys import re @@ -24,8 +25,11 @@ import guzzle_sphinx_theme git_branch = [re.sub(r'origin/', '', x.lstrip(' ')) for x in str(git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')] git_branch = [x for x in git_branch if 'HEAD' not in x] print('git_branch = {}'.format(git_branch[0])) -filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0])) -call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True) +try: + filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(git_branch[0])) + call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True) +except HTTPError: + print('JVM doc not found. Skipping...') # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst index 8abe24196..4e6fb9d9d 100644 --- a/doc/jvm/xgboost4j_spark_tutorial.rst +++ b/doc/jvm/xgboost4j_spark_tutorial.rst @@ -274,7 +274,7 @@ and then loading the model in another session: With regards to ML pipeline save and load, please refer the next section. Interact with Other Bindings of XGBoost ------------------------------------- +--------------------------------------- After we train a model with XGBoost4j-Spark on massive dataset, sometimes we want to do model serving in single machine or integrate it with other single node libraries for further processing. XGBoost4j-Spark supports export model to local by: .. code-block:: scala diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst index b430bba39..b6a3d650e 100644 --- a/doc/python/python_api.rst +++ b/doc/python/python_api.rst @@ -2,6 +2,10 @@ Python API Reference ==================== This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package. +.. contents:: + :backlinks: none + :local: + Core Data Structure ------------------- .. automodule:: xgboost.core @@ -29,9 +33,15 @@ Scikit-Learn API .. automodule:: xgboost.sklearn .. autoclass:: xgboost.XGBRegressor :members: + :inherited-members: :show-inheritance: .. autoclass:: xgboost.XGBClassifier :members: + :inherited-members: + :show-inheritance: +.. autoclass:: xgboost.XGBRanker + :members: + :inherited-members: :show-inheritance: Plotting API diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index 06086444e..faebbd772 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -12,7 +12,7 @@ from .core import DMatrix, Booster from .training import train, cv from . import rabit # noqa try: - from .sklearn import XGBModel, XGBClassifier, XGBRegressor + from .sklearn import XGBModel, XGBClassifier, XGBRegressor, XGBRanker from .plotting import plot_importance, plot_tree, to_graphviz except ImportError: pass @@ -23,5 +23,5 @@ with open(VERSION_FILE) as f: __all__ = ['DMatrix', 'Booster', 'train', 'cv', - 'XGBModel', 'XGBClassifier', 'XGBRegressor', + 'XGBModel', 'XGBClassifier', 'XGBRegressor', 'XGBRanker', 'plot_importance', 'plot_tree', 'to_graphviz'] diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 86b70a01a..117f672f3 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1376,11 +1376,12 @@ class Booster(object): def get_score(self, fmap='', importance_type='weight'): """Get feature importance of each feature. Importance type can be defined as: - 'weight' - the number of times a feature is used to split the data across all trees. - 'gain' - the average gain across all splits the feature is used in. - 'cover' - the average coverage across all splits the feature is used in. - 'total_gain' - the total gain across all splits the feature is used in. - 'total_cover' - the total coverage across all splits the feature is used in. + + * 'weight': the number of times a feature is used to split the data across all trees. + * 'gain': the average gain across all splits the feature is used in. + * 'cover': the average coverage across all splits the feature is used in. + * 'total_gain': the total gain across all splits the feature is used in. + * 'total_cover': the total coverage across all splits the feature is used in. Parameters ---------- @@ -1496,6 +1497,7 @@ class Booster(object): def get_split_value_histogram(self, feature, fmap='', bins=None, as_pandas=True): """Get split value histogram of a feature + Parameters ---------- feature: str @@ -1506,7 +1508,7 @@ class Booster(object): The maximum number of bins. Number of bins equals number of unique split values n_unique, if bins == None or bins > n_unique. - as_pandas : bool, default True + as_pandas: bool, default True Return pd.DataFrame when pandas is installed. If False or pandas is not installed, return numpy ndarray. diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 59c657c55..99bc31675 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -28,10 +28,11 @@ def plot_importance(booster, ax=None, height=0.2, grid : bool, Turn the axes grids on or off. Default is True (On). importance_type : str, default "weight" How the importance is calculated: either "weight", "gain", or "cover" - "weight" is the number of times a feature appears in a tree - "gain" is the average gain of splits which use the feature - "cover" is the average coverage of splits which use the feature - where coverage is defined as the number of samples affected by the split + + * "weight" is the number of times a feature appears in a tree + * "gain" is the average gain of splits which use the feature + * "cover" is the average coverage of splits which use the feature + where coverage is defined as the number of samples affected by the split max_num_features : int, default None Maximum number of top features displayed on plot. If None, all features will be displayed. height : float, default 0.2 diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index f78bf9439..f37415ac4 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -99,14 +99,16 @@ class XGBModel(XGBModelBase): missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. - **kwargs : dict, optional + \*\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. - Attempting to set a parameter via the constructor args and **kwargs dict simultaneously + Attempting to set a parameter via the constructor args and \*\*kwargs dict simultaneously will result in a TypeError. - Note: - **kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via - this argument will interact properly with Sklearn. + + .. note:: \*\*kwargs unsupported by scikit-learn + + \*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters + passed via this argument will interact properly with scikit-learn. Note ---- @@ -217,6 +219,7 @@ class XGBModel(XGBModelBase): def save_model(self, fname): """ Save the model to a file. + Parameters ---------- fname : string @@ -227,6 +230,7 @@ class XGBModel(XGBModelBase): def load_model(self, fname): """ Load the model from a file. + Parameters ---------- fname : string or a memory buffer @@ -336,6 +340,39 @@ class XGBModel(XGBModelBase): return self def predict(self, data, output_margin=False, ntree_limit=None): + """ + Predict with `data`. + + .. note:: This function is not thread safe. + + For each booster object, predict can only be called from one thread. + If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies + of model object and then call ``predict()``. + + .. note:: Using ``predict()`` with DART booster + + If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only + some of the trees will be evaluated. This will produce incorrect results if ``data`` is + not the training data. To obtain correct results on test sets, set ``ntree_limit`` to + a nonzero value, e.g. + + .. code-block:: python + + preds = bst.predict(dtest, ntree_limit=num_round) + + Parameters + ---------- + data : DMatrix + The dmatrix storing the input. + output_margin : bool + Whether to output the raw untransformed margin value. + ntree_limit : int + Limit number of trees in the prediction; defaults to best_ntree_limit if defined + (i.e. it has been trained with early stopping), otherwise 0 (use all trees). + Returns + ------- + prediction : numpy array + """ # pylint: disable=missing-docstring,invalid-name test_dmatrix = DMatrix(data, missing=self.missing, nthread=self.n_jobs) # get ntree_limit to use - if none specified, default to @@ -372,10 +409,10 @@ class XGBModel(XGBModelBase): def evals_result(self): """Return the evaluation results. - If eval_set is passed to the `fit` function, you can call evals_result() to - get evaluation results for all passed eval_sets. When eval_metric is also - passed to the `fit` function, the evals_result will contain the eval_metrics - passed to the `fit` function + If ``eval_set`` is passed to the `fit` function, you can call ``evals_result()`` to + get evaluation results for all passed eval_sets. When ``eval_metric`` is also + passed to the ``fit`` function, the ``evals_result`` will contain the ``eval_metrics`` + passed to the ``fit`` function Returns ------- @@ -383,20 +420,26 @@ class XGBModel(XGBModelBase): Example ------- - param_dist = {'objective':'binary:logistic', 'n_estimators':2} - clf = xgb.XGBModel(**param_dist) + .. code-block:: python - clf.fit(X_train, y_train, - eval_set=[(X_train, y_train), (X_test, y_test)], - eval_metric='logloss', - verbose=True) + param_dist = {'objective':'binary:logistic', 'n_estimators':2} - evals_result = clf.evals_result() + clf = xgb.XGBModel(**param_dist) + + clf.fit(X_train, y_train, + eval_set=[(X_train, y_train), (X_test, y_test)], + eval_metric='logloss', + verbose=True) + + evals_result = clf.evals_result() The variable evals_result will contain: - {'validation_0': {'logloss': ['0.604835', '0.531479']}, - 'validation_1': {'logloss': ['0.41965', '0.17686']}} + + .. code-block:: none + + {'validation_0': {'logloss': ['0.604835', '0.531479']}, + 'validation_1': {'logloss': ['0.41965', '0.17686']}} """ if self.evals_result_: evals_result = self.evals_result_ @@ -408,9 +451,11 @@ class XGBModel(XGBModelBase): @property def feature_importances_(self): """ + Feature importances property + Returns ------- - feature_importances_ : array of shape = [n_features] + feature_importances_ : array of shape ``[n_features]`` """ b = self.get_booster() @@ -422,9 +467,8 @@ class XGBModel(XGBModelBase): class XGBClassifier(XGBModel, XGBClassifierBase): # pylint: disable=missing-docstring,too-many-arguments,invalid-name - __doc__ = """Implementation of the scikit-learn API for XGBoost classification. - - """ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) + __doc__ = "Implementation of the scikit-learn API for XGBoost classification.\n\n" \ + + '\n'.join(XGBModel.__doc__.split('\n')[2:]) def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, @@ -610,10 +654,13 @@ class XGBClassifier(XGBModel, XGBClassifierBase): def predict_proba(self, data, ntree_limit=None): """ Predict the probability of each `data` example being of a given class. - NOTE: This function is not thread safe. - For each booster object, predict can only be called from one thread. - If you want to run prediction using multiple thread, call xgb.copy() to make copies - of model object and then call predict + + .. note:: This function is not thread safe + + For each booster object, predict can only be called from one thread. + If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies + of model object and then call predict + Parameters ---------- data : DMatrix @@ -621,6 +668,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): ntree_limit : int Limit number of trees in the prediction; defaults to best_ntree_limit if defined (i.e. it has been trained with early stopping), otherwise 0 (use all trees). + Returns ------- prediction : numpy array @@ -652,20 +700,26 @@ class XGBClassifier(XGBModel, XGBClassifierBase): Example ------- - param_dist = {'objective':'binary:logistic', 'n_estimators':2} - clf = xgb.XGBClassifier(**param_dist) + .. code-block:: python - clf.fit(X_train, y_train, - eval_set=[(X_train, y_train), (X_test, y_test)], - eval_metric='logloss', - verbose=True) + param_dist = {'objective':'binary:logistic', 'n_estimators':2} - evals_result = clf.evals_result() + clf = xgb.XGBClassifier(**param_dist) - The variable evals_result will contain: - {'validation_0': {'logloss': ['0.604835', '0.531479']}, - 'validation_1': {'logloss': ['0.41965', '0.17686']}} + clf.fit(X_train, y_train, + eval_set=[(X_train, y_train), (X_test, y_test)], + eval_metric='logloss', + verbose=True) + + evals_result = clf.evals_result() + + The variable ``evals_result`` will contain + + .. code-block:: none + + {'validation_0': {'logloss': ['0.604835', '0.531479']}, + 'validation_1': {'logloss': ['0.41965', '0.17686']}} """ if self.evals_result_: evals_result = self.evals_result_ @@ -677,8 +731,8 @@ class XGBClassifier(XGBModel, XGBClassifierBase): class XGBRegressor(XGBModel, XGBRegressorBase): # pylint: disable=missing-docstring - __doc__ = """Implementation of the scikit-learn API for XGBoost regression. - """ + '\n'.join(XGBModel.__doc__.split('\n')[2:]) + __doc__ = "Implementation of the scikit-learn API for XGBoost regression.\n\n"\ + + '\n'.join(XGBModel.__doc__.split('\n')[2:]) class XGBRanker(XGBModel): @@ -731,14 +785,16 @@ class XGBRanker(XGBModel): missing : float, optional Value in the data which needs to be present as a missing value. If None, defaults to np.nan. - **kwargs : dict, optional + \*\*kwargs : dict, optional Keyword arguments for XGBoost Booster object. Full documentation of parameters can be found here: https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst. - Attempting to set a parameter via the constructor args and **kwargs dict simultaneously - will result in a TypeError. - Note: - **kwargs is unsupported by Sklearn. We do not guarantee that parameters passed via - this argument will interact properly with Sklearn. + Attempting to set a parameter via the constructor args and \*\*kwargs dict + simultaneously will result in a TypeError. + + .. note:: \*\*kwargs unsupported by scikit-learn + + \*\*kwargs is unsupported by scikit-learn. We do not guarantee that parameters + passed via this argument will interact properly with scikit-learn. Note ---- @@ -750,16 +806,25 @@ class XGBRanker(XGBModel): For example, if your original data look like: + +-------+-----------+---------------+ | qid | label | features | + +-------+-----------+---------------+ | 1 | 0 | x_1 | + +-------+-----------+---------------+ | 1 | 1 | x_2 | + +-------+-----------+---------------+ | 1 | 0 | x_3 | + +-------+-----------+---------------+ | 2 | 0 | x_4 | + +-------+-----------+---------------+ | 2 | 1 | x_5 | + +-------+-----------+---------------+ | 2 | 1 | x_6 | + +-------+-----------+---------------+ | 2 | 1 | x_7 | + +-------+-----------+---------------+ - then your group array should be [3, 4]. + then your group array should be ``[3, 4]``. """ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, @@ -908,3 +973,5 @@ class XGBRanker(XGBModel): return self.get_booster().predict(test_dmatrix, output_margin=output_margin, ntree_limit=ntree_limit) + + predict.__doc__ = XGBModel.predict.__doc__ diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 3216aa8a2..eb568da65 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -147,18 +147,24 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, and/or num_class appears in the parameters) evals_result: dict This dictionary stores the evaluation results of all the items in watchlist. + Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and - a parameter containing ('eval_metric': 'logloss') - Returns: {'train': {'logloss': ['0.48253', '0.35953']}, - 'eval': {'logloss': ['0.480385', '0.357756']}} + a parameter containing ('eval_metric': 'logloss'), the **evals_result** + returns + + .. code-block:: none + + {'train': {'logloss': ['0.48253', '0.35953']}, + 'eval': {'logloss': ['0.480385', '0.357756']}} + verbose_eval : bool or int Requires at least one item in evals. - If `verbose_eval` is True then the evaluation metric on the validation set is + If **verbose_eval** is True then the evaluation metric on the validation set is printed at each boosting stage. - If `verbose_eval` is an integer then the evaluation metric on the validation set - is printed at every given `verbose_eval` boosting stage. The last boosting stage - / the boosting stage found by using `early_stopping_rounds` is also printed. - Example: with verbose_eval=4 and at least one item in evals, an evaluation metric + If **verbose_eval** is an integer then the evaluation metric on the validation set + is printed at every given **verbose_eval** boosting stage. The last boosting stage + / the boosting stage found by using **early_stopping_rounds** is also printed. + Example: with ``verbose_eval=4`` and at least one item in evals, an evaluation metric is printed every 4 boosting stages, instead of every boosting stage. learning_rates: list or function (deprecated - use callback API instead) List of learning rate for each boosting round @@ -328,10 +334,10 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None folds : a KFold or StratifiedKFold instance or list of fold indices Sklearn KFolds or StratifiedKFolds object. Alternatively may explicitly pass sample indices for each fold. - For `n` folds, `folds` should be a length `n` list of tuples. - Each tuple is `(in,out)` where `in` is a list of indices to be used - as the training samples for the `n`th fold and `out` is a list of - indices to be used as the testing samples for the `n`th fold. + For ``n`` folds, ``folds`` should be a length ``n`` list of tuples. + Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used + as the training samples for the ``n`` th fold and ``out`` is a list of + indices to be used as the testing samples for the ``n`` th fold. metrics : string or list of strings Evaluation metrics to be watched in CV. obj : function @@ -363,8 +369,12 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using xgb.callback module. - Example: [xgb.callback.reset_learning_rate(custom_rates)] - shuffle : bool + Example: + + .. code-block:: none + + [xgb.callback.reset_learning_rate(custom_rates)] + shuffle : bool Shuffle data before creating folds. Returns