Merge pull request #5 from dmlc/master

Update to latest version
2015-11-03 20:37:20 +01:00
parent 7c79c9ac3a e436c94419
commit 96f221e0d0
95 changed files with 2074 additions and 962 deletions
--- a/python-package/xgboost/build-python.sh
+++ b/python-package/xgboost/build-python.sh
@@ -11,6 +11,8 @@


 pushd xgboost
+#remove the pre-compiled .so and trigger the system's on-the-fly compiling
+make clean
 if make python; then
    echo "Successfully build multi-thread xgboost"
 else
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -249,7 +249,7 @@ class DMatrix(object):
                csr = scipy.sparse.csr_matrix(data)
                self._init_from_csr(csr)
            except:
-                raise TypeError('can not intialize DMatrix from {}'.format(type(data).__name__))
+                raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__))
        if label is not None:
            self.set_label(label)
        if weight is not None:
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -36,9 +36,10 @@ def find_lib_path():
    else:
        dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    #From github issues, most of installation errors come from machines w/o compilers
    if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
        raise XGBoostLibraryNotFound(
            'Cannot find XGBoost Libarary in the candicate path, ' +
-            'did you run build.sh in root path?\n'
+            'did you install compilers and run build.sh in root path?\n'
            'List of candidates:\n' + ('\n'.join(dll_path)))
    return lib_path
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -7,11 +7,12 @@ from __future__ import absolute_import
 import re
 import numpy as np
 from .core import Booster
+from .sklearn import XGBModel

 from io import BytesIO

 def plot_importance(booster, ax=None, height=0.2,
-                    xlim=None, title='Feature importance',
+                    xlim=None, ylim=None, title='Feature importance',
                    xlabel='F score', ylabel='Features',
                    grid=True, **kwargs):

@@ -19,14 +20,16 @@ def plot_importance(booster, ax=None, height=0.2,

    Parameters
    ----------
-    booster : Booster or dict
-        Booster instance, or dict taken by Booster.get_fscore()
+    booster : Booster, XGBModel or dict
+        Booster or XGBModel instance, or dict taken by Booster.get_fscore()
    ax : matplotlib Axes, default None
        Target axes instance. If None, new figure and axes will be created.
    height : float, default 0.2
        Bar height, passed to ax.barh()
    xlim : tuple, default None
        Tuple passed to axes.xlim()
+    ylim : tuple, default None
+        Tuple passed to axes.ylim()
    title : str, default "Feature importance"
        Axes title. To disable, pass None.
    xlabel : str, default "F score"
@@ -46,12 +49,14 @@ def plot_importance(booster, ax=None, height=0.2,
    except ImportError:
        raise ImportError('You must install matplotlib to plot importance')

-    if isinstance(booster, Booster):
+    if isinstance(booster, XGBModel):
+        importance = booster.booster().get_fscore()
+    elif isinstance(booster, Booster):
        importance = booster.get_fscore()
    elif isinstance(booster, dict):
        importance = booster
    else:
-        raise ValueError('tree must be Booster or dict instance')
+        raise ValueError('tree must be Booster, XGBModel or dict instance')

    if len(importance) == 0:
        raise ValueError('Booster.get_fscore() results in empty')
@@ -73,12 +78,19 @@ def plot_importance(booster, ax=None, height=0.2,
    ax.set_yticklabels(labels)

    if xlim is not None:
-        if not isinstance(xlim, tuple) or len(xlim, 2):
+        if not isinstance(xlim, tuple) or len(xlim) != 2:
            raise ValueError('xlim must be a tuple of 2 elements')
    else:
        xlim = (0, max(values) * 1.1)
    ax.set_xlim(xlim)

+    if ylim is not None:
+        if not isinstance(ylim, tuple) or len(ylim) != 2:
+            raise ValueError('ylim must be a tuple of 2 elements')
+    else:
+        ylim = (-1, len(importance))
+    ax.set_ylim(ylim)
+
    if title is not None:
        ax.set_title(title)
    if xlabel is not None:
@@ -142,8 +154,8 @@ def to_graphviz(booster, num_trees=0, rankdir='UT',

    Parameters
    ----------
-    booster : Booster
-        Booster instance
+    booster : Booster, XGBModel
+        Booster or XGBModel instance
    num_trees : int, default 0
        Specify the ordinal number of target tree
    rankdir : str, default "UT"
@@ -165,8 +177,11 @@ def to_graphviz(booster, num_trees=0, rankdir='UT',
    except ImportError:
        raise ImportError('You must install graphviz to plot tree')

-    if not isinstance(booster, Booster):
-        raise ValueError('booster must be Booster instance')
+    if not isinstance(booster, (Booster, XGBModel)):
+        raise ValueError('booster must be Booster or XGBModel instance')
+
+    if isinstance(booster, XGBModel):
+        booster = booster.booster()

    tree = booster.get_dump()[num_trees]
    tree = tree.split()
@@ -193,8 +208,8 @@ def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):

    Parameters
    ----------
-    booster : Booster
-        Booster instance
+    booster : Booster, XGBModel
+        Booster or XGBModel instance
    num_trees : int, default 0
        Specify the ordinal number of target tree
    rankdir : str, default "UT"
@@ -216,7 +231,6 @@ def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):
    except ImportError:
        raise ImportError('You must install matplotlib to plot tree')

-
    if ax is None:
        _, ax = plt.subplots(1, 1)

--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -54,6 +54,14 @@ class XGBModel(XGBModelBase):
        Subsample ratio of the training instance.
    colsample_bytree : float
        Subsample ratio of columns when constructing each tree.
+    colsample_bylevel : float
+        Subsample ratio of columns for each split, in each level.
+    reg_alpha : float (xgb's alpha)
+        L2 regularization term on weights
+    reg_lambda : float (xgb's lambda)
+        L1 regularization term on weights
+    scale_pos_weight : float
+        Balancing of positive and negative weights.

    base_score:
        The initial prediction score of all instances, global bias.
@@ -66,7 +74,8 @@ class XGBModel(XGBModelBase):
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
                 silent=True, objective="reg:linear",
                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
-                 subsample=1, colsample_bytree=1,
+                 subsample=1, colsample_bytree=1, colsample_bylevel=1,
+                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                 base_score=0.5, seed=0, missing=None):
        if not SKLEARN_INSTALLED:
            raise XGBoostError('sklearn needs to be installed in order to use this module')
@@ -82,6 +91,10 @@ class XGBModel(XGBModelBase):
        self.max_delta_step = max_delta_step
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
+        self.colsample_bylevel = colsample_bylevel
+        self.reg_alpha = reg_alpha
+        self.reg_lambda = reg_lambda
+        self.scale_pos_weight = scale_pos_weight

        self.base_score = base_score
        self.seed = seed
@@ -190,7 +203,7 @@ class XGBModel(XGBModelBase):

        if evals_result:
            for val in evals_result.items():
-                evals_result_key = val[1].keys()[0]
+                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

@@ -199,10 +212,12 @@ class XGBModel(XGBModelBase):
            self.best_iteration = self._Booster.best_iteration
        return self

-    def predict(self, data):
+    def predict(self, data, output_margin=False, ntree_limit=0):
        # pylint: disable=missing-docstring,invalid-name
        test_dmatrix = DMatrix(data, missing=self.missing)
-        return self.booster().predict(test_dmatrix)
+        return self.booster().predict(test_dmatrix,
+                                      output_margin=output_margin,
+                                      ntree_limit=ntree_limit)

    def evals_result(self):
        """Return the evaluation results.
@@ -251,14 +266,16 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
                 n_estimators=100, silent=True,
                 objective="binary:logistic",
                 nthread=-1, gamma=0, min_child_weight=1,
-                 max_delta_step=0, subsample=1, colsample_bytree=1,
+                 max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
+                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                 base_score=0.5, seed=0, missing=None):
        super(XGBClassifier, self).__init__(max_depth, learning_rate,
                                            n_estimators, silent, objective,
                                            nthread, gamma, min_child_weight,
                                            max_delta_step, subsample,
-                                            colsample_bytree,
-                                            base_score, seed, missing)
+                                            colsample_bytree, colsample_bylevel,
+                                            reg_alpha, reg_lambda,
+                                            scale_pos_weight, base_score, seed, missing)

    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
            early_stopping_rounds=None, verbose=True):
@@ -341,7 +358,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):

        if evals_result:
            for val in evals_result.items():
-                evals_result_key = val[1].keys()[0]
+                evals_result_key = list(val[1].keys())[0]
                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
            self.evals_result_ = evals_result

@@ -351,9 +368,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase):

        return self

-    def predict(self, data):
+    def predict(self, data, output_margin=False, ntree_limit=0):
        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
+        class_probs = self.booster().predict(test_dmatrix,
+                                             output_margin=output_margin,
+                                             ntree_limit=ntree_limit)
        if len(class_probs.shape) > 1:
            column_indexes = np.argmax(class_probs, axis=1)
        else:
@@ -361,9 +380,11 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
            column_indexes[class_probs > 0.5] = 1
        return self._le.inverse_transform(column_indexes)

-    def predict_proba(self, data):
+    def predict_proba(self, data, output_margin=False, ntree_limit=0):
        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
+        class_probs = self.booster().predict(test_dmatrix,
+                                             output_margin=output_margin,
+                                             ntree_limit=ntree_limit)
        if self.objective == "multi:softprob":
            return class_probs
        else:
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -10,7 +10,8 @@ import numpy as np
 from .core import Booster, STRING_TYPES

 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
-          early_stopping_rounds=None, evals_result=None, verbose_eval=True):
+          maximize=False, early_stopping_rounds=None, evals_result=None,
+          verbose_eval=True, learning_rates=None, xgb_model=None):
    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
    """Train a booster with given parameters.

@@ -29,6 +30,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
        Customized objective function.
    feval : function
        Customized evaluation function.
+    maximize : bool
+        Whether to maximize feval.
    early_stopping_rounds: int
        Activates early stopping. Validation error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue training.
@@ -46,13 +49,27 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    verbose_eval : bool
        If `verbose_eval` then the evaluation metric on the validation set, if
        given, is printed at each boosting stage.
+    learning_rates: list or function
+        Learning rate for each boosting round (yields learning rate decay).
+        - list l: eta = l[boosting round]
+        - function f: eta = f(boosting round, num_boost_round)
+    xgb_model : file name of stored xgb model or 'Booster' instance
+        Xgb model to be loaded before training (allows training continuation).

    Returns
    -------
    booster : a trained booster model
    """
    evals = list(evals)
-    bst = Booster(params, [dtrain] + [d[0] for d in evals])
+    ntrees = 0
+    if xgb_model is not None:
+        if not isinstance(xgb_model, STRING_TYPES):
+            xgb_model = xgb_model.save_raw()
+        bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
+        ntrees = len(bst.get_dump())
+    else:
+        bst = Booster(params, [dtrain] + [d[0] for d in evals])
+

    if evals_result is not None:
        if not isinstance(evals_result, dict):
@@ -65,6 +82,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    if not early_stopping_rounds:
        for i in range(num_boost_round):
            bst.update(dtrain, i, obj)
+            ntrees += 1
            if len(evals) != 0:
                bst_eval_set = bst.eval_set(evals, i, feval)
                if isinstance(bst_eval_set, STRING_TYPES):
@@ -78,7 +96,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                    res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
                    for key in evals_name:
                        evals_idx = evals_name.index(key)
-                        res_per_eval = len(res) / len(evals_name)
+                        res_per_eval = len(res) // len(evals_name)
                        for r in range(res_per_eval):
                            res_item = res[(evals_idx*res_per_eval) + r]
                            res_key = res_item[0]
@@ -87,6 +105,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                                evals_result[key][res_key].append(res_val)
                            else:
                                evals_result[key][res_key] = [res_val]
+        bst.best_iteration = (ntrees - 1)
        return bst

    else:
@@ -94,7 +113,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
        if len(evals) < 1:
            raise ValueError('For early stopping you need at least one set in evals.')

-        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
+        if verbose_eval:
+            sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
                evals[-1][1], early_stopping_rounds))

        # is params a list of tuples? are we using multiple eval metrics?
@@ -110,6 +130,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
            maximize_metrics = ('auc', 'map', 'ndcg')
            if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
                maximize_score = True
+        if feval is not None:
+            maximize_score = maximize

        if maximize_score:
            best_score = 0.0
@@ -117,10 +139,19 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
            best_score = float('inf')

        best_msg = ''
-        best_score_i = 0
+        best_score_i = ntrees
+
+        if isinstance(learning_rates, list) and len(learning_rates) != num_boost_round:
+            raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")

        for i in range(num_boost_round):
+            if learning_rates is not None:
+                if isinstance(learning_rates, list):
+                    bst.set_param({'eta': learning_rates[i]})
+                else:
+                    bst.set_param({'eta': learning_rates(i, num_boost_round)})
            bst.update(dtrain, i, obj)
+            ntrees += 1
            bst_eval_set = bst.eval_set(evals, i, feval)

            if isinstance(bst_eval_set, STRING_TYPES):
@@ -135,7 +166,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
                for key in evals_name:
                    evals_idx = evals_name.index(key)
-                    res_per_eval = len(res) / len(evals_name)
+                    res_per_eval = len(res) // len(evals_name)
                    for r in range(res_per_eval):
                        res_item = res[(evals_idx*res_per_eval) + r]
                        res_key = res_item[0]
@@ -149,7 +180,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
            if (maximize_score and score > best_score) or \
                    (not maximize_score and score < best_score):
                best_score = score
-                best_score_i = i
+                best_score_i = (ntrees - 1)
                best_msg = msg
            elif i - best_score_i >= early_stopping_rounds:
                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))