From f9e157011fdb7fb022543fbd6eb004477260df11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zygmunt=20Zaj=C4=85c?= <zajac.zygmunt@gmail.com>
Date: Mon, 30 Mar 2015 19:53:47 +0200
Subject: [PATCH 1/5] early stopping for Python wrapper

---
 wrapper/xgboost.py | 72 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 5 deletions(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index affda3ca7..5a5d59b11 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -1,7 +1,10 @@
+# coding: utf-8
+
 """
 xgboost: eXtreme Gradient Boosting library
 
 Authors: Tianqi Chen, Bing Xu
+Early stopping by Zygmunt Zając
 """
 
 from __future__ import absolute_import
@@ -529,6 +532,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
         Data to be trained.
     num_boost_round: int
         Number of boosting iterations.
+        If negative, train until validation error hasn't decreased in -num_boost_round rounds.
+        Requires at least one item in evals. If there's more than one, will use the last.
     watchlist : list of pairs (DMatrix, string)
         List of items to be evaluated during training, this allows user to watch
         performance on the validation set.
@@ -541,16 +546,73 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
     -------
     booster : a trained booster model
     """
+    
+    if num_boost_round < 0 and len(evals) < 1:
+        raise ValueError('For early stopping you need at least on set in evals.')
+    
     evals = list(evals)
     bst = Booster(params, [dtrain] + [d[0] for d in evals])
-    for i in range(num_boost_round):
-        bst.update(dtrain, i, obj)
-        if len(evals) != 0:
+    
+    if num_boost_round >= 0:
+        for i in range(num_boost_round):
+            bst.update(dtrain, i, obj)
+            if len(evals) != 0:
+                bst_eval_set = bst.eval_set(evals, i, feval)
+                if isinstance(bst_eval_set, string_types):
+                    sys.stderr.write(bst_eval_set + '\n')
+                else:
+                    sys.stderr.write(bst_eval_set.decode() + '\n')
+    else:
+        # early stopping
+        
+        # TODO: return model from the best iteration
+        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], -num_boost_round))
+        
+        # is params a list of tuples? are we using multiple eval metrics?
+        if type(params) == list:
+            if len(params) != len(dict(params).items()):
+                raise ValueError('Check your params. Early stopping works with single eval metric only.')
+            params = dict(params)
+
+        # either minimize loss or maximize AUC/MAP/NDCG
+        maximize_score = False
+        if 'eval_metric' in params:
+			maximize_metrics = ('auc', 'map', 'ndcg')
+			if filter( lambda x: params['eval_metric'].startswith(x), maximize_metrics ):
+                maximize_score = True
+        
+        if maximize_score:
+            best_score = 0.0
+        else:
+            best_score = float('inf')
+            
+        best_msg = '' 
+        best_score_i = 0
+        i = 0
+        
+        while True:
+            bst.update(dtrain, i, obj)
             bst_eval_set = bst.eval_set(evals, i, feval)
+            
             if isinstance(bst_eval_set, string_types):
-                sys.stderr.write(bst_eval_set + '\n')
+                msg = bst_eval_set
             else:
-                sys.stderr.write(bst_eval_set.decode() + '\n')
+                msg = bst_eval_set.decode()
+                
+            sys.stderr.write(msg + '\n')
+            
+            score = float(msg.rsplit( ':', 1 )[1])
+            if (maximize_score and score > best_score) or \
+                (not maximize_score and score < best_score):
+                best_score = score
+                best_score_i = i
+                best_msg = msg
+            elif i - best_score_i >= -num_boost_round:
+                sys.stderr.write("Stopping. Best iteration:\n{}".format(best_msg))
+                break
+               
+            i += 1
+        
     return bst
 
 

From 79948586978a5d9736e8c1372d0f608294476e3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zygmunt=20Zaj=C4=85c?= <zajac.zygmunt@gmail.com>
Date: Mon, 30 Mar 2015 19:58:25 +0200
Subject: [PATCH 2/5] early stopping for Python wrapper

---
 wrapper/xgboost.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 5a5d59b11..a2ecacbe2 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -577,8 +577,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
         # either minimize loss or maximize AUC/MAP/NDCG
         maximize_score = False
         if 'eval_metric' in params:
-			maximize_metrics = ('auc', 'map', 'ndcg')
-			if filter( lambda x: params['eval_metric'].startswith(x), maximize_metrics ):
+            maximize_metrics = ('auc', 'map', 'ndcg')
+            if filter( lambda x: params['eval_metric'].startswith(x), maximize_metrics ):
                 maximize_score = True
         
         if maximize_score:

From 39093bc432e5981f2c11b53617690a5214ad22dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zygmunt=20Zaj=C4=85c?= <zajac.zygmunt@gmail.com>
Date: Mon, 30 Mar 2015 19:59:09 +0200
Subject: [PATCH 3/5] early stopping for Python wrapper

---
 wrapper/xgboost.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index a2ecacbe2..da891ab6c 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -578,7 +578,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
         maximize_score = False
         if 'eval_metric' in params:
             maximize_metrics = ('auc', 'map', 'ndcg')
-            if filter( lambda x: params['eval_metric'].startswith(x), maximize_metrics ):
+            if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics):
                 maximize_score = True
         
         if maximize_score:
@@ -601,7 +601,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
                 
             sys.stderr.write(msg + '\n')
             
-            score = float(msg.rsplit( ':', 1 )[1])
+            score = float(msg.rsplit(':', 1)[1])
             if (maximize_score and score > best_score) or \
                 (not maximize_score and score < best_score):
                 best_score = score

From d7f9499f88ee16e098dd6f9d6a79983376d53c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zygmunt=20Zaj=C4=85c?= <zajac.zygmunt@gmail.com>
Date: Thu, 2 Apr 2015 19:43:30 +0200
Subject: [PATCH 4/5] early_stopping_rounds for train() in Python wrapper
 :fire:

---
 wrapper/xgboost.py | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index da891ab6c..dd39acc9a 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -520,7 +520,7 @@ class Booster(object):
         return fmap
 
 
-def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
+def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, early_stopping_rounds=None):
     """
     Train a booster with given parameters.
 
@@ -532,28 +532,31 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
         Data to be trained.
     num_boost_round: int
         Number of boosting iterations.
-        If negative, train until validation error hasn't decreased in -num_boost_round rounds.
-        Requires at least one item in evals. If there's more than one, will use the last.
     watchlist : list of pairs (DMatrix, string)
         List of items to be evaluated during training, this allows user to watch
         performance on the validation set.
-    obj :  function
+    obj : function
         Customized objective function.
     feval : function
         Customized evaluation function.
+    early_stopping_rounds: int
+        Activates early stopping. Validation error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue training.
+        Requires at least one item in evals. 
+        If there's more than one, will use the last.
+        Returns the model from the last iteration (not the best one).
+        If early stopping occurs, the model will have two additional fields: 
+        bst.best_score and bst.best_iteration.
 
     Returns
     -------
     booster : a trained booster model
     """
     
-    if num_boost_round < 0 and len(evals) < 1:
-        raise ValueError('For early stopping you need at least on set in evals.')
-    
     evals = list(evals)
     bst = Booster(params, [dtrain] + [d[0] for d in evals])
     
-    if num_boost_round >= 0:
+    if not early_stopping_rounds:
         for i in range(num_boost_round):
             bst.update(dtrain, i, obj)
             if len(evals) != 0:
@@ -562,11 +565,15 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
                     sys.stderr.write(bst_eval_set + '\n')
                 else:
                     sys.stderr.write(bst_eval_set.decode() + '\n')
+        return bst
+    
     else:
         # early stopping
         
-        # TODO: return model from the best iteration
-        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], -num_boost_round))
+        if len(evals) < 1:
+            raise ValueError('For early stopping you need at least on set in evals.')        
+        
+        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds))
         
         # is params a list of tuples? are we using multiple eval metrics?
         if type(params) == list:
@@ -588,9 +595,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
             
         best_msg = '' 
         best_score_i = 0
-        i = 0
         
-        while True:
+        for i in range(num_boost_round):
             bst.update(dtrain, i, obj)
             bst_eval_set = bst.eval_set(evals, i, feval)
             
@@ -607,14 +613,15 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None):
                 best_score = score
                 best_score_i = i
                 best_msg = msg
-            elif i - best_score_i >= -num_boost_round:
-                sys.stderr.write("Stopping. Best iteration:\n{}".format(best_msg))
-                break
-               
-            i += 1
-        
-    return bst
+            elif i - best_score_i >= early_stopping_rounds:
+                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
+                bst.best_score = best_score
+                bst.best_iteration = best_score_i
+                return bst
+           
+        return bst
 
+        
 
 class CVPack(object):
     def __init__(self, dtrain, dtest, param):

From d17cdd639fe2c4f2a601650a694b83058aff9415 Mon Sep 17 00:00:00 2001
From: Jamie Hall <jamie@kaggle.com>
Date: Thu, 2 Apr 2015 20:33:07 -0700
Subject: [PATCH 5/5] bugfix

---
 wrapper/xgboost.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py
index 8ce5821f4..4a1e7c895 100644
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -829,7 +829,7 @@ class XGBClassifier(XGBModel, ClassifierMixin):
     def predict_proba(self, X):
         testDmatrix = DMatrix(X)
         class_probs = self._Booster.predict(testDmatrix)
-        if self._yspace == "multiclass":
+        if self.objective == "multi:softprob":
             return class_probs
         else:
             classone_probs = class_probs