From ad3a0bbab8eb813cfca897945bba01c688ff59d3 Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Wed, 12 Sep 2018 08:43:41 -0500
Subject: [PATCH] Add the missing max_delta_step (#3668)

* add max_delta_step to SplitEvaluator

* test for max_delta_step

* missing x2 factor for L1 term

* remove gamma from ElasticNet
---
 R-package/tests/testthat/test_basic.R | 14 ++++++++++++
 src/tree/split_evaluator.cc           | 31 ++++++++++++++++++---------
 2 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index e7a6679d3..c9cf73581 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -223,3 +223,17 @@ test_that("train and predict with non-strict classes", {
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 })
+
+test_that("max_delta_step works", {
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  watchlist <- list(train = dtrain)
+  param <- list(objective = "binary:logistic", eval_metric="logloss", max_depth = 2, nthread = 2, eta = 0.5)
+  nrounds = 5
+  # model with no restriction on max_delta_step
+  bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
+  # model with restricted max_delta_step
+  bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
+  # the no-restriction model is expected to have consistently lower loss during the initial interations
+  expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss))
+  expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8)
+})
diff --git a/src/tree/split_evaluator.cc b/src/tree/split_evaluator.cc
index 1c161d7ea..8b67ab107 100644
--- a/src/tree/split_evaluator.cc
+++ b/src/tree/split_evaluator.cc
@@ -63,7 +63,10 @@ bst_float SplitEvaluator::ComputeSplitScore(bst_uint nodeid,
 struct ElasticNetParams : public dmlc::Parameter<ElasticNetParams> {
   bst_float reg_lambda;
   bst_float reg_alpha;
-  bst_float reg_gamma;
+  // maximum delta update we can add in weight estimation
+  // this parameter can be used to stabilize update
+  // default=0 means no constraint on weight delta
+  float max_delta_step;
 
   DMLC_DECLARE_PARAMETER(ElasticNetParams) {
     DMLC_DECLARE_FIELD(reg_lambda)
@@ -74,13 +77,13 @@ struct ElasticNetParams : public dmlc::Parameter<ElasticNetParams> {
       .set_lower_bound(0.0)
       .set_default(0.0)
       .describe("L1 regularization on leaf weight");
-    DMLC_DECLARE_FIELD(reg_gamma)
-      .set_lower_bound(0.0)
-      .set_default(0.0)
-      .describe("Cost incurred by adding a new leaf node to the tree");
+    DMLC_DECLARE_FIELD(max_delta_step)
+      .set_lower_bound(0.0f)
+      .set_default(0.0f)
+      .describe("Maximum delta step we allow each tree's weight estimate to be. "\
+                "If the value is set to 0, it means there is no constraint");
     DMLC_DECLARE_ALIAS(reg_lambda, lambda);
     DMLC_DECLARE_ALIAS(reg_alpha, alpha);
-    DMLC_DECLARE_ALIAS(reg_gamma, gamma);
   }
 };
 
@@ -127,17 +130,25 @@ class ElasticNet final : public SplitEvaluator {
       const override {
     auto loss = weight * (2.0 * stats.sum_grad + stats.sum_hess * weight
         + params_.reg_lambda * weight)
-        + params_.reg_alpha * std::abs(weight);
+        + 2.0 * params_.reg_alpha * std::abs(weight);
     return -loss;
   }
 
   bst_float ComputeScore(bst_uint parentID, const GradStats &stats) const {
-    return Sqr(ThresholdL1(stats.sum_grad)) / (stats.sum_hess + params_.reg_lambda);
+    if (params_.max_delta_step == 0.0f) {
+      return Sqr(ThresholdL1(stats.sum_grad)) / (stats.sum_hess + params_.reg_lambda);
+    } else {
+      return ComputeScore(parentID, stats, ComputeWeight(parentID, stats));
+    }
   }
 
   bst_float ComputeWeight(bst_uint parentID, const GradStats& stats)
       const override {
-    return -ThresholdL1(stats.sum_grad) / (stats.sum_hess + params_.reg_lambda);
+    bst_float w = -ThresholdL1(stats.sum_grad) / (stats.sum_hess + params_.reg_lambda);
+    if (params_.max_delta_step != 0.0f && std::abs(w) > params_.max_delta_step) {
+      w = std::copysign(params_.max_delta_step, w);
+    }
+    return w;
   }
 
  private:
@@ -155,7 +166,7 @@ class ElasticNet final : public SplitEvaluator {
 };
 
 XGBOOST_REGISTER_SPLIT_EVALUATOR(ElasticNet, "elastic_net")
-.describe("Use an elastic net regulariser and a cost per leaf node")
+.describe("Use an elastic net regulariser")
 .set_body([](std::unique_ptr<SplitEvaluator> inner) {
     return new ElasticNet(std::move(inner));
   });