Merge duplicated linear updater parameters. (#4013)

* Merge duplicated linear updater parameters. * Split up coordinate descent parameter.
2018-12-22 13:21:49 +08:00 · 2018-12-22 13:21:49 +08:00 · 85939c6a6e
commit 85939c6a6e
parent f75a21af25
7 changed files with 151 additions and 208 deletions
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@ -8,11 +8,24 @@
 #include <utility>
 #include <vector>
 #include <limits>
+
+#include "./param.h"
 #include "../common/random.h"

 namespace xgboost {
 namespace linear {

+struct CoordinateParam : public dmlc::Parameter<CoordinateParam> {
+  int top_k;
+  DMLC_DECLARE_PARAMETER(CoordinateParam) {
+    DMLC_DECLARE_FIELD(top_k)
+        .set_lower_bound(0)
+        .set_default(0)
+        .describe("The number of top features to select in 'thrifty' feature_selector. "
+                  "The value of zero means using all the features.");
+  }
+};
+
 /**
 * \brief Calculate change in weight for a given feature. Applies l1/l2 penalty normalised by the
 *        number of training instances.
@ -442,17 +455,6 @@ class ThriftyFeatureSelector : public FeatureSelector {
  std::vector<std::pair<double, double>> gpair_sums_;
 };

-/**
- * \brief A set of available FeatureSelector's
- */
-enum FeatureSelectorEnum {
-  kCyclic = 0,
-  kShuffle,
-  kThrifty,
-  kGreedy,
-  kRandom
-};
-
 inline FeatureSelector *FeatureSelector::Create(int choice) {
  switch (choice) {
    case kCyclic:
--- a/src/linear/linear_updater.cc
+++ b/src/linear/linear_updater.cc
@ -3,6 +3,7 @@
 */
 #include <xgboost/linear_updater.h>
 #include <dmlc/registry.h>
+#include "./param.h"

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg);
@ -22,6 +23,8 @@ LinearUpdater* LinearUpdater::Create(const std::string& name) {

 namespace xgboost {
 namespace linear {
+DMLC_REGISTER_PARAMETER(LinearTrainParam);
+
 // List of files that will be force linked in static links.
 DMLC_REGISTRY_LINK_TAG(updater_shotgun);
 DMLC_REGISTRY_LINK_TAG(updater_coordinate);
--- a/src/linear/param.h
+++ b/src/linear/param.h
@ -0,0 +1,77 @@
+/*!
+ * Copyright 2018 by Contributors
+ * \file param.h
+ * \brief training parameters.
+ */
+#ifndef XGBOOST_LINEAR_PARAM_H_
+#define XGBOOST_LINEAR_PARAM_H_
+#include <dmlc/parameter.h>
+
+namespace xgboost {
+namespace linear {
+/**
+ * \brief A set of available FeatureSelector's
+ */
+enum FeatureSelectorEnum {
+  kCyclic = 0,
+  kShuffle,
+  kThrifty,
+  kGreedy,
+  kRandom
+};
+
+struct LinearTrainParam : public dmlc::Parameter<LinearTrainParam> {
+  /*! \brief learning_rate */
+  float learning_rate;
+  /*! \brief regularization weight for L2 norm */
+  float reg_lambda;
+  /*! \brief regularization weight for L1 norm */
+  float reg_alpha;
+  int feature_selector;
+  int n_gpus;
+  int gpu_id;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LinearTrainParam) {
+    DMLC_DECLARE_FIELD(learning_rate)
+        .set_lower_bound(0.0f)
+        .set_default(0.5f)
+        .describe("Learning rate of each update.");
+    DMLC_DECLARE_FIELD(reg_lambda)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L2 regularization on weights.");
+    DMLC_DECLARE_FIELD(reg_alpha)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L1 regularization on weights.");
+    DMLC_DECLARE_FIELD(feature_selector)
+        .set_default(kCyclic)
+        .add_enum("cyclic", kCyclic)
+        .add_enum("shuffle", kShuffle)
+        .add_enum("thrifty", kThrifty)
+        .add_enum("greedy", kGreedy)
+        .add_enum("random", kRandom)
+        .describe("Feature selection or ordering method.");
+    DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe(
+        "Number of devices to use.");
+    DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe(
+        "Primary device ordinal.");
+    // alias of parameters
+    DMLC_DECLARE_ALIAS(learning_rate, eta);
+    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
+    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
+  }
+  /*! \brief Denormalizes the regularization penalties - to be called at each update */
+  void DenormalizePenalties(double sum_instance_weight) {
+    reg_lambda_denorm = reg_lambda * sum_instance_weight;
+    reg_alpha_denorm = reg_alpha * sum_instance_weight;
+  }
+  // denormalizated regularization penalties
+  float reg_lambda_denorm;
+  float reg_alpha_denorm;
+};
+
+}  // namespace linear
+}  // namespace xgboost
+
+#endif  // XGBOOST_LINEAR_PARAM_H_
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@ -4,66 +4,17 @@
 */

 #include <xgboost/linear_updater.h>
+#include "./param.h"
 #include "../common/timer.h"
 #include "coordinate_common.h"

 namespace xgboost {
 namespace linear {

+DMLC_REGISTER_PARAMETER(CoordinateParam);
 DMLC_REGISTRY_FILE_TAG(updater_coordinate);

 // training parameter
-struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
-  /*! \brief learning_rate */
-  float learning_rate;
-  /*! \brief regularization weight for L2 norm */
-  float reg_lambda;
-  /*! \brief regularization weight for L1 norm */
-  float reg_alpha;
-  int feature_selector;
-  int top_k;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(CoordinateTrainParam) {
-    DMLC_DECLARE_FIELD(learning_rate)
-        .set_lower_bound(0.0f)
-        .set_default(0.5f)
-        .describe("Learning rate of each update.");
-    DMLC_DECLARE_FIELD(reg_lambda)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L2 regularization on weights.");
-    DMLC_DECLARE_FIELD(reg_alpha)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L1 regularization on weights.");
-    DMLC_DECLARE_FIELD(feature_selector)
-        .set_default(kCyclic)
-        .add_enum("cyclic", kCyclic)
-        .add_enum("shuffle", kShuffle)
-        .add_enum("thrifty", kThrifty)
-        .add_enum("greedy", kGreedy)
-        .add_enum("random", kRandom)
-        .describe("Feature selection or ordering method.");
-    DMLC_DECLARE_FIELD(top_k)
-        .set_lower_bound(0)
-        .set_default(0)
-        .describe("The number of top features to select in 'thrifty' feature_selector. "
-                  "The value of zero means using all the features.");
-    // alias of parameters
-    DMLC_DECLARE_ALIAS(learning_rate, eta);
-    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
-    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
-  }
-  /*! \brief Denormalizes the regularization penalties - to be called at each update */
-  void DenormalizePenalties(double sum_instance_weight) {
-    reg_lambda_denorm = reg_lambda * sum_instance_weight;
-    reg_alpha_denorm = reg_alpha * sum_instance_weight;
-  }
-  // denormalizated regularization penalties
-  float reg_lambda_denorm;
-  float reg_alpha_denorm;
-};
-
 /**
 * \class CoordinateUpdater
 *
@ -75,33 +26,37 @@ class CoordinateUpdater : public LinearUpdater {
  // set training parameter
  void Init(
      const std::vector<std::pair<std::string, std::string> > &args) override {
-    param.InitAllowUnknown(args);
-    selector.reset(FeatureSelector::Create(param.feature_selector));
+    const std::vector<std::pair<std::string, std::string> > rest {
+      tparam_.InitAllowUnknown(args)
+    };
+    cparam_.InitAllowUnknown(rest);
+    selector.reset(FeatureSelector::Create(tparam_.feature_selector));
    monitor.Init("CoordinateUpdater");
  }
  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
              gbm::GBLinearModel *model, double sum_instance_weight) override {
-    param.DenormalizePenalties(sum_instance_weight);
+    tparam_.DenormalizePenalties(sum_instance_weight);
    const int ngroup = model->param.num_output_group;
    // update bias
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
      auto grad = GetBiasGradientParallel(group_idx, ngroup,
                                          in_gpair->ConstHostVector(), p_fmat);
-      auto dbias = static_cast<float>(param.learning_rate *
+      auto dbias = static_cast<float>(tparam_.learning_rate *
                                      CoordinateDeltaBias(grad.first, grad.second));
      model->bias()[group_idx] += dbias;
      UpdateBiasResidualParallel(group_idx, ngroup,
                                 dbias, &in_gpair->HostVector(), p_fmat);
    }
    // prepare for updating the weights
-    selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, param.reg_alpha_denorm,
-                    param.reg_lambda_denorm, param.top_k);
+    selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
+                    tparam_.reg_alpha_denorm,
+                    tparam_.reg_lambda_denorm, cparam_.top_k);
    // update weights
    for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
      for (unsigned i = 0U; i < model->param.num_feature; i++) {
        int fidx = selector->NextFeature
          (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-           param.reg_alpha_denorm, param.reg_lambda_denorm);
+           tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
        if (fidx < 0) break;
        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
      }
@ -116,20 +71,20 @@ class CoordinateUpdater : public LinearUpdater {
    auto gradient =
        GetGradientParallel(group_idx, ngroup, fidx, *in_gpair, p_fmat);
    auto dw = static_cast<float>(
-        param.learning_rate *
-        CoordinateDelta(gradient.first, gradient.second, w, param.reg_alpha_denorm,
-                        param.reg_lambda_denorm));
+        tparam_.learning_rate *
+        CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
+                        tparam_.reg_lambda_denorm));
    w += dw;
    UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
  }

+  CoordinateParam cparam_;
  // training parameter
-  CoordinateTrainParam param;
+  LinearTrainParam tparam_;
  std::unique_ptr<FeatureSelector> selector;
  common::Monitor monitor;
 };

-DMLC_REGISTER_PARAMETER(CoordinateTrainParam);
 XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
    .describe("Update linear model according to coordinate descent algorithm.")
    .set_body([]() { return new CoordinateUpdater(); });
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@ -9,6 +9,7 @@
 #include "../common/common.h"
 #include "../common/device_helpers.cuh"
 #include "../common/timer.h"
+#include "./param.h"
 #include "coordinate_common.h"

 namespace xgboost {
@ -16,64 +17,6 @@ namespace linear {

 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);

-// training parameter
-struct GPUCoordinateTrainParam
-    : public dmlc::Parameter<GPUCoordinateTrainParam> {
-  /*! \brief learning_rate */
-  float learning_rate;
-  /*! \brief regularization weight for L2 norm */
-  float reg_lambda;
-  /*! \brief regularization weight for L1 norm */
-  float reg_alpha;
-  int feature_selector;
-  int top_k;
-  int n_gpus;
-  int gpu_id;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(GPUCoordinateTrainParam) {
-    DMLC_DECLARE_FIELD(learning_rate)
-        .set_lower_bound(0.0f)
-        .set_default(1.0f)
-        .describe("Learning rate of each update.");
-    DMLC_DECLARE_FIELD(reg_lambda)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L2 regularization on weights.");
-    DMLC_DECLARE_FIELD(reg_alpha)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L1 regularization on weights.");
-    DMLC_DECLARE_FIELD(feature_selector)
-        .set_default(kCyclic)
-        .add_enum("cyclic", kCyclic)
-        .add_enum("shuffle", kShuffle)
-        .add_enum("thrifty", kThrifty)
-        .add_enum("greedy", kGreedy)
-        .add_enum("random", kRandom)
-        .describe("Feature selection or ordering method.");
-    DMLC_DECLARE_FIELD(top_k).set_lower_bound(0).set_default(0).describe(
-        "The number of top features to select in 'thrifty' feature_selector. "
-        "The value of zero means using all the features.");
-    DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe(
-        "Number of devices to use.");
-    DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe(
-        "Primary device ordinal.");
-    // alias of parameters
-    DMLC_DECLARE_ALIAS(learning_rate, eta);
-    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
-    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
-  }
-  /*! \brief Denormalizes the regularization penalties - to be called at each
-   * update */
-  void DenormalizePenalties(double sum_instance_weight) {
-    reg_lambda_denorm = reg_lambda * sum_instance_weight;
-    reg_alpha_denorm = reg_alpha * sum_instance_weight;
-  }
-  // denormalizated regularization penalties
-  float reg_lambda_denorm;
-  float reg_alpha_denorm;
-};
-
 void RescaleIndices(size_t ridx_begin, dh::DVec<Entry> *data) {
  auto d_data = data->Data();
  dh::LaunchN(data->DeviceIdx(), data->Size(),
@ -93,7 +36,7 @@ class DeviceShard {
 public:
  DeviceShard(int device_id, const SparsePage &batch,
              bst_uint row_begin, bst_uint row_end,
-              const GPUCoordinateTrainParam &param,
+              const LinearTrainParam &param,
              const gbm::GBLinearModelParam &model_param)
      : device_id_(device_id),
        ridx_begin_(row_begin),
@ -199,8 +142,8 @@ class GPUCoordinateUpdater : public LinearUpdater {
  // set training parameter
  void Init(
      const std::vector<std::pair<std::string, std::string>> &args) override {
-    param.InitAllowUnknown(args);
-    selector.reset(FeatureSelector::Create(param.feature_selector));
+    tparam_.InitAllowUnknown(args);
+    selector.reset(FeatureSelector::Create(tparam_.feature_selector));
    monitor.Init("GPUCoordinateUpdater");
  }

@ -208,7 +151,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
                      const gbm::GBLinearModelParam &model_param) {
    if (!shards.empty()) return;

-    dist_ = GPUDistribution::Block(GPUSet::All(param.gpu_id, param.n_gpus,
+    dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus,
                                               p_fmat->Info().num_row_));
    auto devices = dist_.Devices();

@ -237,13 +180,13 @@ class GPUCoordinateUpdater : public LinearUpdater {
                           [&](int i, std::unique_ptr<DeviceShard>& shard) {
        shard = std::unique_ptr<DeviceShard>(
            new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
-                            row_segments[i + 1], param, model_param));
+                            row_segments[i + 1], tparam_, model_param));
      });
  }

  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
              gbm::GBLinearModel *model, double sum_instance_weight) override {
-    param.DenormalizePenalties(sum_instance_weight);
+    tparam_.DenormalizePenalties(sum_instance_weight);
    monitor.Start("LazyInitShards");
    this->LazyInitShards(p_fmat, model->param);
    monitor.Stop("LazyInitShards");
@ -260,15 +203,15 @@ class GPUCoordinateUpdater : public LinearUpdater {
    monitor.Stop("UpdateBias");
    // prepare for updating the weights
    selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
-                    param.reg_alpha_denorm, param.reg_lambda_denorm,
-                    param.top_k);
+                    tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
+                    coord_param_.top_k);
    monitor.Start("UpdateFeature");
    for (auto group_idx = 0; group_idx < model->param.num_output_group;
         ++group_idx) {
      for (auto i = 0U; i < model->param.num_feature; i++) {
        auto fidx = selector->NextFeature(
            i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-            param.reg_alpha_denorm, param.reg_lambda_denorm);
+            tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
        if (fidx < 0) break;
        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model);
      }
@ -287,7 +230,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
          });

      auto dbias = static_cast<float>(
-          param.learning_rate *
+          tparam_.learning_rate *
          CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
      model->bias()[group_idx] += dbias;

@ -310,10 +253,10 @@ class GPUCoordinateUpdater : public LinearUpdater {
                                    fidx);
        });

-    auto dw = static_cast<float>(param.learning_rate *
+    auto dw = static_cast<float>(tparam_.learning_rate *
                                 CoordinateDelta(grad.GetGrad(), grad.GetHess(),
-                                                 w, param.reg_alpha_denorm,
-                                                 param.reg_lambda_denorm));
+                                                 w, tparam_.reg_alpha_denorm,
+                                                 tparam_.reg_lambda_denorm));
    w += dw;

    dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
@ -322,7 +265,8 @@ class GPUCoordinateUpdater : public LinearUpdater {
  }

  // training parameter
-  GPUCoordinateTrainParam param;
+  LinearTrainParam tparam_;
+  CoordinateParam coord_param_;
  GPUDistribution dist_;
  std::unique_ptr<FeatureSelector> selector;
  common::Monitor monitor;
@ -330,7 +274,6 @@ class GPUCoordinateUpdater : public LinearUpdater {
  std::vector<std::unique_ptr<DeviceShard>> shards;
 };

-DMLC_REGISTER_PARAMETER(GPUCoordinateTrainParam);
 XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
    .describe(
        "Update linear model according to coordinate descent algorithm. GPU "
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@ -11,54 +11,16 @@ namespace linear {

 DMLC_REGISTRY_FILE_TAG(updater_shotgun);

-// training parameter
-struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
-  /*! \brief learning_rate */
-  float learning_rate;
-  /*! \brief regularization weight for L2 norm */
-  float reg_lambda;
-  /*! \brief regularization weight for L1 norm */
-  float reg_alpha;
-  int feature_selector;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(ShotgunTrainParam) {
-    DMLC_DECLARE_FIELD(learning_rate)
-        .set_lower_bound(0.0f)
-        .set_default(0.5f)
-        .describe("Learning rate of each update.");
-    DMLC_DECLARE_FIELD(reg_lambda)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L2 regularization on weights.");
-    DMLC_DECLARE_FIELD(reg_alpha)
-        .set_lower_bound(0.0f)
-        .set_default(0.0f)
-        .describe("L1 regularization on weights.");
-    DMLC_DECLARE_FIELD(feature_selector)
-        .set_default(kCyclic)
-        .add_enum("cyclic", kCyclic)
-        .add_enum("shuffle", kShuffle)
-        .describe("Feature selection or ordering method.");
-    // alias of parameters
-    DMLC_DECLARE_ALIAS(learning_rate, eta);
-    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
-    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
-  }
-  /*! \brief Denormalizes the regularization penalties - to be called at each update */
-  void DenormalizePenalties(double sum_instance_weight) {
-    reg_lambda_denorm = reg_lambda * sum_instance_weight;
-    reg_alpha_denorm = reg_alpha * sum_instance_weight;
-  }
-  // denormalizated regularization penalties
-  float reg_lambda_denorm;
-  float reg_alpha_denorm;
-};
-
 class ShotgunUpdater : public LinearUpdater {
 public:
  // set training parameter
  void Init(const std::vector<std::pair<std::string, std::string> > &args) override {
    param_.InitAllowUnknown(args);
+    if (param_.feature_selector != kCyclic &&
+        param_.feature_selector != kShuffle) {
+      LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
+                 << "Supported options are: {cyclic, shuffle}";
+    }
    selector_.reset(FeatureSelector::Create(param_.feature_selector));
  }
  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
@ -119,13 +81,11 @@ class ShotgunUpdater : public LinearUpdater {

 protected:
  // training parameters
-  ShotgunTrainParam param_;
+  LinearTrainParam param_;

  std::unique_ptr<FeatureSelector> selector_;
 };

-DMLC_REGISTER_PARAMETER(ShotgunTrainParam);
-
 XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
    .describe(
        "Update linear model according to shotgun coordinate descent "
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@ -3,11 +3,9 @@
 #include "../helpers.h"
 #include "xgboost/gbm.h"

-typedef std::pair<std::string, std::string> arg;
-
 TEST(Linear, shotgun) {
-  typedef std::pair<std::string, std::string> arg;
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
+  {
    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
        xgboost::LinearUpdater::Create("shotgun"));
    updater->Init({{"eta", "1."}});
@ -21,11 +19,16 @@ TEST(Linear, shotgun) {

    ASSERT_EQ(model.bias()[0], 5.0f);

+  }
+  {
+    auto updater = std::unique_ptr<xgboost::LinearUpdater>(
+        xgboost::LinearUpdater::Create("shotgun"));
+    EXPECT_ANY_THROW(updater->Init({{"feature_selector", "random"}}));
+  }
  delete mat;
 }

 TEST(Linear, coordinate) {
-  typedef std::pair<std::string, std::string> arg;
  auto mat = xgboost::CreateDMatrix(10, 10, 0);
  auto updater = std::unique_ptr<xgboost::LinearUpdater>(
      xgboost::LinearUpdater::Create("coord_descent"));