From 85939c6a6ea630c213852f485b13f70d47820c72 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 22 Dec 2018 13:21:49 +0800 Subject: [PATCH] Merge duplicated linear updater parameters. (#4013) * Merge duplicated linear updater parameters. * Split up coordinate descent parameter. --- src/linear/coordinate_common.h | 24 ++++---- src/linear/linear_updater.cc | 3 + src/linear/param.h | 77 ++++++++++++++++++++++++ src/linear/updater_coordinate.cc | 81 ++++++------------------- src/linear/updater_gpu_coordinate.cu | 89 +++++----------------------- src/linear/updater_shotgun.cc | 52 ++-------------- tests/cpp/linear/test_linear.cc | 33 ++++++----- 7 files changed, 151 insertions(+), 208 deletions(-) create mode 100644 src/linear/param.h diff --git a/src/linear/coordinate_common.h b/src/linear/coordinate_common.h index 8b7617c76..2fae87036 100644 --- a/src/linear/coordinate_common.h +++ b/src/linear/coordinate_common.h @@ -8,11 +8,24 @@ #include #include #include + +#include "./param.h" #include "../common/random.h" namespace xgboost { namespace linear { +struct CoordinateParam : public dmlc::Parameter { + int top_k; + DMLC_DECLARE_PARAMETER(CoordinateParam) { + DMLC_DECLARE_FIELD(top_k) + .set_lower_bound(0) + .set_default(0) + .describe("The number of top features to select in 'thrifty' feature_selector. " + "The value of zero means using all the features."); + } +}; + /** * \brief Calculate change in weight for a given feature. Applies l1/l2 penalty normalised by the * number of training instances. @@ -442,17 +455,6 @@ class ThriftyFeatureSelector : public FeatureSelector { std::vector> gpair_sums_; }; -/** - * \brief A set of available FeatureSelector's - */ -enum FeatureSelectorEnum { - kCyclic = 0, - kShuffle, - kThrifty, - kGreedy, - kRandom -}; - inline FeatureSelector *FeatureSelector::Create(int choice) { switch (choice) { case kCyclic: diff --git a/src/linear/linear_updater.cc b/src/linear/linear_updater.cc index 4e12cd865..fa7771eda 100644 --- a/src/linear/linear_updater.cc +++ b/src/linear/linear_updater.cc @@ -3,6 +3,7 @@ */ #include #include +#include "./param.h" namespace dmlc { DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg); @@ -22,6 +23,8 @@ LinearUpdater* LinearUpdater::Create(const std::string& name) { namespace xgboost { namespace linear { +DMLC_REGISTER_PARAMETER(LinearTrainParam); + // List of files that will be force linked in static links. DMLC_REGISTRY_LINK_TAG(updater_shotgun); DMLC_REGISTRY_LINK_TAG(updater_coordinate); diff --git a/src/linear/param.h b/src/linear/param.h new file mode 100644 index 000000000..b86b91841 --- /dev/null +++ b/src/linear/param.h @@ -0,0 +1,77 @@ +/*! + * Copyright 2018 by Contributors + * \file param.h + * \brief training parameters. + */ +#ifndef XGBOOST_LINEAR_PARAM_H_ +#define XGBOOST_LINEAR_PARAM_H_ +#include + +namespace xgboost { +namespace linear { +/** + * \brief A set of available FeatureSelector's + */ +enum FeatureSelectorEnum { + kCyclic = 0, + kShuffle, + kThrifty, + kGreedy, + kRandom +}; + +struct LinearTrainParam : public dmlc::Parameter { + /*! \brief learning_rate */ + float learning_rate; + /*! \brief regularization weight for L2 norm */ + float reg_lambda; + /*! \brief regularization weight for L1 norm */ + float reg_alpha; + int feature_selector; + int n_gpus; + int gpu_id; + // declare parameters + DMLC_DECLARE_PARAMETER(LinearTrainParam) { + DMLC_DECLARE_FIELD(learning_rate) + .set_lower_bound(0.0f) + .set_default(0.5f) + .describe("Learning rate of each update."); + DMLC_DECLARE_FIELD(reg_lambda) + .set_lower_bound(0.0f) + .set_default(0.0f) + .describe("L2 regularization on weights."); + DMLC_DECLARE_FIELD(reg_alpha) + .set_lower_bound(0.0f) + .set_default(0.0f) + .describe("L1 regularization on weights."); + DMLC_DECLARE_FIELD(feature_selector) + .set_default(kCyclic) + .add_enum("cyclic", kCyclic) + .add_enum("shuffle", kShuffle) + .add_enum("thrifty", kThrifty) + .add_enum("greedy", kGreedy) + .add_enum("random", kRandom) + .describe("Feature selection or ordering method."); + DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe( + "Number of devices to use."); + DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe( + "Primary device ordinal."); + // alias of parameters + DMLC_DECLARE_ALIAS(learning_rate, eta); + DMLC_DECLARE_ALIAS(reg_lambda, lambda); + DMLC_DECLARE_ALIAS(reg_alpha, alpha); + } + /*! \brief Denormalizes the regularization penalties - to be called at each update */ + void DenormalizePenalties(double sum_instance_weight) { + reg_lambda_denorm = reg_lambda * sum_instance_weight; + reg_alpha_denorm = reg_alpha * sum_instance_weight; + } + // denormalizated regularization penalties + float reg_lambda_denorm; + float reg_alpha_denorm; +}; + +} // namespace linear +} // namespace xgboost + +#endif // XGBOOST_LINEAR_PARAM_H_ diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc index dd5984ffe..8eea3fce6 100644 --- a/src/linear/updater_coordinate.cc +++ b/src/linear/updater_coordinate.cc @@ -4,66 +4,17 @@ */ #include +#include "./param.h" #include "../common/timer.h" #include "coordinate_common.h" namespace xgboost { namespace linear { +DMLC_REGISTER_PARAMETER(CoordinateParam); DMLC_REGISTRY_FILE_TAG(updater_coordinate); // training parameter -struct CoordinateTrainParam : public dmlc::Parameter { - /*! \brief learning_rate */ - float learning_rate; - /*! \brief regularization weight for L2 norm */ - float reg_lambda; - /*! \brief regularization weight for L1 norm */ - float reg_alpha; - int feature_selector; - int top_k; - // declare parameters - DMLC_DECLARE_PARAMETER(CoordinateTrainParam) { - DMLC_DECLARE_FIELD(learning_rate) - .set_lower_bound(0.0f) - .set_default(0.5f) - .describe("Learning rate of each update."); - DMLC_DECLARE_FIELD(reg_lambda) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L2 regularization on weights."); - DMLC_DECLARE_FIELD(reg_alpha) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L1 regularization on weights."); - DMLC_DECLARE_FIELD(feature_selector) - .set_default(kCyclic) - .add_enum("cyclic", kCyclic) - .add_enum("shuffle", kShuffle) - .add_enum("thrifty", kThrifty) - .add_enum("greedy", kGreedy) - .add_enum("random", kRandom) - .describe("Feature selection or ordering method."); - DMLC_DECLARE_FIELD(top_k) - .set_lower_bound(0) - .set_default(0) - .describe("The number of top features to select in 'thrifty' feature_selector. " - "The value of zero means using all the features."); - // alias of parameters - DMLC_DECLARE_ALIAS(learning_rate, eta); - DMLC_DECLARE_ALIAS(reg_lambda, lambda); - DMLC_DECLARE_ALIAS(reg_alpha, alpha); - } - /*! \brief Denormalizes the regularization penalties - to be called at each update */ - void DenormalizePenalties(double sum_instance_weight) { - reg_lambda_denorm = reg_lambda * sum_instance_weight; - reg_alpha_denorm = reg_alpha * sum_instance_weight; - } - // denormalizated regularization penalties - float reg_lambda_denorm; - float reg_alpha_denorm; -}; - /** * \class CoordinateUpdater * @@ -75,33 +26,37 @@ class CoordinateUpdater : public LinearUpdater { // set training parameter void Init( const std::vector > &args) override { - param.InitAllowUnknown(args); - selector.reset(FeatureSelector::Create(param.feature_selector)); + const std::vector > rest { + tparam_.InitAllowUnknown(args) + }; + cparam_.InitAllowUnknown(rest); + selector.reset(FeatureSelector::Create(tparam_.feature_selector)); monitor.Init("CoordinateUpdater"); } void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model, double sum_instance_weight) override { - param.DenormalizePenalties(sum_instance_weight); + tparam_.DenormalizePenalties(sum_instance_weight); const int ngroup = model->param.num_output_group; // update bias for (int group_idx = 0; group_idx < ngroup; ++group_idx) { auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->ConstHostVector(), p_fmat); - auto dbias = static_cast(param.learning_rate * + auto dbias = static_cast(tparam_.learning_rate * CoordinateDeltaBias(grad.first, grad.second)); model->bias()[group_idx] += dbias; UpdateBiasResidualParallel(group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat); } // prepare for updating the weights - selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, param.reg_alpha_denorm, - param.reg_lambda_denorm, param.top_k); + selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, + tparam_.reg_alpha_denorm, + tparam_.reg_lambda_denorm, cparam_.top_k); // update weights for (int group_idx = 0; group_idx < ngroup; ++group_idx) { for (unsigned i = 0U; i < model->param.num_feature; i++) { int fidx = selector->NextFeature (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat, - param.reg_alpha_denorm, param.reg_lambda_denorm); + tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm); if (fidx < 0) break; this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model); } @@ -116,20 +71,20 @@ class CoordinateUpdater : public LinearUpdater { auto gradient = GetGradientParallel(group_idx, ngroup, fidx, *in_gpair, p_fmat); auto dw = static_cast( - param.learning_rate * - CoordinateDelta(gradient.first, gradient.second, w, param.reg_alpha_denorm, - param.reg_lambda_denorm)); + tparam_.learning_rate * + CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm, + tparam_.reg_lambda_denorm)); w += dw; UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat); } + CoordinateParam cparam_; // training parameter - CoordinateTrainParam param; + LinearTrainParam tparam_; std::unique_ptr selector; common::Monitor monitor; }; -DMLC_REGISTER_PARAMETER(CoordinateTrainParam); XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent") .describe("Update linear model according to coordinate descent algorithm.") .set_body([]() { return new CoordinateUpdater(); }); diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index 311ea970e..47c58198f 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -9,6 +9,7 @@ #include "../common/common.h" #include "../common/device_helpers.cuh" #include "../common/timer.h" +#include "./param.h" #include "coordinate_common.h" namespace xgboost { @@ -16,64 +17,6 @@ namespace linear { DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate); -// training parameter -struct GPUCoordinateTrainParam - : public dmlc::Parameter { - /*! \brief learning_rate */ - float learning_rate; - /*! \brief regularization weight for L2 norm */ - float reg_lambda; - /*! \brief regularization weight for L1 norm */ - float reg_alpha; - int feature_selector; - int top_k; - int n_gpus; - int gpu_id; - // declare parameters - DMLC_DECLARE_PARAMETER(GPUCoordinateTrainParam) { - DMLC_DECLARE_FIELD(learning_rate) - .set_lower_bound(0.0f) - .set_default(1.0f) - .describe("Learning rate of each update."); - DMLC_DECLARE_FIELD(reg_lambda) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L2 regularization on weights."); - DMLC_DECLARE_FIELD(reg_alpha) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L1 regularization on weights."); - DMLC_DECLARE_FIELD(feature_selector) - .set_default(kCyclic) - .add_enum("cyclic", kCyclic) - .add_enum("shuffle", kShuffle) - .add_enum("thrifty", kThrifty) - .add_enum("greedy", kGreedy) - .add_enum("random", kRandom) - .describe("Feature selection or ordering method."); - DMLC_DECLARE_FIELD(top_k).set_lower_bound(0).set_default(0).describe( - "The number of top features to select in 'thrifty' feature_selector. " - "The value of zero means using all the features."); - DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe( - "Number of devices to use."); - DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe( - "Primary device ordinal."); - // alias of parameters - DMLC_DECLARE_ALIAS(learning_rate, eta); - DMLC_DECLARE_ALIAS(reg_lambda, lambda); - DMLC_DECLARE_ALIAS(reg_alpha, alpha); - } - /*! \brief Denormalizes the regularization penalties - to be called at each - * update */ - void DenormalizePenalties(double sum_instance_weight) { - reg_lambda_denorm = reg_lambda * sum_instance_weight; - reg_alpha_denorm = reg_alpha * sum_instance_weight; - } - // denormalizated regularization penalties - float reg_lambda_denorm; - float reg_alpha_denorm; -}; - void RescaleIndices(size_t ridx_begin, dh::DVec *data) { auto d_data = data->Data(); dh::LaunchN(data->DeviceIdx(), data->Size(), @@ -93,7 +36,7 @@ class DeviceShard { public: DeviceShard(int device_id, const SparsePage &batch, bst_uint row_begin, bst_uint row_end, - const GPUCoordinateTrainParam ¶m, + const LinearTrainParam ¶m, const gbm::GBLinearModelParam &model_param) : device_id_(device_id), ridx_begin_(row_begin), @@ -199,8 +142,8 @@ class GPUCoordinateUpdater : public LinearUpdater { // set training parameter void Init( const std::vector> &args) override { - param.InitAllowUnknown(args); - selector.reset(FeatureSelector::Create(param.feature_selector)); + tparam_.InitAllowUnknown(args); + selector.reset(FeatureSelector::Create(tparam_.feature_selector)); monitor.Init("GPUCoordinateUpdater"); } @@ -208,7 +151,7 @@ class GPUCoordinateUpdater : public LinearUpdater { const gbm::GBLinearModelParam &model_param) { if (!shards.empty()) return; - dist_ = GPUDistribution::Block(GPUSet::All(param.gpu_id, param.n_gpus, + dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus, p_fmat->Info().num_row_)); auto devices = dist_.Devices(); @@ -237,13 +180,13 @@ class GPUCoordinateUpdater : public LinearUpdater { [&](int i, std::unique_ptr& shard) { shard = std::unique_ptr( new DeviceShard(devices.DeviceId(i), batch, row_segments[i], - row_segments[i + 1], param, model_param)); + row_segments[i + 1], tparam_, model_param)); }); } void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model, double sum_instance_weight) override { - param.DenormalizePenalties(sum_instance_weight); + tparam_.DenormalizePenalties(sum_instance_weight); monitor.Start("LazyInitShards"); this->LazyInitShards(p_fmat, model->param); monitor.Stop("LazyInitShards"); @@ -260,15 +203,15 @@ class GPUCoordinateUpdater : public LinearUpdater { monitor.Stop("UpdateBias"); // prepare for updating the weights selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, - param.reg_alpha_denorm, param.reg_lambda_denorm, - param.top_k); + tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, + coord_param_.top_k); monitor.Start("UpdateFeature"); for (auto group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) { for (auto i = 0U; i < model->param.num_feature; i++) { auto fidx = selector->NextFeature( i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat, - param.reg_alpha_denorm, param.reg_lambda_denorm); + tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm); if (fidx < 0) break; this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model); } @@ -287,7 +230,7 @@ class GPUCoordinateUpdater : public LinearUpdater { }); auto dbias = static_cast( - param.learning_rate * + tparam_.learning_rate * CoordinateDeltaBias(grad.GetGrad(), grad.GetHess())); model->bias()[group_idx] += dbias; @@ -310,10 +253,10 @@ class GPUCoordinateUpdater : public LinearUpdater { fidx); }); - auto dw = static_cast(param.learning_rate * + auto dw = static_cast(tparam_.learning_rate * CoordinateDelta(grad.GetGrad(), grad.GetHess(), - w, param.reg_alpha_denorm, - param.reg_lambda_denorm)); + w, tparam_.reg_alpha_denorm, + tparam_.reg_lambda_denorm)); w += dw; dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr& shard) { @@ -322,7 +265,8 @@ class GPUCoordinateUpdater : public LinearUpdater { } // training parameter - GPUCoordinateTrainParam param; + LinearTrainParam tparam_; + CoordinateParam coord_param_; GPUDistribution dist_; std::unique_ptr selector; common::Monitor monitor; @@ -330,7 +274,6 @@ class GPUCoordinateUpdater : public LinearUpdater { std::vector> shards; }; -DMLC_REGISTER_PARAMETER(GPUCoordinateTrainParam); XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent") .describe( "Update linear model according to coordinate descent algorithm. GPU " diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc index 36b2acc5c..327acc94a 100644 --- a/src/linear/updater_shotgun.cc +++ b/src/linear/updater_shotgun.cc @@ -11,54 +11,16 @@ namespace linear { DMLC_REGISTRY_FILE_TAG(updater_shotgun); -// training parameter -struct ShotgunTrainParam : public dmlc::Parameter { - /*! \brief learning_rate */ - float learning_rate; - /*! \brief regularization weight for L2 norm */ - float reg_lambda; - /*! \brief regularization weight for L1 norm */ - float reg_alpha; - int feature_selector; - // declare parameters - DMLC_DECLARE_PARAMETER(ShotgunTrainParam) { - DMLC_DECLARE_FIELD(learning_rate) - .set_lower_bound(0.0f) - .set_default(0.5f) - .describe("Learning rate of each update."); - DMLC_DECLARE_FIELD(reg_lambda) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L2 regularization on weights."); - DMLC_DECLARE_FIELD(reg_alpha) - .set_lower_bound(0.0f) - .set_default(0.0f) - .describe("L1 regularization on weights."); - DMLC_DECLARE_FIELD(feature_selector) - .set_default(kCyclic) - .add_enum("cyclic", kCyclic) - .add_enum("shuffle", kShuffle) - .describe("Feature selection or ordering method."); - // alias of parameters - DMLC_DECLARE_ALIAS(learning_rate, eta); - DMLC_DECLARE_ALIAS(reg_lambda, lambda); - DMLC_DECLARE_ALIAS(reg_alpha, alpha); - } - /*! \brief Denormalizes the regularization penalties - to be called at each update */ - void DenormalizePenalties(double sum_instance_weight) { - reg_lambda_denorm = reg_lambda * sum_instance_weight; - reg_alpha_denorm = reg_alpha * sum_instance_weight; - } - // denormalizated regularization penalties - float reg_lambda_denorm; - float reg_alpha_denorm; -}; - class ShotgunUpdater : public LinearUpdater { public: // set training parameter void Init(const std::vector > &args) override { param_.InitAllowUnknown(args); + if (param_.feature_selector != kCyclic && + param_.feature_selector != kShuffle) { + LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n" + << "Supported options are: {cyclic, shuffle}"; + } selector_.reset(FeatureSelector::Create(param_.feature_selector)); } void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, @@ -119,13 +81,11 @@ class ShotgunUpdater : public LinearUpdater { protected: // training parameters - ShotgunTrainParam param_; + LinearTrainParam param_; std::unique_ptr selector_; }; -DMLC_REGISTER_PARAMETER(ShotgunTrainParam); - XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun") .describe( "Update linear model according to shotgun coordinate descent " diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc index 5bcc0e771..2a479a313 100644 --- a/tests/cpp/linear/test_linear.cc +++ b/tests/cpp/linear/test_linear.cc @@ -3,29 +3,32 @@ #include "../helpers.h" #include "xgboost/gbm.h" -typedef std::pair arg; - TEST(Linear, shotgun) { - typedef std::pair arg; auto mat = xgboost::CreateDMatrix(10, 10, 0); - auto updater = std::unique_ptr( - xgboost::LinearUpdater::Create("shotgun")); - updater->Init({{"eta", "1."}}); - xgboost::HostDeviceVector gpair( - (*mat)->Info().num_row_, xgboost::GradientPair(-5, 1.0)); - xgboost::gbm::GBLinearModel model; - model.param.num_feature = (*mat)->Info().num_col_; - model.param.num_output_group = 1; - model.LazyInitModel(); - updater->Update(&gpair, (*mat).get(), &model, gpair.Size()); + { + auto updater = std::unique_ptr( + xgboost::LinearUpdater::Create("shotgun")); + updater->Init({{"eta", "1."}}); + xgboost::HostDeviceVector gpair( + (*mat)->Info().num_row_, xgboost::GradientPair(-5, 1.0)); + xgboost::gbm::GBLinearModel model; + model.param.num_feature = (*mat)->Info().num_col_; + model.param.num_output_group = 1; + model.LazyInitModel(); + updater->Update(&gpair, (*mat).get(), &model, gpair.Size()); - ASSERT_EQ(model.bias()[0], 5.0f); + ASSERT_EQ(model.bias()[0], 5.0f); + } + { + auto updater = std::unique_ptr( + xgboost::LinearUpdater::Create("shotgun")); + EXPECT_ANY_THROW(updater->Init({{"feature_selector", "random"}})); + } delete mat; } TEST(Linear, coordinate) { - typedef std::pair arg; auto mat = xgboost::CreateDMatrix(10, 10, 0); auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("coord_descent"));