Merge duplicated linear updater parameters. (#4013)

* Merge duplicated linear updater parameters.

* Split up coordinate descent parameter.
This commit is contained in:
Jiaming Yuan 2018-12-22 13:21:49 +08:00 committed by GitHub
parent f75a21af25
commit 85939c6a6e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 151 additions and 208 deletions

View File

@ -8,11 +8,24 @@
#include <utility>
#include <vector>
#include <limits>
#include "./param.h"
#include "../common/random.h"
namespace xgboost {
namespace linear {
struct CoordinateParam : public dmlc::Parameter<CoordinateParam> {
int top_k;
DMLC_DECLARE_PARAMETER(CoordinateParam) {
DMLC_DECLARE_FIELD(top_k)
.set_lower_bound(0)
.set_default(0)
.describe("The number of top features to select in 'thrifty' feature_selector. "
"The value of zero means using all the features.");
}
};
/**
* \brief Calculate change in weight for a given feature. Applies l1/l2 penalty normalised by the
* number of training instances.
@ -442,17 +455,6 @@ class ThriftyFeatureSelector : public FeatureSelector {
std::vector<std::pair<double, double>> gpair_sums_;
};
/**
* \brief A set of available FeatureSelector's
*/
enum FeatureSelectorEnum {
kCyclic = 0,
kShuffle,
kThrifty,
kGreedy,
kRandom
};
inline FeatureSelector *FeatureSelector::Create(int choice) {
switch (choice) {
case kCyclic:

View File

@ -3,6 +3,7 @@
*/
#include <xgboost/linear_updater.h>
#include <dmlc/registry.h>
#include "./param.h"
namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg);
@ -22,6 +23,8 @@ LinearUpdater* LinearUpdater::Create(const std::string& name) {
namespace xgboost {
namespace linear {
DMLC_REGISTER_PARAMETER(LinearTrainParam);
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(updater_shotgun);
DMLC_REGISTRY_LINK_TAG(updater_coordinate);

77
src/linear/param.h Normal file
View File

@ -0,0 +1,77 @@
/*!
* Copyright 2018 by Contributors
* \file param.h
* \brief training parameters.
*/
#ifndef XGBOOST_LINEAR_PARAM_H_
#define XGBOOST_LINEAR_PARAM_H_
#include <dmlc/parameter.h>
namespace xgboost {
namespace linear {
/**
* \brief A set of available FeatureSelector's
*/
enum FeatureSelectorEnum {
kCyclic = 0,
kShuffle,
kThrifty,
kGreedy,
kRandom
};
struct LinearTrainParam : public dmlc::Parameter<LinearTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
int feature_selector;
int n_gpus;
int gpu_id;
// declare parameters
DMLC_DECLARE_PARAMETER(LinearTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(0.5f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.add_enum("thrifty", kThrifty)
.add_enum("greedy", kGreedy)
.add_enum("random", kRandom)
.describe("Feature selection or ordering method.");
DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe(
"Number of devices to use.");
DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe(
"Primary device ordinal.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
} // namespace linear
} // namespace xgboost
#endif // XGBOOST_LINEAR_PARAM_H_

View File

@ -4,66 +4,17 @@
*/
#include <xgboost/linear_updater.h>
#include "./param.h"
#include "../common/timer.h"
#include "coordinate_common.h"
namespace xgboost {
namespace linear {
DMLC_REGISTER_PARAMETER(CoordinateParam);
DMLC_REGISTRY_FILE_TAG(updater_coordinate);
// training parameter
struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
int feature_selector;
int top_k;
// declare parameters
DMLC_DECLARE_PARAMETER(CoordinateTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(0.5f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.add_enum("thrifty", kThrifty)
.add_enum("greedy", kGreedy)
.add_enum("random", kRandom)
.describe("Feature selection or ordering method.");
DMLC_DECLARE_FIELD(top_k)
.set_lower_bound(0)
.set_default(0)
.describe("The number of top features to select in 'thrifty' feature_selector. "
"The value of zero means using all the features.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
/**
* \class CoordinateUpdater
*
@ -75,33 +26,37 @@ class CoordinateUpdater : public LinearUpdater {
// set training parameter
void Init(
const std::vector<std::pair<std::string, std::string> > &args) override {
param.InitAllowUnknown(args);
selector.reset(FeatureSelector::Create(param.feature_selector));
const std::vector<std::pair<std::string, std::string> > rest {
tparam_.InitAllowUnknown(args)
};
cparam_.InitAllowUnknown(rest);
selector.reset(FeatureSelector::Create(tparam_.feature_selector));
monitor.Init("CoordinateUpdater");
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
param.DenormalizePenalties(sum_instance_weight);
tparam_.DenormalizePenalties(sum_instance_weight);
const int ngroup = model->param.num_output_group;
// update bias
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
auto grad = GetBiasGradientParallel(group_idx, ngroup,
in_gpair->ConstHostVector(), p_fmat);
auto dbias = static_cast<float>(param.learning_rate *
auto dbias = static_cast<float>(tparam_.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->bias()[group_idx] += dbias;
UpdateBiasResidualParallel(group_idx, ngroup,
dbias, &in_gpair->HostVector(), p_fmat);
}
// prepare for updating the weights
selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, param.reg_alpha_denorm,
param.reg_lambda_denorm, param.top_k);
selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm, cparam_.top_k);
// update weights
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
for (unsigned i = 0U; i < model->param.num_feature; i++) {
int fidx = selector->NextFeature
(i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
param.reg_alpha_denorm, param.reg_lambda_denorm);
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
}
@ -116,20 +71,20 @@ class CoordinateUpdater : public LinearUpdater {
auto gradient =
GetGradientParallel(group_idx, ngroup, fidx, *in_gpair, p_fmat);
auto dw = static_cast<float>(
param.learning_rate *
CoordinateDelta(gradient.first, gradient.second, w, param.reg_alpha_denorm,
param.reg_lambda_denorm));
tparam_.learning_rate *
CoordinateDelta(gradient.first, gradient.second, w, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm));
w += dw;
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
}
CoordinateParam cparam_;
// training parameter
CoordinateTrainParam param;
LinearTrainParam tparam_;
std::unique_ptr<FeatureSelector> selector;
common::Monitor monitor;
};
DMLC_REGISTER_PARAMETER(CoordinateTrainParam);
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
.describe("Update linear model according to coordinate descent algorithm.")
.set_body([]() { return new CoordinateUpdater(); });

View File

@ -9,6 +9,7 @@
#include "../common/common.h"
#include "../common/device_helpers.cuh"
#include "../common/timer.h"
#include "./param.h"
#include "coordinate_common.h"
namespace xgboost {
@ -16,64 +17,6 @@ namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
// training parameter
struct GPUCoordinateTrainParam
: public dmlc::Parameter<GPUCoordinateTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
int feature_selector;
int top_k;
int n_gpus;
int gpu_id;
// declare parameters
DMLC_DECLARE_PARAMETER(GPUCoordinateTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(1.0f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.add_enum("thrifty", kThrifty)
.add_enum("greedy", kGreedy)
.add_enum("random", kRandom)
.describe("Feature selection or ordering method.");
DMLC_DECLARE_FIELD(top_k).set_lower_bound(0).set_default(0).describe(
"The number of top features to select in 'thrifty' feature_selector. "
"The value of zero means using all the features.");
DMLC_DECLARE_FIELD(n_gpus).set_default(1).describe(
"Number of devices to use.");
DMLC_DECLARE_FIELD(gpu_id).set_default(0).describe(
"Primary device ordinal.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each
* update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
void RescaleIndices(size_t ridx_begin, dh::DVec<Entry> *data) {
auto d_data = data->Data();
dh::LaunchN(data->DeviceIdx(), data->Size(),
@ -93,7 +36,7 @@ class DeviceShard {
public:
DeviceShard(int device_id, const SparsePage &batch,
bst_uint row_begin, bst_uint row_end,
const GPUCoordinateTrainParam &param,
const LinearTrainParam &param,
const gbm::GBLinearModelParam &model_param)
: device_id_(device_id),
ridx_begin_(row_begin),
@ -199,8 +142,8 @@ class GPUCoordinateUpdater : public LinearUpdater {
// set training parameter
void Init(
const std::vector<std::pair<std::string, std::string>> &args) override {
param.InitAllowUnknown(args);
selector.reset(FeatureSelector::Create(param.feature_selector));
tparam_.InitAllowUnknown(args);
selector.reset(FeatureSelector::Create(tparam_.feature_selector));
monitor.Init("GPUCoordinateUpdater");
}
@ -208,7 +151,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
const gbm::GBLinearModelParam &model_param) {
if (!shards.empty()) return;
dist_ = GPUDistribution::Block(GPUSet::All(param.gpu_id, param.n_gpus,
dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus,
p_fmat->Info().num_row_));
auto devices = dist_.Devices();
@ -237,13 +180,13 @@ class GPUCoordinateUpdater : public LinearUpdater {
[&](int i, std::unique_ptr<DeviceShard>& shard) {
shard = std::unique_ptr<DeviceShard>(
new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
row_segments[i + 1], param, model_param));
row_segments[i + 1], tparam_, model_param));
});
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
param.DenormalizePenalties(sum_instance_weight);
tparam_.DenormalizePenalties(sum_instance_weight);
monitor.Start("LazyInitShards");
this->LazyInitShards(p_fmat, model->param);
monitor.Stop("LazyInitShards");
@ -260,15 +203,15 @@ class GPUCoordinateUpdater : public LinearUpdater {
monitor.Stop("UpdateBias");
// prepare for updating the weights
selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
param.reg_alpha_denorm, param.reg_lambda_denorm,
param.top_k);
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
coord_param_.top_k);
monitor.Start("UpdateFeature");
for (auto group_idx = 0; group_idx < model->param.num_output_group;
++group_idx) {
for (auto i = 0U; i < model->param.num_feature; i++) {
auto fidx = selector->NextFeature(
i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
param.reg_alpha_denorm, param.reg_lambda_denorm);
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model);
}
@ -287,7 +230,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
});
auto dbias = static_cast<float>(
param.learning_rate *
tparam_.learning_rate *
CoordinateDeltaBias(grad.GetGrad(), grad.GetHess()));
model->bias()[group_idx] += dbias;
@ -310,10 +253,10 @@ class GPUCoordinateUpdater : public LinearUpdater {
fidx);
});
auto dw = static_cast<float>(param.learning_rate *
auto dw = static_cast<float>(tparam_.learning_rate *
CoordinateDelta(grad.GetGrad(), grad.GetHess(),
w, param.reg_alpha_denorm,
param.reg_lambda_denorm));
w, tparam_.reg_alpha_denorm,
tparam_.reg_lambda_denorm));
w += dw;
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
@ -322,7 +265,8 @@ class GPUCoordinateUpdater : public LinearUpdater {
}
// training parameter
GPUCoordinateTrainParam param;
LinearTrainParam tparam_;
CoordinateParam coord_param_;
GPUDistribution dist_;
std::unique_ptr<FeatureSelector> selector;
common::Monitor monitor;
@ -330,7 +274,6 @@ class GPUCoordinateUpdater : public LinearUpdater {
std::vector<std::unique_ptr<DeviceShard>> shards;
};
DMLC_REGISTER_PARAMETER(GPUCoordinateTrainParam);
XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
.describe(
"Update linear model according to coordinate descent algorithm. GPU "

View File

@ -11,54 +11,16 @@ namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_shotgun);
// training parameter
struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
int feature_selector;
// declare parameters
DMLC_DECLARE_PARAMETER(ShotgunTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(0.5f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.describe("Feature selection or ordering method.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
class ShotgunUpdater : public LinearUpdater {
public:
// set training parameter
void Init(const std::vector<std::pair<std::string, std::string> > &args) override {
param_.InitAllowUnknown(args);
if (param_.feature_selector != kCyclic &&
param_.feature_selector != kShuffle) {
LOG(FATAL) << "Unsupported feature selector for shotgun updater.\n"
<< "Supported options are: {cyclic, shuffle}";
}
selector_.reset(FeatureSelector::Create(param_.feature_selector));
}
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
@ -119,13 +81,11 @@ class ShotgunUpdater : public LinearUpdater {
protected:
// training parameters
ShotgunTrainParam param_;
LinearTrainParam param_;
std::unique_ptr<FeatureSelector> selector_;
};
DMLC_REGISTER_PARAMETER(ShotgunTrainParam);
XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
.describe(
"Update linear model according to shotgun coordinate descent "

View File

@ -3,11 +3,9 @@
#include "../helpers.h"
#include "xgboost/gbm.h"
typedef std::pair<std::string, std::string> arg;
TEST(Linear, shotgun) {
typedef std::pair<std::string, std::string> arg;
auto mat = xgboost::CreateDMatrix(10, 10, 0);
{
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
xgboost::LinearUpdater::Create("shotgun"));
updater->Init({{"eta", "1."}});
@ -21,11 +19,16 @@ TEST(Linear, shotgun) {
ASSERT_EQ(model.bias()[0], 5.0f);
}
{
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
xgboost::LinearUpdater::Create("shotgun"));
EXPECT_ANY_THROW(updater->Init({{"feature_selector", "random"}}));
}
delete mat;
}
TEST(Linear, coordinate) {
typedef std::pair<std::string, std::string> arg;
auto mat = xgboost::CreateDMatrix(10, 10, 0);
auto updater = std::unique_ptr<xgboost::LinearUpdater>(
xgboost::LinearUpdater::Create("coord_descent"));