Additional improvements for gblinear (#3134)

* fix rebase conflict

* [core] additional gblinear improvements

* [R] callback for gblinear coefficients history

* force eta=1 for gblinear python tests

* add top_k to GreedyFeatureSelector

* set eta=1 in shotgun test

* [core] fix SparsePage processing in gblinear; col-wise multithreading in greedy updater

* set sorted flag within TryInitColData

* gblinear tests: use scale, add external memory test

* fix multiclass for greedy updater

* fix whitespace

* fix typo
This commit is contained in:
Vadim Khotilovich
2018-03-13 01:27:13 -05:00
committed by GitHub
parent a1b48afa41
commit 706be4e5d4
18 changed files with 750 additions and 260 deletions

View File

@@ -119,7 +119,7 @@ ColIterator(const std::vector<bst_uint>& fset) {
}
bool SparsePageDMatrix::TryInitColData() {
bool SparsePageDMatrix::TryInitColData(bool sorted) {
// load meta data.
std::vector<std::string> cache_shards = common::Split(cache_info_, ':');
{
@@ -140,6 +140,8 @@ bool SparsePageDMatrix::TryInitColData() {
files.push_back(std::move(fdata));
}
col_iter_.reset(new ColPageIter(std::move(files)));
// warning: no attempt to check here whether the cached data was sorted
col_iter_->sorted = sorted;
return true;
}
@@ -147,7 +149,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
float pkeep,
size_t max_row_perbatch, bool sorted) {
if (HaveColAccess(sorted)) return;
if (TryInitColData()) return;
if (TryInitColData(sorted)) return;
const MetaInfo& info = this->info();
if (max_row_perbatch == std::numeric_limits<size_t>::max()) {
max_row_perbatch = kMaxRowPerBatch;
@@ -291,8 +293,7 @@ void SparsePageDMatrix::InitColAccess(const std::vector<bool>& enabled,
fo.reset(nullptr);
}
// initialize column data
CHECK(TryInitColData());
col_iter_->sorted = sorted;
CHECK(TryInitColData(sorted));
}
} // namespace data

View File

@@ -116,7 +116,7 @@ class SparsePageDMatrix : public DMatrix {
* \brief Try to initialize column data.
* \return true if data already exists, false if they do not.
*/
bool TryInitColData();
bool TryInitColData(bool sorted);
// source data pointer.
std::unique_ptr<DataSource> source_;
// the cache prefix

View File

@@ -21,14 +21,12 @@ namespace gbm {
DMLC_REGISTRY_FILE_TAG(gblinear);
// training parameter
// training parameters
struct GBLinearTrainParam : public dmlc::Parameter<GBLinearTrainParam> {
/*! \brief learning_rate */
std::string updater;
// flag to print out detailed breakdown of runtime
int debug_verbose;
float tolerance;
// declare parameters
size_t max_row_perbatch;
int debug_verbose;
DMLC_DECLARE_PARAMETER(GBLinearTrainParam) {
DMLC_DECLARE_FIELD(updater)
.set_default("shotgun")
@@ -37,6 +35,9 @@ struct GBLinearTrainParam : public dmlc::Parameter<GBLinearTrainParam> {
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("Stop if largest weight update is smaller than this number.");
DMLC_DECLARE_FIELD(max_row_perbatch)
.set_default(std::numeric_limits<size_t>::max())
.describe("Maximum rows per batch.");
DMLC_DECLARE_FIELD(debug_verbose)
.set_lower_bound(0)
.set_default(0)
@@ -84,12 +85,10 @@ class GBLinear : public GradientBooster {
if (!p_fmat->HaveColAccess(false)) {
std::vector<bool> enabled(p_fmat->info().num_col, true);
p_fmat->InitColAccess(enabled, 1.0f, std::numeric_limits<size_t>::max(),
false);
p_fmat->InitColAccess(enabled, 1.0f, param.max_row_perbatch, false);
}
model.LazyInitModel();
this->LazySumWeights(p_fmat);
if (!this->CheckConvergence()) {
@@ -191,40 +190,7 @@ class GBLinear : public GradientBooster {
std::vector<std::string> DumpModel(const FeatureMap& fmap,
bool with_stats,
std::string format) const override {
const int ngroup = model.param.num_output_group;
const unsigned nfeature = model.param.num_feature;
std::stringstream fo("");
if (format == "json") {
fo << " { \"bias\": [" << std::endl;
for (int gid = 0; gid < ngroup; ++gid) {
if (gid != 0) fo << "," << std::endl;
fo << " " << model.bias()[gid];
}
fo << std::endl << " ]," << std::endl
<< " \"weight\": [" << std::endl;
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
if (i != 0 || gid != 0) fo << "," << std::endl;
fo << " " << model[i][gid];
}
}
fo << std::endl << " ]" << std::endl << " }";
} else {
fo << "bias:\n";
for (int gid = 0; gid < ngroup; ++gid) {
fo << model.bias()[gid] << std::endl;
}
fo << "weight:\n";
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
fo << model[i][gid] << std::endl;
}
}
}
std::vector<std::string> v;
v.push_back(fo.str());
return v;
return model.DumpModel(fmap, with_stats, format);
}
protected:
@@ -272,9 +238,12 @@ class GBLinear : public GradientBooster {
bool CheckConvergence() {
if (param.tolerance == 0.0f) return false;
if (is_converged) return true;
if (previous_model.weight.size() != model.weight.size()) return false;
if (previous_model.weight.size() != model.weight.size()) {
previous_model = model;
return false;
}
float largest_dw = 0.0;
for (auto i = 0; i < model.weight.size(); i++) {
for (size_t i = 0; i < model.weight.size(); i++) {
largest_dw = std::max(
largest_dw, std::abs(model.weight[i] - previous_model.weight[i]));
}
@@ -287,7 +256,7 @@ class GBLinear : public GradientBooster {
void LazySumWeights(DMatrix *p_fmat) {
if (!sum_weight_complete) {
auto &info = p_fmat->info();
for (int i = 0; i < info.num_row; i++) {
for (size_t i = 0; i < info.num_row; i++) {
sum_instance_weight += info.GetWeight(i);
}
sum_weight_complete = true;

View File

@@ -4,7 +4,9 @@
#pragma once
#include <dmlc/io.h>
#include <dmlc/parameter.h>
#include <xgboost/feature_map.h>
#include <vector>
#include <string>
#include <cstring>
namespace xgboost {
@@ -68,6 +70,44 @@ class GBLinearModel {
inline const bst_float* operator[](size_t i) const {
return &weight[i * param.num_output_group];
}
std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const {
const int ngroup = param.num_output_group;
const unsigned nfeature = param.num_feature;
std::stringstream fo("");
if (format == "json") {
fo << " { \"bias\": [" << std::endl;
for (int gid = 0; gid < ngroup; ++gid) {
if (gid != 0) fo << "," << std::endl;
fo << " " << this->bias()[gid];
}
fo << std::endl << " ]," << std::endl
<< " \"weight\": [" << std::endl;
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
if (i != 0 || gid != 0) fo << "," << std::endl;
fo << " " << (*this)[i][gid];
}
}
fo << std::endl << " ]" << std::endl << " }";
} else {
fo << "bias:\n";
for (int gid = 0; gid < ngroup; ++gid) {
fo << this->bias()[gid] << std::endl;
}
fo << "weight:\n";
for (unsigned i = 0; i < nfeature; ++i) {
for (int gid = 0; gid < ngroup; ++gid) {
fo << (*this)[i][gid] << std::endl;
}
}
}
std::vector<std::string> v;
v.push_back(fo.str());
return v;
}
};
} // namespace gbm
} // namespace xgboost

View File

@@ -7,6 +7,7 @@
#include <string>
#include <utility>
#include <vector>
#include <limits>
#include "../common/random.h"
namespace xgboost {
@@ -19,26 +20,21 @@ namespace linear {
* \param sum_grad The sum gradient.
* \param sum_hess The sum hess.
* \param w The weight.
* \param reg_lambda Unnormalised L2 penalty.
* \param reg_alpha Unnormalised L1 penalty.
* \param sum_instance_weight The sum instance weights, used to normalise l1/l2 penalty.
* \param reg_lambda Unnormalised L2 penalty.
*
* \return The weight update.
*/
inline double CoordinateDelta(double sum_grad, double sum_hess, double w,
double reg_lambda, double reg_alpha,
double sum_instance_weight) {
reg_alpha *= sum_instance_weight;
reg_lambda *= sum_instance_weight;
double reg_alpha, double reg_lambda) {
if (sum_hess < 1e-5f) return 0.0f;
double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
const double sum_grad_l2 = sum_grad + reg_lambda * w;
const double sum_hess_l2 = sum_hess + reg_lambda;
const double tmp = w - sum_grad_l2 / sum_hess_l2;
if (tmp >= 0) {
return std::max(
-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
return std::max(-(sum_grad_l2 + reg_alpha) / sum_hess_l2, -w);
} else {
return std::min(
-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
return std::min(-(sum_grad_l2 - reg_alpha) / sum_hess_l2, -w);
}
}
@@ -50,7 +46,6 @@ inline double CoordinateDelta(double sum_grad, double sum_hess, double w,
*
* \return The weight update.
*/
inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
return -sum_grad / sum_hess;
}
@@ -66,15 +61,14 @@ inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
*
* \return The gradient and diagonal Hessian entry for a given feature.
*/
inline std::pair<double, double> GetGradient(
int group_idx, int num_group, int fidx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
inline std::pair<double, double> GetGradient(int group_idx, int num_group, int fidx,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
ColBatch::Inst col = batch[0];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_float v = col[j].fvalue;
@@ -88,7 +82,7 @@ inline std::pair<double, double> GetGradient(
}
/**
* \brief Get the gradient with respect to a single feature. Multithreaded.
* \brief Get the gradient with respect to a single feature. Row-wise multithreaded.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
@@ -98,16 +92,14 @@ inline std::pair<double, double> GetGradient(
*
* \return The gradient and diagonal Hessian entry for a given feature.
*/
inline std::pair<double, double> GetGradientParallel(
int group_idx, int num_group, int fidx,
const std::vector<bst_gpair> &gpair, DMatrix *p_fmat) {
inline std::pair<double, double> GetGradientParallel(int group_idx, int num_group, int fidx,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
ColBatch::Inst col = batch[0];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint j = 0; j < ndata; ++j) {
@@ -122,7 +114,7 @@ inline std::pair<double, double> GetGradientParallel(
}
/**
* \brief Get the gradient with respect to the bias. Multithreaded.
* \brief Get the gradient with respect to the bias. Row-wise multithreaded.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
@@ -131,10 +123,9 @@ inline std::pair<double, double> GetGradientParallel(
*
* \return The gradient and diagonal Hessian entry for the bias.
*/
inline std::pair<double, double> GetBiasGradientParallel(
int group_idx, int num_group, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
inline std::pair<double, double> GetBiasGradientParallel(int group_idx, int num_group,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
const RowSet &rowset = p_fmat->buffered_rowset();
double sum_grad = 0.0, sum_hess = 0.0;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
@@ -159,15 +150,14 @@ inline std::pair<double, double> GetBiasGradientParallel(
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
float dw, std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat) {
if (dw == 0.0f) return;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator({static_cast<bst_uint>(fidx)});
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
ColBatch::Inst col = batch[0];
// update grad value
const bst_omp_uint num_row = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
@@ -188,9 +178,7 @@ inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateBiasResidualParallel(int group_idx, int num_group,
float dbias,
inline void UpdateBiasResidualParallel(int group_idx, int num_group, float dbias,
std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat) {
if (dbias == 0.0f) return;
@@ -205,114 +193,292 @@ inline void UpdateBiasResidualParallel(int group_idx, int num_group,
}
/**
* \class FeatureSelector
*
* \brief Abstract class for stateful feature selection in coordinate descent
* algorithms.
* \brief Abstract class for stateful feature selection or ordering
* in coordinate descent algorithms.
*/
class FeatureSelector {
public:
static FeatureSelector *Create(std::string name);
/*! \brief factory method */
static FeatureSelector *Create(int choice);
/*! \brief virtual destructor */
virtual ~FeatureSelector() {}
/**
* \brief Setting up the selector state prior to looping through features.
*
* \param model The model.
* \param gpair The gpair.
* \param p_fmat The feature matrix.
* \param alpha Regularisation alpha.
* \param lambda Regularisation lambda.
* \param param A parameter with algorithm-dependent use.
*/
virtual void Setup(const gbm::GBLinearModel &model,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat,
float alpha, float lambda, int param) {}
/**
* \brief Select next coordinate to update.
*
* \param iteration The iteration.
* \param model The model.
* \param group_idx Zero-based index of the group.
* \param gpair The gpair.
* \param p_fmat The feature matrix.
* \param alpha Regularisation alpha.
* \param lambda Regularisation lambda.
* \param sum_instance_weight The sum instance weight.
* \param iteration The iteration in a loop through features
* \param model The model.
* \param group_idx Zero-based index of the group.
* \param gpair The gpair.
* \param p_fmat The feature matrix.
* \param alpha Regularisation alpha.
* \param lambda Regularisation lambda.
*
* \return The index of the selected feature. -1 indicates the bias term.
* \return The index of the selected feature. -1 indicates none selected.
*/
virtual int SelectNextFeature(int iteration,
const gbm::GBLinearModel &model,
int group_idx,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) = 0;
virtual int NextFeature(int iteration,
const gbm::GBLinearModel &model,
int group_idx,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) = 0;
};
/**
* \class CyclicFeatureSelector
*
* \brief Deterministic selection by cycling through coordinates one at a time.
* \brief Deterministic selection by cycling through features one at a time.
*/
class CyclicFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
return iteration % model.param.num_feature;
}
};
/**
* \class RandomFeatureSelector
*
* \brief A random coordinate selector.
* \brief Similar to Cyclyc but with random feature shuffling prior to each update.
* \note Its randomness is controllable by setting a random seed.
*/
class ShuffleFeatureSelector : public FeatureSelector {
public:
void Setup(const gbm::GBLinearModel &model,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda, int param) override {
if (feat_index.size() == 0) {
feat_index.resize(model.param.num_feature);
std::iota(feat_index.begin(), feat_index.end(), 0);
}
std::shuffle(feat_index.begin(), feat_index.end(), common::GlobalRandom());
}
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
return feat_index[iteration % model.param.num_feature];
}
protected:
std::vector<bst_uint> feat_index;
};
/**
* \brief A random (with replacement) coordinate selector.
* \note Its randomness is controllable by setting a random seed.
*/
class RandomFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
return common::GlobalRandom()() % model.param.num_feature;
}
};
/**
* \class GreedyFeatureSelector
*
* \brief Select coordinate with the greatest gradient magnitude.
* \note It has O(num_feature^2) complexity. It is fully deterministic.
*
* \note It allows restricting the selection to top_k features per group with
* the largest magnitude of univariate weight change, by passing the top_k value
* through the `param` argument of Setup(). That would reduce the complexity to
* O(num_feature*top_k).
*/
class GreedyFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
// Find best
void Setup(const gbm::GBLinearModel &model,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda, int param) override {
top_k = static_cast<bst_uint>(param);
const bst_uint ngroup = model.param.num_output_group;
if (param <= 0) top_k = std::numeric_limits<bst_uint>::max();
if (counter.size() == 0) {
counter.resize(ngroup);
gpair_sums.resize(model.param.num_feature * ngroup);
}
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
counter[gid] = 0u;
}
}
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
// k-th selected feature for a group
auto k = counter[group_idx]++;
// stop after either reaching top-K or going through all the features in a group
if (k >= top_k || counter[group_idx] == model.param.num_feature) return -1;
const int ngroup = model.param.num_output_group;
const bst_omp_uint nfeat = model.param.num_feature;
// Calculate univariate gradient sums
std::fill(gpair_sums.begin(), gpair_sums.end(), std::make_pair(0., 0.));
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const ColBatch::Inst col = batch[i];
const bst_uint ndata = col.length;
auto &sums = gpair_sums[group_idx * nfeat + i];
for (bst_uint j = 0u; j < ndata; ++j) {
const bst_float v = col[j].fvalue;
auto &p = gpair[col[j].index * ngroup + group_idx];
if (p.GetHess() < 0.f) continue;
sums.first += p.GetGrad() * v;
sums.second += p.GetHess() * v * v;
}
}
}
// Find a feature with the largest magnitude of weight change
int best_fidx = 0;
double best_weight_update = 0.0f;
for (auto fidx = 0U; fidx < model.param.num_feature; fidx++) {
const float w = model[fidx][group_idx];
auto gradient = GetGradientParallel(
group_idx, model.param.num_output_group, fidx, gpair, p_fmat);
float dw = static_cast<float>(
CoordinateDelta(gradient.first, gradient.second, w, lambda, alpha,
sum_instance_weight));
if (std::abs(dw) > std::abs(best_weight_update)) {
for (bst_omp_uint fidx = 0; fidx < nfeat; ++fidx) {
auto &s = gpair_sums[group_idx * nfeat + fidx];
float dw = std::abs(static_cast<bst_float>(
CoordinateDelta(s.first, s.second, model[fidx][group_idx], alpha, lambda)));
if (dw > best_weight_update) {
best_weight_update = dw;
best_fidx = fidx;
}
}
return best_fidx;
}
protected:
bst_uint top_k;
std::vector<bst_uint> counter;
std::vector<std::pair<double, double>> gpair_sums;
};
inline FeatureSelector *FeatureSelector::Create(std::string name) {
if (name == "cyclic") {
return new CyclicFeatureSelector();
} else if (name == "random") {
return new RandomFeatureSelector();
} else if (name == "greedy") {
return new GreedyFeatureSelector();
} else {
LOG(FATAL) << name << ": unknown coordinate selector";
/**
* \brief Thrifty, approximately-greedy feature selector.
*
* \note Prior to cyclic updates, reorders features in descending magnitude of
* their univariate weight changes. This operation is multithreaded and is a
* linear complexity approximation of the quadratic greedy selection.
*
* \note It allows restricting the selection to top_k features per group with
* the largest magnitude of univariate weight change, by passing the top_k value
* through the `param` argument of Setup().
*/
class ThriftyFeatureSelector : public FeatureSelector {
public:
void Setup(const gbm::GBLinearModel &model,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda, int param) override {
top_k = static_cast<bst_uint>(param);
if (param <= 0) top_k = std::numeric_limits<bst_uint>::max();
const bst_uint ngroup = model.param.num_output_group;
const bst_omp_uint nfeat = model.param.num_feature;
if (deltaw.size() == 0) {
deltaw.resize(nfeat * ngroup);
sorted_idx.resize(nfeat * ngroup);
counter.resize(ngroup);
gpair_sums.resize(nfeat * ngroup);
}
// Calculate univariate gradient sums
std::fill(gpair_sums.begin(), gpair_sums.end(), std::make_pair(0., 0.));
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
// column-parallel is usually faster than row-parallel
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const ColBatch::Inst col = batch[i];
const bst_uint ndata = col.length;
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
auto &sums = gpair_sums[gid * nfeat + i];
for (bst_uint j = 0u; j < ndata; ++j) {
const bst_float v = col[j].fvalue;
auto &p = gpair[col[j].index * ngroup + gid];
if (p.GetHess() < 0.f) continue;
sums.first += p.GetGrad() * v;
sums.second += p.GetHess() * v * v;
}
}
}
}
// rank by descending weight magnitude within the groups
std::fill(deltaw.begin(), deltaw.end(), 0.f);
std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
bst_float *pdeltaw = &deltaw[0];
for (bst_uint gid = 0u; gid < ngroup; ++gid) {
// Calculate univariate weight changes
for (bst_omp_uint i = 0; i < nfeat; ++i) {
auto ii = gid * nfeat + i;
auto &s = gpair_sums[ii];
deltaw[ii] = static_cast<bst_float>(CoordinateDelta(
s.first, s.second, model[i][gid], alpha, lambda));
}
// sort in descending order of deltaw abs values
auto start = sorted_idx.begin() + gid * nfeat;
std::sort(start, start + nfeat,
[pdeltaw](size_t i, size_t j) {
return std::abs(*(pdeltaw + i)) > std::abs(*(pdeltaw + j));
});
counter[gid] = 0u;
}
}
int NextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda) override {
// k-th selected feature for a group
auto k = counter[group_idx]++;
// stop after either reaching top-N or going through all the features in a group
if (k >= top_k || counter[group_idx] == model.param.num_feature) return -1;
// note that sorted_idx stores the "long" indices
const size_t grp_offset = group_idx * model.param.num_feature;
return static_cast<int>(sorted_idx[grp_offset + k] - grp_offset);
}
protected:
bst_uint top_k;
std::vector<bst_float> deltaw;
std::vector<size_t> sorted_idx;
std::vector<bst_uint> counter;
std::vector<std::pair<double, double>> gpair_sums;
};
/**
* \brief A set of available FeatureSelector's
*/
enum FeatureSelectorEnum {
kCyclic = 0,
kShuffle,
kThrifty,
kGreedy,
kRandom
};
inline FeatureSelector *FeatureSelector::Create(int choice) {
switch (choice) {
case kCyclic:
return new CyclicFeatureSelector();
case kShuffle:
return new ShuffleFeatureSelector();
case kThrifty:
return new ThriftyFeatureSelector();
case kGreedy:
return new GreedyFeatureSelector();
case kRandom:
return new RandomFeatureSelector();
default:
LOG(FATAL) << "unknown coordinate selector: " << choice;
}
return nullptr;
}

View File

@@ -20,8 +20,8 @@ struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
std::string feature_selector;
float maximum_weight;
int feature_selector;
int top_k;
int debug_verbose;
// declare parameters
DMLC_DECLARE_PARAMETER(CoordinateTrainParam) {
@@ -38,17 +38,35 @@ struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default("cyclic")
.describe(
"Feature selection algorithm, one of cyclic/random/greedy");
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.add_enum("thrifty", kThrifty)
.add_enum("greedy", kGreedy)
.add_enum("random", kRandom)
.describe("Feature selection or ordering method.");
DMLC_DECLARE_FIELD(top_k)
.set_lower_bound(0)
.set_default(0)
.describe("The number of top features to select in 'thrifty' feature_selector. "
"The value of zero means using all the features.");
DMLC_DECLARE_FIELD(debug_verbose)
.set_lower_bound(0)
.set_default(0)
.describe("flag to print out detailed breakdown of runtime");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
/**
@@ -66,47 +84,47 @@ class CoordinateUpdater : public LinearUpdater {
selector.reset(FeatureSelector::Create(param.feature_selector));
monitor.Init("CoordinateUpdater", param.debug_verbose);
}
void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
// Calculate bias
for (int group_idx = 0; group_idx < model->param.num_output_group;
++group_idx) {
auto grad = GetBiasGradientParallel(
group_idx, model->param.num_output_group, *in_gpair, p_fmat);
auto dbias = static_cast<float>(
param.learning_rate * CoordinateDeltaBias(grad.first, grad.second));
param.DenormalizePenalties(sum_instance_weight);
const int ngroup = model->param.num_output_group;
// update bias
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
auto grad = GetBiasGradientParallel(group_idx, ngroup, *in_gpair, p_fmat);
auto dbias = static_cast<float>(param.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->bias()[group_idx] += dbias;
UpdateBiasResidualParallel(group_idx, model->param.num_output_group,
dbias, in_gpair, p_fmat);
UpdateBiasResidualParallel(group_idx, ngroup, dbias, in_gpair, p_fmat);
}
for (int group_idx = 0; group_idx < model->param.num_output_group;
++group_idx) {
for (auto i = 0U; i < model->param.num_feature; i++) {
int fidx = selector->SelectNextFeature(
i, *model, group_idx, *in_gpair, p_fmat, param.reg_alpha,
param.reg_lambda, sum_instance_weight);
this->UpdateFeature(fidx, group_idx, in_gpair, p_fmat, model,
sum_instance_weight);
// prepare for updating the weights
selector->Setup(*model, *in_gpair, p_fmat, param.reg_alpha_denorm,
param.reg_lambda_denorm, param.top_k);
// update weights
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
for (unsigned i = 0U; i < model->param.num_feature; i++) {
int fidx = selector->NextFeature(i, *model, group_idx, *in_gpair, p_fmat,
param.reg_alpha_denorm, param.reg_lambda_denorm);
if (fidx < 0) break;
this->UpdateFeature(fidx, group_idx, in_gpair, p_fmat, model);
}
}
}
void UpdateFeature(int fidx, int group_idx, std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat, gbm::GBLinearModel *model,
double sum_instance_weight) {
inline void UpdateFeature(int fidx, int group_idx, std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat, gbm::GBLinearModel *model) {
const int ngroup = model->param.num_output_group;
bst_float &w = (*model)[fidx][group_idx];
monitor.Start("GetGradientParallel");
auto gradient = GetGradientParallel(
group_idx, model->param.num_output_group, fidx, *in_gpair, p_fmat);
auto gradient = GetGradientParallel(group_idx, ngroup, fidx, *in_gpair, p_fmat);
monitor.Stop("GetGradientParallel");
auto dw = static_cast<float>(
param.learning_rate *
CoordinateDelta(gradient.first, gradient.second, w, param.reg_lambda,
param.reg_alpha, sum_instance_weight));
CoordinateDelta(gradient.first, gradient.second, w, param.reg_alpha_denorm,
param.reg_lambda_denorm));
w += dw;
monitor.Start("UpdateResidualParallel");
UpdateResidualParallel(fidx, group_idx, model->param.num_output_group, dw,
in_gpair, p_fmat);
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
monitor.Stop("UpdateResidualParallel");
}

View File

@@ -19,11 +19,12 @@ struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
int feature_selector;
// declare parameters
DMLC_DECLARE_PARAMETER(ShotgunTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(1.0f)
.set_default(0.5f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
@@ -33,75 +34,79 @@ struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default(kCyclic)
.add_enum("cyclic", kCyclic)
.add_enum("shuffle", kShuffle)
.describe("Feature selection or ordering method.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
/*! \brief Denormalizes the regularization penalties - to be called at each update */
void DenormalizePenalties(double sum_instance_weight) {
reg_lambda_denorm = reg_lambda * sum_instance_weight;
reg_alpha_denorm = reg_alpha * sum_instance_weight;
}
// denormalizated regularization penalties
float reg_lambda_denorm;
float reg_alpha_denorm;
};
class ShotgunUpdater : public LinearUpdater {
public:
// set training parameter
void Init(
const std::vector<std::pair<std::string, std::string> > &args) override {
void Init(const std::vector<std::pair<std::string, std::string> > &args) override {
param.InitAllowUnknown(args);
selector.reset(FeatureSelector::Create(param.feature_selector));
}
void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
param.DenormalizePenalties(sum_instance_weight);
std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model->param.num_output_group;
const RowSet &rowset = p_fmat->buffered_rowset();
// for all the output group
// update bias
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.GetHess() >= 0.0f) {
sum_grad += p.GetGrad();
sum_hess += p.GetHess();
}
}
// remove bias effect
bst_float dw = static_cast<bst_float>(
param.learning_rate * CoordinateDeltaBias(sum_grad, sum_hess));
model->bias()[gid] += dw;
// update grad value
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.GetHess() >= 0.0f) {
p += bst_gpair(p.GetHess() * dw, 0);
}
}
auto grad = GetBiasGradientParallel(gid, ngroup, *in_gpair, p_fmat);
auto dbias = static_cast<bst_float>(param.learning_rate *
CoordinateDeltaBias(grad.first, grad.second));
model->bias()[gid] += dbias;
UpdateBiasResidualParallel(gid, ngroup, dbias, in_gpair, p_fmat);
}
// lock-free parallel updates of weights
selector->Setup(*model, *in_gpair, p_fmat, param.reg_alpha_denorm, param.reg_lambda_denorm, 0);
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
// number of features
const ColBatch &batch = iter->Value();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = batch.col_index[i];
ColBatch::Inst col = batch[i];
int ii = selector->NextFeature(i, *model, 0, *in_gpair, p_fmat,
param.reg_alpha_denorm, param.reg_lambda_denorm);
if (ii < 0) continue;
const bst_uint fid = batch.col_index[ii];
ColBatch::Inst col = batch[ii];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (bst_uint j = 0; j < col.length; ++j) {
const bst_float v = col[j].fvalue;
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
const bst_float v = col[j].fvalue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
bst_float &w = (*model)[fid][gid];
bst_float dw = static_cast<bst_float>(
param.learning_rate *
CoordinateDelta(sum_grad, sum_hess, w, param.reg_lambda,
param.reg_alpha, sum_instance_weight));
CoordinateDelta(sum_grad, sum_hess, w, param.reg_alpha_denorm,
param.reg_lambda_denorm));
if (dw == 0.f) continue;
w += dw;
// update grad value
// update grad values
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
@@ -112,8 +117,11 @@ class ShotgunUpdater : public LinearUpdater {
}
}
// training parameter
protected:
// training parameters
ShotgunTrainParam param;
std::unique_ptr<FeatureSelector> selector;
};
DMLC_REGISTER_PARAMETER(ShotgunTrainParam);