Refactor linear modelling and add new coordinate descent updater (#3103)

* Refactor linear modelling and add new coordinate descent updater

* Allow unsorted column iterator

* Add prediction cacheing to gblinear
This commit is contained in:
Rory Mitchell
2018-02-17 09:17:01 +13:00
committed by GitHub
parent 9ffe8596f2
commit 10eb05a63a
23 changed files with 1252 additions and 271 deletions

View File

@@ -0,0 +1,321 @@
/*!
* Copyright 2018 by Contributors
* \author Rory Mitchell
*/
#pragma once
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "../common/random.h"
namespace xgboost {
namespace linear {
/**
* \brief Calculate change in weight for a given feature. Applies l1/l2 penalty normalised by the
* number of training instances.
*
* \param sum_grad The sum gradient.
* \param sum_hess The sum hess.
* \param w The weight.
* \param reg_lambda Unnormalised L2 penalty.
* \param reg_alpha Unnormalised L1 penalty.
* \param sum_instance_weight The sum instance weights, used to normalise l1/l2 penalty.
*
* \return The weight update.
*/
inline double CoordinateDelta(double sum_grad, double sum_hess, double w,
double reg_lambda, double reg_alpha,
double sum_instance_weight) {
reg_alpha *= sum_instance_weight;
reg_lambda *= sum_instance_weight;
if (sum_hess < 1e-5f) return 0.0f;
double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
if (tmp >= 0) {
return std::max(
-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
} else {
return std::min(
-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
}
}
/**
* \brief Calculate update to bias.
*
* \param sum_grad The sum gradient.
* \param sum_hess The sum hess.
*
* \return The weight update.
*/
inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
return -sum_grad / sum_hess;
}
/**
* \brief Get the gradient with respect to a single feature.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
* \param fidx The target feature.
* \param gpair Gradients.
* \param p_fmat The feature matrix.
*
* \return The gradient and diagonal Hessian entry for a given feature.
*/
inline std::pair<double, double> GetGradient(
int group_idx, int num_group, int fidx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_float v = col[j].fvalue;
auto &p = gpair[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) continue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
}
return std::make_pair(sum_grad, sum_hess);
}
/**
* \brief Get the gradient with respect to a single feature. Multithreaded.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
* \param fidx The target feature.
* \param gpair Gradients.
* \param p_fmat The feature matrix.
*
* \return The gradient and diagonal Hessian entry for a given feature.
*/
inline std::pair<double, double> GetGradientParallel(
int group_idx, int num_group, int fidx,
const std::vector<bst_gpair> &gpair, DMatrix *p_fmat) {
double sum_grad = 0.0, sum_hess = 0.0;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint j = 0; j < ndata; ++j) {
const bst_float v = col[j].fvalue;
auto &p = gpair[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) continue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
}
return std::make_pair(sum_grad, sum_hess);
}
/**
* \brief Get the gradient with respect to the bias. Multithreaded.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
* \param gpair Gradients.
* \param p_fmat The feature matrix.
*
* \return The gradient and diagonal Hessian entry for the bias.
*/
inline std::pair<double, double> GetBiasGradientParallel(
int group_idx, int num_group, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat) {
const RowSet &rowset = p_fmat->buffered_rowset();
double sum_grad = 0.0, sum_hess = 0.0;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint i = 0; i < ndata; ++i) {
auto &p = gpair[rowset[i] * num_group + group_idx];
if (p.GetHess() >= 0.0f) {
sum_grad += p.GetGrad();
sum_hess += p.GetHess();
}
}
return std::make_pair(sum_grad, sum_hess);
}
/**
* \brief Updates the gradient vector with respect to a change in weight.
*
* \param fidx The feature index.
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
* \param dw The change in weight.
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
float dw, std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat) {
if (dw == 0.0f) return;
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
const ColBatch &batch = iter->Value();
ColBatch::Inst col = batch[fidx];
// update grad value
const bst_omp_uint num_row = static_cast<bst_omp_uint>(col.length);
#pragma omp parallel for schedule(static)
for (bst_omp_uint j = 0; j < num_row; ++j) {
bst_gpair &p = (*in_gpair)[col[j].index * num_group + group_idx];
if (p.GetHess() < 0.0f) continue;
p += bst_gpair(p.GetHess() * col[j].fvalue * dw, 0);
}
}
}
/**
* \brief Updates the gradient vector based on a change in the bias.
*
* \param group_idx Zero-based index of the group.
* \param num_group Number of groups.
* \param dbias The change in bias.
* \param in_gpair The gradient vector to be updated.
* \param p_fmat The input feature matrix.
*/
inline void UpdateBiasResidualParallel(int group_idx, int num_group,
float dbias,
std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat) {
if (dbias == 0.0f) return;
const RowSet &rowset = p_fmat->buffered_rowset();
const bst_omp_uint ndata = static_cast<bst_omp_uint>(p_fmat->info().num_row);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &g = (*in_gpair)[rowset[i] * num_group + group_idx];
if (g.GetHess() < 0.0f) continue;
g += bst_gpair(g.GetHess() * dbias, 0);
}
}
/**
* \class FeatureSelector
*
* \brief Abstract class for stateful feature selection in coordinate descent
* algorithms.
*/
class FeatureSelector {
public:
static FeatureSelector *Create(std::string name);
/*! \brief virtual destructor */
virtual ~FeatureSelector() {}
/**
* \brief Select next coordinate to update.
*
* \param iteration The iteration.
* \param model The model.
* \param group_idx Zero-based index of the group.
* \param gpair The gpair.
* \param p_fmat The feature matrix.
* \param alpha Regularisation alpha.
* \param lambda Regularisation lambda.
* \param sum_instance_weight The sum instance weight.
*
* \return The index of the selected feature. -1 indicates the bias term.
*/
virtual int SelectNextFeature(int iteration,
const gbm::GBLinearModel &model,
int group_idx,
const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) = 0;
};
/**
* \class CyclicFeatureSelector
*
* \brief Deterministic selection by cycling through coordinates one at a time.
*/
class CyclicFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
return iteration % model.param.num_feature;
}
};
/**
* \class RandomFeatureSelector
*
* \brief A random coordinate selector.
*/
class RandomFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
return common::GlobalRandom()() % model.param.num_feature;
}
};
/**
* \class GreedyFeatureSelector
*
* \brief Select coordinate with the greatest gradient magnitude.
*/
class GreedyFeatureSelector : public FeatureSelector {
public:
int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
int group_idx, const std::vector<bst_gpair> &gpair,
DMatrix *p_fmat, float alpha, float lambda,
double sum_instance_weight) override {
// Find best
int best_fidx = 0;
double best_weight_update = 0.0f;
for (auto fidx = 0U; fidx < model.param.num_feature; fidx++) {
const float w = model[fidx][group_idx];
auto gradient = GetGradientParallel(
group_idx, model.param.num_output_group, fidx, gpair, p_fmat);
float dw = static_cast<float>(
CoordinateDelta(gradient.first, gradient.second, w, lambda, alpha,
sum_instance_weight));
if (std::abs(dw) > std::abs(best_weight_update)) {
best_weight_update = dw;
best_fidx = fidx;
}
}
return best_fidx;
}
};
inline FeatureSelector *FeatureSelector::Create(std::string name) {
if (name == "cyclic") {
return new CyclicFeatureSelector();
} else if (name == "random") {
return new RandomFeatureSelector();
} else if (name == "greedy") {
return new GreedyFeatureSelector();
} else {
LOG(FATAL) << name << ": unknown coordinate selector";
}
return nullptr;
}
} // namespace linear
} // namespace xgboost

View File

@@ -0,0 +1,29 @@
/*!
* Copyright 2018
*/
#include <xgboost/linear_updater.h>
#include <dmlc/registry.h>
namespace dmlc {
DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg);
} // namespace dmlc
namespace xgboost {
LinearUpdater* LinearUpdater::Create(const std::string& name) {
auto *e = ::dmlc::Registry< ::xgboost::LinearUpdaterReg>::Get()->Find(name);
if (e == nullptr) {
LOG(FATAL) << "Unknown linear updater " << name;
}
return (e->body)();
}
} // namespace xgboost
namespace xgboost {
namespace linear {
// List of files that will be force linked in static links.
DMLC_REGISTRY_LINK_TAG(updater_shotgun);
DMLC_REGISTRY_LINK_TAG(updater_coordinate);
} // namespace linear
} // namespace xgboost

View File

@@ -0,0 +1,124 @@
/*!
* Copyright 2018 by Contributors
* \author Rory Mitchell
*/
#include <xgboost/linear_updater.h>
#include "../common/timer.h"
#include "coordinate_common.h"
namespace xgboost {
namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_coordinate);
// training parameter
struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
std::string feature_selector;
float maximum_weight;
int debug_verbose;
// declare parameters
DMLC_DECLARE_PARAMETER(CoordinateTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(1.0f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(feature_selector)
.set_default("cyclic")
.describe(
"Feature selection algorithm, one of cyclic/random/greedy");
DMLC_DECLARE_FIELD(debug_verbose)
.set_lower_bound(0)
.set_default(0)
.describe("flag to print out detailed breakdown of runtime");
// alias of parameters
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
};
/**
* \class CoordinateUpdater
*
* \brief Coordinate descent algorithm that updates one feature per iteration
*/
class CoordinateUpdater : public LinearUpdater {
public:
// set training parameter
void Init(
const std::vector<std::pair<std::string, std::string> > &args) override {
param.InitAllowUnknown(args);
selector.reset(FeatureSelector::Create(param.feature_selector));
monitor.Init("CoordinateUpdater", param.debug_verbose);
}
void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
// Calculate bias
for (int group_idx = 0; group_idx < model->param.num_output_group;
++group_idx) {
auto grad = GetBiasGradientParallel(
group_idx, model->param.num_output_group, *in_gpair, p_fmat);
auto dbias = static_cast<float>(
param.learning_rate * CoordinateDeltaBias(grad.first, grad.second));
model->bias()[group_idx] += dbias;
UpdateBiasResidualParallel(group_idx, model->param.num_output_group,
dbias, in_gpair, p_fmat);
}
for (int group_idx = 0; group_idx < model->param.num_output_group;
++group_idx) {
for (auto i = 0U; i < model->param.num_feature; i++) {
int fidx = selector->SelectNextFeature(
i, *model, group_idx, *in_gpair, p_fmat, param.reg_alpha,
param.reg_lambda, sum_instance_weight);
this->UpdateFeature(fidx, group_idx, in_gpair, p_fmat, model,
sum_instance_weight);
}
}
}
void UpdateFeature(int fidx, int group_idx, std::vector<bst_gpair> *in_gpair,
DMatrix *p_fmat, gbm::GBLinearModel *model,
double sum_instance_weight) {
bst_float &w = (*model)[fidx][group_idx];
monitor.Start("GetGradientParallel");
auto gradient = GetGradientParallel(
group_idx, model->param.num_output_group, fidx, *in_gpair, p_fmat);
monitor.Stop("GetGradientParallel");
auto dw = static_cast<float>(
param.learning_rate *
CoordinateDelta(gradient.first, gradient.second, w, param.reg_lambda,
param.reg_alpha, sum_instance_weight));
w += dw;
monitor.Start("UpdateResidualParallel");
UpdateResidualParallel(fidx, group_idx, model->param.num_output_group, dw,
in_gpair, p_fmat);
monitor.Stop("UpdateResidualParallel");
}
// training parameter
CoordinateTrainParam param;
std::unique_ptr<FeatureSelector> selector;
common::Monitor monitor;
};
DMLC_REGISTER_PARAMETER(CoordinateTrainParam);
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
.describe("Update linear model according to coordinate descent algorithm.")
.set_body([]() { return new CoordinateUpdater(); });
} // namespace linear
} // namespace xgboost

View File

@@ -0,0 +1,127 @@
/*!
* Copyright 2018 by Contributors
* \author Tianqi Chen, Rory Mitchell
*/
#include <xgboost/linear_updater.h>
#include "coordinate_common.h"
namespace xgboost {
namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_shotgun);
// training parameter
struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
// declare parameters
DMLC_DECLARE_PARAMETER(ShotgunTrainParam) {
DMLC_DECLARE_FIELD(learning_rate)
.set_lower_bound(0.0f)
.set_default(1.0f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha)
.set_lower_bound(0.0f)
.set_default(0.0f)
.describe("L1 regularization on weights.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
}
};
class ShotgunUpdater : public LinearUpdater {
public:
// set training parameter
void Init(
const std::vector<std::pair<std::string, std::string> > &args) override {
param.InitAllowUnknown(args);
}
void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
gbm::GBLinearModel *model, double sum_instance_weight) override {
std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model->param.num_output_group;
const RowSet &rowset = p_fmat->buffered_rowset();
// for all the output group
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.GetHess() >= 0.0f) {
sum_grad += p.GetGrad();
sum_hess += p.GetHess();
}
}
// remove bias effect
bst_float dw = static_cast<bst_float>(
param.learning_rate * CoordinateDeltaBias(sum_grad, sum_hess));
model->bias()[gid] += dw;
// update grad value
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.GetHess() >= 0.0f) {
p += bst_gpair(p.GetHess() * dw, 0);
}
}
}
dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
// number of features
const ColBatch &batch = iter->Value();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = batch.col_index[i];
ColBatch::Inst col = batch[i];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (bst_uint j = 0; j < col.length; ++j) {
const bst_float v = col[j].fvalue;
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
sum_grad += p.GetGrad() * v;
sum_hess += p.GetHess() * v * v;
}
bst_float &w = (*model)[fid][gid];
bst_float dw = static_cast<bst_float>(
param.learning_rate *
CoordinateDelta(sum_grad, sum_hess, w, param.reg_lambda,
param.reg_alpha, sum_instance_weight));
w += dw;
// update grad value
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.GetHess() < 0.0f) continue;
p += bst_gpair(p.GetHess() * col[j].fvalue * dw, 0);
}
}
}
}
}
// training parameter
ShotgunTrainParam param;
};
DMLC_REGISTER_PARAMETER(ShotgunTrainParam);
XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
.describe(
"Update linear model according to shotgun coordinate descent "
"algorithm.")
.set_body([]() { return new ShotgunUpdater(); });
} // namespace linear
} // namespace xgboost