Refactor linear modelling and add new coordinate descent updater (#3103)

* Refactor linear modelling and add new coordinate descent updater * Allow unsorted column iterator * Add prediction cacheing to gblinear
2018-02-17 09:17:01 +13:00
parent 9ffe8596f2
commit 10eb05a63a
23 changed files with 1252 additions and 271 deletions
--- a/src/linear/coordinate_common.h
+++ b/src/linear/coordinate_common.h
@@ -0,0 +1,321 @@
+/*!
+ * Copyright 2018 by Contributors
+ * \author Rory Mitchell
+ */
+#pragma once
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+#include "../common/random.h"
+
+namespace xgboost {
+namespace linear {
+
+/**
+ * \brief Calculate change in weight for a given feature. Applies l1/l2 penalty normalised by the
+ *        number of training instances.
+ *
+ * \param sum_grad            The sum gradient.
+ * \param sum_hess            The sum hess.
+ * \param w                   The weight.
+ * \param reg_lambda          Unnormalised L2 penalty.
+ * \param reg_alpha           Unnormalised L1 penalty.
+ * \param sum_instance_weight The sum instance weights, used to normalise l1/l2 penalty.
+ *
+ * \return  The weight update.
+ */
+
+inline double CoordinateDelta(double sum_grad, double sum_hess, double w,
+                              double reg_lambda, double reg_alpha,
+                              double sum_instance_weight) {
+  reg_alpha *= sum_instance_weight;
+  reg_lambda *= sum_instance_weight;
+  if (sum_hess < 1e-5f) return 0.0f;
+  double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
+  if (tmp >= 0) {
+    return std::max(
+        -(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
+  } else {
+    return std::min(
+        -(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
+  }
+}
+
+/**
+ * \brief Calculate update to bias.
+ *
+ * \param sum_grad  The sum gradient.
+ * \param sum_hess  The sum hess.
+ *
+ * \return  The weight update.
+ */
+
+inline double CoordinateDeltaBias(double sum_grad, double sum_hess) {
+  return -sum_grad / sum_hess;
+}
+
+/**
+ * \brief Get the gradient with respect to a single feature.
+ *
+ * \param group_idx Zero-based index of the group.
+ * \param num_group Number of groups.
+ * \param fidx      The target feature.
+ * \param gpair     Gradients.
+ * \param p_fmat    The feature matrix.
+ *
+ * \return  The gradient and diagonal Hessian entry for a given feature.
+ */
+
+inline std::pair<double, double> GetGradient(
+    int group_idx, int num_group, int fidx, const std::vector<bst_gpair> &gpair,
+    DMatrix *p_fmat) {
+  double sum_grad = 0.0, sum_hess = 0.0;
+  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+  while (iter->Next()) {
+    const ColBatch &batch = iter->Value();
+    ColBatch::Inst col = batch[fidx];
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+    for (bst_omp_uint j = 0; j < ndata; ++j) {
+      const bst_float v = col[j].fvalue;
+      auto &p = gpair[col[j].index * num_group + group_idx];
+      if (p.GetHess() < 0.0f) continue;
+      sum_grad += p.GetGrad() * v;
+      sum_hess += p.GetHess() * v * v;
+    }
+  }
+  return std::make_pair(sum_grad, sum_hess);
+}
+
+/**
+ * \brief Get the gradient with respect to a single feature. Multithreaded.
+ *
+ * \param group_idx Zero-based index of the group.
+ * \param num_group Number of groups.
+ * \param fidx      The target feature.
+ * \param gpair     Gradients.
+ * \param p_fmat    The feature matrix.
+ *
+ * \return  The gradient and diagonal Hessian entry for a given feature.
+ */
+
+inline std::pair<double, double> GetGradientParallel(
+    int group_idx, int num_group, int fidx,
+
+    const std::vector<bst_gpair> &gpair, DMatrix *p_fmat) {
+  double sum_grad = 0.0, sum_hess = 0.0;
+  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+  while (iter->Next()) {
+    const ColBatch &batch = iter->Value();
+    ColBatch::Inst col = batch[fidx];
+    const bst_omp_uint ndata = static_cast<bst_omp_uint>(col.length);
+#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
+    for (bst_omp_uint j = 0; j < ndata; ++j) {
+      const bst_float v = col[j].fvalue;
+      auto &p = gpair[col[j].index * num_group + group_idx];
+      if (p.GetHess() < 0.0f) continue;
+      sum_grad += p.GetGrad() * v;
+      sum_hess += p.GetHess() * v * v;
+    }
+  }
+  return std::make_pair(sum_grad, sum_hess);
+}
+
+/**
+ * \brief Get the gradient with respect to the bias. Multithreaded.
+ *
+ * \param group_idx Zero-based index of the group.
+ * \param num_group Number of groups.
+ * \param gpair     Gradients.
+ * \param p_fmat    The feature matrix.
+ *
+ * \return  The gradient and diagonal Hessian entry for the bias.
+ */
+
+inline std::pair<double, double> GetBiasGradientParallel(
+    int group_idx, int num_group, const std::vector<bst_gpair> &gpair,
+    DMatrix *p_fmat) {
+  const RowSet &rowset = p_fmat->buffered_rowset();
+  double sum_grad = 0.0, sum_hess = 0.0;
+  const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    auto &p = gpair[rowset[i] * num_group + group_idx];
+    if (p.GetHess() >= 0.0f) {
+      sum_grad += p.GetGrad();
+      sum_hess += p.GetHess();
+    }
+  }
+  return std::make_pair(sum_grad, sum_hess);
+}
+
+/**
+ * \brief Updates the gradient vector with respect to a change in weight.
+ *
+ * \param fidx      The feature index.
+ * \param group_idx Zero-based index of the group.
+ * \param num_group Number of groups.
+ * \param dw        The change in weight.
+ * \param in_gpair  The gradient vector to be updated.
+ * \param p_fmat    The input feature matrix.
+ */
+
+inline void UpdateResidualParallel(int fidx, int group_idx, int num_group,
+                                   float dw, std::vector<bst_gpair> *in_gpair,
+                                   DMatrix *p_fmat) {
+  if (dw == 0.0f) return;
+  dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+  while (iter->Next()) {
+    const ColBatch &batch = iter->Value();
+    ColBatch::Inst col = batch[fidx];
+    // update grad value
+    const bst_omp_uint num_row = static_cast<bst_omp_uint>(col.length);
+#pragma omp parallel for schedule(static)
+    for (bst_omp_uint j = 0; j < num_row; ++j) {
+      bst_gpair &p = (*in_gpair)[col[j].index * num_group + group_idx];
+      if (p.GetHess() < 0.0f) continue;
+      p += bst_gpair(p.GetHess() * col[j].fvalue * dw, 0);
+    }
+  }
+}
+
+/**
+ * \brief Updates the gradient vector based on a change in the bias.
+ *
+ * \param group_idx Zero-based index of the group.
+ * \param num_group Number of groups.
+ * \param dbias     The change in bias.
+ * \param in_gpair  The gradient vector to be updated.
+ * \param p_fmat    The input feature matrix.
+ */
+
+inline void UpdateBiasResidualParallel(int group_idx, int num_group,
+                                       float dbias,
+                                       std::vector<bst_gpair> *in_gpair,
+                                       DMatrix *p_fmat) {
+  if (dbias == 0.0f) return;
+  const RowSet &rowset = p_fmat->buffered_rowset();
+  const bst_omp_uint ndata = static_cast<bst_omp_uint>(p_fmat->info().num_row);
+#pragma omp parallel for schedule(static)
+  for (bst_omp_uint i = 0; i < ndata; ++i) {
+    bst_gpair &g = (*in_gpair)[rowset[i] * num_group + group_idx];
+    if (g.GetHess() < 0.0f) continue;
+    g += bst_gpair(g.GetHess() * dbias, 0);
+  }
+}
+
+/**
+ * \class FeatureSelector
+ *
+ * \brief Abstract class for stateful feature selection in coordinate descent
+ * algorithms.
+ */
+
+class FeatureSelector {
+ public:
+  static FeatureSelector *Create(std::string name);
+  /*! \brief virtual destructor */
+  virtual ~FeatureSelector() {}
+
+  /**
+   * \brief Select next coordinate to update.
+   *
+   * \param iteration           The iteration.
+   * \param model               The model.
+   * \param group_idx           Zero-based index of the group.
+   * \param gpair               The gpair.
+   * \param p_fmat              The feature matrix.
+   * \param alpha               Regularisation alpha.
+   * \param lambda              Regularisation lambda.
+   * \param sum_instance_weight The sum instance weight.
+   *
+   * \return  The index of the selected feature. -1 indicates the bias term.
+   */
+
+  virtual int SelectNextFeature(int iteration,
+                                   const gbm::GBLinearModel &model,
+                                   int group_idx,
+                                   const std::vector<bst_gpair> &gpair,
+                                   DMatrix *p_fmat, float alpha, float lambda,
+                                   double sum_instance_weight) = 0;
+};
+
+/**
+ * \class CyclicFeatureSelector
+ *
+ * \brief Deterministic selection by cycling through coordinates one at a time.
+ */
+
+class CyclicFeatureSelector : public FeatureSelector {
+ public:
+  int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
+                           int group_idx, const std::vector<bst_gpair> &gpair,
+                           DMatrix *p_fmat, float alpha, float lambda,
+                           double sum_instance_weight) override {
+    return iteration % model.param.num_feature;
+  }
+};
+
+/**
+ * \class RandomFeatureSelector
+ *
+ * \brief A random coordinate selector.
+ */
+
+class RandomFeatureSelector : public FeatureSelector {
+ public:
+  int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
+                           int group_idx, const std::vector<bst_gpair> &gpair,
+                           DMatrix *p_fmat, float alpha, float lambda,
+                           double sum_instance_weight) override {
+    return common::GlobalRandom()() % model.param.num_feature;
+  }
+};
+
+/**
+ * \class GreedyFeatureSelector
+ *
+ * \brief Select coordinate with the greatest gradient magnitude.
+ */
+
+class GreedyFeatureSelector : public FeatureSelector {
+ public:
+  int SelectNextFeature(int iteration, const gbm::GBLinearModel &model,
+                           int group_idx, const std::vector<bst_gpair> &gpair,
+                           DMatrix *p_fmat, float alpha, float lambda,
+                           double sum_instance_weight) override {
+    // Find best
+    int best_fidx = 0;
+    double best_weight_update = 0.0f;
+
+    for (auto fidx = 0U; fidx < model.param.num_feature; fidx++) {
+      const float w = model[fidx][group_idx];
+      auto gradient = GetGradientParallel(
+          group_idx, model.param.num_output_group, fidx, gpair, p_fmat);
+      float dw = static_cast<float>(
+          CoordinateDelta(gradient.first, gradient.second, w, lambda, alpha,
+                          sum_instance_weight));
+      if (std::abs(dw) > std::abs(best_weight_update)) {
+        best_weight_update = dw;
+        best_fidx = fidx;
+      }
+    }
+    return best_fidx;
+  }
+};
+
+inline FeatureSelector *FeatureSelector::Create(std::string name) {
+  if (name == "cyclic") {
+    return new CyclicFeatureSelector();
+  } else if (name == "random") {
+    return new RandomFeatureSelector();
+  } else if (name == "greedy") {
+    return new GreedyFeatureSelector();
+  } else {
+    LOG(FATAL) << name << ": unknown coordinate selector";
+  }
+  return nullptr;
+}
+
+}  // namespace linear
+}  // namespace xgboost
--- a/src/linear/linear_updater.cc
+++ b/src/linear/linear_updater.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright 2018
+ */
+#include <xgboost/linear_updater.h>
+#include <dmlc/registry.h>
+
+namespace dmlc {
+DMLC_REGISTRY_ENABLE(::xgboost::LinearUpdaterReg);
+}  // namespace dmlc
+
+namespace xgboost {
+
+LinearUpdater* LinearUpdater::Create(const std::string& name) {
+  auto *e = ::dmlc::Registry< ::xgboost::LinearUpdaterReg>::Get()->Find(name);
+  if (e == nullptr) {
+    LOG(FATAL) << "Unknown linear updater " << name;
+  }
+  return (e->body)();
+}
+
+}  // namespace xgboost
+
+namespace xgboost {
+namespace linear {
+// List of files that will be force linked in static links.
+DMLC_REGISTRY_LINK_TAG(updater_shotgun);
+DMLC_REGISTRY_LINK_TAG(updater_coordinate);
+}  // namespace linear
+}  // namespace xgboost
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -0,0 +1,124 @@
+/*!
+ * Copyright 2018 by Contributors
+ * \author Rory Mitchell
+ */
+
+#include <xgboost/linear_updater.h>
+#include "../common/timer.h"
+#include "coordinate_common.h"
+
+namespace xgboost {
+namespace linear {
+
+DMLC_REGISTRY_FILE_TAG(updater_coordinate);
+
+// training parameter
+struct CoordinateTrainParam : public dmlc::Parameter<CoordinateTrainParam> {
+  /*! \brief learning_rate */
+  float learning_rate;
+  /*! \brief regularization weight for L2 norm */
+  float reg_lambda;
+  /*! \brief regularization weight for L1 norm */
+  float reg_alpha;
+  std::string feature_selector;
+  float maximum_weight;
+  int debug_verbose;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(CoordinateTrainParam) {
+    DMLC_DECLARE_FIELD(learning_rate)
+        .set_lower_bound(0.0f)
+        .set_default(1.0f)
+        .describe("Learning rate of each update.");
+    DMLC_DECLARE_FIELD(reg_lambda)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L2 regularization on weights.");
+    DMLC_DECLARE_FIELD(reg_alpha)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L1 regularization on weights.");
+    DMLC_DECLARE_FIELD(feature_selector)
+        .set_default("cyclic")
+        .describe(
+            "Feature selection algorithm, one of cyclic/random/greedy");
+    DMLC_DECLARE_FIELD(debug_verbose)
+        .set_lower_bound(0)
+        .set_default(0)
+        .describe("flag to print out detailed breakdown of runtime");
+    // alias of parameters
+    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
+    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
+  }
+};
+
+/**
+ * \class CoordinateUpdater
+ *
+ * \brief Coordinate descent algorithm that updates one feature per iteration
+ */
+
+class CoordinateUpdater : public LinearUpdater {
+ public:
+  // set training parameter
+  void Init(
+      const std::vector<std::pair<std::string, std::string> > &args) override {
+    param.InitAllowUnknown(args);
+    selector.reset(FeatureSelector::Create(param.feature_selector));
+    monitor.Init("CoordinateUpdater", param.debug_verbose);
+  }
+  void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
+              gbm::GBLinearModel *model, double sum_instance_weight) override {
+    // Calculate bias
+    for (int group_idx = 0; group_idx < model->param.num_output_group;
+         ++group_idx) {
+      auto grad = GetBiasGradientParallel(
+          group_idx, model->param.num_output_group, *in_gpair, p_fmat);
+      auto dbias = static_cast<float>(
+          param.learning_rate * CoordinateDeltaBias(grad.first, grad.second));
+      model->bias()[group_idx] += dbias;
+      UpdateBiasResidualParallel(group_idx, model->param.num_output_group,
+                                 dbias, in_gpair, p_fmat);
+    }
+    for (int group_idx = 0; group_idx < model->param.num_output_group;
+         ++group_idx) {
+      for (auto i = 0U; i < model->param.num_feature; i++) {
+        int fidx = selector->SelectNextFeature(
+            i, *model, group_idx, *in_gpair, p_fmat, param.reg_alpha,
+            param.reg_lambda, sum_instance_weight);
+        this->UpdateFeature(fidx, group_idx, in_gpair, p_fmat, model,
+                            sum_instance_weight);
+      }
+    }
+  }
+
+  void UpdateFeature(int fidx, int group_idx, std::vector<bst_gpair> *in_gpair,
+                     DMatrix *p_fmat, gbm::GBLinearModel *model,
+                     double sum_instance_weight) {
+    bst_float &w = (*model)[fidx][group_idx];
+    monitor.Start("GetGradientParallel");
+    auto gradient = GetGradientParallel(
+        group_idx, model->param.num_output_group, fidx, *in_gpair, p_fmat);
+    monitor.Stop("GetGradientParallel");
+    auto dw = static_cast<float>(
+        param.learning_rate *
+        CoordinateDelta(gradient.first, gradient.second, w, param.reg_lambda,
+                        param.reg_alpha, sum_instance_weight));
+    w += dw;
+    monitor.Start("UpdateResidualParallel");
+    UpdateResidualParallel(fidx, group_idx, model->param.num_output_group, dw,
+                           in_gpair, p_fmat);
+    monitor.Stop("UpdateResidualParallel");
+  }
+
+  // training parameter
+  CoordinateTrainParam param;
+  std::unique_ptr<FeatureSelector> selector;
+  common::Monitor monitor;
+};
+
+DMLC_REGISTER_PARAMETER(CoordinateTrainParam);
+XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
+    .describe("Update linear model according to coordinate descent algorithm.")
+    .set_body([]() { return new CoordinateUpdater(); });
+}  // namespace linear
+}  // namespace xgboost
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -0,0 +1,127 @@
+/*!
+ * Copyright 2018 by Contributors
+ * \author Tianqi Chen, Rory Mitchell
+ */
+
+#include <xgboost/linear_updater.h>
+#include "coordinate_common.h"
+
+namespace xgboost {
+namespace linear {
+
+DMLC_REGISTRY_FILE_TAG(updater_shotgun);
+
+// training parameter
+struct ShotgunTrainParam : public dmlc::Parameter<ShotgunTrainParam> {
+  /*! \brief learning_rate */
+  float learning_rate;
+  /*! \brief regularization weight for L2 norm */
+  float reg_lambda;
+  /*! \brief regularization weight for L1 norm */
+  float reg_alpha;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ShotgunTrainParam) {
+    DMLC_DECLARE_FIELD(learning_rate)
+        .set_lower_bound(0.0f)
+        .set_default(1.0f)
+        .describe("Learning rate of each update.");
+    DMLC_DECLARE_FIELD(reg_lambda)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L2 regularization on weights.");
+    DMLC_DECLARE_FIELD(reg_alpha)
+        .set_lower_bound(0.0f)
+        .set_default(0.0f)
+        .describe("L1 regularization on weights.");
+    // alias of parameters
+    DMLC_DECLARE_ALIAS(learning_rate, eta);
+    DMLC_DECLARE_ALIAS(reg_lambda, lambda);
+    DMLC_DECLARE_ALIAS(reg_alpha, alpha);
+  }
+};
+
+class ShotgunUpdater : public LinearUpdater {
+ public:
+  // set training parameter
+  void Init(
+      const std::vector<std::pair<std::string, std::string> > &args) override {
+    param.InitAllowUnknown(args);
+  }
+  void Update(std::vector<bst_gpair> *in_gpair, DMatrix *p_fmat,
+              gbm::GBLinearModel *model, double sum_instance_weight) override {
+    std::vector<bst_gpair> &gpair = *in_gpair;
+    const int ngroup = model->param.num_output_group;
+    const RowSet &rowset = p_fmat->buffered_rowset();
+    // for all the output group
+    for (int gid = 0; gid < ngroup; ++gid) {
+      double sum_grad = 0.0, sum_hess = 0.0;
+      const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
+#pragma omp parallel for schedule(static) reduction(+ : sum_grad, sum_hess)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
+        if (p.GetHess() >= 0.0f) {
+          sum_grad += p.GetGrad();
+          sum_hess += p.GetHess();
+        }
+      }
+      // remove bias effect
+      bst_float dw = static_cast<bst_float>(
+          param.learning_rate * CoordinateDeltaBias(sum_grad, sum_hess));
+      model->bias()[gid] += dw;
+// update grad value
+#pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < ndata; ++i) {
+        bst_gpair &p = gpair[rowset[i] * ngroup + gid];
+        if (p.GetHess() >= 0.0f) {
+          p += bst_gpair(p.GetHess() * dw, 0);
+        }
+      }
+    }
+    dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
+    while (iter->Next()) {
+      // number of features
+      const ColBatch &batch = iter->Value();
+      const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
+#pragma omp parallel for schedule(static)
+      for (bst_omp_uint i = 0; i < nfeat; ++i) {
+        const bst_uint fid = batch.col_index[i];
+        ColBatch::Inst col = batch[i];
+        for (int gid = 0; gid < ngroup; ++gid) {
+          double sum_grad = 0.0, sum_hess = 0.0;
+          for (bst_uint j = 0; j < col.length; ++j) {
+            const bst_float v = col[j].fvalue;
+            bst_gpair &p = gpair[col[j].index * ngroup + gid];
+            if (p.GetHess() < 0.0f) continue;
+            sum_grad += p.GetGrad() * v;
+            sum_hess += p.GetHess() * v * v;
+          }
+          bst_float &w = (*model)[fid][gid];
+          bst_float dw = static_cast<bst_float>(
+              param.learning_rate *
+              CoordinateDelta(sum_grad, sum_hess, w, param.reg_lambda,
+                              param.reg_alpha, sum_instance_weight));
+          w += dw;
+          // update grad value
+          for (bst_uint j = 0; j < col.length; ++j) {
+            bst_gpair &p = gpair[col[j].index * ngroup + gid];
+            if (p.GetHess() < 0.0f) continue;
+            p += bst_gpair(p.GetHess() * col[j].fvalue * dw, 0);
+          }
+        }
+      }
+    }
+  }
+
+  // training parameter
+  ShotgunTrainParam param;
+};
+
+DMLC_REGISTER_PARAMETER(ShotgunTrainParam);
+
+XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
+    .describe(
+        "Update linear model according to shotgun coordinate descent "
+        "algorithm.")
+    .set_body([]() { return new ShotgunUpdater(); });
+}  // namespace linear
+}  // namespace xgboost