[GBM] Finish migrate all gbms

This commit is contained in:
tqchen 2016-01-02 18:05:40 -08:00
parent e4567bbc47
commit 9042b9e2c7
5 changed files with 343 additions and 391 deletions

View File

@ -2,7 +2,7 @@
* Copyright (c) 2015 by Contributors * Copyright (c) 2015 by Contributors
* \file c_api.h * \file c_api.h
* \author Tianqi Chen * \author Tianqi Chen
* \brief C Style API of XGBoost, used to interfacing with other languages. * \brief C API of XGBoost, used to interfacing with other languages.
*/ */
#ifndef XGBOOST_C_API_H_ #ifndef XGBOOST_C_API_H_
#define XGBOOST_C_API_H_ #define XGBOOST_C_API_H_

View File

@ -27,8 +27,7 @@ class GradientBooster {
virtual ~GradientBooster() {} virtual ~GradientBooster() {}
/*! /*!
* \brief Set the configuration of gradient boosting. * \brief Set the configuration of gradient boosting.
* * User must call configure once before InitModel and Training.
* User must call configure before trainig.
* *
* \param cfg configurations on both training and model parameters. * \param cfg configurations on both training and model parameters.
*/ */

View File

@ -1,52 +1,106 @@
/*! /*!
* Copyright by Contributors * Copyright 2014 by Contributors
* \file gblinear-inl.hpp * \file gblinear.cc
* \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
* the update rule is parallel coordinate descent (shotgun) * the update rule is parallel coordinate descent (shotgun)
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#ifndef XGBOOST_GBM_GBLINEAR_INL_HPP_ #include <dmlc/logging.h>
#define XGBOOST_GBM_GBLINEAR_INL_HPP_ #include <dmlc/omp.h>
#include <dmlc/parameter.h>
#include <xgboost/gbm.h>
#include <vector> #include <vector>
#include <string> #include <string>
#include <sstream> #include <sstream>
#include <cstring>
#include <algorithm> #include <algorithm>
#include "./gbm.h"
#include "../tree/updater.h"
namespace xgboost { namespace xgboost {
namespace gbm { namespace gbm {
// model parameter
struct GBLinearModelParam :public dmlc::Parameter<GBLinearModelParam> {
// number of feature dimension
unsigned num_feature;
// number of output group
int num_output_group;
// reserved field
int reserved[32];
// constructor
GBLinearModelParam() {
std::memset(this, 0, sizeof(GBLinearModelParam));
}
DMLC_DECLARE_PARAMETER(GBLinearModelParam) {
DMLC_DECLARE_FIELD(num_feature).set_lower_bound(0)
.describe("Number of features used in classification.");
DMLC_DECLARE_FIELD(num_output_group).set_lower_bound(1).set_default(1)
.describe("Number of output groups in the setting.");
}
};
// training parameter
struct GBLinearTrainParam : public dmlc::Parameter<GBLinearTrainParam> {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
/*! \brief regularization weight for L2 norm in bias */
float reg_lambda_bias;
// declare parameters
DMLC_DECLARE_PARAMETER(GBLinearTrainParam) {
DMLC_DECLARE_FIELD(learning_rate).set_lower_bound(0.0f).set_default(1.0f)
.describe("Learning rate of each update.");
DMLC_DECLARE_FIELD(reg_lambda).set_lower_bound(0.0f).set_default(0.0f)
.describe("L2 regularization on weights.");
DMLC_DECLARE_FIELD(reg_alpha).set_lower_bound(0.0f).set_default(0.0f)
.describe("L1 regularization on weights.");
DMLC_DECLARE_FIELD(reg_lambda_bias).set_lower_bound(0.0f).set_default(0.0f)
.describe("L2 regularization on bias.");
// alias of parameters
DMLC_DECLARE_ALIAS(learning_rate, eta);
DMLC_DECLARE_ALIAS(reg_lambda, lambda);
DMLC_DECLARE_ALIAS(reg_alpha, alpha);
DMLC_DECLARE_ALIAS(reg_lambda_bias, lambda_bias);
}
// given original weight calculate delta
inline double CalcDelta(double sum_grad, double sum_hess, double w) const {
if (sum_hess < 1e-5f) return 0.0f;
double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
if (tmp >=0) {
return std::max(-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
} else {
return std::min(-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
}
}
// given original weight calculate delta bias
inline double CalcDeltaBias(double sum_grad, double sum_hess, double w) const {
return - (sum_grad + reg_lambda_bias * w) / (sum_hess + reg_lambda_bias);
}
};
/*! /*!
* \brief gradient boosted linear model * \brief gradient boosted linear model
* \tparam FMatrix the data type updater taking
*/ */
class GBLinear : public IGradBooster { class GBLinear : public GradientBooster {
public: public:
virtual ~GBLinear(void) { void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
}
// set model parameters
virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strncmp(name, "bst:", 4)) {
param.SetParam(name + 4, val);
}
if (model.weight.size() == 0) { if (model.weight.size() == 0) {
model.param.SetParam(name, val); model.param.InitAllowUnknown(cfg);
} }
param.InitAllowUnknown(cfg);
} }
virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*) void LoadModel(dmlc::Stream* fi) override {
model.LoadModel(fi); model.LoadModel(fi);
} }
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*) void SaveModel(dmlc::Stream* fo) const override {
model.SaveModel(fo); model.SaveModel(fo);
} }
virtual void InitModel(void) { void InitModel() override {
model.InitModel(); model.InitModel();
} }
virtual void DoBoost(IFMatrix *p_fmat, virtual void DoBoost(DMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) { std::vector<bst_gpair> *in_gpair) {
std::vector<bst_gpair> &gpair = *in_gpair; std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
@ -75,7 +129,7 @@ class GBLinear : public IGradBooster {
} }
} }
} }
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator(); dmlc::DataIter<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) { while (iter->Next()) {
// number of features // number of features
const ColBatch &batch = iter->Value(); const ColBatch &batch = iter->Value();
@ -108,22 +162,20 @@ class GBLinear : public IGradBooster {
} }
} }
virtual void Predict(IFMatrix *p_fmat, void Predict(DMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, std::vector<float> *out_preds,
std::vector<float> *out_preds, unsigned ntree_limit) override {
unsigned ntree_limit = 0) { CHECK_EQ(ntree_limit, 0)
utils::Check(ntree_limit == 0, << "GBLinear::Predict ntrees is only valid for gbtree predictor";
"GBLinear::Predict ntrees is only valid for gbtree predictor");
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
preds.resize(0); preds.resize(0);
// start collecting the prediction // start collecting the prediction
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator(); dmlc::DataIter<RowBatch> *iter = p_fmat->RowIterator();
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * ngroup == preds.size(), CHECK_EQ(batch.base_rowid * ngroup, preds.size());
"base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows // output convention: nrow * k, where nrow is number of rows
// k is number of group // k is number of group
preds.resize(preds.size() + batch.size * ngroup); preds.resize(preds.size() + batch.size * ngroup);
@ -139,22 +191,22 @@ class GBLinear : public IGradBooster {
} }
} }
} }
virtual void Predict(const SparseBatch::Inst &inst, void Predict(const SparseBatch::Inst &inst,
std::vector<float> *out_preds, std::vector<float> *out_preds,
unsigned ntree_limit, unsigned ntree_limit,
unsigned root_index) { unsigned root_index) override {
const int ngroup = model.param.num_output_group; const int ngroup = model.param.num_output_group;
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
this->Pred(inst, BeginPtr(*out_preds)); this->Pred(inst, dmlc::BeginPtr(*out_preds));
} }
} }
virtual void PredictLeaf(IFMatrix *p_fmat, void PredictLeaf(DMatrix *p_fmat,
const BoosterInfo &info, std::vector<float> *out_preds,
std::vector<float> *out_preds, unsigned ntree_limit) override {
unsigned ntree_limit = 0) { LOG(FATAL) << "gblinear does not support predict leaf index";
utils::Error("gblinear does not support predict leaf index");
} }
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) override {
std::stringstream fo(""); std::stringstream fo("");
fo << "bias:\n"; fo << "bias:\n";
for (int i = 0; i < model.param.num_output_group; ++i) { for (int i = 0; i < model.param.num_output_group; ++i) {
@ -182,76 +234,11 @@ class GBLinear : public IGradBooster {
preds[gid] = psum; preds[gid] = psum;
} }
} }
// training parameter
struct ParamTrain {
/*! \brief learning_rate */
float learning_rate;
/*! \brief regularization weight for L2 norm */
float reg_lambda;
/*! \brief regularization weight for L1 norm */
float reg_alpha;
/*! \brief regularization weight for L2 norm in bias */
float reg_lambda_bias;
// parameter
ParamTrain(void) {
reg_alpha = 0.0f;
reg_lambda = 0.0f;
reg_lambda_bias = 0.0f;
learning_rate = 1.0f;
}
inline void SetParam(const char *name, const char *val) {
using namespace std;
// sync-names
if (!strcmp("eta", name)) learning_rate = static_cast<float>(atof(val));
if (!strcmp("lambda", name)) reg_lambda = static_cast<float>(atof(val));
if (!strcmp( "alpha", name)) reg_alpha = static_cast<float>(atof(val));
if (!strcmp( "lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
// real names
if (!strcmp( "learning_rate", name)) learning_rate = static_cast<float>(atof(val));
if (!strcmp( "reg_lambda", name)) reg_lambda = static_cast<float>(atof(val));
if (!strcmp( "reg_alpha", name)) reg_alpha = static_cast<float>(atof(val));
if (!strcmp( "reg_lambda_bias", name)) reg_lambda_bias = static_cast<float>(atof(val));
}
// given original weight calculate delta
inline double CalcDelta(double sum_grad, double sum_hess, double w) {
if (sum_hess < 1e-5f) return 0.0f;
double tmp = w - (sum_grad + reg_lambda * w) / (sum_hess + reg_lambda);
if (tmp >=0) {
return std::max(-(sum_grad + reg_lambda * w + reg_alpha) / (sum_hess + reg_lambda), -w);
} else {
return std::min(-(sum_grad + reg_lambda * w - reg_alpha) / (sum_hess + reg_lambda), -w);
}
}
// given original weight calculate delta bias
inline double CalcDeltaBias(double sum_grad, double sum_hess, double w) {
return - (sum_grad + reg_lambda_bias * w) / (sum_hess + reg_lambda_bias);
}
};
// model for linear booster // model for linear booster
class Model { class Model {
public: public:
// model parameter
struct Param {
// number of feature dimension
unsigned num_feature;
// number of output group
int num_output_group;
// reserved field
int reserved[32];
// constructor
Param(void) {
num_feature = 0;
num_output_group = 1;
std::memset(reserved, 0, sizeof(reserved));
}
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp(name, "bst:num_feature")) num_feature = static_cast<unsigned>(atoi(val));
if (!strcmp(name, "num_output_group")) num_output_group = atoi(val);
}
};
// parameter // parameter
Param param; GBLinearModelParam param;
// weight for each of feature, bias is the last one // weight for each of feature, bias is the last one
std::vector<float> weight; std::vector<float> weight;
// initialize the model parameter // initialize the model parameter
@ -261,14 +248,14 @@ class GBLinear : public IGradBooster {
std::fill(weight.begin(), weight.end(), 0.0f); std::fill(weight.begin(), weight.end(), 0.0f);
} }
// save the model to file // save the model to file
inline void SaveModel(utils::IStream &fo) const { // NOLINT(*) inline void SaveModel(dmlc::Stream* fo) const {
fo.Write(&param, sizeof(Param)); fo->Write(&param, sizeof(param));
fo.Write(weight); fo->Write(weight);
} }
// load model from file // load model from file
inline void LoadModel(utils::IStream &fi) { // NOLINT(*) inline void LoadModel(dmlc::Stream* fi) {
utils::Assert(fi.Read(&param, sizeof(Param)) != 0, "Load LinearBooster"); CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param));
fi.Read(&weight); fi->Read(&weight);
} }
// model bias // model bias
inline float* bias(void) { inline float* bias(void) {
@ -282,11 +269,20 @@ class GBLinear : public IGradBooster {
// model field // model field
Model model; Model model;
// training parameter // training parameter
ParamTrain param; GBLinearTrainParam param;
// Per feature: shuffle index of each feature index // Per feature: shuffle index of each feature index
std::vector<bst_uint> feat_index; std::vector<bst_uint> feat_index;
}; };
// register the ojective functions
DMLC_REGISTER_PARAMETER(GBLinearModelParam);
DMLC_REGISTER_PARAMETER(GBLinearTrainParam);
XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
.describe("Linear booster, implement generalized linear model.")
.set_body([]() {
return new GBLinear();
});
} // namespace gbm } // namespace gbm
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_GBM_GBLINEAR_INL_HPP_

View File

@ -1,138 +1,201 @@
/*! /*!
* Copyright by Contributors * Copyright 2014 by Contributors
* \file gbtree-inl.hpp * \file gbtree.cc
* \brief gradient boosted tree implementation * \brief gradient boosted tree implementation.
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#ifndef XGBOOST_GBM_GBTREE_INL_HPP_ #include <dmlc/logging.h>
#define XGBOOST_GBM_GBTREE_INL_HPP_ #include <dmlc/omp.h>
#include <dmlc/parameter.h>
#include <xgboost/gbm.h>
#include <xgboost/tree_updater.h>
#include <vector> #include <vector>
#include <memory>
#include <utility> #include <utility>
#include <string> #include <string>
#include <limits> #include <limits>
#include "./gbm.h"
#include "../utils/omp.h"
#include "../tree/updater.h"
namespace xgboost { namespace xgboost {
namespace gbm { namespace gbm {
/*!
* \brief gradient boosted tree /*! \brief training parameters */
*/ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
class GBTree : public IGradBooster { /*! \brief number of threads */
int nthread;
/*!
* \brief number of parallel trees constructed each iteration
* use this option to support boosted random forest
*/
int num_parallel_tree;
/*! \brief tree updater sequence */
std::string updater_seq;
// declare parameters
DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
DMLC_DECLARE_FIELD(nthread).set_lower_bound(0).set_default(0)
.describe("Number of threads used for training.");
DMLC_DECLARE_FIELD(num_parallel_tree).set_lower_bound(1).set_default(1)
.describe("Number of parallel trees constructed during each iteration."\
" This option is used to support boosted random forest");
DMLC_DECLARE_FIELD(updater_seq).set_default("grow_colmaker,prune")
.describe("Tree updater sequence.");
}
};
/*! \brief model parameters */
struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
/*! \brief number of trees */
int num_trees;
/*! \brief number of roots */
int num_roots;
/*! \brief number of features to be used by trees */
int num_feature;
/*! \brief pad this space, for backward compatiblity reason.*/
int pad_32bit;
/*! \brief size of prediction buffer allocated used for buffering */
int64_t num_pbuffer;
/*!
* \brief how many output group a single instance can produce
* this affects the behavior of number of output we have:
* suppose we have n instance and k group, output will be k * n
*/
int num_output_group;
/*! \brief size of leaf vector needed in tree */
int size_leaf_vector;
/*! \brief reserved parameters */
int reserved[32];
/*! \brief constructor */
GBTreeModelParam() {
std::memset(this, 0, sizeof(GBTreeModelParam));
static_assert(sizeof(GBTreeModelParam) == (4 + 2 + 2 + 32) * sizeof(int),
"64/32 bit compatibility issue");
}
// declare parameters, only declare those that need to be set.
DMLC_DECLARE_PARAMETER(GBTreeModelParam) {
DMLC_DECLARE_FIELD(num_output_group).set_lower_bound(1).set_default(1)
.describe("Number of output groups to be predicted,"\
" used for multi-class classification.");
DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1)
.describe("Tree updater sequence.");
DMLC_DECLARE_FIELD(num_feature).set_lower_bound(0)
.describe("Number of features used for training and prediction.");
DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
.describe("Reserved option for vector tree.");
}
/*! \return size of prediction buffer actually needed */
inline size_t PredBufferSize() const {
return num_output_group * num_pbuffer * (size_leaf_vector + 1);
}
/*!
* \brief get the buffer offset given a buffer index and group id
* \return calculated buffer offset
*/
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1;
CHECK_LT(buffer_index, num_pbuffer);
return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
}
};
// gradient boosted trees
class GBTree : public GradientBooster {
public: public:
GBTree(void) { void Configure(const std::vector<std::pair<std::string, std::string> >& cfg) override {
} this->cfg = cfg;
virtual ~GBTree(void) { // initialize model parameters if not yet been initialized.
this->Clear(); if (trees.size() == 0) {
} mparam.InitAllowUnknown(cfg);
virtual void SetParam(const char *name, const char *val) {
using namespace std;
if (!strncmp(name, "bst:", 4)) {
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
// set into updaters, if already initialized
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->SetParam(name+4, val);
}
} }
if (!strcmp(name, "silent")) { // initialize the updaters only when needed.
this->SetParam("bst:silent", val); std::string updater_seq = tparam.updater_seq;
tparam.InitAllowUnknown(cfg);
if (updater_seq != tparam.updater_seq) updaters.clear();
for (const auto& up : updaters) {
up->Init(cfg);
} }
tparam.SetParam(name, val);
if (trees.size() == 0) mparam.SetParam(name, val);
} }
virtual void LoadModel(utils::IStream &fi, bool with_pbuffer) { // NOLINT(*) void LoadModel(dmlc::Stream* fi) override {
this->Clear(); CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam))
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0, << "GBTree: invalid model file";
"GBTree: invalid model file"); trees.clear();
trees.resize(mparam.num_trees); for (int i = 0; i < mparam.num_trees; ++i) {
for (size_t i = 0; i < trees.size(); ++i) { std::unique_ptr<RegTree> ptr(new RegTree());
trees[i] = new tree::RegTree(); ptr->LoadModel(fi);
trees[i]->LoadModel(fi); trees.push_back(std::move(ptr));
} }
tree_info.resize(mparam.num_trees); tree_info.resize(mparam.num_trees);
if (mparam.num_trees != 0) { if (mparam.num_trees != 0) {
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0, CHECK_EQ(fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * mparam.num_trees),
"GBTree: invalid model file"); sizeof(int) * mparam.num_trees);
}
if (mparam.num_pbuffer != 0 && with_pbuffer) {
pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize(mparam.PredBufferSize());
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
"GBTree: invalid model file");
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
"GBTree: invalid model file");
} }
this->ResetPredBuffer(0);
} }
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const { // NOLINT(*) void SaveModel(dmlc::Stream* fo) const override {
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree"); CHECK_EQ(mparam.num_trees, static_cast<int>(trees.size()));
if (with_pbuffer) { // not save predict buffer.
fo.Write(&mparam, sizeof(ModelParam)); GBTreeModelParam p = mparam;
} else { p.num_pbuffer = 0;
ModelParam p = mparam; fo->Write(&p, sizeof(p));
p.num_pbuffer = 0;
fo.Write(&p, sizeof(ModelParam));
}
for (size_t i = 0; i < trees.size(); ++i) { for (size_t i = 0; i < trees.size(); ++i) {
trees[i]->SaveModel(fo); trees[i]->SaveModel(fo);
} }
if (tree_info.size() != 0) { if (tree_info.size() != 0) {
fo.Write(BeginPtr(tree_info), sizeof(int) * tree_info.size()); fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size());
}
if (mparam.num_pbuffer != 0 && with_pbuffer) {
fo.Write(BeginPtr(pred_buffer), pred_buffer.size() * sizeof(float));
fo.Write(BeginPtr(pred_counter), pred_counter.size() * sizeof(unsigned));
} }
} }
// initialize the predict buffer void InitModel() override {
virtual void InitModel(void) { CHECK(mparam.num_trees == 0 && trees.size() == 0)
pred_buffer.clear(); pred_counter.clear(); << "Model has already been initialized.";
pred_buffer.clear();
pred_counter.clear();
pred_buffer.resize(mparam.PredBufferSize(), 0.0f); pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
pred_counter.resize(mparam.PredBufferSize(), 0); pred_counter.resize(mparam.PredBufferSize(), 0);
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
} }
virtual void ResetPredBuffer(size_t num_pbuffer) { void ResetPredBuffer(size_t num_pbuffer) override {
mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer); mparam.num_pbuffer = static_cast<int64_t>(num_pbuffer);
pred_buffer.clear(); pred_counter.clear(); pred_buffer.clear();
pred_counter.clear();
pred_buffer.resize(mparam.PredBufferSize(), 0.0f); pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
pred_counter.resize(mparam.PredBufferSize(), 0); pred_counter.resize(mparam.PredBufferSize(), 0);
} }
virtual bool AllowLazyCheckPoint(void) const { bool AllowLazyCheckPoint() const override {
return !(tparam.distcol_mode != 0 && mparam.num_output_group != 1); return mparam.num_output_group == 1 ||
tparam.updater_seq.find("distcol") != std::string::npos;
} }
virtual void DoBoost(IFMatrix *p_fmat, void DoBoost(DMatrix* p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, std::vector<bst_gpair>* in_gpair) override {
std::vector<bst_gpair> *in_gpair) { const std::vector<bst_gpair>& gpair = *in_gpair;
const std::vector<bst_gpair> &gpair = *in_gpair; std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
std::vector<std::vector<tree::RegTree*> > new_trees;
if (mparam.num_output_group == 1) { if (mparam.num_output_group == 1) {
new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0)); std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(gpair, p_fmat, buffer_offset, 0, &ret);
new_trees.push_back(std::move(ret));
} else { } else {
const int ngroup = mparam.num_output_group; const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0, CHECK_EQ(gpair.size() % ngroup, 0)
"must have exactly ngroup*nrow gpairs"); << "must have exactly ngroup*nrow gpairs";
std::vector<bst_gpair> tmp(gpair.size()/ngroup); std::vector<bst_gpair> tmp(gpair.size() / ngroup);
for (int gid = 0; gid < ngroup; ++gid) { for (int gid = 0; gid < ngroup; ++gid) {
bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size()); bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid]; tmp[i] = gpair[i * ngroup + gid];
} }
new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid)); std::vector<std::unique_ptr<RegTree> > ret;
BoostNewTrees(tmp, p_fmat, buffer_offset, gid, &ret);
new_trees.push_back(std::move(ret));
} }
} }
for (int gid = 0; gid < mparam.num_output_group; ++gid) { for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->CommitModel(new_trees[gid], gid); this->CommitModel(std::move(new_trees[gid]), gid);
} }
} }
virtual void Predict(IFMatrix *p_fmat, void Predict(DMatrix* p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, std::vector<float>* out_preds,
std::vector<float> *out_preds, unsigned ntree_limit) override {
unsigned ntree_limit = 0) { const MetaInfo& info = p_fmat->info();
int nthread; int nthread;
#pragma omp parallel #pragma omp parallel
{ {
@ -140,10 +203,11 @@ class GBTree : public IGradBooster {
} }
InitThreadTemp(nthread); InitThreadTemp(nthread);
std::vector<float> &preds = *out_preds; std::vector<float> &preds = *out_preds;
const size_t stride = info.num_row * mparam.num_output_group; const size_t stride = p_fmat->info().num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1)); preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction // start collecting the prediction
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator(); dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch &batch = iter->Value();
@ -152,9 +216,9 @@ class GBTree : public IGradBooster {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
tree::RegTree::FVec &feats = thread_temp[tid]; RegTree::FVec &feats = thread_temp[tid];
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i); int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
utils::Assert(static_cast<size_t>(ridx) < info.num_row, "data row index exceed bound"); CHECK_LT(static_cast<size_t>(ridx), info.num_row);
// loop over output groups // loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) { for (int gid = 0; gid < mparam.num_output_group; ++gid) {
this->Pred(batch[i], this->Pred(batch[i],
@ -166,12 +230,12 @@ class GBTree : public IGradBooster {
} }
} }
} }
virtual void Predict(const SparseBatch::Inst &inst, void Predict(const SparseBatch::Inst& inst,
std::vector<float> *out_preds, std::vector<float>* out_preds,
unsigned ntree_limit, unsigned ntree_limit,
unsigned root_index) { unsigned root_index) override {
if (thread_temp.size() == 0) { if (thread_temp.size() == 0) {
thread_temp.resize(1, tree::RegTree::FVec()); thread_temp.resize(1, RegTree::FVec());
thread_temp[0].Init(mparam.num_feature); thread_temp[0].Init(mparam.num_feature);
} }
out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1)); out_preds->resize(mparam.num_output_group * (mparam.size_leaf_vector+1));
@ -182,120 +246,99 @@ class GBTree : public IGradBooster {
ntree_limit); ntree_limit);
} }
} }
virtual void PredictLeaf(IFMatrix *p_fmat, void PredictLeaf(DMatrix* p_fmat,
const BoosterInfo &info, std::vector<float>* out_preds,
std::vector<float> *out_preds, unsigned ntree_limit) {
unsigned ntree_limit) {
int nthread; int nthread;
#pragma omp parallel #pragma omp parallel
{ {
nthread = omp_get_num_threads(); nthread = omp_get_num_threads();
} }
InitThreadTemp(nthread); InitThreadTemp(nthread);
this->PredPath(p_fmat, info, out_preds, ntree_limit); this->PredPath(p_fmat, out_preds, ntree_limit);
} }
virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) { std::vector<std::string> Dump2Text(const FeatureMap& fmap, int option) {
std::vector<std::string> dump; std::vector<std::string> dump;
for (size_t i = 0; i < trees.size(); i++) { for (size_t i = 0; i < trees.size(); i++) {
dump.push_back(trees[i]->DumpModel(fmap, option&1)); dump.push_back(trees[i]->Dump2Text(fmap, option & 1));
} }
return dump; return dump;
} }
protected: protected:
// clear the model
inline void Clear(void) {
for (size_t i = 0; i < trees.size(); ++i) {
delete trees[i];
}
for (size_t i = 0; i < updaters.size(); ++i) {
delete updaters[i];
}
updaters.clear();
trees.clear();
pred_buffer.clear();
pred_counter.clear();
}
// initialize updater before using them // initialize updater before using them
inline void InitUpdater(void) { inline void InitUpdater() {
if (tparam.updater_initialized != 0) return; if (updaters.size() != 0) return;
for (size_t i = 0; i < updaters.size(); ++i) {
delete updaters[i];
}
updaters.clear();
std::string tval = tparam.updater_seq; std::string tval = tparam.updater_seq;
char *pstr; char *pstr;
pstr = std::strtok(&tval[0], ","); pstr = std::strtok(&tval[0], ",");
while (pstr != NULL) { while (pstr != nullptr) {
updaters.push_back(tree::CreateUpdater(pstr)); std::unique_ptr<TreeUpdater> up(TreeUpdater::Create(pstr));
for (size_t j = 0; j < cfg.size(); ++j) { up->Init(this->cfg);
// set parameters updaters.push_back(std::move(up));
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); pstr = std::strtok(nullptr, ",");
}
pstr = std::strtok(NULL, ",");
} }
tparam.updater_initialized = 1;
} }
// do group specific group // do group specific group
inline std::vector<tree::RegTree*> inline void
BoostNewTrees(const std::vector<bst_gpair> &gpair, BoostNewTrees(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat, DMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
const BoosterInfo &info, int bst_group,
int bst_group) { std::vector<std::unique_ptr<RegTree> >* ret) {
std::vector<tree::RegTree *> new_trees;
this->InitUpdater(); this->InitUpdater();
std::vector<RegTree*> new_trees;
ret->clear();
// create the trees // create the trees
for (int i = 0; i < tparam.num_parallel_tree; ++i) { for (int i = 0; i < tparam.num_parallel_tree; ++i) {
new_trees.push_back(new tree::RegTree()); std::unique_ptr<RegTree> ptr(new RegTree());
for (size_t j = 0; j < cfg.size(); ++j) { ptr->param.InitAllowUnknown(this->cfg);
new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str()); ptr->InitModel();
} new_trees.push_back(ptr.get());
new_trees.back()->InitModel(); ret->push_back(std::move(ptr));
} }
// update the trees // update the trees
for (size_t i = 0; i < updaters.size(); ++i) { for (auto& up : updaters) {
updaters[i]->Update(gpair, p_fmat, info, new_trees); up->Update(gpair, p_fmat, new_trees);
} }
// optimization, update buffer, if possible // optimization, update buffer, if possible
// this is only under distributed column mode // this is only under distributed column mode
// for safety check of lazy checkpoint // for safety check of lazy checkpoint
if ( if (buffer_offset >= 0 &&
buffer_offset >= 0 &&
new_trees.size() == 1 && updaters.size() > 0 && new_trees.size() == 1 && updaters.size() > 0 &&
updaters.back()->GetLeafPosition() != NULL) { updaters.back()->GetLeafPosition() != nullptr) {
utils::Check(info.num_row == p_fmat->buffered_rowset().size(), CHECK_EQ(p_fmat->info().num_row, p_fmat->buffered_rowset().size());
"distributed mode is not compatible with prob_buffer_row");
this->UpdateBufferByPosition(p_fmat, this->UpdateBufferByPosition(p_fmat,
buffer_offset, bst_group, buffer_offset,
bst_group,
*new_trees[0], *new_trees[0],
updaters.back()->GetLeafPosition()); updaters.back()->GetLeafPosition());
} }
return new_trees;
} }
// commit new trees all at once // commit new trees all at once
inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) { inline void CommitModel(std::vector<std::unique_ptr<RegTree> >&& new_trees,
int bst_group) {
for (size_t i = 0; i < new_trees.size(); ++i) { for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(new_trees[i]); trees.push_back(std::move(new_trees[i]));
tree_info.push_back(bst_group); tree_info.push_back(bst_group);
} }
mparam.num_trees += static_cast<int>(new_trees.size()); mparam.num_trees += static_cast<int>(new_trees.size());
} }
// update buffer by pre-cached position // update buffer by pre-cached position
inline void UpdateBufferByPosition(IFMatrix *p_fmat, inline void UpdateBufferByPosition(DMatrix *p_fmat,
int64_t buffer_offset, int64_t buffer_offset,
int bst_group, int bst_group,
const tree::RegTree &new_tree, const RegTree &new_tree,
const int* leaf_position) { const int* leaf_position) {
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset(); const std::vector<bst_uint>& rowset = p_fmat->buffered_rowset();
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size()); const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < ndata; ++i) { for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i]; const bst_uint ridx = rowset[i];
const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group); const int64_t bid = mparam.BufferOffset(buffer_offset + ridx, bst_group);
const int tid = leaf_position[ridx]; const int tid = leaf_position[ridx];
utils::Assert(pred_counter[bid] == trees.size(), "cached buffer not up to date"); CHECK_EQ(pred_counter[bid], trees.size());
utils::Assert(tid >= 0, "invalid leaf position"); CHECK_GE(tid, 0);
pred_buffer[bid] += new_tree[tid].leaf_value(); pred_buffer[bid] += new_tree[tid].leaf_value();
for (int i = 0; i < mparam.size_leaf_vector; ++i) { for (int i = 0; i < mparam.size_leaf_vector; ++i) {
pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i]; pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
@ -308,8 +351,9 @@ class GBTree : public IGradBooster {
int64_t buffer_index, int64_t buffer_index,
int bst_group, int bst_group,
unsigned root_index, unsigned root_index,
tree::RegTree::FVec *p_feats, RegTree::FVec *p_feats,
float *out_pred, size_t stride, float *out_pred,
size_t stride,
unsigned ntree_limit) { unsigned ntree_limit) {
size_t itop = 0; size_t itop = 0;
float psum = 0.0f; float psum = 0.0f;
@ -354,28 +398,28 @@ class GBTree : public IGradBooster {
} }
} }
// predict independent leaf index // predict independent leaf index
inline void PredPath(IFMatrix *p_fmat, inline void PredPath(DMatrix *p_fmat,
const BoosterInfo &info,
std::vector<float> *out_preds, std::vector<float> *out_preds,
unsigned ntree_limit) { unsigned ntree_limit) {
const MetaInfo& info = p_fmat->info();
// number of valid trees // number of valid trees
if (ntree_limit == 0 || ntree_limit > trees.size()) { if (ntree_limit == 0 || ntree_limit > trees.size()) {
ntree_limit = static_cast<unsigned>(trees.size()); ntree_limit = static_cast<unsigned>(trees.size());
} }
std::vector<float> &preds = *out_preds; std::vector<float>& preds = *out_preds;
preds.resize(info.num_row * ntree_limit); preds.resize(info.num_row * ntree_limit);
// start collecting the prediction // start collecting the prediction
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator(); dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
iter->BeforeFirst(); iter->BeforeFirst();
while (iter->Next()) { while (iter->Next()) {
const RowBatch &batch = iter->Value(); const RowBatch& batch = iter->Value();
// parallel over local batch // parallel over local batch
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size); const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) { for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
size_t ridx = static_cast<size_t>(batch.base_rowid + i); size_t ridx = static_cast<size_t>(batch.base_rowid + i);
tree::RegTree::FVec &feats = thread_temp[tid]; RegTree::FVec &feats = thread_temp[tid];
feats.Fill(batch[i]); feats.Fill(batch[i]);
for (unsigned j = 0; j < ntree_limit; ++j) { for (unsigned j = 0; j < ntree_limit; ++j) {
int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx)); int tid = trees[j]->GetLeafIndex(feats, info.GetRoot(ridx));
@ -389,7 +433,7 @@ class GBTree : public IGradBooster {
inline void InitThreadTemp(int nthread) { inline void InitThreadTemp(int nthread) {
int prev_thread_temp_size = thread_temp.size(); int prev_thread_temp_size = thread_temp.size();
if (prev_thread_temp_size < nthread) { if (prev_thread_temp_size < nthread) {
thread_temp.resize(nthread, tree::RegTree::FVec()); thread_temp.resize(nthread, RegTree::FVec());
for (int i = prev_thread_temp_size; i < nthread; ++i) { for (int i = prev_thread_temp_size; i < nthread; ++i) {
thread_temp[i].Init(mparam.num_feature); thread_temp[i].Init(mparam.num_feature);
} }
@ -397,109 +441,12 @@ class GBTree : public IGradBooster {
} }
// --- data structure --- // --- data structure ---
/*! \brief training parameters */
struct TrainParam {
/*! \brief number of threads */
int nthread;
/*!
* \brief number of parallel trees constructed each iteration
* use this option to support boosted random forest
*/
int num_parallel_tree;
/*! \brief whether updater is already initialized */
int updater_initialized;
/*! \brief distributed column mode */
int distcol_mode;
/*! \brief tree updater sequence */
std::string updater_seq;
// construction
TrainParam(void) {
nthread = 0;
updater_seq = "grow_colmaker,prune";
num_parallel_tree = 1;
updater_initialized = 0;
distcol_mode = 0;
}
inline void SetParam(const char *name, const char *val){
using namespace std;
if (!strcmp(name, "updater") &&
strcmp(updater_seq.c_str(), val) != 0) {
updater_seq = val;
updater_initialized = 0;
}
if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
distcol_mode = 1;
}
if (!strcmp(name, "nthread")) {
omp_set_num_threads(nthread = atoi(val));
}
if (!strcmp(name, "num_parallel_tree")) {
num_parallel_tree = atoi(val);
}
}
};
/*! \brief model parameters */
struct ModelParam {
/*! \brief number of trees */
int num_trees;
/*! \brief number of root: default 0, means single tree */
int num_roots;
/*! \brief number of features to be used by trees */
int num_feature;
/*! \brief size of prediction buffer allocated used for buffering */
int64_t num_pbuffer;
/*!
* \brief how many output group a single instance can produce
* this affects the behavior of number of output we have:
* suppose we have n instance and k group, output will be k*n
*/
int num_output_group;
/*! \brief size of leaf vector needed in tree */
int size_leaf_vector;
/*! \brief reserved parameters */
int reserved[31];
/*! \brief constructor */
ModelParam(void) {
std::memset(this, 0, sizeof(ModelParam));
num_trees = 0;
num_roots = num_feature = 0;
num_pbuffer = 0;
num_output_group = 1;
size_leaf_vector = 0;
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
using namespace std;
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
if (!strcmp("bst:size_leaf_vector", name)) size_leaf_vector = atoi(val);
}
/*! \return size of prediction buffer actually needed */
inline size_t PredBufferSize(void) const {
return num_output_group * num_pbuffer * (size_leaf_vector + 1);
}
/*!
* \brief get the buffer offset given a buffer index and group id
* \return calculated buffer offset
*/
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1;
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
return (buffer_index + num_pbuffer * bst_group) * (size_leaf_vector + 1);
}
};
// training parameter // training parameter
TrainParam tparam; GBTreeTrainParam tparam;
// model parameter // model parameter
ModelParam mparam; GBTreeModelParam mparam;
/*! \brief vector of trees stored in the model */ /*! \brief vector of trees stored in the model */
std::vector<tree::RegTree*> trees; std::vector<std::unique_ptr<RegTree> > trees;
/*! \brief some information indicator of the tree, reserved */ /*! \brief some information indicator of the tree, reserved */
std::vector<int> tree_info; std::vector<int> tree_info;
/*! \brief prediction buffer */ /*! \brief prediction buffer */
@ -508,13 +455,22 @@ class GBTree : public IGradBooster {
std::vector<unsigned> pred_counter; std::vector<unsigned> pred_counter;
// ----training fields---- // ----training fields----
// configurations for tree // configurations for tree
std::vector< std::pair<std::string, std::string> > cfg; std::vector<std::pair<std::string, std::string> > cfg;
// temporal storage for per thread // temporal storage for per thread
std::vector<tree::RegTree::FVec> thread_temp; std::vector<RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree // the updaters that can be applied to each of tree
std::vector<tree::IUpdater*> updaters; std::vector<std::unique_ptr<TreeUpdater> > updaters;
}; };
// register the ojective functions
DMLC_REGISTER_PARAMETER(GBTreeModelParam);
DMLC_REGISTER_PARAMETER(GBTreeTrainParam);
XGBOOST_REGISTER_GBM(GBTree, "gbtree")
.describe("Tree booster, gradient boosted trees.")
.set_body([]() {
return new GBTree();
});
} // namespace gbm } // namespace gbm
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_GBM_GBTREE_INL_HPP_

View File

@ -6,6 +6,7 @@
*/ */
#include <dmlc/logging.h> #include <dmlc/logging.h>
#include <dmlc/omp.h> #include <dmlc/omp.h>
#include <dmlc/parameter.h>
#include <xgboost/objective.h> #include <xgboost/objective.h>
#include <vector> #include <vector>
#include <algorithm> #include <algorithm>