mv code into src
This commit is contained in:
82
src/gbm/gbm.h
Normal file
82
src/gbm/gbm.h
Normal file
@@ -0,0 +1,82 @@
|
||||
#ifndef XGBOOST_GBM_GBM_H_
|
||||
#define XGBOOST_GBM_GBM_H_
|
||||
/*!
|
||||
* \file gbm.h
|
||||
* \brief interface of gradient booster, that learns through gradient statistics
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for gradient booster */
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief interface of gradient boosting model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IGradBooster {
|
||||
public:
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
virtual void LoadModel(utils::IStream &fi) = 0;
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void SaveModel(utils::IStream &fo) const = 0;
|
||||
/*!
|
||||
* \brief initialize the model
|
||||
*/
|
||||
virtual void InitModel(void) = 0;
|
||||
/*!
|
||||
* \brief peform update to the model(boosting)
|
||||
* \param gpair the gradient pair statistics of the data
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param root_index pre-partitioned root_index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index) = 0;
|
||||
/*!
|
||||
* \brief generate predictions for given feature matrix
|
||||
* \param fmat feature matrix
|
||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||
* this means we do not have buffer index allocated to the gbm
|
||||
* a buffer index is assigned to each instance that requires repeative prediction
|
||||
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
|
||||
* \param root_index pre-partitioned root_index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
* \param out_preds output vector to hold the predictions
|
||||
*/
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
int64_t buffer_offset,
|
||||
const std::vector<unsigned> &root_index,
|
||||
std::vector<float> *out_preds) = 0;
|
||||
// destrcutor
|
||||
virtual ~IGradBooster(void){}
|
||||
};
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#include "gbtree-inl.hpp"
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
template<typename FMatrix>
|
||||
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
|
||||
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
|
||||
utils::Error("unknown booster type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBM_H_
|
||||
365
src/gbm/gbtree-inl.hpp
Normal file
365
src/gbm/gbtree-inl.hpp
Normal file
@@ -0,0 +1,365 @@
|
||||
#ifndef XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
#define XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
/*!
|
||||
* \file gbtree-inl.hpp
|
||||
* \brief gradient boosted tree implementation
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./gbm.h"
|
||||
#include "../tree/updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief gradient boosted tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBTree : public IGradBooster<FMatrix> {
|
||||
public:
|
||||
virtual ~GBTree(void) {
|
||||
this->Clear();
|
||||
}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strncmp(name, "bst:", 4)) {
|
||||
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
||||
// set into updaters, if already intialized
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->SetParam(name+4, val);
|
||||
}
|
||||
}
|
||||
if (!strcmp(name, "silent")) {
|
||||
this->SetParam("bst:silent", val);
|
||||
}
|
||||
tparam.SetParam(name, val);
|
||||
if (trees.size() == 0) mparam.SetParam(name, val);
|
||||
}
|
||||
virtual void LoadModel(utils::IStream &fi) {
|
||||
this->Clear();
|
||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
trees.resize(mparam.num_trees);
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i] = new tree::RegTree();
|
||||
trees[i]->LoadModel(fi);
|
||||
}
|
||||
tree_info.resize(mparam.num_trees);
|
||||
if (mparam.num_trees != 0) {
|
||||
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
|
||||
"GBTree: invalid model file");
|
||||
}
|
||||
if (mparam.num_pbuffer != 0) {
|
||||
pred_buffer.resize(mparam.PredBufferSize());
|
||||
pred_counter.resize(mparam.PredBufferSize());
|
||||
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
}
|
||||
}
|
||||
virtual void SaveModel(utils::IStream &fo) const {
|
||||
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i]->SaveModel(fo);
|
||||
}
|
||||
if (tree_info.size() != 0) {
|
||||
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
|
||||
}
|
||||
if (mparam.num_pbuffer != 0) {
|
||||
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
|
||||
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
|
||||
}
|
||||
}
|
||||
// initialize the predic buffer
|
||||
virtual void InitModel(void) {
|
||||
pred_buffer.clear(); pred_counter.clear();
|
||||
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
|
||||
pred_counter.resize(mparam.PredBufferSize(), 0);
|
||||
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
||||
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
||||
}
|
||||
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index) {
|
||||
if (mparam.num_output_group == 1) {
|
||||
this->BoostNewTrees(gpair, fmat, root_index, 0);
|
||||
} else {
|
||||
const int ngroup = mparam.num_output_group;
|
||||
utils::Check(gpair.size() % ngroup == 0,
|
||||
"must have exactly ngroup*nrow gpairs");
|
||||
std::vector<bst_gpair> tmp(gpair.size()/ngroup);
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
tmp[i] = gpair[i * ngroup + gid];
|
||||
}
|
||||
this->BoostNewTrees(tmp, fmat, root_index, gid);
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
int64_t buffer_offset,
|
||||
const std::vector<unsigned> &root_index,
|
||||
std::vector<float> *out_preds) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
this->InitThreadTemp(nthread);
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
|
||||
"base_rowid is not set correctly");
|
||||
// output convention: nrow * k, where nrow is number of rows
|
||||
// k is number of group
|
||||
preds.resize(preds.size() + batch.size * mparam.num_output_group);
|
||||
// parallel over local batch
|
||||
const unsigned nsize = static_cast<unsigned>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < nsize; ++i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
std::vector<float> &feats = thread_temp[tid];
|
||||
const size_t ridx = batch.base_rowid + i;
|
||||
const unsigned root_idx = root_index.size() == 0 ? 0 : root_index[ridx];
|
||||
// loop over output groups
|
||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||
preds[ridx * mparam.num_output_group + gid] =
|
||||
this->Pred(batch[i],
|
||||
buffer_offset < 0 ? -1 : buffer_offset+ridx,
|
||||
gid, root_idx, &feats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// clear the model
|
||||
inline void Clear(void) {
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
delete trees[i];
|
||||
}
|
||||
trees.clear();
|
||||
pred_buffer.clear();
|
||||
pred_counter.clear();
|
||||
}
|
||||
// initialize updater before using them
|
||||
inline void InitUpdater(void) {
|
||||
if (tparam.updater_initialized != 0) return;
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
delete updaters[i];
|
||||
}
|
||||
updaters.clear();
|
||||
std::string tval = tparam.updater_seq;
|
||||
char *saveptr, *pstr;
|
||||
pstr = strtok_r(&tval[0], ",", &saveptr);
|
||||
while (pstr != NULL) {
|
||||
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
// set parameters
|
||||
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
}
|
||||
pstr = strtok_r(NULL, ",", &saveptr);
|
||||
}
|
||||
tparam.updater_initialized = 1;
|
||||
}
|
||||
// do group specific group
|
||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
int bst_group) {
|
||||
this->InitUpdater();
|
||||
// create the trees
|
||||
std::vector<tree::RegTree *> new_trees;
|
||||
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
||||
new_trees.push_back(new tree::RegTree());
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
}
|
||||
new_trees.back()->InitModel();
|
||||
}
|
||||
// update the trees
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->Update(gpair, fmat, root_index, new_trees);
|
||||
}
|
||||
// push back to model
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
trees.push_back(new_trees[i]);
|
||||
tree_info.push_back(bst_group);
|
||||
}
|
||||
mparam.num_trees += tparam.num_parallel_tree;
|
||||
}
|
||||
// make a prediction for a single instance
|
||||
inline float Pred(const SparseBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
std::vector<float> *p_feats) {
|
||||
size_t itop = 0;
|
||||
float psum = 0.0f;
|
||||
const int bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||
// load buffered results if any
|
||||
if (bid >= 0) {
|
||||
itop = pred_counter[bid];
|
||||
psum = pred_buffer[bid];
|
||||
}
|
||||
if (itop != trees.size()) {
|
||||
FillThreadTemp(inst, p_feats);
|
||||
for (size_t i = itop; i < trees.size(); ++i) {
|
||||
if (tree_info[i] == bst_group) {
|
||||
psum += trees[i]->Predict(*p_feats, root_index);
|
||||
}
|
||||
}
|
||||
DropThreadTemp(inst, p_feats);
|
||||
}
|
||||
// updated the buffered results
|
||||
if (bid >= 0) {
|
||||
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
||||
pred_buffer[bid] = psum;
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
// initialize thread local space for prediction
|
||||
inline void InitThreadTemp(int nthread) {
|
||||
thread_temp.resize(nthread);
|
||||
for (size_t i = 0; i < thread_temp.size(); ++i) {
|
||||
thread_temp[i].resize(mparam.num_feature);
|
||||
std::fill(thread_temp[i].begin(), thread_temp[i].end(), NAN);
|
||||
}
|
||||
}
|
||||
// fill in a thread local dense vector using a sparse instance
|
||||
inline static void FillThreadTemp(const SparseBatch::Inst &inst,
|
||||
std::vector<float> *p_feats) {
|
||||
std::vector<float> &feats = *p_feats;
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
feats[inst[i].findex] = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
// clear up a thread local dense vector
|
||||
inline static void DropThreadTemp(const SparseBatch::Inst &inst,
|
||||
std::vector<float> *p_feats) {
|
||||
std::vector<float> &feats = *p_feats;
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
feats[inst[i].findex] = NAN;
|
||||
}
|
||||
}
|
||||
// --- data structure ---
|
||||
/*! \brief training parameters */
|
||||
struct TrainParam {
|
||||
/*! \brief number of threads */
|
||||
int nthread;
|
||||
/*!
|
||||
* \brief number of parallel trees constructed each iteration
|
||||
* use this option to support boosted random forest
|
||||
*/
|
||||
int num_parallel_tree;
|
||||
/*! \brief whether updater is already initialized */
|
||||
int updater_initialized;
|
||||
/*! \brief tree updater sequence */
|
||||
std::string updater_seq;
|
||||
// construction
|
||||
TrainParam(void) {
|
||||
nthread = 0;
|
||||
updater_seq = "grow_colmaker,prune";
|
||||
num_parallel_tree = 1;
|
||||
updater_initialized = 0;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp(name, "updater") &&
|
||||
strcmp(updater_seq.c_str(), val) != 0) {
|
||||
updater_seq = val;
|
||||
updater_initialized = 0;
|
||||
}
|
||||
if (!strcmp(name, "nthread")) {
|
||||
omp_set_num_threads(nthread);
|
||||
nthread = atoi(val);
|
||||
}
|
||||
if (!strcmp(name, "num_parallel_tree")) {
|
||||
num_parallel_tree = atoi(val);
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief model parameters */
|
||||
struct ModelParam {
|
||||
/*! \brief number of trees */
|
||||
int num_trees;
|
||||
/*! \brief number of root: default 0, means single tree */
|
||||
int num_roots;
|
||||
/*! \brief number of features to be used by trees */
|
||||
int num_feature;
|
||||
/*! \brief size of predicton buffer allocated used for buffering */
|
||||
int64_t num_pbuffer;
|
||||
/*!
|
||||
* \brief how many output group a single instance can produce
|
||||
* this affects the behavior of number of output we have:
|
||||
* suppose we have n instance and k group, output will be k*n
|
||||
*/
|
||||
int num_output_group;
|
||||
/*! \brief reserved parameters */
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
num_trees = 0;
|
||||
num_roots = num_feature = 0;
|
||||
num_pbuffer = 0;
|
||||
num_output_group = 1;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
|
||||
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
|
||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
/*! \return size of prediction buffer actually needed */
|
||||
inline size_t PredBufferSize(void) const {
|
||||
return num_output_group * num_pbuffer;
|
||||
}
|
||||
/*!
|
||||
* \brief get the buffer offset given a buffer index and group id
|
||||
* \return calculated buffer offset
|
||||
*/
|
||||
inline size_t BufferOffset(int64_t buffer_index, int bst_group) const {
|
||||
if (buffer_index < 0) return -1;
|
||||
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
|
||||
return buffer_index + num_pbuffer * bst_group;
|
||||
}
|
||||
};
|
||||
// training parameter
|
||||
TrainParam tparam;
|
||||
// model parameter
|
||||
ModelParam mparam;
|
||||
/*! \brief vector of trees stored in the model */
|
||||
std::vector<tree::RegTree*> trees;
|
||||
/*! \brief some information indicator of the tree, reserved */
|
||||
std::vector<int> tree_info;
|
||||
/*! \brief prediction buffer */
|
||||
std::vector<float> pred_buffer;
|
||||
/*! \brief prediction buffer counter, remember the prediction */
|
||||
std::vector<unsigned> pred_counter;
|
||||
// ----training fields----
|
||||
// configurations for tree
|
||||
std::vector< std::pair<std::string, std::string> > cfg;
|
||||
// temporal storage for per thread
|
||||
std::vector< std::vector<float> > thread_temp;
|
||||
// the updaters that can be applied to each of tree
|
||||
std::vector< tree::IUpdater<FMatrix>* > updaters;
|
||||
};
|
||||
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
Reference in New Issue
Block a user