mv code into src

This commit is contained in:
tqchen
2014-08-15 21:04:23 -07:00
parent 3589e8252f
commit 34dd409c5b
25 changed files with 1 additions and 28 deletions

293
src/data.h Normal file
View File

@@ -0,0 +1,293 @@
#ifndef XGBOOST_UNITY_DATA_H
#define XGBOOST_UNITY_DATA_H
/*!
* \file data.h
* \brief the input data structure for gradient boosting
* \author Tianqi Chen
*/
#include <cstdio>
#include <vector>
#include <limits>
#include <algorithm>
#include "utils/io.h"
#include "utils/utils.h"
#include "utils/iterator.h"
#include "utils/matrix_csr.h"
namespace xgboost {
/*!
* \brief unsigned interger type used in boost,
* used for feature index and row index
*/
typedef unsigned bst_uint;
/*! \brief float type, used for storing statistics */
typedef float bst_float;
const float rt_eps = 1e-5f;
// min gap between feature values to allow a split happen
const float rt_2eps = rt_eps * 2.0f;
/*! \brief gradient statistics pair usually needed in gradient boosting */
struct bst_gpair{
/*! \brief gradient statistics */
bst_float grad;
/*! \brief second order gradient statistics */
bst_float hess;
bst_gpair(void) {}
bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
};
/*! \brief read-only sparse instance batch in CSR format */
struct SparseBatch {
/*! \brief an entry of sparse vector */
struct Entry {
/*! \brief feature index */
bst_uint findex;
/*! \brief feature value */
bst_float fvalue;
// default constructor
Entry(void) {}
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {}
/*! \brief reversely compare feature values */
inline static bool CmpValue(const Entry &a, const Entry &b) {
return a.fvalue < b.fvalue;
}
};
/*! \brief an instance of sparse vector in the batch */
struct Inst {
/*! \brief pointer to the elements*/
const Entry *data;
/*! \brief length of the instance */
const bst_uint length;
/*! \brief constructor */
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
/*! \brief get i-th pair in the sparse vector*/
inline const Entry& operator[](size_t i) const {
return data[i];
}
};
/*! \brief batch size */
size_t size;
/*! \brief the offset of rowid of this batch */
size_t base_rowid;
/*! \brief array[size+1], row pointer of each of the elements */
const size_t *row_ptr;
/*! \brief array[row_ptr.back()], content of the sparse element */
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], row_ptr[i+1] - row_ptr[i]);
}
};
/**
* \brief This is a interface convention via template, defining the way to access features,
* column access rule is defined by template, for efficiency purpose,
* row access is defined by iterator of sparse batches
* \tparam Derived type of actual implementation
*/
template<typename Derived>
class FMatrixInterface {
public:
/*! \brief example iterator over one column */
struct ColIter{
/*!
* \brief move to next position
* \return whether there is element in next position
*/
inline bool Next(void);
/*! \return row index of current position */
inline bst_uint rindex(void) const;
/*! \return feature value in current position */
inline bst_float fvalue(void) const;
};
/*! \brief backward iterator over column */
struct ColBackIter : public ColIter {};
public:
// column access is needed by some of tree construction algorithms
/*!
* \brief get column iterator, the columns must be sorted by feature value
* \param cidx column index
* \return column iterator
*/
inline ColIter GetSortedCol(size_t cidx) const;
/*!
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
* \param cidx column index
* \return reverse column iterator
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
/*!
* \brief get number of columns
* \return number of columns
*/
inline size_t NumCol(void) const;
/*!
* \brief check if column access is supported, if not, initialize column access
* \param max_rows maximum number of rows allowed in constructor
*/
inline void InitColAccess(void);
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const;
/*! \breif return #entries-in-col */
inline size_t GetColSize(size_t cidx) const;
/*!
* \breif return #entries-in-col / #rows
* \param cidx column index
* this function is used to help speedup,
* doese not necessarily implement it if not sure, return 0.0;
* \return column density
*/
inline float GetColDensity(size_t cidx) const;
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const = 0;
};
/*!
* \brief sparse matrix that support column access, CSC
*/
class FMatrixS : public FMatrixInterface<FMatrixS>{
public:
typedef SparseBatch::Entry Entry;
/*! \brief row iterator */
struct ColIter{
const Entry *dptr_, *end_;
ColIter(const Entry* begin, const Entry* end)
:dptr_(begin), end_(end) {}
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
++dptr_; return true;
}
}
inline bst_uint rindex(void) const {
return dptr_->findex;
}
inline bst_float fvalue(void) const {
return dptr_->fvalue;
}
};
/*! \brief reverse column iterator */
struct ColBackIter : public ColIter {
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
// shadows ColIter::Next
inline bool Next(void) {
if (dptr_ == end_) {
return false;
} else {
--dptr_; return true;
}
}
};
/*! \brief constructor */
explicit FMatrixS(utils::IIterator<SparseBatch> *base_iter)
: iter_(base_iter) {}
// destructor
virtual ~FMatrixS(void) {
delete iter_;
}
/*! \return whether column access is enabled */
inline bool HaveColAccess(void) const {
return col_ptr_.size() != 0;
}
/*! \brief get number of colmuns */
inline size_t NumCol(void) const {
utils::Check(this->HaveColAccess(), "NumCol:need column access");
return col_ptr_.size() - 1;
}
/*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColIter(&col_data_[col_ptr_[cidx]] - 1,
&col_data_[col_ptr_[cidx + 1]] - 1);
}
/*!
* \brief get reversed col iterator,
* this function will be deprecated at some point
*/
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
return ColBackIter(&col_data_[col_ptr_[cidx + 1]],
&col_data_[col_ptr_[cidx]]);
}
/*! \brief get col size */
inline size_t GetColSize(size_t cidx) const {
return col_ptr_[cidx+1] - col_ptr_[cidx];
}
/*! \brief get column density */
inline float GetColDensity(size_t cidx) const {
size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
}
virtual void InitColAccess(void) {
if (this->HaveColAccess()) return;
const size_t max_nrow = std::numeric_limits<bst_uint>::max();
this->InitColData(max_nrow);
}
/*! \brief get the row iterator associated with FMatrix */
virtual utils::IIterator<SparseBatch>* RowIterator(void) const {
return iter_;
}
protected:
/*!
* \brief intialize column data
* \param max_nrow maximum number of rows supported
*/
inline void InitColData(size_t max_nrow) {
// note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0);
// start working
iter_->BeforeFirst();
num_buffered_row_ = 0;
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break;
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) {
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < batch.size; ++j) {
builder.AddBudget(inst[j].findex);
}
}
}
builder.InitStorage();
iter_->BeforeFirst();
while (iter_->Next()) {
const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break;
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
for (size_t i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < batch.size; ++j) {
builder.PushElem(inst[j].findex,
Entry((bst_uint)(batch.base_rowid+j),
inst[j].fvalue));
}
}
}
// sort columns
unsigned ncol = static_cast<unsigned>(this->NumCol());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ncol; ++i) {
std::sort(&col_data_[col_ptr_[i]],
&col_data_[col_ptr_[i + 1]], Entry::CmpValue);
}
}
private:
// --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_;
/*! \brief number */
size_t num_buffered_row_;
/*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */
std::vector<SparseBatch::Entry> col_data_;
};
} // namespace xgboost
#endif

82
src/gbm/gbm.h Normal file
View File

@@ -0,0 +1,82 @@
#ifndef XGBOOST_GBM_GBM_H_
#define XGBOOST_GBM_GBM_H_
/*!
* \file gbm.h
* \brief interface of gradient booster, that learns through gradient statistics
* \author Tianqi Chen
*/
#include <vector>
#include "../data.h"
namespace xgboost {
/*! \brief namespace for gradient booster */
namespace gbm {
/*!
* \brief interface of gradient boosting model
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class IGradBooster {
public:
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief load model from stream
* \param fi input stream
*/
virtual void LoadModel(utils::IStream &fi) = 0;
/*!
* \brief save model to stream
* \param fo output stream
*/
virtual void SaveModel(utils::IStream &fo) const = 0;
/*!
* \brief initialize the model
*/
virtual void InitModel(void) = 0;
/*!
* \brief peform update to the model(boosting)
* \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features
* \param root_index pre-partitioned root_index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
*/
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
FMatrix &fmat,
const std::vector<unsigned> &root_index) = 0;
/*!
* \brief generate predictions for given feature matrix
* \param fmat feature matrix
* \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm
* a buffer index is assigned to each instance that requires repeative prediction
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
* \param root_index pre-partitioned root_index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
* \param out_preds output vector to hold the predictions
*/
virtual void Predict(const FMatrix &fmat,
int64_t buffer_offset,
const std::vector<unsigned> &root_index,
std::vector<float> *out_preds) = 0;
// destrcutor
virtual ~IGradBooster(void){}
};
} // namespace gbm
} // namespace xgboost
#include "gbtree-inl.hpp"
namespace xgboost {
namespace gbm {
template<typename FMatrix>
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
utils::Error("unknown booster type: %s", name);
return NULL;
}
} // namespace gbm
} // namespace xgboost
#endif // XGBOOST_GBM_GBM_H_

365
src/gbm/gbtree-inl.hpp Normal file
View File

@@ -0,0 +1,365 @@
#ifndef XGBOOST_GBM_GBTREE_INL_HPP_
#define XGBOOST_GBM_GBTREE_INL_HPP_
/*!
* \file gbtree-inl.hpp
* \brief gradient boosted tree implementation
* \author Tianqi Chen
*/
#include <vector>
#include <utility>
#include <string>
#include "./gbm.h"
#include "../tree/updater.h"
namespace xgboost {
namespace gbm {
/*!
* \brief gradient boosted tree
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class GBTree : public IGradBooster<FMatrix> {
public:
virtual ~GBTree(void) {
this->Clear();
}
virtual void SetParam(const char *name, const char *val) {
if (!strncmp(name, "bst:", 4)) {
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
// set into updaters, if already intialized
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->SetParam(name+4, val);
}
}
if (!strcmp(name, "silent")) {
this->SetParam("bst:silent", val);
}
tparam.SetParam(name, val);
if (trees.size() == 0) mparam.SetParam(name, val);
}
virtual void LoadModel(utils::IStream &fi) {
this->Clear();
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"GBTree: invalid model file");
trees.resize(mparam.num_trees);
for (size_t i = 0; i < trees.size(); ++i) {
trees[i] = new tree::RegTree();
trees[i]->LoadModel(fi);
}
tree_info.resize(mparam.num_trees);
if (mparam.num_trees != 0) {
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
"GBTree: invalid model file");
}
if (mparam.num_pbuffer != 0) {
pred_buffer.resize(mparam.PredBufferSize());
pred_counter.resize(mparam.PredBufferSize());
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
"GBTree: invalid model file");
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
"GBTree: invalid model file");
}
}
virtual void SaveModel(utils::IStream &fo) const {
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
fo.Write(&mparam, sizeof(ModelParam));
for (size_t i = 0; i < trees.size(); ++i) {
trees[i]->SaveModel(fo);
}
if (tree_info.size() != 0) {
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
}
if (mparam.num_pbuffer != 0) {
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
}
}
// initialize the predic buffer
virtual void InitModel(void) {
pred_buffer.clear(); pred_counter.clear();
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
pred_counter.resize(mparam.PredBufferSize(), 0);
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
}
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
FMatrix &fmat,
const std::vector<unsigned> &root_index) {
if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, fmat, root_index, 0);
} else {
const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0,
"must have exactly ngroup*nrow gpairs");
std::vector<bst_gpair> tmp(gpair.size()/ngroup);
for (int gid = 0; gid < ngroup; ++gid) {
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < tmp.size(); ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
this->BoostNewTrees(tmp, fmat, root_index, gid);
}
}
}
virtual void Predict(const FMatrix &fmat,
int64_t buffer_offset,
const std::vector<unsigned> &root_index,
std::vector<float> *out_preds) {
int nthread;
#pragma omp parallel
{
nthread = omp_get_num_threads();
}
this->InitThreadTemp(nthread);
std::vector<float> &preds = *out_preds;
preds.resize(0);
// start collecting the prediction
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const SparseBatch &batch = iter->Value();
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
"base_rowid is not set correctly");
// output convention: nrow * k, where nrow is number of rows
// k is number of group
preds.resize(preds.size() + batch.size * mparam.num_output_group);
// parallel over local batch
const unsigned nsize = static_cast<unsigned>(batch.size);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
std::vector<float> &feats = thread_temp[tid];
const size_t ridx = batch.base_rowid + i;
const unsigned root_idx = root_index.size() == 0 ? 0 : root_index[ridx];
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
preds[ridx * mparam.num_output_group + gid] =
this->Pred(batch[i],
buffer_offset < 0 ? -1 : buffer_offset+ridx,
gid, root_idx, &feats);
}
}
}
}
protected:
// clear the model
inline void Clear(void) {
for (size_t i = 0; i < trees.size(); ++i) {
delete trees[i];
}
trees.clear();
pred_buffer.clear();
pred_counter.clear();
}
// initialize updater before using them
inline void InitUpdater(void) {
if (tparam.updater_initialized != 0) return;
for (size_t i = 0; i < updaters.size(); ++i) {
delete updaters[i];
}
updaters.clear();
std::string tval = tparam.updater_seq;
char *saveptr, *pstr;
pstr = strtok_r(&tval[0], ",", &saveptr);
while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
}
pstr = strtok_r(NULL, ",", &saveptr);
}
tparam.updater_initialized = 1;
}
// do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
FMatrix &fmat,
const std::vector<unsigned> &root_index,
int bst_group) {
this->InitUpdater();
// create the trees
std::vector<tree::RegTree *> new_trees;
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
new_trees.push_back(new tree::RegTree());
for (size_t j = 0; j < cfg.size(); ++j) {
new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
}
new_trees.back()->InitModel();
}
// update the trees
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, fmat, root_index, new_trees);
}
// push back to model
for (size_t i = 0; i < new_trees.size(); ++i) {
trees.push_back(new_trees[i]);
tree_info.push_back(bst_group);
}
mparam.num_trees += tparam.num_parallel_tree;
}
// make a prediction for a single instance
inline float Pred(const SparseBatch::Inst &inst,
int64_t buffer_index,
int bst_group,
unsigned root_index,
std::vector<float> *p_feats) {
size_t itop = 0;
float psum = 0.0f;
const int bid = mparam.BufferOffset(buffer_index, bst_group);
// load buffered results if any
if (bid >= 0) {
itop = pred_counter[bid];
psum = pred_buffer[bid];
}
if (itop != trees.size()) {
FillThreadTemp(inst, p_feats);
for (size_t i = itop; i < trees.size(); ++i) {
if (tree_info[i] == bst_group) {
psum += trees[i]->Predict(*p_feats, root_index);
}
}
DropThreadTemp(inst, p_feats);
}
// updated the buffered results
if (bid >= 0) {
pred_counter[bid] = static_cast<unsigned>(trees.size());
pred_buffer[bid] = psum;
}
return psum;
}
// initialize thread local space for prediction
inline void InitThreadTemp(int nthread) {
thread_temp.resize(nthread);
for (size_t i = 0; i < thread_temp.size(); ++i) {
thread_temp[i].resize(mparam.num_feature);
std::fill(thread_temp[i].begin(), thread_temp[i].end(), NAN);
}
}
// fill in a thread local dense vector using a sparse instance
inline static void FillThreadTemp(const SparseBatch::Inst &inst,
std::vector<float> *p_feats) {
std::vector<float> &feats = *p_feats;
for (bst_uint i = 0; i < inst.length; ++i) {
feats[inst[i].findex] = inst[i].fvalue;
}
}
// clear up a thread local dense vector
inline static void DropThreadTemp(const SparseBatch::Inst &inst,
std::vector<float> *p_feats) {
std::vector<float> &feats = *p_feats;
for (bst_uint i = 0; i < inst.length; ++i) {
feats[inst[i].findex] = NAN;
}
}
// --- data structure ---
/*! \brief training parameters */
struct TrainParam {
/*! \brief number of threads */
int nthread;
/*!
* \brief number of parallel trees constructed each iteration
* use this option to support boosted random forest
*/
int num_parallel_tree;
/*! \brief whether updater is already initialized */
int updater_initialized;
/*! \brief tree updater sequence */
std::string updater_seq;
// construction
TrainParam(void) {
nthread = 0;
updater_seq = "grow_colmaker,prune";
num_parallel_tree = 1;
updater_initialized = 0;
}
inline void SetParam(const char *name, const char *val){
if (!strcmp(name, "updater") &&
strcmp(updater_seq.c_str(), val) != 0) {
updater_seq = val;
updater_initialized = 0;
}
if (!strcmp(name, "nthread")) {
omp_set_num_threads(nthread);
nthread = atoi(val);
}
if (!strcmp(name, "num_parallel_tree")) {
num_parallel_tree = atoi(val);
}
}
};
/*! \brief model parameters */
struct ModelParam {
/*! \brief number of trees */
int num_trees;
/*! \brief number of root: default 0, means single tree */
int num_roots;
/*! \brief number of features to be used by trees */
int num_feature;
/*! \brief size of predicton buffer allocated used for buffering */
int64_t num_pbuffer;
/*!
* \brief how many output group a single instance can produce
* this affects the behavior of number of output we have:
* suppose we have n instance and k group, output will be k*n
*/
int num_output_group;
/*! \brief reserved parameters */
int reserved[32];
/*! \brief constructor */
ModelParam(void) {
num_trees = 0;
num_roots = num_feature = 0;
num_pbuffer = 0;
num_output_group = 1;
memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
/*! \return size of prediction buffer actually needed */
inline size_t PredBufferSize(void) const {
return num_output_group * num_pbuffer;
}
/*!
* \brief get the buffer offset given a buffer index and group id
* \return calculated buffer offset
*/
inline size_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1;
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
return buffer_index + num_pbuffer * bst_group;
}
};
// training parameter
TrainParam tparam;
// model parameter
ModelParam mparam;
/*! \brief vector of trees stored in the model */
std::vector<tree::RegTree*> trees;
/*! \brief some information indicator of the tree, reserved */
std::vector<int> tree_info;
/*! \brief prediction buffer */
std::vector<float> pred_buffer;
/*! \brief prediction buffer counter, remember the prediction */
std::vector<unsigned> pred_counter;
// ----training fields----
// configurations for tree
std::vector< std::pair<std::string, std::string> > cfg;
// temporal storage for per thread
std::vector< std::vector<float> > thread_temp;
// the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters;
};
} // namespace gbm
} // namespace xgboost
#endif // XGBOOST_GBM_GBTREE_INL_HPP_

84
src/learner/dmatrix.h Normal file
View File

@@ -0,0 +1,84 @@
#ifndef XGBOOST_LEARNER_DMATRIX_H_
#define XGBOOST_LEARNER_DMATRIX_H_
/*!
* \file dmatrix.h
* \brief meta data and template data structure
* used for regression/classification/ranking
* \author Tianqi Chen
*/
#include "../data.h"
namespace xgboost {
namespace learner {
/*!
* \brief meta information needed in training, including label, weight
*/
struct MetaInfo {
/*! \brief label of each instance */
std::vector<float> labels;
/*!
* \brief the index of begin and end of a group
* needed when the learning task is ranking
*/
std::vector<bst_uint> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*!
* \brief specified root index of each instance,
* can be used for multi task setting
*/
std::vector<unsigned> root_index;
/*! \brief get weight of each instances */
inline float GetWeight(size_t i) const {
if(weights.size() != 0) {
return weights[i];
} else {
return 1.0f;
}
}
/*! \brief get root index of i-th instance */
inline float GetRoot(size_t i) const {
if(root_index.size() != 0) {
return static_cast<float>(root_index[i]);
} else {
return 0;
}
}
inline void SaveBinary(utils::IStream &fo) {
fo.Write(labels);
fo.Write(group_ptr);
fo.Write(weights);
fo.Write(root_index);
}
inline void LoadBinary(utils::IStream &fi) {
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
utils::Check(fi.Read(&root_index), "MetaInfo: invalid format");
}
};
/*!
* \brief data object used for learning,
* \tparam FMatrix type of feature data source
*/
template<typename FMatrix>
struct DMatrix {
/*! \brief meta information about the dataset */
MetaInfo info;
/*! \brief number of rows in the DMatrix */
size_t num_row;
/*! \brief feature matrix about data content */
FMatrix fmat;
/*!
* \brief cache pointer to verify if the data structure is cached in some learner
* used to verify if DMatrix is cached
*/
void *cache_learner_ptr_;
/*! \brief default constructor */
DMatrix(void) : cache_learner_ptr_(NULL) {}
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_DMATRIX_H_

View File

@@ -0,0 +1,346 @@
#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
/*!
* \file xgboost_evaluation-inl.hpp
* \brief evaluation metrics for regression and classification and rank
* \author Kailong Chen, Tianqi Chen
*/
#include <vector>
#include <utility>
#include <string>
#include <climits>
#include <algorithm>
#include "./evaluation.h"
#include "./helper_utils.h"
namespace xgboost {
namespace learner {
/*!
* \brief base class of elementwise evaluation
* \tparam Derived the name of subclass
*/
template<typename Derived>
struct EvalEWiseBase : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(),
"label and prediction size not match");
const unsigned ndata = static_cast<unsigned>(preds.size());
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+:sum, wsum) schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
const float wt = info.GetWeight(i);
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
wsum += wt;
}
return Derived::GetFinal(sum, wsum);
}
/*!
* \brief to be implemented by subclass,
* get evaluation result from one row
* \param label label of current instance
* \param pred prediction value of current instance
* \param weight weight of current instance
*/
inline static float EvalRow(float label, float pred);
/*!
* \brief to be overide by subclas, final trasnformation
* \param esum the sum statistics returned by EvalRow
* \param wsum sum of weight
*/
inline static float GetFinal(float esum, float wsum) {
return esum / wsum;
}
};
/*! \brief RMSE */
struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
virtual const char *Name(void) const {
return "rmse";
}
inline static float EvalRow(float label, float pred) {
float diff = label - pred;
return diff * diff;
}
inline static float GetFinal(float esum, float wsum) {
return std::sqrt(esum / wsum);
}
};
/*! \brief logloss */
struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
virtual const char *Name(void) const {
return "logloss";
}
inline static float EvalRow(float y, float py) {
return - y * std::log(py) - (1.0f - y) * std::log(1 - py);
}
};
/*! \brief error */
struct EvalError : public EvalEWiseBase<EvalError> {
virtual const char *Name(void) const {
return "error";
}
inline static float EvalRow(float label, float pred) {
// assume label is in [0,1]
return pred > 0.5f ? 1.0f - label : label;
}
};
/*! \brief match error */
struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
virtual const char *Name(void) const {
return "merror";
}
inline static float EvalRow(float label, float pred) {
return static_cast<int>(pred) != static_cast<int>(label);
}
};
/*! \brief AMS: also records best threshold */
struct EvalAMS : public IEvaluator {
public:
explicit EvalAMS(const char *name) {
name_ = name;
// note: ams@0 will automatically select which ratio to go
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
rec[i] = std::make_pair(preds[i], i);
}
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
if (ntop == 0) ntop = ndata;
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
const unsigned ridx = rec[i].second;
const float wt = info.weights[ridx];
if (info.labels[ridx] > 0.5f) {
s_tp += wt;
} else {
b_fp += wt;
}
if (rec[i].first != rec[i+1].first) {
double ams = sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
if (tams < ams) {
thresindex = i;
tams = ams;
}
}
}
if (ntop == ndata) {
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
return tams;
} else {
return sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
}
}
virtual const char *Name(void) const {
return name_.c_str();
}
private:
std::string name_;
float ratio_;
};
/*! \brief Area under curve, for both classification and rank */
struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.back() == preds.size(),
"EvalAuc: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
// sum statictis
double sum_auc = 0.0f;
#pragma omp parallel reduction(+:sum_auc)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], j));
}
std::sort(rec.begin(), rec.end(), CmpFirst);
// calculate AUC
double sum_pospair = 0.0;
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
for (size_t j = 0; j < rec.size(); ++j) {
const float wt = info.GetWeight(rec[j].second);
const float ctr = info.labels[rec[j].second];
// keep bucketing predictions in same bucket
if (j != 0 && rec[j].first != rec[j - 1].first) {
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
buf_neg = buf_pos = 0.0f;
}
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
}
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
sum_npos += buf_pos; sum_nneg += buf_neg;
// check weird conditions
utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
"AUC: the dataset only contains pos or neg samples");
// this is the AUC
sum_auc += sum_pospair / (sum_npos*sum_nneg);
}
}
// return average AUC over list
return static_cast<float>(sum_auc) / ngroup;
}
virtual const char *Name(void) const {
return "auc";
}
};
/*! \brief Evaluate rank list */
struct EvalRankList : public IEvaluator {
public:
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(),
"label size predict size not match");
const std::vector<unsigned> &gptr = info.group_ptr;
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
utils::Assert(gptr.back() == preds.size(),
"EvalRanklist: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
// sum statistics
double sum_metric = 0.0f;
#pragma omp parallel reduction(+:sum_metric)
{
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
}
sum_metric += this->EvalMetric(rec);
}
}
return static_cast<float>(sum_metric) / ngroup;
}
virtual const char *Name(void) const {
return name_.c_str();
}
protected:
explicit EvalRankList(const char *name) {
name_ = name;
minus_ = false;
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
topn_ = UINT_MAX;
}
if (name[strlen(name) - 1] == '-') {
minus_ = true;
}
}
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0;
protected:
unsigned topn_;
std::string name_;
bool minus_;
};
/*! \brief Precison at N, for both classification and rank */
struct EvalPrecision : public EvalRankList{
public:
explicit EvalPrecision(const char *name) : EvalRankList(name) {}
protected:
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
// calculate Preicsion
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhit = 0;
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
nhit += (rec[j].second != 0);
}
return static_cast<float>(nhit) / topn_;
}
};
/*! \brief NDCG */
struct EvalNDCG : public EvalRankList{
public:
explicit EvalNDCG(const char *name) : EvalRankList(name) {}
protected:
inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
double sumdcg = 0.0;
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
const unsigned rel = rec[i].second;
if (rel != 0) {
sumdcg += ((1 << rel) - 1) / logf(i + 2);
}
}
return static_cast<float>(sumdcg);
}
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
std::stable_sort(rec.begin(), rec.end(), CmpFirst);
float dcg = this->CalcDCG(rec);
std::stable_sort(rec.begin(), rec.end(), CmpSecond);
float idcg = this->CalcDCG(rec);
if (idcg == 0.0f) {
if (minus_) {
return 0.0f;
} else {
return 1.0f;
}
}
return dcg/idcg;
}
};
/*! \brief Precison at N, for both classification and rank */
struct EvalMAP : public EvalRankList {
public:
explicit EvalMAP(const char *name) : EvalRankList(name) {}
protected:
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
std::sort(rec.begin(), rec.end(), CmpFirst);
unsigned nhits = 0;
double sumap = 0.0;
for (size_t i = 0; i < rec.size(); ++i) {
if (rec[i].second != 0) {
nhits += 1;
if (i < this->topn_) {
sumap += static_cast<float>(nhits) / (i+1);
}
}
}
if (nhits != 0) {
sumap /= nhits;
return static_cast<float>(sumap);
} else {
if (minus_) {
return 0.0f;
} else {
return 1.0f;
}
}
}
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_EVALUATION_INL_HPP_

82
src/learner/evaluation.h Normal file
View File

@@ -0,0 +1,82 @@
#ifndef XGBOOST_LEARNER_EVALUATION_H_
#define XGBOOST_LEARNER_EVALUATION_H_
/*!
* \file evaluation.h
* \brief interface of evaluation function supported in xgboost
* \author Tianqi Chen, Kailong Chen
*/
#include <string>
#include <vector>
#include "../utils/utils.h"
namespace xgboost {
namespace learner {
/*! \brief evaluator that evaluates the loss metrics */
struct IEvaluator{
/*!
* \brief evaluate a specific metric
* \param preds prediction
* \param info information, including label etc.
*/
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const = 0;
/*! \return name of metric */
virtual const char *Name(void) const = 0;
/*! \brief virtual destructor */
virtual ~IEvaluator(void) {}
};
} // namespace learner
} // namespace xgboost
// include implementations of evaluation functions
#include "evaluation-inl.hpp"
// factory function
namespace xgboost {
namespace learner {
inline IEvaluator* CreateEvaluator(const char *name) {
if (!strcmp(name, "rmse")) return new EvalRMSE();
if (!strcmp(name, "error")) return new EvalError();
if (!strcmp(name, "merror")) return new EvalMatchError();
if (!strcmp(name, "logloss")) return new EvalLogLoss();
if (!strcmp(name, "auc")) return new EvalAuc();
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
utils::Error("unknown evaluation metric type: %s", name);
return NULL;
}
/*! \brief a set of evaluators */
class EvalSet{
public:
inline void AddEval(const char *name) {
for (size_t i = 0; i < evals_.size(); ++i) {
if (!strcmp(name, evals_[i]->Name())) return;
}
evals_.push_back(CreateEvaluator(name));
}
~EvalSet(void) {
for (size_t i = 0; i < evals_.size(); ++i) {
delete evals_[i];
}
}
inline std::string Eval(const char *evname,
const std::vector<float> &preds,
const MetaInfo &info) const {
std::string result = "";
for (size_t i = 0; i < evals_.size(); ++i) {
float res = evals_[i]->Eval(preds, info);
char tmp[1024];
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
result += tmp;
}
return result;
}
private:
std::vector<const IEvaluator*> evals_;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_EVALUATION_H_

View File

@@ -0,0 +1,50 @@
#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
#define XGBOOST_LEARNER_HELPER_UTILS_H_
/*!
* \file helper_utils.h
* \brief useful helper functions
* \author Tianqi Chen, Kailong Chen
*/
#include <utility>
#include <vector>
#include <algorithm>
namespace xgboost {
namespace learner {
// simple helper function to do softmax
inline static void Softmax(std::vector<float>* p_rec) {
std::vector<float> &rec = *p_rec;
float wmax = rec[0];
for (size_t i = 1; i < rec.size(); ++i) {
wmax = std::max(rec[i], wmax);
}
double wsum = 0.0f;
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] = std::exp(rec[i]-wmax);
wsum += rec[i];
}
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] /= static_cast<float>(wsum);
}
}
// simple helper function to do softmax
inline static int FindMaxIndex(const std::vector<float>& rec) {
size_t mxid = 0;
for (size_t i = 1; i < rec.size(); ++i) {
if (rec[i] > rec[mxid] + 1e-6f) {
mxid = i;
}
}
return static_cast<int>(mxid);
}
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {
return a.first > b.first;
}
inline static bool CmpSecond(const std::pair<float, unsigned> &a,
const std::pair<float, unsigned> &b) {
return a.second > b.second;
}
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_HELPER_UTILS_H_

296
src/learner/learner-inl.hpp Normal file
View File

@@ -0,0 +1,296 @@
#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
#define XGBOOST_LEARNER_LEARNER_INL_HPP_
/*!
* \file learner-inl.hpp
* \brief learning algorithm
* \author Tianqi Chen
*/
#include <algorithm>
#include <vector>
#include <utility>
#include <string>
#include "./objective.h"
#include "./evaluation.h"
#include "../gbm/gbm.h"
namespace xgboost {
/*! \brief namespace for learning algorithm */
namespace learner {
/*!
* \brief learner that takes do gradient boosting on specific objective functions
* and do training and prediction
*/
template<typename FMatrix>
class BoostLearner {
public:
BoostLearner(void) {
obj_ = NULL;
gbm_ = NULL;
name_obj_ = "reg:linear";
name_gbm_ = "gbtree";
}
~BoostLearner(void) {
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
}
/*!
* \brief add internal cache space for mat, this can speedup prediction for matrix,
* please cache prediction for training and eval data
* warning: if the model is loaded from file from some previous training history
* set cache data must be called with exactly SAME
* data matrices to continue training otherwise it will cause error
* \param mats array of pointers to matrix whose prediction result need to be cached
*/
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
// estimate feature bound
unsigned num_feature = 0;
// assign buffer index
size_t buffer_size = 0;
utils::Assert(cache_.size() == 0, "can only call cache data once");
for (size_t i = 0; i < mats.size(); ++i) {
bool dupilicate = false;
for (size_t j = 0; j < i; ++j) {
if (mats[i] == mats[j]) dupilicate = true;
}
if (dupilicate) continue;
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
buffer_size += mats[i]->num_row;
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
}
char str_temp[25];
if (num_feature > mparam.num_feature) {
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
this->SetParam("bst:num_feature", str_temp);
}
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
this->SetParam("num_pbuffer", str_temp);
if (!silent) {
printf("buffer_size=%ld\n", buffer_size);
}
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (gbm_ == NULL) {
if (!strcmp(name, "objective")) name_obj_ = val;
if (!strcmp(name, "booster")) name_gbm_ = val;
mparam.SetParam(name, val);
}
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
}
/*!
* \brief initialize the model
*/
inline void InitModel(void) {
this->InitObjGBM();
// adapt the base score
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
gbm_->InitModel();
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel(utils::IStream &fi) {
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
"BoostLearner: wrong model format");
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
// delete existing gbm if any
if (obj_ != NULL) delete obj_;
if (gbm_ != NULL) delete gbm_;
this->InitObjGBM();
gbm_->LoadModel(fi);
}
/*!
* \brief load model from file
* \param fname file name
*/
inline void LoadModel(const char *fname) {
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
this->LoadModel(fi);
fi.Close();
}
inline void SaveModel(utils::IStream &fo) const {
fo.Write(&mparam, sizeof(ModelParam));
fo.Write(&name_obj_);
fo.Write(&name_gbm_);
gbm_->SaveModel(fo);
}
/*!
* \brief save model into file
* \param fname file name
*/
inline void SaveModel(const char *fname) const {
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
this->SaveModel(fo);
fo.Close();
}
/*!
* \brief update the model for one iteration
* \param iter current iteration number
* \param p_train pointer to the data matrix
*/
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
this->PredictRaw(preds_, *p_train);
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
}
/*!
* \brief evaluate the model for specific iteration
* \param iter iteration number
* \param evals datas i want to evaluate
* \param evname name of each dataset
* \return a string corresponding to the evaluation result
*/
inline std::string EvalOneIter(int iter,
const std::vector<const DMatrix<FMatrix>*> &evals,
const std::vector<std::string> &evname) {
std::string res;
char tmp[256];
snprintf(tmp, sizeof(tmp), "[%d]", iter);
res = tmp;
for (size_t i = 0; i < evals.size(); ++i) {
this->PredictRaw(*evals[i], &preds_);
obj_->EvalTransform(&preds_);
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
}
return res;
}
/*!
* \brief simple evaluation function, with a specified metric
* \param data input data
* \param metric name of metric
* \return a pair of <evaluation name, result>
*/
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
if (metric == "auto") metric = obj_->DefaultEvalMetric();
IEvaluator *ev = CreateEvaluator(metric.c_str());
this->PredictRaw(data, &preds_);
obj_->EvalTransform(&preds_);
float res = ev->Eval(preds_, data.info);
delete ev;
return std::make_pair(metric, res);
}
/*!
* \brief get prediction
* \param data input data
* \param out_preds output vector that stores the prediction
*/
inline void Predict(const DMatrix<FMatrix> &data,
std::vector<float> *out_preds) const {
this->PredictRaw(data, out_preds);
obj_->PredTransform(out_preds);
}
protected:
/*!
* \brief initialize the objective function and GBM,
* if not yet done
*/
inline void InitObjGBM(void) {
if (obj_ != NULL) return;
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
obj_ = CreateObjFunction(name_obj_.c_str());
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
for (size_t i = 0; i < cfg_.size(); ++i) {
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
}
evaluator_.AddEval(obj_->DefaultEvalMetric());
}
/*!
* \brief get un-transformed prediction
* \param data training data matrix
* \param out_preds output vector that stores the prediction
*/
inline void PredictRaw(const DMatrix<FMatrix> &data,
std::vector<float> *out_preds) {
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
data.info, out_preds);
}
/*! \brief training parameter for regression */
struct ModelParam{
/* \brief global bias */
float base_score;
/* \brief number of features */
unsigned num_feature;
/* \brief number of class, if it is multi-class classification */
int num_class;
/*! \brief reserved field */
int reserved[32];
/*! \brief constructor */
ModelParam(void) {
base_score = 0.5f;
num_feature = 0;
num_class = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
if (!strcmp("num_class", name)) num_class = atoi(val);
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
}
};
// data fields
// silent during training
int silent;
// evaluation set
EvalSet evaluator_;
// model parameter
ModelParam mparam;
// gbm model that back everything
gbm::IGradBooster<FMatrix> *gbm_;
// name of gbm model used for training
std::string name_gbm_;
// objective fnction
IObjFunction *obj_;
// name of objective function
std::string name_obj_;
// configurations
std::vector< std::pair<std::string, std::string> > cfg_;
// temporal storages for prediciton
std::vector<float> preds_;
// gradient pairs
std::vector<bst_gpair> gpair_;
private:
// cache entry object that helps handle feature caching
struct CacheEntry {
const DMatrix<FMatrix> *mat_;
size_t buffer_offset_;
size_t num_row_;
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
};
// find internal bufer offset for certain matrix, if not exist, return -1
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.num_row) {
return cache_[i].buffer_offset_;
}
}
}
return -1;
}
// data structure field
/*! \brief the entries indicates that we have internal prediction cache */
std::vector<CacheEntry> cache_;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_LEARNER_INL_HPP_

View File

@@ -0,0 +1,137 @@
#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
/*!
* \file objective-inl.hpp
* \brief objective function implementations
* \author Tianqi Chen, Kailong Chen
*/
#include <vector>
#include "./objective.h"
namespace xgboost {
namespace learner {
/*! \brief defines functions to calculate some commonly used functions */
struct LossType {
/*! \brief indicate which type we are using */
int loss_type;
// list of constants
static const int kLinearSquare = 0;
static const int kLogisticNeglik = 1;
static const int kLogisticClassify = 2;
static const int kLogisticRaw = 3;
/*!
* \brief transform the linear sum to prediction
* \param x linear sum of boosting ensemble
* \return transformed prediction
*/
inline float PredTransform(float x) const {
switch (loss_type) {
case kLogisticRaw:
case kLinearSquare: return x;
case kLogisticClassify:
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate first order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return first order gradient
*/
inline float FirstOrderGradient(float predt, float label) const {
switch (loss_type) {
case kLinearSquare: return predt - label;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt - label;
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief calculate second order gradient of loss, given transformed prediction
* \param predt transformed prediction
* \param label true label
* \return second order gradient
*/
inline float SecondOrderGradient(float predt, float label) const {
switch (loss_type) {
case kLinearSquare: return 1.0f;
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
case kLogisticClassify:
case kLogisticNeglik: return predt * (1 - predt);
default: utils::Error("unknown loss_type"); return 0.0f;
}
}
/*!
* \brief transform probability value back to margin
*/
inline float ProbToMargin(float base_score) const {
if (loss_type == kLogisticRaw ||
loss_type == kLogisticClassify ||
loss_type == kLogisticNeglik ) {
utils::Check(base_score > 0.0f && base_score < 1.0f,
"base_score must be in (0,1) for logistic loss");
base_score = -logf(1.0f / base_score - 1.0f);
}
return base_score;
}
/*! \brief get default evaluation metric for the objective */
inline const char *DefaultEvalMetric(void) const {
if (loss_type == kLogisticClassify) return "error";
if (loss_type == kLogisticRaw) return "auc";
return "rmse";
}
};
/*! \brief objective function that only need to */
class RegLossObj : public IObjFunction{
public:
explicit RegLossObj(int loss_type) {
loss.loss_type = loss_type;
scale_pos_weight = 1.0f;
}
virtual ~RegLossObj(void) {}
virtual void SetParam(const char *name, const char *val) {
if (!strcmp("scale_pos_weight", name)) {
scale_pos_weight = static_cast<float>(atof(val));
}
}
virtual void GetGradient(const std::vector<float>& preds,
const MetaInfo &info,
int iter,
std::vector<bst_gpair> *out_gpair) {
utils::Check(preds.size() == info.labels.size(),
"labels are not correctly provided");
std::vector<bst_gpair> &gpair = *out_gpair;
gpair.resize(preds.size());
// start calculating gradient
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
float p = loss.PredTransform(preds[j]);
float w = info.GetWeight(j);
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
loss.SecondOrderGradient(p, info.labels[j]) * w);
}
}
virtual const char* DefaultEvalMetric(void) {
return loss.DefaultEvalMetric();
}
virtual void PredTransform(std::vector<float> *io_preds) {
std::vector<float> &preds = *io_preds;
const unsigned ndata = static_cast<unsigned>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
preds[j] = loss.PredTransform(preds[j]);
}
}
protected:
float scale_pos_weight;
LossType loss;
};
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_

79
src/learner/objective.h Normal file
View File

@@ -0,0 +1,79 @@
#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
#define XGBOOST_LEARNER_OBJECTIVE_H_
/*!
* \file objective.h
* \brief interface of objective function used for gradient boosting
* \author Tianqi Chen, Kailong Chen
*/
#include "dmatrix.h"
namespace xgboost {
namespace learner {
/*! \brief interface of objective function */
class IObjFunction{
public:
/*! \brief virtual destructor */
virtual ~IObjFunction(void){}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief get gradient over each of predictions, given existing information
* \param preds prediction of current round
* \param info information about labels, weights, groups in rank
* \param iter current iteration number
* \param out_gpair output of get gradient, saves gradient and second order gradient in
*/
virtual void GetGradient(const std::vector<float>& preds,
const MetaInfo &info,
int iter,
std::vector<bst_gpair> *out_gpair) = 0;
/*! \return the default evaluation metric for the objective */
virtual const char* DefaultEvalMetric(void) = 0;
// the following functions are optional, most of time default implementation is good enough
/*!
* \brief transform prediction values, this is only called when Prediction is called
* \param io_preds prediction values, saves to this vector as well
*/
virtual void PredTransform(std::vector<float> *io_preds){}
/*!
* \brief transform prediction values, this is only called when Eval is called,
* usually it redirect to PredTransform
* \param io_preds prediction values, saves to this vector as well
*/
virtual void EvalTransform(std::vector<float> *io_preds) {
this->PredTransform(io_preds);
}
/*!
* \brief transform probability value back to margin
* this is used to transform user-set base_score back to margin
* used by gradient boosting
* \return transformed value
*/
virtual float ProbToMargin(float base_score) {
return base_score;
}
};
} // namespace learner
} // namespace xgboost
// this are implementations of objective functions
#include "objective-inl.hpp"
// factory function
namespace xgboost {
namespace learner {
/*! \brief factory funciton to create objective function by name */
inline IObjFunction* CreateObjFunction(const char *name) {
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
utils::Error("unknown objective function type: %s", name);
return NULL;
}
} // namespace learner
} // namespace xgboost
#endif // XGBOOST_LEARNER_OBJECTIVE_H_

492
src/tree/model.h Normal file
View File

@@ -0,0 +1,492 @@
#ifndef XGBOOST_TREE_MODEL_H_
#define XGBOOST_TREE_MODEL_H_
/*!
* \file model.h
* \brief model structure for tree
* \author Tianqi Chen
*/
#include <string>
#include <cstring>
#include <sstream>
#include <limits>
#include <algorithm>
#include <vector>
#include <cmath>
#include "../utils/io.h"
#include "../utils/fmap.h"
#include "../utils/utils.h"
namespace xgboost {
namespace tree {
/*!
* \brief template class of TreeModel
* \tparam TSplitCond data type to indicate split condition
* \tparam TNodeStat auxiliary statistics of node to help tree building
*/
template<typename TSplitCond, typename TNodeStat>
class TreeModel {
public:
/*! \brief data type to indicate split condition */
typedef TNodeStat NodeStat;
/*! \brief auxiliary statistics of node to help tree building */
typedef TSplitCond SplitCond;
/*! \brief parameters of the tree */
struct Param{
/*! \brief number of start root */
int num_roots;
/*! \brief total number of nodes */
int num_nodes;
/*!\brief number of deleted nodes */
int num_deleted;
/*! \brief maximum depth, this is a statistics of the tree */
int max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
/*! \brief reserved part */
int reserved[32];
/*! \brief constructor */
Param(void) {
max_depth = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
if (!strcmp("num_roots", name)) num_roots = atoi(val);
if (!strcmp("num_feature", name)) num_feature = atoi(val);
}
};
/*! \brief tree node */
class Node{
public:
/*! \brief index of left child */
inline int cleft(void) const {
return this->cleft_;
}
/*! \brief index of right child */
inline int cright(void) const {
return this->cright_;
}
/*! \brief index of default child when feature is missing */
inline int cdefault(void) const {
return this->default_left() ? this->cleft() : this->cright();
}
/*! \brief feature index of split condition */
inline unsigned split_index(void) const {
return sindex_ & ((1U << 31) - 1U);
}
/*! \brief when feature is unknown, whether goes to left child */
inline bool default_left(void) const {
return (sindex_ >> 31) != 0;
}
/*! \brief whether current node is leaf node */
inline bool is_leaf(void) const {
return cleft_ == -1;
}
/*! \brief get leaf value of leaf node */
inline float leaf_value(void) const {
return (this->info_).leaf_value;
}
/*! \brief get split condition of the node */
inline TSplitCond split_cond(void) const {
return (this->info_).split_cond;
}
/*! \brief get parent of the node */
inline int parent(void) const {
return parent_ & ((1U << 31) - 1);
}
/*! \brief whether current node is left child */
inline bool is_left_child(void) const {
return (parent_ & (1U << 31)) != 0;
}
/*! \brief whether current node is root */
inline bool is_root(void) const {
return parent_ == -1;
}
/*!
* \brief set the right child
* \param nide node id to right child
*/
inline void set_right_child(int nid) {
this->cright_ = nid;
}
/*!
* \brief set split condition of current node
* \param split_index feature index to split
* \param split_cond split condition
* \param default_left the default direction when feature is unknown
*/
inline void set_split(unsigned split_index, TSplitCond split_cond,
bool default_left = false) {
if (default_left) split_index |= (1U << 31);
this->sindex_ = split_index;
(this->info_).split_cond = split_cond;
}
/*!
* \brief set the leaf value of the node
* \param value leaf value
* \param right right index, could be used to store
* additional information
*/
inline void set_leaf(float value, int right = -1) {
(this->info_).leaf_value = value;
this->cleft_ = -1;
this->cright_ = right;
}
private:
friend class TreeModel<TSplitCond, TNodeStat>;
/*!
* \brief in leaf node, we have weights, in non-leaf nodes,
* we have split condition
*/
union Info{
float leaf_value;
TSplitCond split_cond;
};
// pointer to parent, highest bit is used to
// indicate whether it's a left child or not
int parent_;
// pointer to left, right
int cleft_, cright_;
// split feature index, left split or right split depends on the highest bit
unsigned sindex_;
// extra info
Info info_;
// set parent
inline void set_parent(int pidx, bool is_left_child = true) {
if (is_left_child) pidx |= (1U << 31);
this->parent_ = pidx;
}
};
protected:
// vector of nodes
std::vector<Node> nodes;
// stats of nodes
std::vector<TNodeStat> stats;
// free node space, used during training process
std::vector<int> deleted_nodes;
// allocate a new node,
// !!!!!! NOTE: may cause BUG here, nodes.resize
inline int AllocNode(void) {
if (param.num_deleted != 0) {
int nd = deleted_nodes.back();
deleted_nodes.pop_back();
--param.num_deleted;
return nd;
}
int nd = param.num_nodes++;
utils::Check(param.num_nodes < std::numeric_limits<int>::max(),
"number of nodes in the tree exceed 2^31");
nodes.resize(param.num_nodes);
stats.resize(param.num_nodes);
return nd;
}
// delete a tree node
inline void DeleteNode(int nid) {
utils::Assert(nid >= param.num_roots, "can not delete root");
deleted_nodes.push_back(nid);
nodes[nid].set_parent(-1);
++param.num_deleted;
}
public:
/*!
* \brief change a non leaf node to a leaf node, delete its children
* \param rid node id of the node
* \param new leaf value
*/
inline void ChangeToLeaf(int rid, float value) {
utils::Assert(nodes[nodes[rid].cleft() ].is_leaf(),
"can not delete a non termial child");
utils::Assert(nodes[nodes[rid].cright()].is_leaf(),
"can not delete a non termial child");
this->DeleteNode(nodes[rid].cleft());
this->DeleteNode(nodes[rid].cright());
nodes[rid].set_leaf(value);
}
/*!
* \brief collapse a non leaf node to a leaf node, delete its children
* \param rid node id of the node
* \param new leaf value
*/
inline void CollapseToLeaf(int rid, float value) {
if (nodes[rid].is_leaf()) return;
if (!nodes[nodes[rid].cleft() ].is_leaf()) {
CollapseToLeaf(nodes[rid].cleft(), 0.0f);
}
if (!nodes[nodes[rid].cright() ].is_leaf()) {
CollapseToLeaf(nodes[rid].cright(), 0.0f);
}
this->ChangeToLeaf(rid, value);
}
public:
/*! \brief model parameter */
Param param;
/*! \brief constructor */
TreeModel(void) {
param.num_nodes = 1;
param.num_roots = 1;
param.num_deleted = 0;
nodes.resize(1);
}
/*! \brief get node given nid */
inline Node &operator[](int nid) {
return nodes[nid];
}
/*! \brief get node given nid */
inline const Node &operator[](int nid) const {
return nodes[nid];
}
/*! \brief get node statistics given nid */
inline NodeStat &stat(int nid) {
return stats[nid];
}
/*! \brief initialize the model */
inline void InitModel(void) {
param.num_nodes = param.num_roots;
nodes.resize(param.num_nodes);
stats.resize(param.num_nodes);
for (int i = 0; i < param.num_nodes; i ++) {
nodes[i].set_leaf(0.0f);
nodes[i].set_parent(-1);
}
}
/*!
* \brief load model from stream
* \param fi input stream
*/
inline void LoadModel(utils::IStream &fi) {
utils::Check(fi.Read(&param, sizeof(Param)) > 0,
"TreeModel: wrong format");
nodes.resize(param.num_nodes); stats.resize(param.num_nodes);
utils::Check(fi.Read(&nodes[0], sizeof(Node) * nodes.size()) > 0,
"TreeModel: wrong format");
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
"TreeModel: wrong format");
// chg deleted nodes
deleted_nodes.resize(0);
for (int i = param.num_roots; i < param.num_nodes; i ++) {
if (nodes[i].is_root()) deleted_nodes.push_back(i);
}
utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
"number of deleted nodes do not match");
}
/*!
* \brief save model to stream
* \param fo output stream
*/
inline void SaveModel(utils::IStream &fo) const {
utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
"Tree::SaveModel");
utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
"Tree::SaveModel");
fo.Write(&param, sizeof(Param));
fo.Write(&nodes[0], sizeof(Node) * nodes.size());
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
}
/*!
* \brief add child nodes to node
* \param nid node id to add childs
*/
inline void AddChilds(int nid) {
int pleft = this->AllocNode();
int pright = this->AllocNode();
nodes[nid].cleft_ = pleft;
nodes[nid].cright_ = pright;
nodes[nodes[nid].cleft() ].set_parent(nid, true);
nodes[nodes[nid].cright()].set_parent(nid, false);
}
/*!
* \brief only add a right child to a leaf node
* \param node id to add right child
*/
inline void AddRightChild(int nid) {
int pright = this->AllocNode();
nodes[nid].right = pright;
nodes[nodes[nid].right].set_parent(nid, false);
}
/*!
* \brief get current depth
* \param nid node id
* \param pass_rchild whether right child is not counted in depth
*/
inline int GetDepth(int nid, bool pass_rchild = false) const {
int depth = 0;
while (!nodes[nid].is_root()) {
if (!pass_rchild || nodes[nid].is_left_child()) ++depth;
nid = nodes[nid].parent();
}
return depth;
}
/*!
* \brief get maximum depth
* \param nid node id
*/
inline int MaxDepth(int nid) const {
if (nodes[nid].is_leaf()) return 0;
return std::max(MaxDepth(nodes[nid].cleft())+1,
MaxDepth(nodes[nid].cright())+1);
}
/*!
* \brief get maximum depth
*/
inline int MaxDepth(void) {
int maxd = 0;
for (int i = 0; i < param.num_roots; ++i) {
maxd = std::max(maxd, MaxDepth(i));
}
return maxd;
}
/*! \brief number of extra nodes besides the root */
inline int num_extra_nodes(void) const {
return param.num_nodes - param.num_roots - param.num_deleted;
}
/*!
* \brief dump model to text string
* \param fmap feature map of feature types
* \param with_stats whether dump out statistics as well
* \return the string of dumped model
*/
inline std::string DumpModel(const utils::FeatMap& fmap, bool with_stats) {
std::stringstream fo("");
for (int i = 0; i < param.num_roots; ++i) {
this->Dump(i, fo, fmap, 0, with_stats);
}
return fo.str();
}
private:
void Dump(int nid, std::stringstream &fo,
const utils::FeatMap& fmap, int depth, bool with_stats) {
for (int i = 0; i < depth; ++i) {
fo << '\t';
}
if (nodes[nid].is_leaf()) {
fo << nid << ":leaf=" << nodes[nid].leaf_value();
if (with_stats) {
stat(nid).Print(fo, true);
}
fo << '\n';
} else {
// right then left,
TSplitCond cond = nodes[nid].split_cond();
const unsigned split_index = nodes[nid].split_index();
if (split_index < fmap.size()) {
switch (fmap.type(split_index)) {
case utils::FeatMap::kIndicator: {
int nyes = nodes[nid].default_left() ?
nodes[nid].cright() : nodes[nid].cleft();
fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
<< ",no=" << nodes[nid].cdefault();
break;
}
case utils::FeatMap::kInteger: {
fo << nid << ":[" << fmap.name(split_index) << "<"
<< int(float(cond)+1.0f)
<< "] yes=" << nodes[nid].cleft()
<< ",no=" << nodes[nid].cright()
<< ",missing=" << nodes[nid].cdefault();
break;
}
case utils::FeatMap::kFloat:
case utils::FeatMap::kQuantitive: {
fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
<< "] yes=" << nodes[nid].cleft()
<< ",no=" << nodes[nid].cright()
<< ",missing=" << nodes[nid].cdefault();
break;
}
default: utils::Error("unknown fmap type");
}
} else {
fo << nid << ":[f" << split_index << "<"<< float(cond)
<< "] yes=" << nodes[nid].cleft()
<< ",no=" << nodes[nid].cright()
<< ",missing=" << nodes[nid].cdefault();
}
if (with_stats) {
fo << ' ';
stat(nid).Print(fo, false);
}
fo << '\n';
this->Dump(nodes[nid].cleft(), fo, fmap, depth+1, with_stats);
this->Dump(nodes[nid].cright(), fo, fmap, depth+1, with_stats);
}
}
};
/*! \brief node statistics used in regression tree */
struct RTreeNodeStat{
/*! \brief loss chg caused by current split */
float loss_chg;
/*! \brief sum of hessian values, used to measure coverage of data */
float sum_hess;
/*! \brief weight of current node */
float base_weight;
/*! \brief number of child that is leaf node known up to now */
int leaf_child_cnt;
/*! \brief print information of current stats to fo */
inline void Print(std::stringstream &fo, bool is_leaf) const {
if (!is_leaf) {
fo << "gain=" << loss_chg << ",cover=" << sum_hess;
} else {
fo << "cover=" << sum_hess;
}
}
};
/*! \brief define regression tree to be the most common tree model */
class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
public:
/*!
* \brief get the leaf index
* \param feats dense feature vector, if the feature is missing the field is set to NaN
* \param root_gid starting root index of the instance
* \return the leaf index of the given feature
*/
inline int GetLeafIndex(const std::vector<float> &feat, unsigned root_id = 0) const {
// start from groups that belongs to current data
int pid = static_cast<int>(root_id);
// tranverse tree
while (!(*this)[ pid ].is_leaf()) {
unsigned split_index = (*this)[pid].split_index();
const float fvalue = feat[split_index];
pid = this->GetNext(pid, fvalue, std::isnan(fvalue));
}
return pid;
}
/*!
* \brief get the prediction of regression tree, only accepts dense feature vector
* \param feats dense feature vector, if the feature is missing the field is set to NaN
* \param root_gid starting root index of the instance
* \return the leaf index of the given feature
*/
inline float Predict(const std::vector<float> &feat, unsigned root_id = 0) const {
int pid = this->GetLeafIndex(feat, root_id);
return (*this)[pid].leaf_value();
}
private:
/*! \brief get next position of the tree given current pid */
inline int GetNext(int pid, float fvalue, bool is_unknown) const {
float split_value = (*this)[pid].split_cond();
if (is_unknown) {
return (*this)[pid].cdefault();
} else {
if (fvalue < split_value) {
return (*this)[pid].cleft();
} else {
return (*this)[pid].cright();
}
}
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_MODEL_H_

262
src/tree/param.h Normal file
View File

@@ -0,0 +1,262 @@
#ifndef XGBOOST_TREE_PARAM_H_
#define XGBOOST_TREE_PARAM_H_
/*!
* \file param.h
* \brief training parameters, statistics used to support tree construction
* \author Tianqi Chen
*/
#include <cstring>
#include "../data.h"
namespace xgboost {
namespace tree {
/*! \brief core statistics used for tree construction */
struct GradStats {
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief constructor */
GradStats(void) {
this->Clear();
}
/*! \brief clear the statistics */
inline void Clear(void) {
sum_grad = sum_hess = 0.0f;
}
/*! \brief add statistics to the data */
inline void Add(double grad, double hess) {
sum_grad += grad; sum_hess += hess;
}
/*! \brief add statistics to the data */
inline void Add(const bst_gpair& b) {
this->Add(b.grad, b.hess);
}
/*! \brief add statistics to the data */
inline void Add(const GradStats &b) {
this->Add(b.sum_grad, b.sum_hess);
}
/*! \brief substract the statistics by b */
inline GradStats Substract(const GradStats &b) const {
GradStats res;
res.sum_grad = this->sum_grad - b.sum_grad;
res.sum_hess = this->sum_hess - b.sum_hess;
return res;
}
/*! \return whether the statistics is not used yet */
inline bool Empty(void) const {
return sum_hess == 0.0;
}
};
/*! \brief training parameters for regression tree */
struct TrainParam{
// learning step size for a time
float learning_rate;
// minimum loss change required for a split
float min_split_loss;
// maximum depth of a tree
int max_depth;
//----- the rest parameters are less important ----
// minimum amount of hessian(weight) allowed in a child
float min_child_weight;
// weight decay parameter used to control leaf fitting
float reg_lambda;
// reg method
int reg_method;
// default direction choice
int default_direction;
// whether we want to do subsample
float subsample;
// whether to subsample columns each split, in each level
float colsample_bylevel;
// whether to subsample columns during tree construction
float colsample_bytree;
// speed optimization for dense column
float opt_dense_col;
// number of threads to be used for tree construction,
// if OpenMP is enabled, if equals 0, use system default
int nthread;
/*! \brief constructor */
TrainParam(void) {
learning_rate = 0.3f;
min_child_weight = 1.0f;
max_depth = 6;
reg_lambda = 1.0f;
reg_method = 2;
default_direction = 0;
subsample = 1.0f;
colsample_bytree = 1.0f;
colsample_bylevel = 1.0f;
opt_dense_col = 1.0f;
nthread = 0;
}
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
// sync-names
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "reg_method")) reg_method = static_cast<float>(atof(val));
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
if (!strcmp(name, "nthread")) nthread = atoi(val);
if (!strcmp(name, "default_direction")) {
if (!strcmp(val, "learn")) default_direction = 0;
if (!strcmp(val, "left")) default_direction = 1;
if (!strcmp(val, "right")) default_direction = 2;
}
}
// calculate the cost of loss function
inline double CalcGain(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) {
return 0.0;
}
switch (reg_method) {
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess;
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda);
case 3 : return
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) /
(sum_hess + 0.5 * reg_lambda);
default: return Sqr(sum_grad) / sum_hess;
}
}
// calculate weight given the statistics
inline double CalcWeight(double sum_grad, double sum_hess) const {
if (sum_hess < min_child_weight) {
return 0.0;
} else {
switch (reg_method) {
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
case 2: return - sum_grad / (sum_hess + reg_lambda);
case 3: return
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
(sum_hess + 0.5 * reg_lambda);
default: return - sum_grad / sum_hess;
}
}
}
/*! \brief whether need forward small to big search: default right */
inline bool need_forward_search(float col_density = 0.0f) const {
return this->default_direction == 2 ||
(default_direction == 0 && (col_density < opt_dense_col));
}
/*! \brief whether need backward big to small search: default left */
inline bool need_backward_search(float col_density = 0.0f) const {
return this->default_direction != 2;
}
/*! \brief given the loss change, whether we need to invode prunning */
inline bool need_prune(double loss_chg, int depth) const {
return loss_chg < this->min_split_loss;
}
/*! \brief whether we can split with current hessian */
inline bool cannot_split(double sum_hess, int depth) const {
return sum_hess < this->min_child_weight * 2.0;
}
// code support for template data
inline double CalcWeight(const GradStats &d) const {
return this->CalcWeight(d.sum_grad, d.sum_hess);
}
inline double CalcGain(const GradStats &d) const {
return this->CalcGain(d.sum_grad, d.sum_hess);
}
protected:
// functions for L1 cost
inline static double ThresholdL1(double w, double lambda) {
if (w > +lambda) return w - lambda;
if (w < -lambda) return w + lambda;
return 0.0;
}
inline static double Sqr(double a) {
return a * a;
}
};
/*!
* \brief statistics that is helpful to store
* and represent a split solution for the tree
*/
struct SplitEntry{
/*! \brief loss change after split this node */
bst_float loss_chg;
/*! \brief split index */
unsigned sindex;
/*! \brief split value */
float split_value;
/*! \brief constructor */
SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
/*!
* \brief decides whether a we can replace current entry with the statistics given
* This function gives better priority to lower index when loss_chg equals
* not the best way, but helps to give consistent result during multi-thread execution
* \param loss_chg the loss reduction get through the split
* \param split_index the feature index where the split is on
*/
inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const {
if (this->split_index() <= split_index) {
return loss_chg > this->loss_chg;
} else {
return !(this->loss_chg > loss_chg);
}
}
/*!
* \brief update the split entry, replace it if e is better
* \param e candidate split solution
* \return whether the proposed split is better and can replace current split
*/
inline bool Update(const SplitEntry &e) {
if (this->NeedReplace(e.loss_chg, e.split_index())) {
this->loss_chg = e.loss_chg;
this->sindex = e.sindex;
this->split_value = e.split_value;
return true;
} else {
return false;
}
}
/*!
* \brief update the split entry, replace it if e is better
* \param loss_chg loss reduction of new candidate
* \param split_index feature index to split on
* \param split_value the split point
* \param default_left whether the missing value goes to left
* \return whether the proposed split is better and can replace current split
*/
inline bool Update(bst_float loss_chg, unsigned split_index,
float split_value, bool default_left) {
if (this->NeedReplace(loss_chg, split_index)) {
this->loss_chg = loss_chg;
if (default_left) split_index |= (1U << 31);
this->sindex = split_index;
this->split_value = split_value;
return true;
} else {
return false;
}
}
/*!\return feature index to split on */
inline unsigned split_index(void) const {
return sindex & ((1U << 31) - 1U);
}
/*!\return whether missing value goes to left branch */
inline bool default_left(void) const {
return (sindex >> 31) != 0;
}
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_PARAM_H_

70
src/tree/updater.h Normal file
View File

@@ -0,0 +1,70 @@
#ifndef XGBOOST_TREE_UPDATER_H_
#define XGBOOST_TREE_UPDATER_H_
/*!
* \file updater.h
* \brief interface to update the tree
* \author Tianqi Chen
*/
#include <vector>
#include "../data.h"
#include "./model.h"
namespace xgboost {
namespace tree {
/*!
* \brief interface of tree update module, that performs update of a tree
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class IUpdater {
public:
/*!
* \brief set parameters from outside
* \param name name of the parameter
* \param val value of the parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*!
* \brief peform update to the tree models
* \param gpair the gradient pair statistics of the data
* \param fmat feature matrix that provide access to features
* \param root_index pre-partitioned root_index of each instance,
* root_index.size() can be 0 which indicates that no pre-partition involved
* \param trees pointer to the trese to be updated, upater will change the content of the tree
* note: all the trees in the vector are updated, with the same statistics,
* but maybe different random seeds, usually one tree is passed in at a time,
* there can be multiple trees when we train random forest style model
*/
virtual void Update(const std::vector<bst_gpair> &gpair,
FMatrix &fmat,
const std::vector<unsigned> &root_index,
const std::vector<RegTree*> &trees) = 0;
// destructor
virtual ~IUpdater(void) {}
};
} // namespace tree
} // namespace xgboost
#include "./updater_prune-inl.hpp"
#include "./updater_colmaker-inl.hpp"
namespace xgboost {
namespace tree {
/*!
* \brief create a updater based on name
* \param name name of updater
* \return return the updater instance
*/
template<typename FMatrix>
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;
}
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_H_

View File

@@ -0,0 +1,357 @@
#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
/*!
* \file updater_colmaker-inl.hpp
* \brief use columnwise update to construct a tree
* \author Tianqi Chen
*/
#include <vector>
#include <algorithm>
#include "./param.h"
#include "./updater.h"
#include "../utils/omp.h"
#include "../utils/random.h"
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix, typename TStats>
class ColMaker: public IUpdater<FMatrix> {
public:
virtual ~ColMaker(void) {}
// set training parameter
virtual void SetParam(const char *name, const char *val) {
param.SetParam(name, val);
}
virtual void Update(const std::vector<bst_gpair> &gpair,
FMatrix &fmat,
const std::vector<unsigned> &root_index,
const std::vector<RegTree*> &trees) {
for (size_t i = 0; i < trees.size(); ++i) {
Builder builder(param);
builder.Update(gpair, fmat, root_index, trees[i]);
}
}
private:
// training parameter
TrainParam param;
// data structure
/*! \brief per thread x per node entry to store tmp data */
struct ThreadEntry {
/*! \brief statistics of data*/
TStats stats;
/*! \brief last feature value scanned */
float last_fvalue;
/*! \brief current best solution */
SplitEntry best;
// constructor
ThreadEntry(void) {
stats.Clear();
}
};
struct NodeEntry {
/*! \brief statics for node entry */
TStats stats;
/*! \brief loss of this node, without split */
bst_float root_gain;
/*! \brief weight calculated related to current data */
float weight;
/*! \brief current best solution */
SplitEntry best;
// constructor
NodeEntry(void) : root_gain(0.0f), weight(0.0f){
stats.Clear();
}
};
// actual builder that runs the algorithm
struct Builder{
public:
// constructor
explicit Builder(const TrainParam &param) : param(param) {}
// update one tree, growing
virtual void Update(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
const std::vector<unsigned> &root_index,
RegTree *p_tree) {
this->InitData(gpair, fmat, root_index, *p_tree);
this->InitNewNode(qexpand, gpair, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, *p_tree);
// if nothing left to be expand, break
if (qexpand.size() == 0) break;
}
// set all the rest expanding nodes to leaf
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
}
// remember auxiliary statistics in the tree node
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
p_tree->stat(nid).base_weight = snode[nid].weight;
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
}
}
private:
// initialize temp data structure
inline void InitData(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
{// setup position
position.resize(gpair.size());
if (root_index.size() == 0) {
std::fill(position.begin(), position.end(), 0);
} else {
for (size_t i = 0; i < root_index.size(); ++i) {
position[i] = root_index[i];
utils::Assert(root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting");
}
}
// mark delete for the deleted datas
for (size_t i = 0; i < gpair.size(); ++i) {
if (gpair[i].hess < 0.0f) position[i] = -1;
}
// mark subsample
if (param.subsample < 1.0f) {
for (size_t i = 0; i < gpair.size(); ++i) {
if (gpair[i].hess < 0.0f) continue;
if (random::SampleBinary(param.subsample) == 0) position[i] = -1;
}
}
}
{
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) feat_index.push_back(i);
}
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
random::Shuffle(feat_index);
utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
feat_index.resize(n);
}
{// setup temp space for each thread
#pragma omp parallel
{
this->nthread = omp_get_num_threads();
}
// reserve a small space
stemp.clear();
stemp.resize(this->nthread, std::vector<ThreadEntry>());
for (size_t i = 0; i < stemp.size(); ++i) {
stemp[i].clear(); stemp[i].reserve(256);
}
snode.reserve(256);
}
{// expand query
qexpand.reserve(256); qexpand.clear();
for (int i = 0; i < tree.param.num_roots; ++i) {
qexpand.push_back(i);
}
}
}
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair,
const RegTree &tree) {
{// setup statistics space for each tree node
for (size_t i = 0; i < stemp.size(); ++i) {
stemp[i].resize(tree.param.num_nodes, ThreadEntry());
}
snode.resize(tree.param.num_nodes, NodeEntry());
}
// setup position
const unsigned ndata = static_cast<unsigned>(position.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
const int tid = omp_get_thread_num();
if (position[i] < 0) continue;
stemp[tid][position[i]].stats.Add(gpair[i]);
}
// sum the per thread statistics together
for (size_t j = 0; j < qexpand.size(); ++j) {
const int nid = qexpand[j];
TStats stats; stats.Clear();
for (size_t tid = 0; tid < stemp.size(); ++tid) {
stats.Add(stemp[tid][nid].stats);
}
// update node statistics
snode[nid].stats = stats;
snode[nid].root_gain = param.CalcGain(stats);
snode[nid].weight = param.CalcWeight(stats);
}
}
/*! \brief update queue expand add in new leaves */
inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
std::vector<int> &qexpand = *p_qexpand;
std::vector<int> newnodes;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
if (!tree[ nid ].is_leaf()) {
newnodes.push_back(tree[nid].cleft());
newnodes.push_back(tree[nid].cright());
}
}
// use new nodes for qexpand
qexpand = newnodes;
}
// enumerate the split values of specific feature
template<typename Iter>
inline void EnumerateSplit(Iter it, unsigned fid,
const std::vector<bst_gpair> &gpair,
std::vector<ThreadEntry> &temp,
bool is_forward_search) {
// clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear();
}
while (it.Next()) {
const bst_uint ridx = it.rindex();
const int nid = position[ridx];
if (nid < 0) continue;
// start working
const float fvalue = it.fvalue();
// get the statistics of nid
ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init
if (e.stats.Empty()) {
e.stats.Add(gpair[ridx]);
e.last_fvalue = fvalue;
} else {
// try to find a split
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
TStats c = snode[nid].stats.Substract(e.stats);
if (c.sum_hess >= param.min_child_weight) {
double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
}
}
// update the statistics
e.stats.Add(gpair[ridx]);
e.last_fvalue = fvalue;
}
}
// finish updating all statistics, check if it is possible to include all sum statistics
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
ThreadEntry &e = temp[nid];
TStats c = snode[nid].stats.Substract(e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
const double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
const float delta = is_forward_search ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
}
}
}
// find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const FMatrix &fmat,
RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) {
random::Shuffle(feat_set);
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
feat_set.resize(n);
}
// start enumeration
const unsigned nsize = static_cast<unsigned>(feat_set.size());
#pragma omp parallel for schedule(dynamic, 1)
for (unsigned i = 0; i < nsize; ++i) {
const unsigned fid = feat_set[i];
const int tid = omp_get_thread_num();
if (param.need_forward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, stemp[tid], true);
}
if (param.need_backward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, stemp[tid], false);
}
}
// after this each thread's stemp will get the best candidates, aggregate results
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
NodeEntry &e = snode[nid];
for (int tid = 0; tid < this->nthread; ++tid) {
e.best.Update(stemp[tid][nid].best);
}
// now we know the solution in snode[nid], set split
if (e.best.loss_chg > rt_eps) {
p_tree->AddChilds(nid);
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
} else {
(*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
}
}
}
// reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
// step 1, set default direct nodes to default, and leaf nodes to -1
const unsigned ndata = static_cast<unsigned>(position.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
const int nid = position[i];
if (nid >= 0) {
if (tree[nid].is_leaf()) {
position[i] = -1;
} else {
// push to default branch, correct latter
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
}
}
}
// step 2, classify the non-default data into right places
std::vector<unsigned> fsplits;
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
}
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const unsigned nfeats = static_cast<unsigned>(fsplits.size());
#pragma omp parallel for schedule(dynamic, 1)
for (unsigned i = 0; i < nfeats; ++i) {
const unsigned fid = fsplits[i];
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
const bst_uint ridx = it.rindex();
int nid = position[ridx];
if (nid == -1) continue;
// go back to parent, correct those who are not default
nid = tree[nid].parent();
if (tree[nid].split_index() == fid) {
if (it.fvalue() < tree[nid].split_cond()) {
position[ridx] = tree[nid].cleft();
} else {
position[ridx] = tree[nid].cright();
}
}
}
}
}
//--data fields--
const TrainParam &param;
// number of omp thread used during training
int nthread;
// Per feature: shuffle index of each feature index
std::vector<unsigned> feat_index;
// Instance Data: current node position in the tree of each instance
std::vector<int> position;
// PerThread x PerTreeNode: statistics for per thread construction
std::vector< std::vector<ThreadEntry> > stemp;
/*! \brief TreeNode Data: statistics for each constructed node */
std::vector<NodeEntry> snode;
/*! \brief queue of nodes to be expanded */
std::vector<int> qexpand;
};
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_

View File

@@ -0,0 +1,67 @@
#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
/*!
* \file updater_prune-inl.hpp
* \brief prune a tree given the statistics
* \author Tianqi Chen
*/
#include <vector>
#include "./param.h"
#include "./updater.h"
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix>
class TreePruner: public IUpdater<FMatrix> {
public:
virtual ~TreePruner(void) {}
// set training parameter
virtual void SetParam(const char *name, const char *val) {
param.SetParam(name, val);
}
// update the tree, do pruning
virtual void Update(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
const std::vector<unsigned> &root_index,
const std::vector<RegTree*> &trees) {
for (size_t i = 0; i < trees.size(); ++i) {
this->DoPrune(*trees[i]);
}
}
private:
// try to prune off current leaf
inline void TryPruneLeaf(RegTree &tree, int nid, int depth) {
if (tree[nid].is_root()) return;
int pid = tree[nid].parent();
RegTree::NodeStat &s = tree.stat(pid);
++s.leaf_child_cnt;
if (s.leaf_child_cnt >= 2 && param.need_prune(s.loss_chg, depth - 1)) {
// need to be pruned
tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
// tail recursion
this->TryPruneLeaf(tree, pid, depth - 1);
}
}
/*! \brief do prunning of a tree */
inline void DoPrune(RegTree &tree) {
// initialize auxiliary statistics
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
tree.stat(nid).leaf_child_cnt = 0;
}
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
if (tree[nid].is_leaf()) {
this->TryPruneLeaf(tree, nid, tree.GetDepth(nid));
}
}
}
private:
// training parameter
TrainParam param;
};
} // namespace tree
} // namespace xgboost
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_

196
src/utils/config.h Normal file
View File

@@ -0,0 +1,196 @@
#ifndef XGBOOST_UTILS_CONFIG_H_
#define XGBOOST_UTILS_CONFIG_H_
/*!
* \file config.h
* \brief helper class to load in configures from file
* \author Tianqi Chen
*/
#include <cstdio>
#include <cstring>
#include <string>
#include <istream>
#include <fstream>
#include "./utils.h"
namespace xgboost {
namespace utils {
/*!
* \brief base implementation of config reader
*/
class ConfigReaderBase {
public:
/*!
* \brief get current name, called after Next returns true
* \return current parameter name
*/
inline const char *name(void) const {
return s_name;
}
/*!
* \brief get current value, called after Next returns true
* \return current parameter value
*/
inline const char *val(void) const {
return s_val;
}
/*!
* \brief move iterator to next position
* \return true if there is value in next position
*/
inline bool Next(void) {
while (!this->IsEnd()) {
GetNextToken(s_name);
if (s_name[0] == '=') return false;
if (GetNextToken( s_buf ) || s_buf[0] != '=') return false;
if (GetNextToken( s_val ) || s_val[0] == '=') return false;
return true;
}
return false;
}
// called before usage
inline void Init(void) {
ch_buf = this->GetChar();
}
protected:
/*!
* \brief to be implemented by subclass,
* get next token, return EOF if end of file
*/
virtual char GetChar(void) = 0;
/*! \brief to be implemented by child, check if end of stream */
virtual bool IsEnd(void) = 0;
private:
char ch_buf;
char s_name[100000], s_val[100000], s_buf[100000];
inline void SkipLine(void) {
do {
ch_buf = this->GetChar();
} while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
}
inline void ParseStr(char tok[]) {
int i = 0;
while ((ch_buf = this->GetChar()) != EOF) {
switch (ch_buf) {
case '\\': tok[i++] = this->GetChar(); break;
case '\"': tok[i++] = '\0'; return;
case '\r':
case '\n': Error("ConfigReader: unterminated string");
default: tok[i++] = ch_buf;
}
}
Error("ConfigReader: unterminated string");
}
inline void ParseStrML(char tok[]) {
int i = 0;
while ((ch_buf = this->GetChar()) != EOF) {
switch (ch_buf) {
case '\\': tok[i++] = this->GetChar(); break;
case '\'': tok[i++] = '\0'; return;
default: tok[i++] = ch_buf;
}
}
Error("unterminated string");
}
// return newline
inline bool GetNextToken(char tok[]) {
int i = 0;
bool new_line = false;
while (ch_buf != EOF) {
switch (ch_buf) {
case '#' : SkipLine(); new_line = true; break;
case '\"':
if (i == 0) {
ParseStr(tok); ch_buf = this->GetChar(); return new_line;
} else {
Error("ConfigReader: token followed directly by string");
}
case '\'':
if (i == 0) {
ParseStrML( tok ); ch_buf = this->GetChar(); return new_line;
} else {
Error("ConfigReader: token followed directly by string");
}
case '=':
if (i == 0) {
ch_buf = this->GetChar();
tok[0] = '=';
tok[1] = '\0';
} else {
tok[i] = '\0';
}
return new_line;
case '\r':
case '\n':
if (i == 0) new_line = true;
case '\t':
case ' ' :
ch_buf = this->GetChar();
if (i > 0) {
tok[i] = '\0';
return new_line;
}
break;
default:
tok[i++] = ch_buf;
ch_buf = this->GetChar();
break;
}
}
return true;
}
};
/*!
* \brief an iterator use stream base, allows use all types of istream
*/
class ConfigStreamReader: public ConfigReaderBase {
public:
/*!
* \brief constructor
* \param istream input stream
*/
explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
protected:
virtual char GetChar(void) {
return fin.get();
}
/*! \brief to be implemented by child, check if end of stream */
virtual bool IsEnd(void) {
return fin.eof();
}
private:
std::istream &fin;
};
/*!
* \brief an iterator that iterates over a configure file and gets the configures
*/
class ConfigIterator: public ConfigStreamReader {
public:
/*!
* \brief constructor
* \param fname name of configure file
*/
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
fi.open(fname);
if (fi.fail()) {
utils::Error("cannot open file %s", fname);
}
ConfigReaderBase::Init();
}
/*! \brief destructor */
~ConfigIterator(void) {
fi.close();
}
private:
std::ifstream fi;
};
} // namespace utils
} // namespace xgboost
#endif // XGBOOST_UTILS_CONFIG_H_

80
src/utils/fmap.h Normal file
View File

@@ -0,0 +1,80 @@
#ifndef XGBOOST_UTILS_FMAP_H_
#define XGBOOST_UTILS_FMAP_H_
/*!
* \file fmap.h
* \brief helper class that holds the feature names and interpretations
* \author Tianqi Chen
*/
#include <vector>
#include <string>
#include <cstring>
#include "./utils.h"
namespace xgboost {
namespace utils {
/*! \brief helper class that holds the feature names and interpretations */
class FeatMap {
public:
enum Type {
kIndicator = 0,
kQuantitive = 1,
kInteger = 2,
kFloat = 3
};
// function definitions
/*! \brief load feature map from text format */
inline void LoadText(const char *fname) {
FILE *fi = utils::FopenCheck(fname, "r");
this->LoadText(fi);
fclose(fi);
}
/*! \brief load feature map from text format */
inline void LoadText(FILE *fi) {
int fid;
char fname[1256], ftype[1256];
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
this->PushBack(fid, fname, ftype);
}
}
/*!\brief push back feature map */
inline void PushBack(int fid, const char *fname, const char *ftype) {
utils::Check(fid == static_cast<int>(names_.size()), "invalid fmap format");
names_.push_back(std::string(fname));
types_.push_back(GetType(ftype));
}
inline void Clear(void) {
names_.clear(); types_.clear();
}
/*! \brief number of known features */
size_t size(void) const {
return names_.size();
}
/*! \brief return name of specific feature */
const char* name(size_t idx) const {
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
return names_[idx].c_str();
}
/*! \brief return type of specific feature */
const Type& type(size_t idx) const {
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
return types_[idx];
}
private:
inline static Type GetType(const char *tname) {
if (!strcmp("i", tname)) return kIndicator;
if (!strcmp("q", tname)) return kQuantitive;
if (!strcmp("int", tname)) return kInteger;
if (!strcmp("float", tname)) return kFloat;
utils::Error("unknown feature type, use i for indicator and q for quantity");
return kIndicator;
}
/*! \brief name of the feature */
std::vector<std::string> names_;
/*! \brief type of the feature */
std::vector<Type> types_;
};
} // namespace utils
} // namespace xgboost
#endif // XGBOOST_FMAP_H_

104
src/utils/io.h Normal file
View File

@@ -0,0 +1,104 @@
#ifndef XGBOOST_UTILS_IO_H
#define XGBOOST_UTILS_IO_H
#include <cstdio>
#include <vector>
#include <string>
#include "./utils.h"
/*!
* \file io.h
* \brief general stream interface for serialization, I/O
* \author Tianqi Chen
*/
namespace xgboost {
namespace utils {
/*!
* \brief interface of stream I/O, used to serialize model
*/
class IStream {
public:
/*!
* \brief read data from stream
* \param ptr pointer to memory buffer
* \param size size of block
* \return usually is the size of data readed
*/
virtual size_t Read(void *ptr, size_t size) = 0;
/*!
* \brief write data to stream
* \param ptr pointer to memory buffer
* \param size size of block
*/
virtual void Write(const void *ptr, size_t size) = 0;
/*! \brief virtual destructor */
virtual ~IStream(void) {}
public:
// helper functions to write various of data structures
/*!
* \brief binary serialize a vector
* \param vec vector to be serialized
*/
template<typename T>
inline void Write(const std::vector<T> &vec) {
uint64_t sz = vec.size();
this->Write(&sz, sizeof(sz));
this->Write(&vec[0], sizeof(T) * sz);
}
/*!
* \brief binary load a vector
* \param out_vec vector to be loaded
* \return whether load is successfull
*/
template<typename T>
inline bool Read(std::vector<T> *out_vec) {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_vec->resize(sz);
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
return true;
}
/*!
* \brief binary serialize a string
* \param str the string to be serialized
*/
inline void Write(const std::string &str) {
uint64_t sz = str.length();
this->Write(&sz, sizeof(sz));
this->Write(&str[0], sizeof(char) * sz);
}
/*!
* \brief binary load a string
* \param out_str string to be loaded
* \return whether load is successful
*/
inline bool Read(std::string *out_str) {
uint64_t sz;
if (this->Read(&sz, sizeof(sz)) == 0) return false;
out_str->resize(sz);
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
return true;
}
};
/*! \brief implementation of file i/o stream */
class FileStream : public IStream {
private:
FILE *fp;
public:
explicit FileStream(FILE *fp) {
this->fp = fp;
}
virtual size_t Read(void *ptr, size_t size) {
return fread(ptr, size, 1, fp);
}
virtual void Write(const void *ptr, size_t size) {
fwrite(ptr, size, 1, fp);
}
inline void Close(void) {
fclose(fp);
}
};
} // namespace utils
} // namespace xgboost
#endif

40
src/utils/iterator.h Normal file
View File

@@ -0,0 +1,40 @@
#ifndef XGBOOST_UTILS_ITERATOR_H
#define XGBOOST_UTILS_ITERATOR_H
#include <cstdio>
/*!
* \file iterator.h
* \brief itertator interface
* \author Tianqi Chen
*/
namespace xgboost {
namespace utils {
/*!
* \brief iterator interface
* \tparam DType data type
*/
template<typename DType>
class IIterator {
public:
/*!
* \brief set the parameter
* \param name name of parameter
* \param val value of parameter
*/
virtual void SetParam(const char *name, const char *val) = 0;
/*! \brief initalize the iterator so that we can use the iterator */
virtual void Init(void) = 0;
/*! \brief set before first of the item */
virtual void BeforeFirst(void) = 0;
/*! \brief move to next item */
virtual bool Next(void) = 0;
/*! \brief get current data */
virtual const DType &Value(void) const = 0;
public:
/*! \brief constructor */
virtual ~IIterator(void) {}
};
} // namespace utils
} // namespace xgboost
#endif

123
src/utils/matrix_csr.h Normal file
View File

@@ -0,0 +1,123 @@
#ifndef XGBOOST_UTILS_MATRIX_CSR_H_
#define XGBOOST_UTILS_MATRIX_CSR_H_
/*!
* \file matrix_csr.h
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
* \author Tianqi Chen
*/
#include <vector>
#include <algorithm>
#include "./utils.h"
namespace xgboost {
namespace utils {
/*!
* \brief a class used to help construct CSR format matrix,
* can be used to convert row major CSR to column major CSR
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
* \tparam whether enabling the usage of aclist, this option must be enabled manually
*/
template<typename IndexType, bool UseAcList = false>
struct SparseCSRMBuilder {
private:
/*! \brief dummy variable used in the indicator matrix construction */
std::vector<size_t> dummy_aclist;
/*! \brief pointer to each of the row */
std::vector<size_t> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> &findex;
/*! \brief a list of active rows, used when many rows are empty */
std::vector<size_t> &aclist;
public:
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex)
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
Assert(!UseAcList, "enabling bug");
}
/*! \brief use with caution! rptr must be cleaned before use */
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
std::vector<IndexType> &p_findex,
std::vector<size_t> &p_aclist)
:rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
Assert(UseAcList, "must manually enable the option use aclist");
}
public:
/*!
* \brief step 1: initialize the number of rows in the data, not necessary exact
* \nrows number of rows in the matrix, can be smaller than expected
*/
inline void InitBudget(size_t nrows = 0) {
if (!UseAcList) {
rptr.clear();
rptr.resize(nrows + 1, 0);
} else {
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
this->Cleanup();
}
}
/*!
* \brief step 2: add budget to each rows, this function is called when aclist is used
* \param row_id the id of the row
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t row_id, size_t nelem = 1) {
if (rptr.size() < row_id + 2) {
rptr.resize(row_id + 2, 0);
}
if (UseAcList) {
if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
}
rptr[row_id + 1] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage(void) {
// initialize rptr to be beginning of each segment
size_t start = 0;
if (!UseAcList) {
for (size_t i = 1; i < rptr.size(); i++) {
size_t rlen = rptr[i];
rptr[i] = start;
start += rlen;
}
} else {
// case with active list
std::sort(aclist.begin(), aclist.end());
for (size_t i = 0; i < aclist.size(); i++) {
size_t ridx = aclist[i];
size_t rlen = rptr[ridx + 1];
rptr[ridx + 1] = start;
// set previous rptr to right position if previous feature is not active
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
start += rlen;
}
}
findex.resize(start);
}
/*!
* \brief step 4:
* used in indicator matrix construction, add new
* element to each row, the number of calls shall be exactly same as add_budget
*/
inline void PushElem(size_t row_id, IndexType col_id) {
size_t &rp = rptr[row_id + 1];
findex[rp++] = col_id;
}
/*!
* \brief step 5: only needed when aclist is used
* clean up the rptr for next usage
*/
inline void Cleanup(void) {
Assert(UseAcList, "this function can only be called use AcList");
for (size_t i = 0; i < aclist.size(); i++) {
const size_t ridx = aclist[i];
rptr[ridx] = 0; rptr[ridx + 1] = 0;
}
aclist.clear();
}
};
} // namespace utils
} // namespace xgboost
#endif

16
src/utils/omp.h Normal file
View File

@@ -0,0 +1,16 @@
#ifndef XGBOOST_UTILS_OMP_H_
#define XGBOOST_UTILS_OMP_H_
/*!
* \file omp.h
* \brief header to handle OpenMP compatibility issues
* \author Tianqi Chen
*/
#if defined(_OPENMP)
#include <omp.h>
#else
#warning "OpenMP is not available, compile to single thread code"
inline int omp_get_thread_num() { return 0; }
inline int omp_get_num_threads() { return 1; }
inline void omp_set_num_threads(int nthread) {}
#endif
#endif // XGBOOST_UTILS_OMP_H_

102
src/utils/random.h Normal file
View File

@@ -0,0 +1,102 @@
#ifndef XGBOOST_UTILS_RANDOM_H_
#define XGBOOST_UTILS_RANDOM_H_
/*!
* \file xgboost_random.h
* \brief PRNG to support random number generation
* \author Tianqi Chen: tianqi.tchen@gmail.com
*
* Use standard PRNG from stdlib
*/
#include <cmath>
#include <cstdlib>
#include <vector>
#include <algorithm>
#include "./utils.h"
/*! namespace of PRNG */
namespace xgboost {
namespace random {
/*! \brief seed the PRNG */
inline void Seed(uint32_t seed) {
srand(seed);
}
/*! \brief return a real number uniform in [0,1) */
inline double NextDouble(void) {
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
}
/*! \brief return a real numer uniform in (0,1) */
inline double NextDouble2(void) {
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
}
/*! \brief return a random number */
inline uint32_t NextUInt32(void) {
return (uint32_t)rand();
}
/*! \brief return a random number in n */
inline uint32_t NextUInt32(uint32_t n) {
return (uint32_t)floor(NextDouble() * n);
}
/*! \brief return x~N(0,1) */
inline double SampleNormal() {
double x, y, s;
do {
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
} while (s >= 1.0 || s == 0.0);
return x * sqrt(-2.0 * log(s) / s);
}
/*! \brief return iid x,y ~N(0,1) */
inline void SampleNormal2D(double &xx, double &yy) {
double x, y, s;
do {
x = 2 * NextDouble2() - 1.0;
y = 2 * NextDouble2() - 1.0;
s = x*x + y*y;
} while (s >= 1.0 || s == 0.0);
double t = sqrt(-2.0 * log(s) / s);
xx = x * t;
yy = y * t;
}
/*! \brief return x~N(mu,sigma^2) */
inline double SampleNormal(double mu, double sigma) {
return SampleNormal() * sigma + mu;
}
/*! \brief return 1 with probability p, coin flip */
inline int SampleBinary(double p) {
return NextDouble() < p;
}
template<typename T>
inline void Shuffle(T *data, size_t sz) {
if (sz == 0) return;
for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
std::swap(data[i], data[NextUInt32(i + 1)]);
}
}
// random shuffle the data inside, require PRNG
template<typename T>
inline void Shuffle(std::vector<T> &data) {
Shuffle(&data[0], data.size());
}
/*! \brief random number generator with independent random number seed*/
struct Random{
/*! \brief set random number seed */
inline void Seed(unsigned sd) {
this->rseed = sd;
}
/*! \brief return a real number uniform in [0,1) */
inline double RandDouble(void) {
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
}
// random number seed
unsigned rseed;
};
} // namespace random
} // namespace xgboost
#endif // XGBOOST_UTILS_RANDOM_H_

94
src/utils/utils.h Normal file
View File

@@ -0,0 +1,94 @@
#ifndef XGBOOST_UTILS_UTILS_H_
#define XGBOOST_UTILS_UTILS_H_
/*!
* \file utils.h
* \brief simple utils to support the code
* \author Tianqi Chen
*/
#define _CRT_SECURE_NO_WARNINGS
#ifdef _MSC_VER
#define fopen64 fopen
#else
#ifdef _FILE_OFFSET_BITS
#if _FILE_OFFSET_BITS == 32
#warning "FILE OFFSET BITS defined to be 32 bit"
#endif
#endif
#ifdef __APPLE__
#define off64_t off_t
#define fopen64 fopen
#endif
#define _FILE_OFFSET_BITS 64
extern "C" {
#include <sys/types.h>
};
#endif
#ifdef _MSC_VER
typedef unsigned char uint8_t;
typedef unsigned short int uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long uint64_t;
typedef long int64_t;
#else
#include <inttypes.h>
#endif
#include <cstdio>
#include <cstdarg>
#include <cstdlib>
namespace xgboost {
/*! \brief namespace for helper utils of the project */
namespace utils {
/*! \brief assert an condition is true, use this to handle debug information */
inline void Assert(bool exp, const char *fmt, ...) {
if (!exp) {
va_list args;
va_start(args, fmt);
fprintf(stderr, "AssertError:");
vfprintf(stderr, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
}
}
/*!\brief same as assert, but this is intended to be used as message for user*/
inline void Check(bool exp, const char *fmt, ...) {
if (!exp) {
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
}
}
/*! \brief report error message, same as check */
inline void Error(const char *fmt, ...) {
{
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
}
}
/*! \brief replace fopen, report error when the file open fails */
inline FILE *FopenCheck(const char *fname, const char *flag) {
FILE *fp = fopen64(fname, flag);
Check(fp != NULL, "can not open file \"%s\"\n", fname);
return fp;
}
} // namespace utils
} // namespace xgboost
#endif // XGBOOST_UTILS_UTILS_H_