mv code into src
This commit is contained in:
293
src/data.h
Normal file
293
src/data.h
Normal file
@@ -0,0 +1,293 @@
|
||||
#ifndef XGBOOST_UNITY_DATA_H
|
||||
#define XGBOOST_UNITY_DATA_H
|
||||
/*!
|
||||
* \file data.h
|
||||
* \brief the input data structure for gradient boosting
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include "utils/io.h"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/iterator.h"
|
||||
#include "utils/matrix_csr.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*!
|
||||
* \brief unsigned interger type used in boost,
|
||||
* used for feature index and row index
|
||||
*/
|
||||
typedef unsigned bst_uint;
|
||||
/*! \brief float type, used for storing statistics */
|
||||
typedef float bst_float;
|
||||
const float rt_eps = 1e-5f;
|
||||
// min gap between feature values to allow a split happen
|
||||
const float rt_2eps = rt_eps * 2.0f;
|
||||
|
||||
/*! \brief gradient statistics pair usually needed in gradient boosting */
|
||||
struct bst_gpair{
|
||||
/*! \brief gradient statistics */
|
||||
bst_float grad;
|
||||
/*! \brief second order gradient statistics */
|
||||
bst_float hess;
|
||||
bst_gpair(void) {}
|
||||
bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {}
|
||||
};
|
||||
|
||||
/*! \brief read-only sparse instance batch in CSR format */
|
||||
struct SparseBatch {
|
||||
/*! \brief an entry of sparse vector */
|
||||
struct Entry {
|
||||
/*! \brief feature index */
|
||||
bst_uint findex;
|
||||
/*! \brief feature value */
|
||||
bst_float fvalue;
|
||||
// default constructor
|
||||
Entry(void) {}
|
||||
Entry(bst_uint findex, bst_float fvalue) : findex(findex), fvalue(fvalue) {}
|
||||
/*! \brief reversely compare feature values */
|
||||
inline static bool CmpValue(const Entry &a, const Entry &b) {
|
||||
return a.fvalue < b.fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief an instance of sparse vector in the batch */
|
||||
struct Inst {
|
||||
/*! \brief pointer to the elements*/
|
||||
const Entry *data;
|
||||
/*! \brief length of the instance */
|
||||
const bst_uint length;
|
||||
/*! \brief constructor */
|
||||
Inst(const Entry *data, bst_uint length) : data(data), length(length) {}
|
||||
/*! \brief get i-th pair in the sparse vector*/
|
||||
inline const Entry& operator[](size_t i) const {
|
||||
return data[i];
|
||||
}
|
||||
};
|
||||
/*! \brief batch size */
|
||||
size_t size;
|
||||
/*! \brief the offset of rowid of this batch */
|
||||
size_t base_rowid;
|
||||
/*! \brief array[size+1], row pointer of each of the elements */
|
||||
const size_t *row_ptr;
|
||||
/*! \brief array[row_ptr.back()], content of the sparse element */
|
||||
const Entry *data_ptr;
|
||||
/*! \brief get i-th row from the batch */
|
||||
inline Inst operator[](size_t i) const {
|
||||
return Inst(data_ptr + row_ptr[i], row_ptr[i+1] - row_ptr[i]);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief This is a interface convention via template, defining the way to access features,
|
||||
* column access rule is defined by template, for efficiency purpose,
|
||||
* row access is defined by iterator of sparse batches
|
||||
* \tparam Derived type of actual implementation
|
||||
*/
|
||||
template<typename Derived>
|
||||
class FMatrixInterface {
|
||||
public:
|
||||
/*! \brief example iterator over one column */
|
||||
struct ColIter{
|
||||
/*!
|
||||
* \brief move to next position
|
||||
* \return whether there is element in next position
|
||||
*/
|
||||
inline bool Next(void);
|
||||
/*! \return row index of current position */
|
||||
inline bst_uint rindex(void) const;
|
||||
/*! \return feature value in current position */
|
||||
inline bst_float fvalue(void) const;
|
||||
};
|
||||
/*! \brief backward iterator over column */
|
||||
struct ColBackIter : public ColIter {};
|
||||
public:
|
||||
// column access is needed by some of tree construction algorithms
|
||||
/*!
|
||||
* \brief get column iterator, the columns must be sorted by feature value
|
||||
* \param cidx column index
|
||||
* \return column iterator
|
||||
*/
|
||||
inline ColIter GetSortedCol(size_t cidx) const;
|
||||
/*!
|
||||
* \brief get column backward iterator, starts from biggest fvalue, and iterator back
|
||||
* \param cidx column index
|
||||
* \return reverse column iterator
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const;
|
||||
/*!
|
||||
* \brief get number of columns
|
||||
* \return number of columns
|
||||
*/
|
||||
inline size_t NumCol(void) const;
|
||||
/*!
|
||||
* \brief check if column access is supported, if not, initialize column access
|
||||
* \param max_rows maximum number of rows allowed in constructor
|
||||
*/
|
||||
inline void InitColAccess(void);
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess(void) const;
|
||||
/*! \breif return #entries-in-col */
|
||||
inline size_t GetColSize(size_t cidx) const;
|
||||
/*!
|
||||
* \breif return #entries-in-col / #rows
|
||||
* \param cidx column index
|
||||
* this function is used to help speedup,
|
||||
* doese not necessarily implement it if not sure, return 0.0;
|
||||
* \return column density
|
||||
*/
|
||||
inline float GetColDensity(size_t cidx) const;
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
virtual utils::IIterator<SparseBatch>* RowIterator(void) const = 0;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief sparse matrix that support column access, CSC
|
||||
*/
|
||||
class FMatrixS : public FMatrixInterface<FMatrixS>{
|
||||
public:
|
||||
typedef SparseBatch::Entry Entry;
|
||||
/*! \brief row iterator */
|
||||
struct ColIter{
|
||||
const Entry *dptr_, *end_;
|
||||
ColIter(const Entry* begin, const Entry* end)
|
||||
:dptr_(begin), end_(end) {}
|
||||
inline bool Next(void) {
|
||||
if (dptr_ == end_) {
|
||||
return false;
|
||||
} else {
|
||||
++dptr_; return true;
|
||||
}
|
||||
}
|
||||
inline bst_uint rindex(void) const {
|
||||
return dptr_->findex;
|
||||
}
|
||||
inline bst_float fvalue(void) const {
|
||||
return dptr_->fvalue;
|
||||
}
|
||||
};
|
||||
/*! \brief reverse column iterator */
|
||||
struct ColBackIter : public ColIter {
|
||||
ColBackIter(const Entry* dptr, const Entry* end) : ColIter(dptr, end) {}
|
||||
// shadows ColIter::Next
|
||||
inline bool Next(void) {
|
||||
if (dptr_ == end_) {
|
||||
return false;
|
||||
} else {
|
||||
--dptr_; return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief constructor */
|
||||
explicit FMatrixS(utils::IIterator<SparseBatch> *base_iter)
|
||||
: iter_(base_iter) {}
|
||||
// destructor
|
||||
virtual ~FMatrixS(void) {
|
||||
delete iter_;
|
||||
}
|
||||
/*! \return whether column access is enabled */
|
||||
inline bool HaveColAccess(void) const {
|
||||
return col_ptr_.size() != 0;
|
||||
}
|
||||
/*! \brief get number of colmuns */
|
||||
inline size_t NumCol(void) const {
|
||||
utils::Check(this->HaveColAccess(), "NumCol:need column access");
|
||||
return col_ptr_.size() - 1;
|
||||
}
|
||||
/*! \brief get col sorted iterator */
|
||||
inline ColIter GetSortedCol(size_t cidx) const {
|
||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColIter(&col_data_[col_ptr_[cidx]] - 1,
|
||||
&col_data_[col_ptr_[cidx + 1]] - 1);
|
||||
}
|
||||
/*!
|
||||
* \brief get reversed col iterator,
|
||||
* this function will be deprecated at some point
|
||||
*/
|
||||
inline ColBackIter GetReverseSortedCol(size_t cidx) const {
|
||||
utils::Assert(cidx < this->NumCol(), "col id exceed bound");
|
||||
return ColBackIter(&col_data_[col_ptr_[cidx + 1]],
|
||||
&col_data_[col_ptr_[cidx]]);
|
||||
}
|
||||
/*! \brief get col size */
|
||||
inline size_t GetColSize(size_t cidx) const {
|
||||
return col_ptr_[cidx+1] - col_ptr_[cidx];
|
||||
}
|
||||
/*! \brief get column density */
|
||||
inline float GetColDensity(size_t cidx) const {
|
||||
size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]);
|
||||
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_;
|
||||
}
|
||||
virtual void InitColAccess(void) {
|
||||
if (this->HaveColAccess()) return;
|
||||
const size_t max_nrow = std::numeric_limits<bst_uint>::max();
|
||||
this->InitColData(max_nrow);
|
||||
}
|
||||
/*! \brief get the row iterator associated with FMatrix */
|
||||
virtual utils::IIterator<SparseBatch>* RowIterator(void) const {
|
||||
return iter_;
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief intialize column data
|
||||
* \param max_nrow maximum number of rows supported
|
||||
*/
|
||||
inline void InitColData(size_t max_nrow) {
|
||||
// note: this part of code is serial, todo, parallelize this transformer
|
||||
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
|
||||
builder.InitBudget(0);
|
||||
// start working
|
||||
iter_->BeforeFirst();
|
||||
num_buffered_row_ = 0;
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
if (batch.base_rowid >= max_nrow) break;
|
||||
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
|
||||
for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < batch.size; ++j) {
|
||||
builder.AddBudget(inst[j].findex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder.InitStorage();
|
||||
|
||||
iter_->BeforeFirst();
|
||||
while (iter_->Next()) {
|
||||
const SparseBatch &batch = iter_->Value();
|
||||
if (batch.base_rowid >= max_nrow) break;
|
||||
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid);
|
||||
for (size_t i = 0; i < nbatch; ++i) {
|
||||
SparseBatch::Inst inst = batch[i];
|
||||
for (bst_uint j = 0; j < batch.size; ++j) {
|
||||
builder.PushElem(inst[j].findex,
|
||||
Entry((bst_uint)(batch.base_rowid+j),
|
||||
inst[j].fvalue));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sort columns
|
||||
unsigned ncol = static_cast<unsigned>(this->NumCol());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < ncol; ++i) {
|
||||
std::sort(&col_data_[col_ptr_[i]],
|
||||
&col_data_[col_ptr_[i + 1]], Entry::CmpValue);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// --- data structure used to support InitColAccess --
|
||||
utils::IIterator<SparseBatch> *iter_;
|
||||
/*! \brief number */
|
||||
size_t num_buffered_row_;
|
||||
/*! \brief column pointer of CSC format */
|
||||
std::vector<size_t> col_ptr_;
|
||||
/*! \brief column datas in CSC format */
|
||||
std::vector<SparseBatch::Entry> col_data_;
|
||||
};
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
82
src/gbm/gbm.h
Normal file
82
src/gbm/gbm.h
Normal file
@@ -0,0 +1,82 @@
|
||||
#ifndef XGBOOST_GBM_GBM_H_
|
||||
#define XGBOOST_GBM_GBM_H_
|
||||
/*!
|
||||
* \file gbm.h
|
||||
* \brief interface of gradient booster, that learns through gradient statistics
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for gradient booster */
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief interface of gradient boosting model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IGradBooster {
|
||||
public:
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
virtual void LoadModel(utils::IStream &fi) = 0;
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
virtual void SaveModel(utils::IStream &fo) const = 0;
|
||||
/*!
|
||||
* \brief initialize the model
|
||||
*/
|
||||
virtual void InitModel(void) = 0;
|
||||
/*!
|
||||
* \brief peform update to the model(boosting)
|
||||
* \param gpair the gradient pair statistics of the data
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param root_index pre-partitioned root_index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
*/
|
||||
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index) = 0;
|
||||
/*!
|
||||
* \brief generate predictions for given feature matrix
|
||||
* \param fmat feature matrix
|
||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||
* this means we do not have buffer index allocated to the gbm
|
||||
* a buffer index is assigned to each instance that requires repeative prediction
|
||||
* the size of buffer is set by convention using IGradBooster.SetParam("num_pbuffer","size")
|
||||
* \param root_index pre-partitioned root_index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
* \param out_preds output vector to hold the predictions
|
||||
*/
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
int64_t buffer_offset,
|
||||
const std::vector<unsigned> &root_index,
|
||||
std::vector<float> *out_preds) = 0;
|
||||
// destrcutor
|
||||
virtual ~IGradBooster(void){}
|
||||
};
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#include "gbtree-inl.hpp"
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
template<typename FMatrix>
|
||||
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
|
||||
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
|
||||
utils::Error("unknown booster type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBM_H_
|
||||
365
src/gbm/gbtree-inl.hpp
Normal file
365
src/gbm/gbtree-inl.hpp
Normal file
@@ -0,0 +1,365 @@
|
||||
#ifndef XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
#define XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
/*!
|
||||
* \file gbtree-inl.hpp
|
||||
* \brief gradient boosted tree implementation
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./gbm.h"
|
||||
#include "../tree/updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief gradient boosted tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBTree : public IGradBooster<FMatrix> {
|
||||
public:
|
||||
virtual ~GBTree(void) {
|
||||
this->Clear();
|
||||
}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strncmp(name, "bst:", 4)) {
|
||||
cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
|
||||
// set into updaters, if already intialized
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->SetParam(name+4, val);
|
||||
}
|
||||
}
|
||||
if (!strcmp(name, "silent")) {
|
||||
this->SetParam("bst:silent", val);
|
||||
}
|
||||
tparam.SetParam(name, val);
|
||||
if (trees.size() == 0) mparam.SetParam(name, val);
|
||||
}
|
||||
virtual void LoadModel(utils::IStream &fi) {
|
||||
this->Clear();
|
||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
trees.resize(mparam.num_trees);
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i] = new tree::RegTree();
|
||||
trees[i]->LoadModel(fi);
|
||||
}
|
||||
tree_info.resize(mparam.num_trees);
|
||||
if (mparam.num_trees != 0) {
|
||||
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
|
||||
"GBTree: invalid model file");
|
||||
}
|
||||
if (mparam.num_pbuffer != 0) {
|
||||
pred_buffer.resize(mparam.PredBufferSize());
|
||||
pred_counter.resize(mparam.PredBufferSize());
|
||||
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
|
||||
"GBTree: invalid model file");
|
||||
}
|
||||
}
|
||||
virtual void SaveModel(utils::IStream &fo) const {
|
||||
utils::Assert(mparam.num_trees == static_cast<int>(trees.size()), "GBTree");
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
trees[i]->SaveModel(fo);
|
||||
}
|
||||
if (tree_info.size() != 0) {
|
||||
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
|
||||
}
|
||||
if (mparam.num_pbuffer != 0) {
|
||||
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
|
||||
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
|
||||
}
|
||||
}
|
||||
// initialize the predic buffer
|
||||
virtual void InitModel(void) {
|
||||
pred_buffer.clear(); pred_counter.clear();
|
||||
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
|
||||
pred_counter.resize(mparam.PredBufferSize(), 0);
|
||||
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
||||
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
||||
}
|
||||
virtual void DoBoost(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index) {
|
||||
if (mparam.num_output_group == 1) {
|
||||
this->BoostNewTrees(gpair, fmat, root_index, 0);
|
||||
} else {
|
||||
const int ngroup = mparam.num_output_group;
|
||||
utils::Check(gpair.size() % ngroup == 0,
|
||||
"must have exactly ngroup*nrow gpairs");
|
||||
std::vector<bst_gpair> tmp(gpair.size()/ngroup);
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (size_t i = 0; i < tmp.size(); ++i) {
|
||||
tmp[i] = gpair[i * ngroup + gid];
|
||||
}
|
||||
this->BoostNewTrees(tmp, fmat, root_index, gid);
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
int64_t buffer_offset,
|
||||
const std::vector<unsigned> &root_index,
|
||||
std::vector<float> *out_preds) {
|
||||
int nthread;
|
||||
#pragma omp parallel
|
||||
{
|
||||
nthread = omp_get_num_threads();
|
||||
}
|
||||
this->InitThreadTemp(nthread);
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
// start collecting the prediction
|
||||
utils::IIterator<SparseBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const SparseBatch &batch = iter->Value();
|
||||
utils::Assert(batch.base_rowid * mparam.num_output_group == preds.size(),
|
||||
"base_rowid is not set correctly");
|
||||
// output convention: nrow * k, where nrow is number of rows
|
||||
// k is number of group
|
||||
preds.resize(preds.size() + batch.size * mparam.num_output_group);
|
||||
// parallel over local batch
|
||||
const unsigned nsize = static_cast<unsigned>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < nsize; ++i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
std::vector<float> &feats = thread_temp[tid];
|
||||
const size_t ridx = batch.base_rowid + i;
|
||||
const unsigned root_idx = root_index.size() == 0 ? 0 : root_index[ridx];
|
||||
// loop over output groups
|
||||
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||
preds[ridx * mparam.num_output_group + gid] =
|
||||
this->Pred(batch[i],
|
||||
buffer_offset < 0 ? -1 : buffer_offset+ridx,
|
||||
gid, root_idx, &feats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// clear the model
|
||||
inline void Clear(void) {
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
delete trees[i];
|
||||
}
|
||||
trees.clear();
|
||||
pred_buffer.clear();
|
||||
pred_counter.clear();
|
||||
}
|
||||
// initialize updater before using them
|
||||
inline void InitUpdater(void) {
|
||||
if (tparam.updater_initialized != 0) return;
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
delete updaters[i];
|
||||
}
|
||||
updaters.clear();
|
||||
std::string tval = tparam.updater_seq;
|
||||
char *saveptr, *pstr;
|
||||
pstr = strtok_r(&tval[0], ",", &saveptr);
|
||||
while (pstr != NULL) {
|
||||
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
// set parameters
|
||||
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
}
|
||||
pstr = strtok_r(NULL, ",", &saveptr);
|
||||
}
|
||||
tparam.updater_initialized = 1;
|
||||
}
|
||||
// do group specific group
|
||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
int bst_group) {
|
||||
this->InitUpdater();
|
||||
// create the trees
|
||||
std::vector<tree::RegTree *> new_trees;
|
||||
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
||||
new_trees.push_back(new tree::RegTree());
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
new_trees.back()->param.SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
}
|
||||
new_trees.back()->InitModel();
|
||||
}
|
||||
// update the trees
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->Update(gpair, fmat, root_index, new_trees);
|
||||
}
|
||||
// push back to model
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
trees.push_back(new_trees[i]);
|
||||
tree_info.push_back(bst_group);
|
||||
}
|
||||
mparam.num_trees += tparam.num_parallel_tree;
|
||||
}
|
||||
// make a prediction for a single instance
|
||||
inline float Pred(const SparseBatch::Inst &inst,
|
||||
int64_t buffer_index,
|
||||
int bst_group,
|
||||
unsigned root_index,
|
||||
std::vector<float> *p_feats) {
|
||||
size_t itop = 0;
|
||||
float psum = 0.0f;
|
||||
const int bid = mparam.BufferOffset(buffer_index, bst_group);
|
||||
// load buffered results if any
|
||||
if (bid >= 0) {
|
||||
itop = pred_counter[bid];
|
||||
psum = pred_buffer[bid];
|
||||
}
|
||||
if (itop != trees.size()) {
|
||||
FillThreadTemp(inst, p_feats);
|
||||
for (size_t i = itop; i < trees.size(); ++i) {
|
||||
if (tree_info[i] == bst_group) {
|
||||
psum += trees[i]->Predict(*p_feats, root_index);
|
||||
}
|
||||
}
|
||||
DropThreadTemp(inst, p_feats);
|
||||
}
|
||||
// updated the buffered results
|
||||
if (bid >= 0) {
|
||||
pred_counter[bid] = static_cast<unsigned>(trees.size());
|
||||
pred_buffer[bid] = psum;
|
||||
}
|
||||
return psum;
|
||||
}
|
||||
// initialize thread local space for prediction
|
||||
inline void InitThreadTemp(int nthread) {
|
||||
thread_temp.resize(nthread);
|
||||
for (size_t i = 0; i < thread_temp.size(); ++i) {
|
||||
thread_temp[i].resize(mparam.num_feature);
|
||||
std::fill(thread_temp[i].begin(), thread_temp[i].end(), NAN);
|
||||
}
|
||||
}
|
||||
// fill in a thread local dense vector using a sparse instance
|
||||
inline static void FillThreadTemp(const SparseBatch::Inst &inst,
|
||||
std::vector<float> *p_feats) {
|
||||
std::vector<float> &feats = *p_feats;
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
feats[inst[i].findex] = inst[i].fvalue;
|
||||
}
|
||||
}
|
||||
// clear up a thread local dense vector
|
||||
inline static void DropThreadTemp(const SparseBatch::Inst &inst,
|
||||
std::vector<float> *p_feats) {
|
||||
std::vector<float> &feats = *p_feats;
|
||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||
feats[inst[i].findex] = NAN;
|
||||
}
|
||||
}
|
||||
// --- data structure ---
|
||||
/*! \brief training parameters */
|
||||
struct TrainParam {
|
||||
/*! \brief number of threads */
|
||||
int nthread;
|
||||
/*!
|
||||
* \brief number of parallel trees constructed each iteration
|
||||
* use this option to support boosted random forest
|
||||
*/
|
||||
int num_parallel_tree;
|
||||
/*! \brief whether updater is already initialized */
|
||||
int updater_initialized;
|
||||
/*! \brief tree updater sequence */
|
||||
std::string updater_seq;
|
||||
// construction
|
||||
TrainParam(void) {
|
||||
nthread = 0;
|
||||
updater_seq = "grow_colmaker,prune";
|
||||
num_parallel_tree = 1;
|
||||
updater_initialized = 0;
|
||||
}
|
||||
inline void SetParam(const char *name, const char *val){
|
||||
if (!strcmp(name, "updater") &&
|
||||
strcmp(updater_seq.c_str(), val) != 0) {
|
||||
updater_seq = val;
|
||||
updater_initialized = 0;
|
||||
}
|
||||
if (!strcmp(name, "nthread")) {
|
||||
omp_set_num_threads(nthread);
|
||||
nthread = atoi(val);
|
||||
}
|
||||
if (!strcmp(name, "num_parallel_tree")) {
|
||||
num_parallel_tree = atoi(val);
|
||||
}
|
||||
}
|
||||
};
|
||||
/*! \brief model parameters */
|
||||
struct ModelParam {
|
||||
/*! \brief number of trees */
|
||||
int num_trees;
|
||||
/*! \brief number of root: default 0, means single tree */
|
||||
int num_roots;
|
||||
/*! \brief number of features to be used by trees */
|
||||
int num_feature;
|
||||
/*! \brief size of predicton buffer allocated used for buffering */
|
||||
int64_t num_pbuffer;
|
||||
/*!
|
||||
* \brief how many output group a single instance can produce
|
||||
* this affects the behavior of number of output we have:
|
||||
* suppose we have n instance and k group, output will be k*n
|
||||
*/
|
||||
int num_output_group;
|
||||
/*! \brief reserved parameters */
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
num_trees = 0;
|
||||
num_roots = num_feature = 0;
|
||||
num_pbuffer = 0;
|
||||
num_output_group = 1;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("num_pbuffer", name)) num_pbuffer = atol(val);
|
||||
if (!strcmp("num_output_group", name)) num_output_group = atol(val);
|
||||
if (!strcmp("bst:num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
/*! \return size of prediction buffer actually needed */
|
||||
inline size_t PredBufferSize(void) const {
|
||||
return num_output_group * num_pbuffer;
|
||||
}
|
||||
/*!
|
||||
* \brief get the buffer offset given a buffer index and group id
|
||||
* \return calculated buffer offset
|
||||
*/
|
||||
inline size_t BufferOffset(int64_t buffer_index, int bst_group) const {
|
||||
if (buffer_index < 0) return -1;
|
||||
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
|
||||
return buffer_index + num_pbuffer * bst_group;
|
||||
}
|
||||
};
|
||||
// training parameter
|
||||
TrainParam tparam;
|
||||
// model parameter
|
||||
ModelParam mparam;
|
||||
/*! \brief vector of trees stored in the model */
|
||||
std::vector<tree::RegTree*> trees;
|
||||
/*! \brief some information indicator of the tree, reserved */
|
||||
std::vector<int> tree_info;
|
||||
/*! \brief prediction buffer */
|
||||
std::vector<float> pred_buffer;
|
||||
/*! \brief prediction buffer counter, remember the prediction */
|
||||
std::vector<unsigned> pred_counter;
|
||||
// ----training fields----
|
||||
// configurations for tree
|
||||
std::vector< std::pair<std::string, std::string> > cfg;
|
||||
// temporal storage for per thread
|
||||
std::vector< std::vector<float> > thread_temp;
|
||||
// the updaters that can be applied to each of tree
|
||||
std::vector< tree::IUpdater<FMatrix>* > updaters;
|
||||
};
|
||||
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBTREE_INL_HPP_
|
||||
84
src/learner/dmatrix.h
Normal file
84
src/learner/dmatrix.h
Normal file
@@ -0,0 +1,84 @@
|
||||
#ifndef XGBOOST_LEARNER_DMATRIX_H_
|
||||
#define XGBOOST_LEARNER_DMATRIX_H_
|
||||
/*!
|
||||
* \file dmatrix.h
|
||||
* \brief meta data and template data structure
|
||||
* used for regression/classification/ranking
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief meta information needed in training, including label, weight
|
||||
*/
|
||||
struct MetaInfo {
|
||||
/*! \brief label of each instance */
|
||||
std::vector<float> labels;
|
||||
/*!
|
||||
* \brief the index of begin and end of a group
|
||||
* needed when the learning task is ranking
|
||||
*/
|
||||
std::vector<bst_uint> group_ptr;
|
||||
/*! \brief weights of each instance, optional */
|
||||
std::vector<float> weights;
|
||||
/*!
|
||||
* \brief specified root index of each instance,
|
||||
* can be used for multi task setting
|
||||
*/
|
||||
std::vector<unsigned> root_index;
|
||||
/*! \brief get weight of each instances */
|
||||
inline float GetWeight(size_t i) const {
|
||||
if(weights.size() != 0) {
|
||||
return weights[i];
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
/*! \brief get root index of i-th instance */
|
||||
inline float GetRoot(size_t i) const {
|
||||
if(root_index.size() != 0) {
|
||||
return static_cast<float>(root_index[i]);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
inline void SaveBinary(utils::IStream &fo) {
|
||||
fo.Write(labels);
|
||||
fo.Write(group_ptr);
|
||||
fo.Write(weights);
|
||||
fo.Write(root_index);
|
||||
}
|
||||
inline void LoadBinary(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
|
||||
utils::Check(fi.Read(&root_index), "MetaInfo: invalid format");
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief data object used for learning,
|
||||
* \tparam FMatrix type of feature data source
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
struct DMatrix {
|
||||
/*! \brief meta information about the dataset */
|
||||
MetaInfo info;
|
||||
/*! \brief number of rows in the DMatrix */
|
||||
size_t num_row;
|
||||
/*! \brief feature matrix about data content */
|
||||
FMatrix fmat;
|
||||
/*!
|
||||
* \brief cache pointer to verify if the data structure is cached in some learner
|
||||
* used to verify if DMatrix is cached
|
||||
*/
|
||||
void *cache_learner_ptr_;
|
||||
/*! \brief default constructor */
|
||||
DMatrix(void) : cache_learner_ptr_(NULL) {}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_DMATRIX_H_
|
||||
346
src/learner/evaluation-inl.hpp
Normal file
346
src/learner/evaluation-inl.hpp
Normal file
@@ -0,0 +1,346 @@
|
||||
#ifndef XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
#define XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
/*!
|
||||
* \file xgboost_evaluation-inl.hpp
|
||||
* \brief evaluation metrics for regression and classification and rank
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include <climits>
|
||||
#include <algorithm>
|
||||
#include "./evaluation.h"
|
||||
#include "./helper_utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief base class of elementwise evaluation
|
||||
* \tparam Derived the name of subclass
|
||||
*/
|
||||
template<typename Derived>
|
||||
struct EvalEWiseBase : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"label and prediction size not match");
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
float sum = 0.0, wsum = 0.0;
|
||||
#pragma omp parallel for reduction(+:sum, wsum) schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
const float wt = info.GetWeight(i);
|
||||
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
|
||||
wsum += wt;
|
||||
}
|
||||
return Derived::GetFinal(sum, wsum);
|
||||
}
|
||||
/*!
|
||||
* \brief to be implemented by subclass,
|
||||
* get evaluation result from one row
|
||||
* \param label label of current instance
|
||||
* \param pred prediction value of current instance
|
||||
* \param weight weight of current instance
|
||||
*/
|
||||
inline static float EvalRow(float label, float pred);
|
||||
/*!
|
||||
* \brief to be overide by subclas, final trasnformation
|
||||
* \param esum the sum statistics returned by EvalRow
|
||||
* \param wsum sum of weight
|
||||
*/
|
||||
inline static float GetFinal(float esum, float wsum) {
|
||||
return esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief RMSE */
|
||||
struct EvalRMSE : public EvalEWiseBase<EvalRMSE> {
|
||||
virtual const char *Name(void) const {
|
||||
return "rmse";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
float diff = label - pred;
|
||||
return diff * diff;
|
||||
}
|
||||
inline static float GetFinal(float esum, float wsum) {
|
||||
return std::sqrt(esum / wsum);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief logloss */
|
||||
struct EvalLogLoss : public EvalEWiseBase<EvalLogLoss> {
|
||||
virtual const char *Name(void) const {
|
||||
return "logloss";
|
||||
}
|
||||
inline static float EvalRow(float y, float py) {
|
||||
return - y * std::log(py) - (1.0f - y) * std::log(1 - py);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief error */
|
||||
struct EvalError : public EvalEWiseBase<EvalError> {
|
||||
virtual const char *Name(void) const {
|
||||
return "error";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
// assume label is in [0,1]
|
||||
return pred > 0.5f ? 1.0f - label : label;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief match error */
|
||||
struct EvalMatchError : public EvalEWiseBase<EvalMatchError> {
|
||||
virtual const char *Name(void) const {
|
||||
return "merror";
|
||||
}
|
||||
inline static float EvalRow(float label, float pred) {
|
||||
return static_cast<int>(pred) != static_cast<int>(label);
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief AMS: also records best threshold */
|
||||
struct EvalAMS : public IEvaluator {
|
||||
public:
|
||||
explicit EvalAMS(const char *name) {
|
||||
name_ = name;
|
||||
// note: ams@0 will automatically select which ratio to go
|
||||
utils::Check(sscanf(name, "ams@%f", &ratio_) == 1, "invalid ams format");
|
||||
}
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
|
||||
std::vector< std::pair<float, unsigned> > rec(ndata);
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
rec[i] = std::make_pair(preds[i], i);
|
||||
}
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned ntop = static_cast<unsigned>(ratio_ * ndata);
|
||||
if (ntop == 0) ntop = ndata;
|
||||
const double br = 10.0;
|
||||
unsigned thresindex = 0;
|
||||
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
|
||||
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
|
||||
const unsigned ridx = rec[i].second;
|
||||
const float wt = info.weights[ridx];
|
||||
if (info.labels[ridx] > 0.5f) {
|
||||
s_tp += wt;
|
||||
} else {
|
||||
b_fp += wt;
|
||||
}
|
||||
if (rec[i].first != rec[i+1].first) {
|
||||
double ams = sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
|
||||
if (tams < ams) {
|
||||
thresindex = i;
|
||||
tams = ams;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ntop == ndata) {
|
||||
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
|
||||
return tams;
|
||||
} else {
|
||||
return sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
|
||||
}
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
float ratio_;
|
||||
};
|
||||
|
||||
/*! \brief Area under curve, for both classification and rank */
|
||||
struct EvalAuc : public IEvaluator {
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
|
||||
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
|
||||
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
|
||||
utils::Check(gptr.back() == preds.size(),
|
||||
"EvalAuc: group structure must match number of prediction");
|
||||
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||
// sum statictis
|
||||
double sum_auc = 0.0f;
|
||||
#pragma omp parallel reduction(+:sum_auc)
|
||||
{
|
||||
// each thread takes a local rec
|
||||
std::vector< std::pair<float, unsigned> > rec;
|
||||
#pragma omp for schedule(static)
|
||||
for (unsigned k = 0; k < ngroup; ++k) {
|
||||
rec.clear();
|
||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
||||
rec.push_back(std::make_pair(preds[j], j));
|
||||
}
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
// calculate AUC
|
||||
double sum_pospair = 0.0;
|
||||
double sum_npos = 0.0, sum_nneg = 0.0, buf_pos = 0.0, buf_neg = 0.0;
|
||||
for (size_t j = 0; j < rec.size(); ++j) {
|
||||
const float wt = info.GetWeight(rec[j].second);
|
||||
const float ctr = info.labels[rec[j].second];
|
||||
// keep bucketing predictions in same bucket
|
||||
if (j != 0 && rec[j].first != rec[j - 1].first) {
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
buf_neg = buf_pos = 0.0f;
|
||||
}
|
||||
buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
|
||||
}
|
||||
sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
|
||||
sum_npos += buf_pos; sum_nneg += buf_neg;
|
||||
// check weird conditions
|
||||
utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
|
||||
"AUC: the dataset only contains pos or neg samples");
|
||||
// this is the AUC
|
||||
sum_auc += sum_pospair / (sum_npos*sum_nneg);
|
||||
}
|
||||
}
|
||||
// return average AUC over list
|
||||
return static_cast<float>(sum_auc) / ngroup;
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return "auc";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Evaluate rank list */
|
||||
struct EvalRankList : public IEvaluator {
|
||||
public:
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"label size predict size not match");
|
||||
const std::vector<unsigned> &gptr = info.group_ptr;
|
||||
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
|
||||
utils::Assert(gptr.back() == preds.size(),
|
||||
"EvalRanklist: group structure must match number of prediction");
|
||||
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
|
||||
// sum statistics
|
||||
double sum_metric = 0.0f;
|
||||
#pragma omp parallel reduction(+:sum_metric)
|
||||
{
|
||||
// each thread takes a local rec
|
||||
std::vector< std::pair<float, unsigned> > rec;
|
||||
#pragma omp for schedule(static)
|
||||
for (unsigned k = 0; k < ngroup; ++k) {
|
||||
rec.clear();
|
||||
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
|
||||
rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
|
||||
}
|
||||
sum_metric += this->EvalMetric(rec);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sum_metric) / ngroup;
|
||||
}
|
||||
virtual const char *Name(void) const {
|
||||
return name_.c_str();
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit EvalRankList(const char *name) {
|
||||
name_ = name;
|
||||
minus_ = false;
|
||||
if (sscanf(name, "%*[^@]@%u[-]?", &topn_) != 1) {
|
||||
topn_ = UINT_MAX;
|
||||
}
|
||||
if (name[strlen(name) - 1] == '-') {
|
||||
minus_ = true;
|
||||
}
|
||||
}
|
||||
/*! \return evaluation metric, given the pair_sort record, (pred,label) */
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &pair_sort) const = 0;
|
||||
|
||||
protected:
|
||||
unsigned topn_;
|
||||
std::string name_;
|
||||
bool minus_;
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
struct EvalPrecision : public EvalRankList{
|
||||
public:
|
||||
explicit EvalPrecision(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
// calculate Preicsion
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned nhit = 0;
|
||||
for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
|
||||
nhit += (rec[j].second != 0);
|
||||
}
|
||||
return static_cast<float>(nhit) / topn_;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief NDCG */
|
||||
struct EvalNDCG : public EvalRankList{
|
||||
public:
|
||||
explicit EvalNDCG(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
inline float CalcDCG(const std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
double sumdcg = 0.0;
|
||||
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
|
||||
const unsigned rel = rec[i].second;
|
||||
if (rel != 0) {
|
||||
sumdcg += ((1 << rel) - 1) / logf(i + 2);
|
||||
}
|
||||
}
|
||||
return static_cast<float>(sumdcg);
|
||||
}
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
std::stable_sort(rec.begin(), rec.end(), CmpFirst);
|
||||
float dcg = this->CalcDCG(rec);
|
||||
std::stable_sort(rec.begin(), rec.end(), CmpSecond);
|
||||
float idcg = this->CalcDCG(rec);
|
||||
if (idcg == 0.0f) {
|
||||
if (minus_) {
|
||||
return 0.0f;
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
return dcg/idcg;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Precison at N, for both classification and rank */
|
||||
struct EvalMAP : public EvalRankList {
|
||||
public:
|
||||
explicit EvalMAP(const char *name) : EvalRankList(name) {}
|
||||
|
||||
protected:
|
||||
virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
|
||||
std::sort(rec.begin(), rec.end(), CmpFirst);
|
||||
unsigned nhits = 0;
|
||||
double sumap = 0.0;
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
if (rec[i].second != 0) {
|
||||
nhits += 1;
|
||||
if (i < this->topn_) {
|
||||
sumap += static_cast<float>(nhits) / (i+1);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (nhits != 0) {
|
||||
sumap /= nhits;
|
||||
return static_cast<float>(sumap);
|
||||
} else {
|
||||
if (minus_) {
|
||||
return 0.0f;
|
||||
} else {
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_EVALUATION_INL_HPP_
|
||||
82
src/learner/evaluation.h
Normal file
82
src/learner/evaluation.h
Normal file
@@ -0,0 +1,82 @@
|
||||
#ifndef XGBOOST_LEARNER_EVALUATION_H_
|
||||
#define XGBOOST_LEARNER_EVALUATION_H_
|
||||
/*!
|
||||
* \file evaluation.h
|
||||
* \brief interface of evaluation function supported in xgboost
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "../utils/utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief evaluator that evaluates the loss metrics */
|
||||
struct IEvaluator{
|
||||
/*!
|
||||
* \brief evaluate a specific metric
|
||||
* \param preds prediction
|
||||
* \param info information, including label etc.
|
||||
*/
|
||||
virtual float Eval(const std::vector<float> &preds,
|
||||
const MetaInfo &info) const = 0;
|
||||
/*! \return name of metric */
|
||||
virtual const char *Name(void) const = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IEvaluator(void) {}
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
|
||||
// include implementations of evaluation functions
|
||||
#include "evaluation-inl.hpp"
|
||||
// factory function
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
inline IEvaluator* CreateEvaluator(const char *name) {
|
||||
if (!strcmp(name, "rmse")) return new EvalRMSE();
|
||||
if (!strcmp(name, "error")) return new EvalError();
|
||||
if (!strcmp(name, "merror")) return new EvalMatchError();
|
||||
if (!strcmp(name, "logloss")) return new EvalLogLoss();
|
||||
if (!strcmp(name, "auc")) return new EvalAuc();
|
||||
if (!strncmp(name, "ams@",4)) return new EvalAMS(name);
|
||||
if (!strncmp(name, "pre@", 4)) return new EvalPrecision(name);
|
||||
if (!strncmp(name, "map", 3)) return new EvalMAP(name);
|
||||
if (!strncmp(name, "ndcg", 3)) return new EvalNDCG(name);
|
||||
utils::Error("unknown evaluation metric type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*! \brief a set of evaluators */
|
||||
class EvalSet{
|
||||
public:
|
||||
inline void AddEval(const char *name) {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
if (!strcmp(name, evals_[i]->Name())) return;
|
||||
}
|
||||
evals_.push_back(CreateEvaluator(name));
|
||||
}
|
||||
~EvalSet(void) {
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
delete evals_[i];
|
||||
}
|
||||
}
|
||||
inline std::string Eval(const char *evname,
|
||||
const std::vector<float> &preds,
|
||||
const MetaInfo &info) const {
|
||||
std::string result = "";
|
||||
for (size_t i = 0; i < evals_.size(); ++i) {
|
||||
float res = evals_[i]->Eval(preds, info);
|
||||
char tmp[1024];
|
||||
snprintf(tmp, sizeof(tmp), "\t%s-%s:%f", evname, evals_[i]->Name(), res);
|
||||
result += tmp;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<const IEvaluator*> evals_;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_EVALUATION_H_
|
||||
50
src/learner/helper_utils.h
Normal file
50
src/learner/helper_utils.h
Normal file
@@ -0,0 +1,50 @@
|
||||
#ifndef XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
#define XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
/*!
|
||||
* \file helper_utils.h
|
||||
* \brief useful helper functions
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
// simple helper function to do softmax
|
||||
inline static void Softmax(std::vector<float>* p_rec) {
|
||||
std::vector<float> &rec = *p_rec;
|
||||
float wmax = rec[0];
|
||||
for (size_t i = 1; i < rec.size(); ++i) {
|
||||
wmax = std::max(rec[i], wmax);
|
||||
}
|
||||
double wsum = 0.0f;
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] = std::exp(rec[i]-wmax);
|
||||
wsum += rec[i];
|
||||
}
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] /= static_cast<float>(wsum);
|
||||
}
|
||||
}
|
||||
// simple helper function to do softmax
|
||||
inline static int FindMaxIndex(const std::vector<float>& rec) {
|
||||
size_t mxid = 0;
|
||||
for (size_t i = 1; i < rec.size(); ++i) {
|
||||
if (rec[i] > rec[mxid] + 1e-6f) {
|
||||
mxid = i;
|
||||
}
|
||||
}
|
||||
return static_cast<int>(mxid);
|
||||
}
|
||||
|
||||
inline static bool CmpFirst(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
inline static bool CmpSecond(const std::pair<float, unsigned> &a,
|
||||
const std::pair<float, unsigned> &b) {
|
||||
return a.second > b.second;
|
||||
}
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_HELPER_UTILS_H_
|
||||
296
src/learner/learner-inl.hpp
Normal file
296
src/learner/learner-inl.hpp
Normal file
@@ -0,0 +1,296 @@
|
||||
#ifndef XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
#define XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
/*!
|
||||
* \file learner-inl.hpp
|
||||
* \brief learning algorithm
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./objective.h"
|
||||
#include "./evaluation.h"
|
||||
#include "../gbm/gbm.h"
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for learning algorithm */
|
||||
namespace learner {
|
||||
/*!
|
||||
* \brief learner that takes do gradient boosting on specific objective functions
|
||||
* and do training and prediction
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class BoostLearner {
|
||||
public:
|
||||
BoostLearner(void) {
|
||||
obj_ = NULL;
|
||||
gbm_ = NULL;
|
||||
name_obj_ = "reg:linear";
|
||||
name_gbm_ = "gbtree";
|
||||
}
|
||||
~BoostLearner(void) {
|
||||
if (obj_ != NULL) delete obj_;
|
||||
if (gbm_ != NULL) delete gbm_;
|
||||
}
|
||||
/*!
|
||||
* \brief add internal cache space for mat, this can speedup prediction for matrix,
|
||||
* please cache prediction for training and eval data
|
||||
* warning: if the model is loaded from file from some previous training history
|
||||
* set cache data must be called with exactly SAME
|
||||
* data matrices to continue training otherwise it will cause error
|
||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||
*/
|
||||
inline void SetCacheData(const std::vector<DMatrix<FMatrix>*>& mats) {
|
||||
// estimate feature bound
|
||||
unsigned num_feature = 0;
|
||||
// assign buffer index
|
||||
size_t buffer_size = 0;
|
||||
utils::Assert(cache_.size() == 0, "can only call cache data once");
|
||||
for (size_t i = 0; i < mats.size(); ++i) {
|
||||
bool dupilicate = false;
|
||||
for (size_t j = 0; j < i; ++j) {
|
||||
if (mats[i] == mats[j]) dupilicate = true;
|
||||
}
|
||||
if (dupilicate) continue;
|
||||
// set mats[i]'s cache learner pointer to this
|
||||
mats[i]->cache_learner_ptr_ = this;
|
||||
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->num_row));
|
||||
buffer_size += mats[i]->num_row;
|
||||
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->num_col));
|
||||
}
|
||||
char str_temp[25];
|
||||
if (num_feature > mparam.num_feature) {
|
||||
snprintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
||||
this->SetParam("bst:num_feature", str_temp);
|
||||
}
|
||||
snprintf(str_temp, sizeof(str_temp), "%lu", buffer_size);
|
||||
this->SetParam("num_pbuffer", str_temp);
|
||||
if (!silent) {
|
||||
printf("buffer_size=%ld\n", buffer_size);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp(name, "silent")) silent = atoi(val);
|
||||
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
|
||||
if (gbm_ == NULL) {
|
||||
if (!strcmp(name, "objective")) name_obj_ = val;
|
||||
if (!strcmp(name, "booster")) name_gbm_ = val;
|
||||
mparam.SetParam(name, val);
|
||||
}
|
||||
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
|
||||
}
|
||||
/*!
|
||||
* \brief initialize the model
|
||||
*/
|
||||
inline void InitModel(void) {
|
||||
this->InitObjGBM();
|
||||
// adapt the base score
|
||||
mparam.base_score = obj_->ProbToMargin(mparam.base_score);
|
||||
gbm_->InitModel();
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||
"BoostLearner: wrong model format");
|
||||
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
|
||||
utils::Check(fi.Read(&name_gbm_), "BoostLearner: wrong model format");
|
||||
// delete existing gbm if any
|
||||
if (obj_ != NULL) delete obj_;
|
||||
if (gbm_ != NULL) delete gbm_;
|
||||
this->InitObjGBM();
|
||||
gbm_->LoadModel(fi);
|
||||
}
|
||||
/*!
|
||||
* \brief load model from file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void LoadModel(const char *fname) {
|
||||
utils::FileStream fi(utils::FopenCheck(fname, "rb"));
|
||||
this->LoadModel(fi);
|
||||
fi.Close();
|
||||
}
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
fo.Write(&mparam, sizeof(ModelParam));
|
||||
fo.Write(&name_obj_);
|
||||
fo.Write(&name_gbm_);
|
||||
gbm_->SaveModel(fo);
|
||||
}
|
||||
/*!
|
||||
* \brief save model into file
|
||||
* \param fname file name
|
||||
*/
|
||||
inline void SaveModel(const char *fname) const {
|
||||
utils::FileStream fo(utils::FopenCheck(fname, "wb"));
|
||||
this->SaveModel(fo);
|
||||
fo.Close();
|
||||
}
|
||||
/*!
|
||||
* \brief update the model for one iteration
|
||||
* \param iter current iteration number
|
||||
* \param p_train pointer to the data matrix
|
||||
*/
|
||||
inline void UpdateOneIter(int iter, DMatrix<FMatrix> *p_train) {
|
||||
this->PredictRaw(preds_, *p_train);
|
||||
obj_->GetGradient(preds_, p_train->info, iter, &gpair_);
|
||||
gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index);
|
||||
}
|
||||
/*!
|
||||
* \brief evaluate the model for specific iteration
|
||||
* \param iter iteration number
|
||||
* \param evals datas i want to evaluate
|
||||
* \param evname name of each dataset
|
||||
* \return a string corresponding to the evaluation result
|
||||
*/
|
||||
inline std::string EvalOneIter(int iter,
|
||||
const std::vector<const DMatrix<FMatrix>*> &evals,
|
||||
const std::vector<std::string> &evname) {
|
||||
std::string res;
|
||||
char tmp[256];
|
||||
snprintf(tmp, sizeof(tmp), "[%d]", iter);
|
||||
res = tmp;
|
||||
for (size_t i = 0; i < evals.size(); ++i) {
|
||||
this->PredictRaw(*evals[i], &preds_);
|
||||
obj_->EvalTransform(&preds_);
|
||||
res += evaluator_.Eval(evname[i].c_str(), preds_, evals[i]->info);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
/*!
|
||||
* \brief simple evaluation function, with a specified metric
|
||||
* \param data input data
|
||||
* \param metric name of metric
|
||||
* \return a pair of <evaluation name, result>
|
||||
*/
|
||||
std::pair<std::string, float> Evaluate(const DMatrix<FMatrix> &data, std::string metric) {
|
||||
if (metric == "auto") metric = obj_->DefaultEvalMetric();
|
||||
IEvaluator *ev = CreateEvaluator(metric.c_str());
|
||||
this->PredictRaw(data, &preds_);
|
||||
obj_->EvalTransform(&preds_);
|
||||
float res = ev->Eval(preds_, data.info);
|
||||
delete ev;
|
||||
return std::make_pair(metric, res);
|
||||
}
|
||||
/*!
|
||||
* \brief get prediction
|
||||
* \param data input data
|
||||
* \param out_preds output vector that stores the prediction
|
||||
*/
|
||||
inline void Predict(const DMatrix<FMatrix> &data,
|
||||
std::vector<float> *out_preds) const {
|
||||
this->PredictRaw(data, out_preds);
|
||||
obj_->PredTransform(out_preds);
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief initialize the objective function and GBM,
|
||||
* if not yet done
|
||||
*/
|
||||
inline void InitObjGBM(void) {
|
||||
if (obj_ != NULL) return;
|
||||
utils::Assert(gbm_ == NULL, "GBM and obj should be NULL");
|
||||
obj_ = CreateObjFunction(name_obj_.c_str());
|
||||
gbm_ = gbm::CreateGradBooster<FMatrix>(name_gbm_.c_str());
|
||||
for (size_t i = 0; i < cfg_.size(); ++i) {
|
||||
obj_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
gbm_->SetParam(cfg_[i].first.c_str(), cfg_[i].second.c_str());
|
||||
}
|
||||
evaluator_.AddEval(obj_->DefaultEvalMetric());
|
||||
}
|
||||
/*!
|
||||
* \brief get un-transformed prediction
|
||||
* \param data training data matrix
|
||||
* \param out_preds output vector that stores the prediction
|
||||
*/
|
||||
inline void PredictRaw(const DMatrix<FMatrix> &data,
|
||||
std::vector<float> *out_preds) {
|
||||
gbm_->Predict(data.fmat, this->FindBufferOffset(data),
|
||||
data.info, out_preds);
|
||||
}
|
||||
|
||||
/*! \brief training parameter for regression */
|
||||
struct ModelParam{
|
||||
/* \brief global bias */
|
||||
float base_score;
|
||||
/* \brief number of features */
|
||||
unsigned num_feature;
|
||||
/* \brief number of class, if it is multi-class classification */
|
||||
int num_class;
|
||||
/*! \brief reserved field */
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
ModelParam(void) {
|
||||
base_score = 0.5f;
|
||||
num_feature = 0;
|
||||
num_class = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("base_score", name)) base_score = static_cast<float>(atof(val));
|
||||
if (!strcmp("num_class", name)) num_class = atoi(val);
|
||||
if (!strcmp("bst:num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
};
|
||||
// data fields
|
||||
// silent during training
|
||||
int silent;
|
||||
// evaluation set
|
||||
EvalSet evaluator_;
|
||||
// model parameter
|
||||
ModelParam mparam;
|
||||
// gbm model that back everything
|
||||
gbm::IGradBooster<FMatrix> *gbm_;
|
||||
// name of gbm model used for training
|
||||
std::string name_gbm_;
|
||||
// objective fnction
|
||||
IObjFunction *obj_;
|
||||
// name of objective function
|
||||
std::string name_obj_;
|
||||
// configurations
|
||||
std::vector< std::pair<std::string, std::string> > cfg_;
|
||||
// temporal storages for prediciton
|
||||
std::vector<float> preds_;
|
||||
// gradient pairs
|
||||
std::vector<bst_gpair> gpair_;
|
||||
|
||||
private:
|
||||
// cache entry object that helps handle feature caching
|
||||
struct CacheEntry {
|
||||
const DMatrix<FMatrix> *mat_;
|
||||
size_t buffer_offset_;
|
||||
size_t num_row_;
|
||||
CacheEntry(const DMatrix<FMatrix> *mat, size_t buffer_offset, size_t num_row)
|
||||
:mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
|
||||
};
|
||||
// find internal bufer offset for certain matrix, if not exist, return -1
|
||||
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
|
||||
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
|
||||
if (cache_[i].num_row_ == mat.num_row) {
|
||||
return cache_[i].buffer_offset_;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
// data structure field
|
||||
/*! \brief the entries indicates that we have internal prediction cache */
|
||||
std::vector<CacheEntry> cache_;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_LEARNER_INL_HPP_
|
||||
137
src/learner/objective-inl.hpp
Normal file
137
src/learner/objective-inl.hpp
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
#define XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
/*!
|
||||
* \file objective-inl.hpp
|
||||
* \brief objective function implementations
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "./objective.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief defines functions to calculate some commonly used functions */
|
||||
struct LossType {
|
||||
/*! \brief indicate which type we are using */
|
||||
int loss_type;
|
||||
// list of constants
|
||||
static const int kLinearSquare = 0;
|
||||
static const int kLogisticNeglik = 1;
|
||||
static const int kLogisticClassify = 2;
|
||||
static const int kLogisticRaw = 3;
|
||||
/*!
|
||||
* \brief transform the linear sum to prediction
|
||||
* \param x linear sum of boosting ensemble
|
||||
* \return transformed prediction
|
||||
*/
|
||||
inline float PredTransform(float x) const {
|
||||
switch (loss_type) {
|
||||
case kLogisticRaw:
|
||||
case kLinearSquare: return x;
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return 1.0f / (1.0f + expf(-x));
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief calculate first order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return first order gradient
|
||||
*/
|
||||
inline float FirstOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return predt - label;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt - label;
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief calculate second order gradient of loss, given transformed prediction
|
||||
* \param predt transformed prediction
|
||||
* \param label true label
|
||||
* \return second order gradient
|
||||
*/
|
||||
inline float SecondOrderGradient(float predt, float label) const {
|
||||
switch (loss_type) {
|
||||
case kLinearSquare: return 1.0f;
|
||||
case kLogisticRaw: predt = 1.0f / (1.0f + expf(-predt));
|
||||
case kLogisticClassify:
|
||||
case kLogisticNeglik: return predt * (1 - predt);
|
||||
default: utils::Error("unknown loss_type"); return 0.0f;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief transform probability value back to margin
|
||||
*/
|
||||
inline float ProbToMargin(float base_score) const {
|
||||
if (loss_type == kLogisticRaw ||
|
||||
loss_type == kLogisticClassify ||
|
||||
loss_type == kLogisticNeglik ) {
|
||||
utils::Check(base_score > 0.0f && base_score < 1.0f,
|
||||
"base_score must be in (0,1) for logistic loss");
|
||||
base_score = -logf(1.0f / base_score - 1.0f);
|
||||
}
|
||||
return base_score;
|
||||
}
|
||||
/*! \brief get default evaluation metric for the objective */
|
||||
inline const char *DefaultEvalMetric(void) const {
|
||||
if (loss_type == kLogisticClassify) return "error";
|
||||
if (loss_type == kLogisticRaw) return "auc";
|
||||
return "rmse";
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief objective function that only need to */
|
||||
class RegLossObj : public IObjFunction{
|
||||
public:
|
||||
explicit RegLossObj(int loss_type) {
|
||||
loss.loss_type = loss_type;
|
||||
scale_pos_weight = 1.0f;
|
||||
}
|
||||
virtual ~RegLossObj(void) {}
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("scale_pos_weight", name)) {
|
||||
scale_pos_weight = static_cast<float>(atof(val));
|
||||
}
|
||||
}
|
||||
virtual void GetGradient(const std::vector<float>& preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) {
|
||||
utils::Check(preds.size() == info.labels.size(),
|
||||
"labels are not correctly provided");
|
||||
std::vector<bst_gpair> &gpair = *out_gpair;
|
||||
gpair.resize(preds.size());
|
||||
// start calculating gradient
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
float p = loss.PredTransform(preds[j]);
|
||||
float w = info.GetWeight(j);
|
||||
if (info.labels[j] == 1.0f) w *= scale_pos_weight;
|
||||
gpair[j] = bst_gpair(loss.FirstOrderGradient(p, info.labels[j]) * w,
|
||||
loss.SecondOrderGradient(p, info.labels[j]) * w);
|
||||
}
|
||||
}
|
||||
virtual const char* DefaultEvalMetric(void) {
|
||||
return loss.DefaultEvalMetric();
|
||||
}
|
||||
virtual void PredTransform(std::vector<float> *io_preds) {
|
||||
std::vector<float> &preds = *io_preds;
|
||||
const unsigned ndata = static_cast<unsigned>(preds.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
preds[j] = loss.PredTransform(preds[j]);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
float scale_pos_weight;
|
||||
LossType loss;
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_OBJECTIVE_INL_HPP_
|
||||
79
src/learner/objective.h
Normal file
79
src/learner/objective.h
Normal file
@@ -0,0 +1,79 @@
|
||||
#ifndef XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
#define XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
/*!
|
||||
* \file objective.h
|
||||
* \brief interface of objective function used for gradient boosting
|
||||
* \author Tianqi Chen, Kailong Chen
|
||||
*/
|
||||
#include "dmatrix.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief interface of objective function */
|
||||
class IObjFunction{
|
||||
public:
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IObjFunction(void){}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief get gradient over each of predictions, given existing information
|
||||
* \param preds prediction of current round
|
||||
* \param info information about labels, weights, groups in rank
|
||||
* \param iter current iteration number
|
||||
* \param out_gpair output of get gradient, saves gradient and second order gradient in
|
||||
*/
|
||||
virtual void GetGradient(const std::vector<float>& preds,
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
std::vector<bst_gpair> *out_gpair) = 0;
|
||||
/*! \return the default evaluation metric for the objective */
|
||||
virtual const char* DefaultEvalMetric(void) = 0;
|
||||
// the following functions are optional, most of time default implementation is good enough
|
||||
/*!
|
||||
* \brief transform prediction values, this is only called when Prediction is called
|
||||
* \param io_preds prediction values, saves to this vector as well
|
||||
*/
|
||||
virtual void PredTransform(std::vector<float> *io_preds){}
|
||||
/*!
|
||||
* \brief transform prediction values, this is only called when Eval is called,
|
||||
* usually it redirect to PredTransform
|
||||
* \param io_preds prediction values, saves to this vector as well
|
||||
*/
|
||||
virtual void EvalTransform(std::vector<float> *io_preds) {
|
||||
this->PredTransform(io_preds);
|
||||
}
|
||||
/*!
|
||||
* \brief transform probability value back to margin
|
||||
* this is used to transform user-set base_score back to margin
|
||||
* used by gradient boosting
|
||||
* \return transformed value
|
||||
*/
|
||||
virtual float ProbToMargin(float base_score) {
|
||||
return base_score;
|
||||
}
|
||||
};
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
|
||||
// this are implementations of objective functions
|
||||
#include "objective-inl.hpp"
|
||||
// factory function
|
||||
namespace xgboost {
|
||||
namespace learner {
|
||||
/*! \brief factory funciton to create objective function by name */
|
||||
inline IObjFunction* CreateObjFunction(const char *name) {
|
||||
if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
|
||||
if (!strcmp("reg:logistic", name)) return new RegLossObj(LossType::kLogisticNeglik);
|
||||
if (!strcmp("binary:logistic", name)) return new RegLossObj(LossType::kLogisticClassify);
|
||||
if (!strcmp("binary:logitraw", name)) return new RegLossObj(LossType::kLogisticRaw);
|
||||
utils::Error("unknown objective function type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
} // namespace learner
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_LEARNER_OBJECTIVE_H_
|
||||
492
src/tree/model.h
Normal file
492
src/tree/model.h
Normal file
@@ -0,0 +1,492 @@
|
||||
#ifndef XGBOOST_TREE_MODEL_H_
|
||||
#define XGBOOST_TREE_MODEL_H_
|
||||
/*!
|
||||
* \file model.h
|
||||
* \brief model structure for tree
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/fmap.h"
|
||||
#include "../utils/utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*!
|
||||
* \brief template class of TreeModel
|
||||
* \tparam TSplitCond data type to indicate split condition
|
||||
* \tparam TNodeStat auxiliary statistics of node to help tree building
|
||||
*/
|
||||
template<typename TSplitCond, typename TNodeStat>
|
||||
class TreeModel {
|
||||
public:
|
||||
/*! \brief data type to indicate split condition */
|
||||
typedef TNodeStat NodeStat;
|
||||
/*! \brief auxiliary statistics of node to help tree building */
|
||||
typedef TSplitCond SplitCond;
|
||||
/*! \brief parameters of the tree */
|
||||
struct Param{
|
||||
/*! \brief number of start root */
|
||||
int num_roots;
|
||||
/*! \brief total number of nodes */
|
||||
int num_nodes;
|
||||
/*!\brief number of deleted nodes */
|
||||
int num_deleted;
|
||||
/*! \brief maximum depth, this is a statistics of the tree */
|
||||
int max_depth;
|
||||
/*! \brief number of features used for tree construction */
|
||||
int num_feature;
|
||||
/*! \brief reserved part */
|
||||
int reserved[32];
|
||||
/*! \brief constructor */
|
||||
Param(void) {
|
||||
max_depth = 0;
|
||||
memset(reserved, 0, sizeof(reserved));
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
if (!strcmp("num_roots", name)) num_roots = atoi(val);
|
||||
if (!strcmp("num_feature", name)) num_feature = atoi(val);
|
||||
}
|
||||
};
|
||||
/*! \brief tree node */
|
||||
class Node{
|
||||
public:
|
||||
/*! \brief index of left child */
|
||||
inline int cleft(void) const {
|
||||
return this->cleft_;
|
||||
}
|
||||
/*! \brief index of right child */
|
||||
inline int cright(void) const {
|
||||
return this->cright_;
|
||||
}
|
||||
/*! \brief index of default child when feature is missing */
|
||||
inline int cdefault(void) const {
|
||||
return this->default_left() ? this->cleft() : this->cright();
|
||||
}
|
||||
/*! \brief feature index of split condition */
|
||||
inline unsigned split_index(void) const {
|
||||
return sindex_ & ((1U << 31) - 1U);
|
||||
}
|
||||
/*! \brief when feature is unknown, whether goes to left child */
|
||||
inline bool default_left(void) const {
|
||||
return (sindex_ >> 31) != 0;
|
||||
}
|
||||
/*! \brief whether current node is leaf node */
|
||||
inline bool is_leaf(void) const {
|
||||
return cleft_ == -1;
|
||||
}
|
||||
/*! \brief get leaf value of leaf node */
|
||||
inline float leaf_value(void) const {
|
||||
return (this->info_).leaf_value;
|
||||
}
|
||||
/*! \brief get split condition of the node */
|
||||
inline TSplitCond split_cond(void) const {
|
||||
return (this->info_).split_cond;
|
||||
}
|
||||
/*! \brief get parent of the node */
|
||||
inline int parent(void) const {
|
||||
return parent_ & ((1U << 31) - 1);
|
||||
}
|
||||
/*! \brief whether current node is left child */
|
||||
inline bool is_left_child(void) const {
|
||||
return (parent_ & (1U << 31)) != 0;
|
||||
}
|
||||
/*! \brief whether current node is root */
|
||||
inline bool is_root(void) const {
|
||||
return parent_ == -1;
|
||||
}
|
||||
/*!
|
||||
* \brief set the right child
|
||||
* \param nide node id to right child
|
||||
*/
|
||||
inline void set_right_child(int nid) {
|
||||
this->cright_ = nid;
|
||||
}
|
||||
/*!
|
||||
* \brief set split condition of current node
|
||||
* \param split_index feature index to split
|
||||
* \param split_cond split condition
|
||||
* \param default_left the default direction when feature is unknown
|
||||
*/
|
||||
inline void set_split(unsigned split_index, TSplitCond split_cond,
|
||||
bool default_left = false) {
|
||||
if (default_left) split_index |= (1U << 31);
|
||||
this->sindex_ = split_index;
|
||||
(this->info_).split_cond = split_cond;
|
||||
}
|
||||
/*!
|
||||
* \brief set the leaf value of the node
|
||||
* \param value leaf value
|
||||
* \param right right index, could be used to store
|
||||
* additional information
|
||||
*/
|
||||
inline void set_leaf(float value, int right = -1) {
|
||||
(this->info_).leaf_value = value;
|
||||
this->cleft_ = -1;
|
||||
this->cright_ = right;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class TreeModel<TSplitCond, TNodeStat>;
|
||||
/*!
|
||||
* \brief in leaf node, we have weights, in non-leaf nodes,
|
||||
* we have split condition
|
||||
*/
|
||||
union Info{
|
||||
float leaf_value;
|
||||
TSplitCond split_cond;
|
||||
};
|
||||
// pointer to parent, highest bit is used to
|
||||
// indicate whether it's a left child or not
|
||||
int parent_;
|
||||
// pointer to left, right
|
||||
int cleft_, cright_;
|
||||
// split feature index, left split or right split depends on the highest bit
|
||||
unsigned sindex_;
|
||||
// extra info
|
||||
Info info_;
|
||||
// set parent
|
||||
inline void set_parent(int pidx, bool is_left_child = true) {
|
||||
if (is_left_child) pidx |= (1U << 31);
|
||||
this->parent_ = pidx;
|
||||
}
|
||||
};
|
||||
|
||||
protected:
|
||||
// vector of nodes
|
||||
std::vector<Node> nodes;
|
||||
// stats of nodes
|
||||
std::vector<TNodeStat> stats;
|
||||
// free node space, used during training process
|
||||
std::vector<int> deleted_nodes;
|
||||
// allocate a new node,
|
||||
// !!!!!! NOTE: may cause BUG here, nodes.resize
|
||||
inline int AllocNode(void) {
|
||||
if (param.num_deleted != 0) {
|
||||
int nd = deleted_nodes.back();
|
||||
deleted_nodes.pop_back();
|
||||
--param.num_deleted;
|
||||
return nd;
|
||||
}
|
||||
int nd = param.num_nodes++;
|
||||
utils::Check(param.num_nodes < std::numeric_limits<int>::max(),
|
||||
"number of nodes in the tree exceed 2^31");
|
||||
nodes.resize(param.num_nodes);
|
||||
stats.resize(param.num_nodes);
|
||||
return nd;
|
||||
}
|
||||
// delete a tree node
|
||||
inline void DeleteNode(int nid) {
|
||||
utils::Assert(nid >= param.num_roots, "can not delete root");
|
||||
deleted_nodes.push_back(nid);
|
||||
nodes[nid].set_parent(-1);
|
||||
++param.num_deleted;
|
||||
}
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief change a non leaf node to a leaf node, delete its children
|
||||
* \param rid node id of the node
|
||||
* \param new leaf value
|
||||
*/
|
||||
inline void ChangeToLeaf(int rid, float value) {
|
||||
utils::Assert(nodes[nodes[rid].cleft() ].is_leaf(),
|
||||
"can not delete a non termial child");
|
||||
utils::Assert(nodes[nodes[rid].cright()].is_leaf(),
|
||||
"can not delete a non termial child");
|
||||
this->DeleteNode(nodes[rid].cleft());
|
||||
this->DeleteNode(nodes[rid].cright());
|
||||
nodes[rid].set_leaf(value);
|
||||
}
|
||||
/*!
|
||||
* \brief collapse a non leaf node to a leaf node, delete its children
|
||||
* \param rid node id of the node
|
||||
* \param new leaf value
|
||||
*/
|
||||
inline void CollapseToLeaf(int rid, float value) {
|
||||
if (nodes[rid].is_leaf()) return;
|
||||
if (!nodes[nodes[rid].cleft() ].is_leaf()) {
|
||||
CollapseToLeaf(nodes[rid].cleft(), 0.0f);
|
||||
}
|
||||
if (!nodes[nodes[rid].cright() ].is_leaf()) {
|
||||
CollapseToLeaf(nodes[rid].cright(), 0.0f);
|
||||
}
|
||||
this->ChangeToLeaf(rid, value);
|
||||
}
|
||||
|
||||
public:
|
||||
/*! \brief model parameter */
|
||||
Param param;
|
||||
/*! \brief constructor */
|
||||
TreeModel(void) {
|
||||
param.num_nodes = 1;
|
||||
param.num_roots = 1;
|
||||
param.num_deleted = 0;
|
||||
nodes.resize(1);
|
||||
}
|
||||
/*! \brief get node given nid */
|
||||
inline Node &operator[](int nid) {
|
||||
return nodes[nid];
|
||||
}
|
||||
/*! \brief get node given nid */
|
||||
inline const Node &operator[](int nid) const {
|
||||
return nodes[nid];
|
||||
}
|
||||
/*! \brief get node statistics given nid */
|
||||
inline NodeStat &stat(int nid) {
|
||||
return stats[nid];
|
||||
}
|
||||
/*! \brief initialize the model */
|
||||
inline void InitModel(void) {
|
||||
param.num_nodes = param.num_roots;
|
||||
nodes.resize(param.num_nodes);
|
||||
stats.resize(param.num_nodes);
|
||||
for (int i = 0; i < param.num_nodes; i ++) {
|
||||
nodes[i].set_leaf(0.0f);
|
||||
nodes[i].set_parent(-1);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief load model from stream
|
||||
* \param fi input stream
|
||||
*/
|
||||
inline void LoadModel(utils::IStream &fi) {
|
||||
utils::Check(fi.Read(¶m, sizeof(Param)) > 0,
|
||||
"TreeModel: wrong format");
|
||||
nodes.resize(param.num_nodes); stats.resize(param.num_nodes);
|
||||
utils::Check(fi.Read(&nodes[0], sizeof(Node) * nodes.size()) > 0,
|
||||
"TreeModel: wrong format");
|
||||
utils::Check(fi.Read(&stats[0], sizeof(NodeStat) * stats.size()) > 0,
|
||||
"TreeModel: wrong format");
|
||||
// chg deleted nodes
|
||||
deleted_nodes.resize(0);
|
||||
for (int i = param.num_roots; i < param.num_nodes; i ++) {
|
||||
if (nodes[i].is_root()) deleted_nodes.push_back(i);
|
||||
}
|
||||
utils::Assert(static_cast<int>(deleted_nodes.size()) == param.num_deleted,
|
||||
"number of deleted nodes do not match");
|
||||
}
|
||||
/*!
|
||||
* \brief save model to stream
|
||||
* \param fo output stream
|
||||
*/
|
||||
inline void SaveModel(utils::IStream &fo) const {
|
||||
utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
|
||||
"Tree::SaveModel");
|
||||
utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
|
||||
"Tree::SaveModel");
|
||||
fo.Write(¶m, sizeof(Param));
|
||||
fo.Write(&nodes[0], sizeof(Node) * nodes.size());
|
||||
fo.Write(&stats[0], sizeof(NodeStat) * nodes.size());
|
||||
}
|
||||
/*!
|
||||
* \brief add child nodes to node
|
||||
* \param nid node id to add childs
|
||||
*/
|
||||
inline void AddChilds(int nid) {
|
||||
int pleft = this->AllocNode();
|
||||
int pright = this->AllocNode();
|
||||
nodes[nid].cleft_ = pleft;
|
||||
nodes[nid].cright_ = pright;
|
||||
nodes[nodes[nid].cleft() ].set_parent(nid, true);
|
||||
nodes[nodes[nid].cright()].set_parent(nid, false);
|
||||
}
|
||||
/*!
|
||||
* \brief only add a right child to a leaf node
|
||||
* \param node id to add right child
|
||||
*/
|
||||
inline void AddRightChild(int nid) {
|
||||
int pright = this->AllocNode();
|
||||
nodes[nid].right = pright;
|
||||
nodes[nodes[nid].right].set_parent(nid, false);
|
||||
}
|
||||
/*!
|
||||
* \brief get current depth
|
||||
* \param nid node id
|
||||
* \param pass_rchild whether right child is not counted in depth
|
||||
*/
|
||||
inline int GetDepth(int nid, bool pass_rchild = false) const {
|
||||
int depth = 0;
|
||||
while (!nodes[nid].is_root()) {
|
||||
if (!pass_rchild || nodes[nid].is_left_child()) ++depth;
|
||||
nid = nodes[nid].parent();
|
||||
}
|
||||
return depth;
|
||||
}
|
||||
/*!
|
||||
* \brief get maximum depth
|
||||
* \param nid node id
|
||||
*/
|
||||
inline int MaxDepth(int nid) const {
|
||||
if (nodes[nid].is_leaf()) return 0;
|
||||
return std::max(MaxDepth(nodes[nid].cleft())+1,
|
||||
MaxDepth(nodes[nid].cright())+1);
|
||||
}
|
||||
/*!
|
||||
* \brief get maximum depth
|
||||
*/
|
||||
inline int MaxDepth(void) {
|
||||
int maxd = 0;
|
||||
for (int i = 0; i < param.num_roots; ++i) {
|
||||
maxd = std::max(maxd, MaxDepth(i));
|
||||
}
|
||||
return maxd;
|
||||
}
|
||||
/*! \brief number of extra nodes besides the root */
|
||||
inline int num_extra_nodes(void) const {
|
||||
return param.num_nodes - param.num_roots - param.num_deleted;
|
||||
}
|
||||
/*!
|
||||
* \brief dump model to text string
|
||||
* \param fmap feature map of feature types
|
||||
* \param with_stats whether dump out statistics as well
|
||||
* \return the string of dumped model
|
||||
*/
|
||||
inline std::string DumpModel(const utils::FeatMap& fmap, bool with_stats) {
|
||||
std::stringstream fo("");
|
||||
for (int i = 0; i < param.num_roots; ++i) {
|
||||
this->Dump(i, fo, fmap, 0, with_stats);
|
||||
}
|
||||
return fo.str();
|
||||
}
|
||||
|
||||
private:
|
||||
void Dump(int nid, std::stringstream &fo,
|
||||
const utils::FeatMap& fmap, int depth, bool with_stats) {
|
||||
for (int i = 0; i < depth; ++i) {
|
||||
fo << '\t';
|
||||
}
|
||||
if (nodes[nid].is_leaf()) {
|
||||
fo << nid << ":leaf=" << nodes[nid].leaf_value();
|
||||
if (with_stats) {
|
||||
stat(nid).Print(fo, true);
|
||||
}
|
||||
fo << '\n';
|
||||
} else {
|
||||
// right then left,
|
||||
TSplitCond cond = nodes[nid].split_cond();
|
||||
const unsigned split_index = nodes[nid].split_index();
|
||||
if (split_index < fmap.size()) {
|
||||
switch (fmap.type(split_index)) {
|
||||
case utils::FeatMap::kIndicator: {
|
||||
int nyes = nodes[nid].default_left() ?
|
||||
nodes[nid].cright() : nodes[nid].cleft();
|
||||
fo << nid << ":[" << fmap.name(split_index) << "] yes=" << nyes
|
||||
<< ",no=" << nodes[nid].cdefault();
|
||||
break;
|
||||
}
|
||||
case utils::FeatMap::kInteger: {
|
||||
fo << nid << ":[" << fmap.name(split_index) << "<"
|
||||
<< int(float(cond)+1.0f)
|
||||
<< "] yes=" << nodes[nid].cleft()
|
||||
<< ",no=" << nodes[nid].cright()
|
||||
<< ",missing=" << nodes[nid].cdefault();
|
||||
break;
|
||||
}
|
||||
case utils::FeatMap::kFloat:
|
||||
case utils::FeatMap::kQuantitive: {
|
||||
fo << nid << ":[" << fmap.name(split_index) << "<"<< float(cond)
|
||||
<< "] yes=" << nodes[nid].cleft()
|
||||
<< ",no=" << nodes[nid].cright()
|
||||
<< ",missing=" << nodes[nid].cdefault();
|
||||
break;
|
||||
}
|
||||
default: utils::Error("unknown fmap type");
|
||||
}
|
||||
} else {
|
||||
fo << nid << ":[f" << split_index << "<"<< float(cond)
|
||||
<< "] yes=" << nodes[nid].cleft()
|
||||
<< ",no=" << nodes[nid].cright()
|
||||
<< ",missing=" << nodes[nid].cdefault();
|
||||
}
|
||||
if (with_stats) {
|
||||
fo << ' ';
|
||||
stat(nid).Print(fo, false);
|
||||
}
|
||||
fo << '\n';
|
||||
this->Dump(nodes[nid].cleft(), fo, fmap, depth+1, with_stats);
|
||||
this->Dump(nodes[nid].cright(), fo, fmap, depth+1, with_stats);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief node statistics used in regression tree */
|
||||
struct RTreeNodeStat{
|
||||
/*! \brief loss chg caused by current split */
|
||||
float loss_chg;
|
||||
/*! \brief sum of hessian values, used to measure coverage of data */
|
||||
float sum_hess;
|
||||
/*! \brief weight of current node */
|
||||
float base_weight;
|
||||
/*! \brief number of child that is leaf node known up to now */
|
||||
int leaf_child_cnt;
|
||||
/*! \brief print information of current stats to fo */
|
||||
inline void Print(std::stringstream &fo, bool is_leaf) const {
|
||||
if (!is_leaf) {
|
||||
fo << "gain=" << loss_chg << ",cover=" << sum_hess;
|
||||
} else {
|
||||
fo << "cover=" << sum_hess;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief define regression tree to be the most common tree model */
|
||||
class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
|
||||
public:
|
||||
/*!
|
||||
* \brief get the leaf index
|
||||
* \param feats dense feature vector, if the feature is missing the field is set to NaN
|
||||
* \param root_gid starting root index of the instance
|
||||
* \return the leaf index of the given feature
|
||||
*/
|
||||
inline int GetLeafIndex(const std::vector<float> &feat, unsigned root_id = 0) const {
|
||||
// start from groups that belongs to current data
|
||||
int pid = static_cast<int>(root_id);
|
||||
// tranverse tree
|
||||
while (!(*this)[ pid ].is_leaf()) {
|
||||
unsigned split_index = (*this)[pid].split_index();
|
||||
const float fvalue = feat[split_index];
|
||||
pid = this->GetNext(pid, fvalue, std::isnan(fvalue));
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
/*!
|
||||
* \brief get the prediction of regression tree, only accepts dense feature vector
|
||||
* \param feats dense feature vector, if the feature is missing the field is set to NaN
|
||||
* \param root_gid starting root index of the instance
|
||||
* \return the leaf index of the given feature
|
||||
*/
|
||||
inline float Predict(const std::vector<float> &feat, unsigned root_id = 0) const {
|
||||
int pid = this->GetLeafIndex(feat, root_id);
|
||||
return (*this)[pid].leaf_value();
|
||||
}
|
||||
private:
|
||||
/*! \brief get next position of the tree given current pid */
|
||||
inline int GetNext(int pid, float fvalue, bool is_unknown) const {
|
||||
float split_value = (*this)[pid].split_cond();
|
||||
if (is_unknown) {
|
||||
return (*this)[pid].cdefault();
|
||||
} else {
|
||||
if (fvalue < split_value) {
|
||||
return (*this)[pid].cleft();
|
||||
} else {
|
||||
return (*this)[pid].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_MODEL_H_
|
||||
262
src/tree/param.h
Normal file
262
src/tree/param.h
Normal file
@@ -0,0 +1,262 @@
|
||||
#ifndef XGBOOST_TREE_PARAM_H_
|
||||
#define XGBOOST_TREE_PARAM_H_
|
||||
/*!
|
||||
* \file param.h
|
||||
* \brief training parameters, statistics used to support tree construction
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <cstring>
|
||||
#include "../data.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
/*! \brief core statistics used for tree construction */
|
||||
struct GradStats {
|
||||
/*! \brief sum gradient statistics */
|
||||
double sum_grad;
|
||||
/*! \brief sum hessian statistics */
|
||||
double sum_hess;
|
||||
/*! \brief constructor */
|
||||
GradStats(void) {
|
||||
this->Clear();
|
||||
}
|
||||
/*! \brief clear the statistics */
|
||||
inline void Clear(void) {
|
||||
sum_grad = sum_hess = 0.0f;
|
||||
}
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(double grad, double hess) {
|
||||
sum_grad += grad; sum_hess += hess;
|
||||
}
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(const bst_gpair& b) {
|
||||
this->Add(b.grad, b.hess);
|
||||
}
|
||||
/*! \brief add statistics to the data */
|
||||
inline void Add(const GradStats &b) {
|
||||
this->Add(b.sum_grad, b.sum_hess);
|
||||
}
|
||||
/*! \brief substract the statistics by b */
|
||||
inline GradStats Substract(const GradStats &b) const {
|
||||
GradStats res;
|
||||
res.sum_grad = this->sum_grad - b.sum_grad;
|
||||
res.sum_hess = this->sum_hess - b.sum_hess;
|
||||
return res;
|
||||
}
|
||||
/*! \return whether the statistics is not used yet */
|
||||
inline bool Empty(void) const {
|
||||
return sum_hess == 0.0;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief training parameters for regression tree */
|
||||
struct TrainParam{
|
||||
// learning step size for a time
|
||||
float learning_rate;
|
||||
// minimum loss change required for a split
|
||||
float min_split_loss;
|
||||
// maximum depth of a tree
|
||||
int max_depth;
|
||||
//----- the rest parameters are less important ----
|
||||
// minimum amount of hessian(weight) allowed in a child
|
||||
float min_child_weight;
|
||||
// weight decay parameter used to control leaf fitting
|
||||
float reg_lambda;
|
||||
// reg method
|
||||
int reg_method;
|
||||
// default direction choice
|
||||
int default_direction;
|
||||
// whether we want to do subsample
|
||||
float subsample;
|
||||
// whether to subsample columns each split, in each level
|
||||
float colsample_bylevel;
|
||||
// whether to subsample columns during tree construction
|
||||
float colsample_bytree;
|
||||
// speed optimization for dense column
|
||||
float opt_dense_col;
|
||||
// number of threads to be used for tree construction,
|
||||
// if OpenMP is enabled, if equals 0, use system default
|
||||
int nthread;
|
||||
/*! \brief constructor */
|
||||
TrainParam(void) {
|
||||
learning_rate = 0.3f;
|
||||
min_child_weight = 1.0f;
|
||||
max_depth = 6;
|
||||
reg_lambda = 1.0f;
|
||||
reg_method = 2;
|
||||
default_direction = 0;
|
||||
subsample = 1.0f;
|
||||
colsample_bytree = 1.0f;
|
||||
colsample_bylevel = 1.0f;
|
||||
opt_dense_col = 1.0f;
|
||||
nthread = 0;
|
||||
}
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
inline void SetParam(const char *name, const char *val) {
|
||||
// sync-names
|
||||
if (!strcmp(name, "gamma")) min_split_loss = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "eta")) learning_rate = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "learning_rate")) learning_rate = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "reg_method")) reg_method = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "opt_dense_col")) opt_dense_col = static_cast<float>(atof(val));
|
||||
if (!strcmp(name, "max_depth")) max_depth = atoi(val);
|
||||
if (!strcmp(name, "nthread")) nthread = atoi(val);
|
||||
if (!strcmp(name, "default_direction")) {
|
||||
if (!strcmp(val, "learn")) default_direction = 0;
|
||||
if (!strcmp(val, "left")) default_direction = 1;
|
||||
if (!strcmp(val, "right")) default_direction = 2;
|
||||
}
|
||||
}
|
||||
// calculate the cost of loss function
|
||||
inline double CalcGain(double sum_grad, double sum_hess) const {
|
||||
if (sum_hess < min_child_weight) {
|
||||
return 0.0;
|
||||
}
|
||||
switch (reg_method) {
|
||||
case 1 : return Sqr(ThresholdL1(sum_grad, reg_lambda)) / sum_hess;
|
||||
case 2 : return Sqr(sum_grad) / (sum_hess + reg_lambda);
|
||||
case 3 : return
|
||||
Sqr(ThresholdL1(sum_grad, 0.5 * reg_lambda)) /
|
||||
(sum_hess + 0.5 * reg_lambda);
|
||||
default: return Sqr(sum_grad) / sum_hess;
|
||||
}
|
||||
}
|
||||
// calculate weight given the statistics
|
||||
inline double CalcWeight(double sum_grad, double sum_hess) const {
|
||||
if (sum_hess < min_child_weight) {
|
||||
return 0.0;
|
||||
} else {
|
||||
switch (reg_method) {
|
||||
case 1: return - ThresholdL1(sum_grad, reg_lambda) / sum_hess;
|
||||
case 2: return - sum_grad / (sum_hess + reg_lambda);
|
||||
case 3: return
|
||||
- ThresholdL1(sum_grad, 0.5 * reg_lambda) /
|
||||
(sum_hess + 0.5 * reg_lambda);
|
||||
default: return - sum_grad / sum_hess;
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief whether need forward small to big search: default right */
|
||||
inline bool need_forward_search(float col_density = 0.0f) const {
|
||||
return this->default_direction == 2 ||
|
||||
(default_direction == 0 && (col_density < opt_dense_col));
|
||||
}
|
||||
/*! \brief whether need backward big to small search: default left */
|
||||
inline bool need_backward_search(float col_density = 0.0f) const {
|
||||
return this->default_direction != 2;
|
||||
}
|
||||
/*! \brief given the loss change, whether we need to invode prunning */
|
||||
inline bool need_prune(double loss_chg, int depth) const {
|
||||
return loss_chg < this->min_split_loss;
|
||||
}
|
||||
/*! \brief whether we can split with current hessian */
|
||||
inline bool cannot_split(double sum_hess, int depth) const {
|
||||
return sum_hess < this->min_child_weight * 2.0;
|
||||
}
|
||||
// code support for template data
|
||||
inline double CalcWeight(const GradStats &d) const {
|
||||
return this->CalcWeight(d.sum_grad, d.sum_hess);
|
||||
}
|
||||
inline double CalcGain(const GradStats &d) const {
|
||||
return this->CalcGain(d.sum_grad, d.sum_hess);
|
||||
}
|
||||
|
||||
protected:
|
||||
// functions for L1 cost
|
||||
inline static double ThresholdL1(double w, double lambda) {
|
||||
if (w > +lambda) return w - lambda;
|
||||
if (w < -lambda) return w + lambda;
|
||||
return 0.0;
|
||||
}
|
||||
inline static double Sqr(double a) {
|
||||
return a * a;
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief statistics that is helpful to store
|
||||
* and represent a split solution for the tree
|
||||
*/
|
||||
struct SplitEntry{
|
||||
/*! \brief loss change after split this node */
|
||||
bst_float loss_chg;
|
||||
/*! \brief split index */
|
||||
unsigned sindex;
|
||||
/*! \brief split value */
|
||||
float split_value;
|
||||
/*! \brief constructor */
|
||||
SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
|
||||
/*!
|
||||
* \brief decides whether a we can replace current entry with the statistics given
|
||||
* This function gives better priority to lower index when loss_chg equals
|
||||
* not the best way, but helps to give consistent result during multi-thread execution
|
||||
* \param loss_chg the loss reduction get through the split
|
||||
* \param split_index the feature index where the split is on
|
||||
*/
|
||||
inline bool NeedReplace(bst_float loss_chg, unsigned split_index) const {
|
||||
if (this->split_index() <= split_index) {
|
||||
return loss_chg > this->loss_chg;
|
||||
} else {
|
||||
return !(this->loss_chg > loss_chg);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief update the split entry, replace it if e is better
|
||||
* \param e candidate split solution
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
inline bool Update(const SplitEntry &e) {
|
||||
if (this->NeedReplace(e.loss_chg, e.split_index())) {
|
||||
this->loss_chg = e.loss_chg;
|
||||
this->sindex = e.sindex;
|
||||
this->split_value = e.split_value;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief update the split entry, replace it if e is better
|
||||
* \param loss_chg loss reduction of new candidate
|
||||
* \param split_index feature index to split on
|
||||
* \param split_value the split point
|
||||
* \param default_left whether the missing value goes to left
|
||||
* \return whether the proposed split is better and can replace current split
|
||||
*/
|
||||
inline bool Update(bst_float loss_chg, unsigned split_index,
|
||||
float split_value, bool default_left) {
|
||||
if (this->NeedReplace(loss_chg, split_index)) {
|
||||
this->loss_chg = loss_chg;
|
||||
if (default_left) split_index |= (1U << 31);
|
||||
this->sindex = split_index;
|
||||
this->split_value = split_value;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/*!\return feature index to split on */
|
||||
inline unsigned split_index(void) const {
|
||||
return sindex & ((1U << 31) - 1U);
|
||||
}
|
||||
/*!\return whether missing value goes to left branch */
|
||||
inline bool default_left(void) const {
|
||||
return (sindex >> 31) != 0;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_PARAM_H_
|
||||
70
src/tree/updater.h
Normal file
70
src/tree/updater.h
Normal file
@@ -0,0 +1,70 @@
|
||||
#ifndef XGBOOST_TREE_UPDATER_H_
|
||||
#define XGBOOST_TREE_UPDATER_H_
|
||||
/*!
|
||||
* \file updater.h
|
||||
* \brief interface to update the tree
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
|
||||
#include "../data.h"
|
||||
#include "./model.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*!
|
||||
* \brief interface of tree update module, that performs update of a tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IUpdater {
|
||||
public:
|
||||
/*!
|
||||
* \brief set parameters from outside
|
||||
* \param name name of the parameter
|
||||
* \param val value of the parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*!
|
||||
* \brief peform update to the tree models
|
||||
* \param gpair the gradient pair statistics of the data
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param root_index pre-partitioned root_index of each instance,
|
||||
* root_index.size() can be 0 which indicates that no pre-partition involved
|
||||
* \param trees pointer to the trese to be updated, upater will change the content of the tree
|
||||
* note: all the trees in the vector are updated, with the same statistics,
|
||||
* but maybe different random seeds, usually one tree is passed in at a time,
|
||||
* there can be multiple trees when we train random forest style model
|
||||
*/
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
const std::vector<RegTree*> &trees) = 0;
|
||||
// destructor
|
||||
virtual ~IUpdater(void) {}
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
#include "./updater_prune-inl.hpp"
|
||||
#include "./updater_colmaker-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*!
|
||||
* \brief create a updater based on name
|
||||
* \param name name of updater
|
||||
* \return return the updater instance
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
|
||||
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
|
||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
|
||||
utils::Error("unknown updater:%s", name);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_H_
|
||||
357
src/tree/updater_colmaker-inl.hpp
Normal file
357
src/tree/updater_colmaker-inl.hpp
Normal file
@@ -0,0 +1,357 @@
|
||||
#ifndef XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
|
||||
#define XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
|
||||
/*!
|
||||
* \file updater_colmaker-inl.hpp
|
||||
* \brief use columnwise update to construct a tree
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "./param.h"
|
||||
#include "./updater.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../utils/random.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename FMatrix, typename TStats>
|
||||
class ColMaker: public IUpdater<FMatrix> {
|
||||
public:
|
||||
virtual ~ColMaker(void) {}
|
||||
// set training parameter
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
param.SetParam(name, val);
|
||||
}
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||
FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
const std::vector<RegTree*> &trees) {
|
||||
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
Builder builder(param);
|
||||
builder.Update(gpair, fmat, root_index, trees[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
// data structure
|
||||
/*! \brief per thread x per node entry to store tmp data */
|
||||
struct ThreadEntry {
|
||||
/*! \brief statistics of data*/
|
||||
TStats stats;
|
||||
/*! \brief last feature value scanned */
|
||||
float last_fvalue;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
// constructor
|
||||
ThreadEntry(void) {
|
||||
stats.Clear();
|
||||
}
|
||||
};
|
||||
struct NodeEntry {
|
||||
/*! \brief statics for node entry */
|
||||
TStats stats;
|
||||
/*! \brief loss of this node, without split */
|
||||
bst_float root_gain;
|
||||
/*! \brief weight calculated related to current data */
|
||||
float weight;
|
||||
/*! \brief current best solution */
|
||||
SplitEntry best;
|
||||
// constructor
|
||||
NodeEntry(void) : root_gain(0.0f), weight(0.0f){
|
||||
stats.Clear();
|
||||
}
|
||||
};
|
||||
// actual builder that runs the algorithm
|
||||
struct Builder{
|
||||
public:
|
||||
// constructor
|
||||
explicit Builder(const TrainParam ¶m) : param(param) {}
|
||||
// update one tree, growing
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
RegTree *p_tree) {
|
||||
this->InitData(gpair, fmat, root_index, *p_tree);
|
||||
this->InitNewNode(qexpand, gpair, *p_tree);
|
||||
|
||||
for (int depth = 0; depth < param.max_depth; ++depth) {
|
||||
this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree);
|
||||
this->ResetPosition(this->qexpand, fmat, *p_tree);
|
||||
this->UpdateQueueExpand(*p_tree, &this->qexpand);
|
||||
this->InitNewNode(qexpand, gpair, *p_tree);
|
||||
// if nothing left to be expand, break
|
||||
if (qexpand.size() == 0) break;
|
||||
}
|
||||
// set all the rest expanding nodes to leaf
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
(*p_tree)[nid].set_leaf(snode[nid].weight * param.learning_rate);
|
||||
}
|
||||
// remember auxiliary statistics in the tree node
|
||||
for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) {
|
||||
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
|
||||
p_tree->stat(nid).base_weight = snode[nid].weight;
|
||||
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// initialize temp data structure
|
||||
inline void InitData(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
||||
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
|
||||
{// setup position
|
||||
position.resize(gpair.size());
|
||||
if (root_index.size() == 0) {
|
||||
std::fill(position.begin(), position.end(), 0);
|
||||
} else {
|
||||
for (size_t i = 0; i < root_index.size(); ++i) {
|
||||
position[i] = root_index[i];
|
||||
utils::Assert(root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting");
|
||||
}
|
||||
}
|
||||
// mark delete for the deleted datas
|
||||
for (size_t i = 0; i < gpair.size(); ++i) {
|
||||
if (gpair[i].hess < 0.0f) position[i] = -1;
|
||||
}
|
||||
// mark subsample
|
||||
if (param.subsample < 1.0f) {
|
||||
for (size_t i = 0; i < gpair.size(); ++i) {
|
||||
if (gpair[i].hess < 0.0f) continue;
|
||||
if (random::SampleBinary(param.subsample) == 0) position[i] = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// initialize feature index
|
||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
||||
for (unsigned i = 0; i < ncol; ++i) {
|
||||
if (fmat.GetColSize(i) != 0) feat_index.push_back(i);
|
||||
}
|
||||
unsigned n = static_cast<unsigned>(param.colsample_bytree * feat_index.size());
|
||||
random::Shuffle(feat_index);
|
||||
utils::Check(n > 0, "colsample_bytree is too small that no feature can be included");
|
||||
feat_index.resize(n);
|
||||
}
|
||||
{// setup temp space for each thread
|
||||
#pragma omp parallel
|
||||
{
|
||||
this->nthread = omp_get_num_threads();
|
||||
}
|
||||
// reserve a small space
|
||||
stemp.clear();
|
||||
stemp.resize(this->nthread, std::vector<ThreadEntry>());
|
||||
for (size_t i = 0; i < stemp.size(); ++i) {
|
||||
stemp[i].clear(); stemp[i].reserve(256);
|
||||
}
|
||||
snode.reserve(256);
|
||||
}
|
||||
{// expand query
|
||||
qexpand.reserve(256); qexpand.clear();
|
||||
for (int i = 0; i < tree.param.num_roots; ++i) {
|
||||
qexpand.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
|
||||
inline void InitNewNode(const std::vector<int> &qexpand,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
const RegTree &tree) {
|
||||
{// setup statistics space for each tree node
|
||||
for (size_t i = 0; i < stemp.size(); ++i) {
|
||||
stemp[i].resize(tree.param.num_nodes, ThreadEntry());
|
||||
}
|
||||
snode.resize(tree.param.num_nodes, NodeEntry());
|
||||
}
|
||||
// setup position
|
||||
const unsigned ndata = static_cast<unsigned>(position.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
const int tid = omp_get_thread_num();
|
||||
if (position[i] < 0) continue;
|
||||
stemp[tid][position[i]].stats.Add(gpair[i]);
|
||||
}
|
||||
// sum the per thread statistics together
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
const int nid = qexpand[j];
|
||||
TStats stats; stats.Clear();
|
||||
for (size_t tid = 0; tid < stemp.size(); ++tid) {
|
||||
stats.Add(stemp[tid][nid].stats);
|
||||
}
|
||||
// update node statistics
|
||||
snode[nid].stats = stats;
|
||||
snode[nid].root_gain = param.CalcGain(stats);
|
||||
snode[nid].weight = param.CalcWeight(stats);
|
||||
}
|
||||
}
|
||||
/*! \brief update queue expand add in new leaves */
|
||||
inline void UpdateQueueExpand(const RegTree &tree, std::vector<int> *p_qexpand) {
|
||||
std::vector<int> &qexpand = *p_qexpand;
|
||||
std::vector<int> newnodes;
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
if (!tree[ nid ].is_leaf()) {
|
||||
newnodes.push_back(tree[nid].cleft());
|
||||
newnodes.push_back(tree[nid].cright());
|
||||
}
|
||||
}
|
||||
// use new nodes for qexpand
|
||||
qexpand = newnodes;
|
||||
}
|
||||
// enumerate the split values of specific feature
|
||||
template<typename Iter>
|
||||
inline void EnumerateSplit(Iter it, unsigned fid,
|
||||
const std::vector<bst_gpair> &gpair,
|
||||
std::vector<ThreadEntry> &temp,
|
||||
bool is_forward_search) {
|
||||
// clear all the temp statistics
|
||||
for (size_t j = 0; j < qexpand.size(); ++j) {
|
||||
temp[qexpand[j]].stats.Clear();
|
||||
}
|
||||
while (it.Next()) {
|
||||
const bst_uint ridx = it.rindex();
|
||||
const int nid = position[ridx];
|
||||
if (nid < 0) continue;
|
||||
// start working
|
||||
const float fvalue = it.fvalue();
|
||||
// get the statistics of nid
|
||||
ThreadEntry &e = temp[nid];
|
||||
// test if first hit, this is fine, because we set 0 during init
|
||||
if (e.stats.Empty()) {
|
||||
e.stats.Add(gpair[ridx]);
|
||||
e.last_fvalue = fvalue;
|
||||
} else {
|
||||
// try to find a split
|
||||
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
|
||||
TStats c = snode[nid].stats.Substract(e.stats);
|
||||
if (c.sum_hess >= param.min_child_weight) {
|
||||
double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
|
||||
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
|
||||
}
|
||||
}
|
||||
// update the statistics
|
||||
e.stats.Add(gpair[ridx]);
|
||||
e.last_fvalue = fvalue;
|
||||
}
|
||||
}
|
||||
// finish updating all statistics, check if it is possible to include all sum statistics
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
ThreadEntry &e = temp[nid];
|
||||
TStats c = snode[nid].stats.Substract(e.stats);
|
||||
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
|
||||
const double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
|
||||
const float delta = is_forward_search ? rt_eps : -rt_eps;
|
||||
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
|
||||
}
|
||||
}
|
||||
}
|
||||
// find splits at current level, do split per level
|
||||
inline void FindSplit(int depth, const std::vector<int> &qexpand,
|
||||
const std::vector<bst_gpair> &gpair, const FMatrix &fmat,
|
||||
RegTree *p_tree) {
|
||||
std::vector<unsigned> feat_set = feat_index;
|
||||
if (param.colsample_bylevel != 1.0f) {
|
||||
random::Shuffle(feat_set);
|
||||
unsigned n = static_cast<unsigned>(param.colsample_bylevel * feat_index.size());
|
||||
utils::Check(n > 0, "colsample_bylevel is too small that no feature can be included");
|
||||
feat_set.resize(n);
|
||||
}
|
||||
// start enumeration
|
||||
const unsigned nsize = static_cast<unsigned>(feat_set.size());
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (unsigned i = 0; i < nsize; ++i) {
|
||||
const unsigned fid = feat_set[i];
|
||||
const int tid = omp_get_thread_num();
|
||||
if (param.need_forward_search(fmat.GetColDensity(fid))) {
|
||||
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, stemp[tid], true);
|
||||
}
|
||||
if (param.need_backward_search(fmat.GetColDensity(fid))) {
|
||||
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, stemp[tid], false);
|
||||
}
|
||||
}
|
||||
// after this each thread's stemp will get the best candidates, aggregate results
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
NodeEntry &e = snode[nid];
|
||||
for (int tid = 0; tid < this->nthread; ++tid) {
|
||||
e.best.Update(stemp[tid][nid].best);
|
||||
}
|
||||
// now we know the solution in snode[nid], set split
|
||||
if (e.best.loss_chg > rt_eps) {
|
||||
p_tree->AddChilds(nid);
|
||||
(*p_tree)[nid].set_split(e.best.split_index(), e.best.split_value, e.best.default_left());
|
||||
} else {
|
||||
(*p_tree)[nid].set_leaf(e.weight * param.learning_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
// reset position of each data points after split is created in the tree
|
||||
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
|
||||
// step 1, set default direct nodes to default, and leaf nodes to -1
|
||||
const unsigned ndata = static_cast<unsigned>(position.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned i = 0; i < ndata; ++i) {
|
||||
const int nid = position[i];
|
||||
if (nid >= 0) {
|
||||
if (tree[nid].is_leaf()) {
|
||||
position[i] = -1;
|
||||
} else {
|
||||
// push to default branch, correct latter
|
||||
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
// step 2, classify the non-default data into right places
|
||||
std::vector<unsigned> fsplits;
|
||||
for (size_t i = 0; i < qexpand.size(); ++i) {
|
||||
const int nid = qexpand[i];
|
||||
if (!tree[nid].is_leaf()) fsplits.push_back(tree[nid].split_index());
|
||||
}
|
||||
std::sort(fsplits.begin(), fsplits.end());
|
||||
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
|
||||
// start put things into right place
|
||||
const unsigned nfeats = static_cast<unsigned>(fsplits.size());
|
||||
#pragma omp parallel for schedule(dynamic, 1)
|
||||
for (unsigned i = 0; i < nfeats; ++i) {
|
||||
const unsigned fid = fsplits[i];
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
const bst_uint ridx = it.rindex();
|
||||
int nid = position[ridx];
|
||||
if (nid == -1) continue;
|
||||
// go back to parent, correct those who are not default
|
||||
nid = tree[nid].parent();
|
||||
if (tree[nid].split_index() == fid) {
|
||||
if (it.fvalue() < tree[nid].split_cond()) {
|
||||
position[ridx] = tree[nid].cleft();
|
||||
} else {
|
||||
position[ridx] = tree[nid].cright();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//--data fields--
|
||||
const TrainParam ¶m;
|
||||
// number of omp thread used during training
|
||||
int nthread;
|
||||
// Per feature: shuffle index of each feature index
|
||||
std::vector<unsigned> feat_index;
|
||||
// Instance Data: current node position in the tree of each instance
|
||||
std::vector<int> position;
|
||||
// PerThread x PerTreeNode: statistics for per thread construction
|
||||
std::vector< std::vector<ThreadEntry> > stemp;
|
||||
/*! \brief TreeNode Data: statistics for each constructed node */
|
||||
std::vector<NodeEntry> snode;
|
||||
/*! \brief queue of nodes to be expanded */
|
||||
std::vector<int> qexpand;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_COLMAKER_INL_HPP_
|
||||
67
src/tree/updater_prune-inl.hpp
Normal file
67
src/tree/updater_prune-inl.hpp
Normal file
@@ -0,0 +1,67 @@
|
||||
#ifndef XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
||||
#define XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
||||
/*!
|
||||
* \file updater_prune-inl.hpp
|
||||
* \brief prune a tree given the statistics
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include "./param.h"
|
||||
#include "./updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
/*! \brief pruner that prunes a tree after growing finishs */
|
||||
template<typename FMatrix>
|
||||
class TreePruner: public IUpdater<FMatrix> {
|
||||
public:
|
||||
virtual ~TreePruner(void) {}
|
||||
// set training parameter
|
||||
virtual void SetParam(const char *name, const char *val) {
|
||||
param.SetParam(name, val);
|
||||
}
|
||||
// update the tree, do pruning
|
||||
virtual void Update(const std::vector<bst_gpair> &gpair, FMatrix &fmat,
|
||||
const std::vector<unsigned> &root_index,
|
||||
const std::vector<RegTree*> &trees) {
|
||||
for (size_t i = 0; i < trees.size(); ++i) {
|
||||
this->DoPrune(*trees[i]);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// try to prune off current leaf
|
||||
inline void TryPruneLeaf(RegTree &tree, int nid, int depth) {
|
||||
if (tree[nid].is_root()) return;
|
||||
int pid = tree[nid].parent();
|
||||
RegTree::NodeStat &s = tree.stat(pid);
|
||||
++s.leaf_child_cnt;
|
||||
|
||||
if (s.leaf_child_cnt >= 2 && param.need_prune(s.loss_chg, depth - 1)) {
|
||||
// need to be pruned
|
||||
tree.ChangeToLeaf(pid, param.learning_rate * s.base_weight);
|
||||
// tail recursion
|
||||
this->TryPruneLeaf(tree, pid, depth - 1);
|
||||
}
|
||||
}
|
||||
/*! \brief do prunning of a tree */
|
||||
inline void DoPrune(RegTree &tree) {
|
||||
// initialize auxiliary statistics
|
||||
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
|
||||
tree.stat(nid).leaf_child_cnt = 0;
|
||||
}
|
||||
for (int nid = 0; nid < tree.param.num_nodes; ++nid) {
|
||||
if (tree[nid].is_leaf()) {
|
||||
this->TryPruneLeaf(tree, nid, tree.GetDepth(nid));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
};
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_TREE_UPDATER_PRUNE_INL_HPP_
|
||||
196
src/utils/config.h
Normal file
196
src/utils/config.h
Normal file
@@ -0,0 +1,196 @@
|
||||
#ifndef XGBOOST_UTILS_CONFIG_H_
|
||||
#define XGBOOST_UTILS_CONFIG_H_
|
||||
/*!
|
||||
* \file config.h
|
||||
* \brief helper class to load in configures from file
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <istream>
|
||||
#include <fstream>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief base implementation of config reader
|
||||
*/
|
||||
class ConfigReaderBase {
|
||||
public:
|
||||
/*!
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name(void) const {
|
||||
return s_name;
|
||||
}
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val(void) const {
|
||||
return s_val;
|
||||
}
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next(void) {
|
||||
while (!this->IsEnd()) {
|
||||
GetNextToken(s_name);
|
||||
if (s_name[0] == '=') return false;
|
||||
if (GetNextToken( s_buf ) || s_buf[0] != '=') return false;
|
||||
if (GetNextToken( s_val ) || s_val[0] == '=') return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// called before usage
|
||||
inline void Init(void) {
|
||||
ch_buf = this->GetChar();
|
||||
}
|
||||
|
||||
protected:
|
||||
/*!
|
||||
* \brief to be implemented by subclass,
|
||||
* get next token, return EOF if end of file
|
||||
*/
|
||||
virtual char GetChar(void) = 0;
|
||||
/*! \brief to be implemented by child, check if end of stream */
|
||||
virtual bool IsEnd(void) = 0;
|
||||
|
||||
private:
|
||||
char ch_buf;
|
||||
char s_name[100000], s_val[100000], s_buf[100000];
|
||||
|
||||
inline void SkipLine(void) {
|
||||
do {
|
||||
ch_buf = this->GetChar();
|
||||
} while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
|
||||
}
|
||||
|
||||
inline void ParseStr(char tok[]) {
|
||||
int i = 0;
|
||||
while ((ch_buf = this->GetChar()) != EOF) {
|
||||
switch (ch_buf) {
|
||||
case '\\': tok[i++] = this->GetChar(); break;
|
||||
case '\"': tok[i++] = '\0'; return;
|
||||
case '\r':
|
||||
case '\n': Error("ConfigReader: unterminated string");
|
||||
default: tok[i++] = ch_buf;
|
||||
}
|
||||
}
|
||||
Error("ConfigReader: unterminated string");
|
||||
}
|
||||
inline void ParseStrML(char tok[]) {
|
||||
int i = 0;
|
||||
while ((ch_buf = this->GetChar()) != EOF) {
|
||||
switch (ch_buf) {
|
||||
case '\\': tok[i++] = this->GetChar(); break;
|
||||
case '\'': tok[i++] = '\0'; return;
|
||||
default: tok[i++] = ch_buf;
|
||||
}
|
||||
}
|
||||
Error("unterminated string");
|
||||
}
|
||||
// return newline
|
||||
inline bool GetNextToken(char tok[]) {
|
||||
int i = 0;
|
||||
bool new_line = false;
|
||||
while (ch_buf != EOF) {
|
||||
switch (ch_buf) {
|
||||
case '#' : SkipLine(); new_line = true; break;
|
||||
case '\"':
|
||||
if (i == 0) {
|
||||
ParseStr(tok); ch_buf = this->GetChar(); return new_line;
|
||||
} else {
|
||||
Error("ConfigReader: token followed directly by string");
|
||||
}
|
||||
case '\'':
|
||||
if (i == 0) {
|
||||
ParseStrML( tok ); ch_buf = this->GetChar(); return new_line;
|
||||
} else {
|
||||
Error("ConfigReader: token followed directly by string");
|
||||
}
|
||||
case '=':
|
||||
if (i == 0) {
|
||||
ch_buf = this->GetChar();
|
||||
tok[0] = '=';
|
||||
tok[1] = '\0';
|
||||
} else {
|
||||
tok[i] = '\0';
|
||||
}
|
||||
return new_line;
|
||||
case '\r':
|
||||
case '\n':
|
||||
if (i == 0) new_line = true;
|
||||
case '\t':
|
||||
case ' ' :
|
||||
ch_buf = this->GetChar();
|
||||
if (i > 0) {
|
||||
tok[i] = '\0';
|
||||
return new_line;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
tok[i++] = ch_buf;
|
||||
ch_buf = this->GetChar();
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
/*!
|
||||
* \brief an iterator use stream base, allows use all types of istream
|
||||
*/
|
||||
class ConfigStreamReader: public ConfigReaderBase {
|
||||
public:
|
||||
/*!
|
||||
* \brief constructor
|
||||
* \param istream input stream
|
||||
*/
|
||||
explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
|
||||
|
||||
protected:
|
||||
virtual char GetChar(void) {
|
||||
return fin.get();
|
||||
}
|
||||
/*! \brief to be implemented by child, check if end of stream */
|
||||
virtual bool IsEnd(void) {
|
||||
return fin.eof();
|
||||
}
|
||||
|
||||
private:
|
||||
std::istream &fin;
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief an iterator that iterates over a configure file and gets the configures
|
||||
*/
|
||||
class ConfigIterator: public ConfigStreamReader {
|
||||
public:
|
||||
/*!
|
||||
* \brief constructor
|
||||
* \param fname name of configure file
|
||||
*/
|
||||
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
|
||||
fi.open(fname);
|
||||
if (fi.fail()) {
|
||||
utils::Error("cannot open file %s", fname);
|
||||
}
|
||||
ConfigReaderBase::Init();
|
||||
}
|
||||
/*! \brief destructor */
|
||||
~ConfigIterator(void) {
|
||||
fi.close();
|
||||
}
|
||||
|
||||
private:
|
||||
std::ifstream fi;
|
||||
};
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_CONFIG_H_
|
||||
80
src/utils/fmap.h
Normal file
80
src/utils/fmap.h
Normal file
@@ -0,0 +1,80 @@
|
||||
#ifndef XGBOOST_UTILS_FMAP_H_
|
||||
#define XGBOOST_UTILS_FMAP_H_
|
||||
/*!
|
||||
* \file fmap.h
|
||||
* \brief helper class that holds the feature names and interpretations
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*! \brief helper class that holds the feature names and interpretations */
|
||||
class FeatMap {
|
||||
public:
|
||||
enum Type {
|
||||
kIndicator = 0,
|
||||
kQuantitive = 1,
|
||||
kInteger = 2,
|
||||
kFloat = 3
|
||||
};
|
||||
// function definitions
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(const char *fname) {
|
||||
FILE *fi = utils::FopenCheck(fname, "r");
|
||||
this->LoadText(fi);
|
||||
fclose(fi);
|
||||
}
|
||||
/*! \brief load feature map from text format */
|
||||
inline void LoadText(FILE *fi) {
|
||||
int fid;
|
||||
char fname[1256], ftype[1256];
|
||||
while (fscanf(fi, "%d\t%[^\t]\t%s\n", &fid, fname, ftype) == 3) {
|
||||
this->PushBack(fid, fname, ftype);
|
||||
}
|
||||
}
|
||||
/*!\brief push back feature map */
|
||||
inline void PushBack(int fid, const char *fname, const char *ftype) {
|
||||
utils::Check(fid == static_cast<int>(names_.size()), "invalid fmap format");
|
||||
names_.push_back(std::string(fname));
|
||||
types_.push_back(GetType(ftype));
|
||||
}
|
||||
inline void Clear(void) {
|
||||
names_.clear(); types_.clear();
|
||||
}
|
||||
/*! \brief number of known features */
|
||||
size_t size(void) const {
|
||||
return names_.size();
|
||||
}
|
||||
/*! \brief return name of specific feature */
|
||||
const char* name(size_t idx) const {
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return names_[idx].c_str();
|
||||
}
|
||||
/*! \brief return type of specific feature */
|
||||
const Type& type(size_t idx) const {
|
||||
utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
|
||||
return types_[idx];
|
||||
}
|
||||
|
||||
private:
|
||||
inline static Type GetType(const char *tname) {
|
||||
if (!strcmp("i", tname)) return kIndicator;
|
||||
if (!strcmp("q", tname)) return kQuantitive;
|
||||
if (!strcmp("int", tname)) return kInteger;
|
||||
if (!strcmp("float", tname)) return kFloat;
|
||||
utils::Error("unknown feature type, use i for indicator and q for quantity");
|
||||
return kIndicator;
|
||||
}
|
||||
/*! \brief name of the feature */
|
||||
std::vector<std::string> names_;
|
||||
/*! \brief type of the feature */
|
||||
std::vector<Type> types_;
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_FMAP_H_
|
||||
104
src/utils/io.h
Normal file
104
src/utils/io.h
Normal file
@@ -0,0 +1,104 @@
|
||||
#ifndef XGBOOST_UTILS_IO_H
|
||||
#define XGBOOST_UTILS_IO_H
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "./utils.h"
|
||||
/*!
|
||||
* \file io.h
|
||||
* \brief general stream interface for serialization, I/O
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief interface of stream I/O, used to serialize model
|
||||
*/
|
||||
class IStream {
|
||||
public:
|
||||
/*!
|
||||
* \brief read data from stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
* \return usually is the size of data readed
|
||||
*/
|
||||
virtual size_t Read(void *ptr, size_t size) = 0;
|
||||
/*!
|
||||
* \brief write data to stream
|
||||
* \param ptr pointer to memory buffer
|
||||
* \param size size of block
|
||||
*/
|
||||
virtual void Write(const void *ptr, size_t size) = 0;
|
||||
/*! \brief virtual destructor */
|
||||
virtual ~IStream(void) {}
|
||||
|
||||
public:
|
||||
// helper functions to write various of data structures
|
||||
/*!
|
||||
* \brief binary serialize a vector
|
||||
* \param vec vector to be serialized
|
||||
*/
|
||||
template<typename T>
|
||||
inline void Write(const std::vector<T> &vec) {
|
||||
uint64_t sz = vec.size();
|
||||
this->Write(&sz, sizeof(sz));
|
||||
this->Write(&vec[0], sizeof(T) * sz);
|
||||
}
|
||||
/*!
|
||||
* \brief binary load a vector
|
||||
* \param out_vec vector to be loaded
|
||||
* \return whether load is successfull
|
||||
*/
|
||||
template<typename T>
|
||||
inline bool Read(std::vector<T> *out_vec) {
|
||||
uint64_t sz;
|
||||
if (this->Read(&sz, sizeof(sz)) == 0) return false;
|
||||
out_vec->resize(sz);
|
||||
if (this->Read(&(*out_vec)[0], sizeof(T) * sz) == 0) return false;
|
||||
return true;
|
||||
}
|
||||
/*!
|
||||
* \brief binary serialize a string
|
||||
* \param str the string to be serialized
|
||||
*/
|
||||
inline void Write(const std::string &str) {
|
||||
uint64_t sz = str.length();
|
||||
this->Write(&sz, sizeof(sz));
|
||||
this->Write(&str[0], sizeof(char) * sz);
|
||||
}
|
||||
/*!
|
||||
* \brief binary load a string
|
||||
* \param out_str string to be loaded
|
||||
* \return whether load is successful
|
||||
*/
|
||||
inline bool Read(std::string *out_str) {
|
||||
uint64_t sz;
|
||||
if (this->Read(&sz, sizeof(sz)) == 0) return false;
|
||||
out_str->resize(sz);
|
||||
if (this->Read(&(*out_str)[0], sizeof(char) * sz) == 0) return false;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief implementation of file i/o stream */
|
||||
class FileStream : public IStream {
|
||||
private:
|
||||
FILE *fp;
|
||||
public:
|
||||
explicit FileStream(FILE *fp) {
|
||||
this->fp = fp;
|
||||
}
|
||||
virtual size_t Read(void *ptr, size_t size) {
|
||||
return fread(ptr, size, 1, fp);
|
||||
}
|
||||
virtual void Write(const void *ptr, size_t size) {
|
||||
fwrite(ptr, size, 1, fp);
|
||||
}
|
||||
inline void Close(void) {
|
||||
fclose(fp);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
40
src/utils/iterator.h
Normal file
40
src/utils/iterator.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef XGBOOST_UTILS_ITERATOR_H
|
||||
#define XGBOOST_UTILS_ITERATOR_H
|
||||
#include <cstdio>
|
||||
/*!
|
||||
* \file iterator.h
|
||||
* \brief itertator interface
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief iterator interface
|
||||
* \tparam DType data type
|
||||
*/
|
||||
template<typename DType>
|
||||
class IIterator {
|
||||
public:
|
||||
/*!
|
||||
* \brief set the parameter
|
||||
* \param name name of parameter
|
||||
* \param val value of parameter
|
||||
*/
|
||||
virtual void SetParam(const char *name, const char *val) = 0;
|
||||
/*! \brief initalize the iterator so that we can use the iterator */
|
||||
virtual void Init(void) = 0;
|
||||
/*! \brief set before first of the item */
|
||||
virtual void BeforeFirst(void) = 0;
|
||||
/*! \brief move to next item */
|
||||
virtual bool Next(void) = 0;
|
||||
/*! \brief get current data */
|
||||
virtual const DType &Value(void) const = 0;
|
||||
public:
|
||||
/*! \brief constructor */
|
||||
virtual ~IIterator(void) {}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
|
||||
123
src/utils/matrix_csr.h
Normal file
123
src/utils/matrix_csr.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#ifndef XGBOOST_UTILS_MATRIX_CSR_H_
|
||||
#define XGBOOST_UTILS_MATRIX_CSR_H_
|
||||
/*!
|
||||
* \file matrix_csr.h
|
||||
* \brief this file defines some easy to use STL based class for in memory sparse CSR matrix
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "./utils.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace utils {
|
||||
/*!
|
||||
* \brief a class used to help construct CSR format matrix,
|
||||
* can be used to convert row major CSR to column major CSR
|
||||
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
|
||||
* \tparam whether enabling the usage of aclist, this option must be enabled manually
|
||||
*/
|
||||
template<typename IndexType, bool UseAcList = false>
|
||||
struct SparseCSRMBuilder {
|
||||
private:
|
||||
/*! \brief dummy variable used in the indicator matrix construction */
|
||||
std::vector<size_t> dummy_aclist;
|
||||
/*! \brief pointer to each of the row */
|
||||
std::vector<size_t> &rptr;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<IndexType> &findex;
|
||||
/*! \brief a list of active rows, used when many rows are empty */
|
||||
std::vector<size_t> &aclist;
|
||||
|
||||
public:
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
|
||||
Assert(!UseAcList, "enabling bug");
|
||||
}
|
||||
/*! \brief use with caution! rptr must be cleaned before use */
|
||||
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
|
||||
std::vector<IndexType> &p_findex,
|
||||
std::vector<size_t> &p_aclist)
|
||||
:rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
|
||||
Assert(UseAcList, "must manually enable the option use aclist");
|
||||
}
|
||||
|
||||
public:
|
||||
/*!
|
||||
* \brief step 1: initialize the number of rows in the data, not necessary exact
|
||||
* \nrows number of rows in the matrix, can be smaller than expected
|
||||
*/
|
||||
inline void InitBudget(size_t nrows = 0) {
|
||||
if (!UseAcList) {
|
||||
rptr.clear();
|
||||
rptr.resize(nrows + 1, 0);
|
||||
} else {
|
||||
Assert(nrows + 1 == rptr.size(), "rptr must be initialized already");
|
||||
this->Cleanup();
|
||||
}
|
||||
}
|
||||
/*!
|
||||
* \brief step 2: add budget to each rows, this function is called when aclist is used
|
||||
* \param row_id the id of the row
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget(size_t row_id, size_t nelem = 1) {
|
||||
if (rptr.size() < row_id + 2) {
|
||||
rptr.resize(row_id + 2, 0);
|
||||
}
|
||||
if (UseAcList) {
|
||||
if (rptr[row_id + 1] == 0) aclist.push_back(row_id);
|
||||
}
|
||||
rptr[row_id + 1] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage(void) {
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
if (!UseAcList) {
|
||||
for (size_t i = 1; i < rptr.size(); i++) {
|
||||
size_t rlen = rptr[i];
|
||||
rptr[i] = start;
|
||||
start += rlen;
|
||||
}
|
||||
} else {
|
||||
// case with active list
|
||||
std::sort(aclist.begin(), aclist.end());
|
||||
for (size_t i = 0; i < aclist.size(); i++) {
|
||||
size_t ridx = aclist[i];
|
||||
size_t rlen = rptr[ridx + 1];
|
||||
rptr[ridx + 1] = start;
|
||||
// set previous rptr to right position if previous feature is not active
|
||||
if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start;
|
||||
start += rlen;
|
||||
}
|
||||
}
|
||||
findex.resize(start);
|
||||
}
|
||||
/*!
|
||||
* \brief step 4:
|
||||
* used in indicator matrix construction, add new
|
||||
* element to each row, the number of calls shall be exactly same as add_budget
|
||||
*/
|
||||
inline void PushElem(size_t row_id, IndexType col_id) {
|
||||
size_t &rp = rptr[row_id + 1];
|
||||
findex[rp++] = col_id;
|
||||
}
|
||||
/*!
|
||||
* \brief step 5: only needed when aclist is used
|
||||
* clean up the rptr for next usage
|
||||
*/
|
||||
inline void Cleanup(void) {
|
||||
Assert(UseAcList, "this function can only be called use AcList");
|
||||
for (size_t i = 0; i < aclist.size(); i++) {
|
||||
const size_t ridx = aclist[i];
|
||||
rptr[ridx] = 0; rptr[ridx + 1] = 0;
|
||||
}
|
||||
aclist.clear();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif
|
||||
16
src/utils/omp.h
Normal file
16
src/utils/omp.h
Normal file
@@ -0,0 +1,16 @@
|
||||
#ifndef XGBOOST_UTILS_OMP_H_
|
||||
#define XGBOOST_UTILS_OMP_H_
|
||||
/*!
|
||||
* \file omp.h
|
||||
* \brief header to handle OpenMP compatibility issues
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#else
|
||||
#warning "OpenMP is not available, compile to single thread code"
|
||||
inline int omp_get_thread_num() { return 0; }
|
||||
inline int omp_get_num_threads() { return 1; }
|
||||
inline void omp_set_num_threads(int nthread) {}
|
||||
#endif
|
||||
#endif // XGBOOST_UTILS_OMP_H_
|
||||
102
src/utils/random.h
Normal file
102
src/utils/random.h
Normal file
@@ -0,0 +1,102 @@
|
||||
#ifndef XGBOOST_UTILS_RANDOM_H_
|
||||
#define XGBOOST_UTILS_RANDOM_H_
|
||||
/*!
|
||||
* \file xgboost_random.h
|
||||
* \brief PRNG to support random number generation
|
||||
* \author Tianqi Chen: tianqi.tchen@gmail.com
|
||||
*
|
||||
* Use standard PRNG from stdlib
|
||||
*/
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include "./utils.h"
|
||||
|
||||
/*! namespace of PRNG */
|
||||
namespace xgboost {
|
||||
namespace random {
|
||||
|
||||
/*! \brief seed the PRNG */
|
||||
inline void Seed(uint32_t seed) {
|
||||
srand(seed);
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double NextDouble(void) {
|
||||
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0);
|
||||
}
|
||||
/*! \brief return a real numer uniform in (0,1) */
|
||||
inline double NextDouble2(void) {
|
||||
return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0);
|
||||
}
|
||||
|
||||
/*! \brief return a random number */
|
||||
inline uint32_t NextUInt32(void) {
|
||||
return (uint32_t)rand();
|
||||
}
|
||||
/*! \brief return a random number in n */
|
||||
inline uint32_t NextUInt32(uint32_t n) {
|
||||
return (uint32_t)floor(NextDouble() * n);
|
||||
}
|
||||
/*! \brief return x~N(0,1) */
|
||||
inline double SampleNormal() {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
|
||||
return x * sqrt(-2.0 * log(s) / s);
|
||||
}
|
||||
|
||||
/*! \brief return iid x,y ~N(0,1) */
|
||||
inline void SampleNormal2D(double &xx, double &yy) {
|
||||
double x, y, s;
|
||||
do {
|
||||
x = 2 * NextDouble2() - 1.0;
|
||||
y = 2 * NextDouble2() - 1.0;
|
||||
s = x*x + y*y;
|
||||
} while (s >= 1.0 || s == 0.0);
|
||||
double t = sqrt(-2.0 * log(s) / s);
|
||||
xx = x * t;
|
||||
yy = y * t;
|
||||
}
|
||||
/*! \brief return x~N(mu,sigma^2) */
|
||||
inline double SampleNormal(double mu, double sigma) {
|
||||
return SampleNormal() * sigma + mu;
|
||||
}
|
||||
/*! \brief return 1 with probability p, coin flip */
|
||||
inline int SampleBinary(double p) {
|
||||
return NextDouble() < p;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline void Shuffle(T *data, size_t sz) {
|
||||
if (sz == 0) return;
|
||||
for (uint32_t i = (uint32_t)sz - 1; i > 0; i--){
|
||||
std::swap(data[i], data[NextUInt32(i + 1)]);
|
||||
}
|
||||
}
|
||||
// random shuffle the data inside, require PRNG
|
||||
template<typename T>
|
||||
inline void Shuffle(std::vector<T> &data) {
|
||||
Shuffle(&data[0], data.size());
|
||||
}
|
||||
|
||||
/*! \brief random number generator with independent random number seed*/
|
||||
struct Random{
|
||||
/*! \brief set random number seed */
|
||||
inline void Seed(unsigned sd) {
|
||||
this->rseed = sd;
|
||||
}
|
||||
/*! \brief return a real number uniform in [0,1) */
|
||||
inline double RandDouble(void) {
|
||||
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
|
||||
}
|
||||
// random number seed
|
||||
unsigned rseed;
|
||||
};
|
||||
} // namespace random
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_RANDOM_H_
|
||||
94
src/utils/utils.h
Normal file
94
src/utils/utils.h
Normal file
@@ -0,0 +1,94 @@
|
||||
#ifndef XGBOOST_UTILS_UTILS_H_
|
||||
#define XGBOOST_UTILS_UTILS_H_
|
||||
/*!
|
||||
* \file utils.h
|
||||
* \brief simple utils to support the code
|
||||
* \author Tianqi Chen
|
||||
*/
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#ifdef _MSC_VER
|
||||
#define fopen64 fopen
|
||||
#else
|
||||
#ifdef _FILE_OFFSET_BITS
|
||||
#if _FILE_OFFSET_BITS == 32
|
||||
#warning "FILE OFFSET BITS defined to be 32 bit"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define off64_t off_t
|
||||
#define fopen64 fopen
|
||||
#endif
|
||||
|
||||
#define _FILE_OFFSET_BITS 64
|
||||
extern "C" {
|
||||
#include <sys/types.h>
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short int uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long uint64_t;
|
||||
typedef long int64_t;
|
||||
#else
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
|
||||
namespace xgboost {
|
||||
/*! \brief namespace for helper utils of the project */
|
||||
namespace utils {
|
||||
|
||||
/*! \brief assert an condition is true, use this to handle debug information */
|
||||
inline void Assert(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
fprintf(stderr, "AssertError:");
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*!\brief same as assert, but this is intended to be used as message for user*/
|
||||
inline void Check(bool exp, const char *fmt, ...) {
|
||||
if (!exp) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief report error message, same as check */
|
||||
inline void Error(const char *fmt, ...) {
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
vfprintf(stderr, fmt, args);
|
||||
va_end(args);
|
||||
fprintf(stderr, "\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/*! \brief replace fopen, report error when the file open fails */
|
||||
inline FILE *FopenCheck(const char *fname, const char *flag) {
|
||||
FILE *fp = fopen64(fname, flag);
|
||||
Check(fp != NULL, "can not open file \"%s\"\n", fname);
|
||||
return fp;
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_UTILS_UTILS_H_
|
||||
Reference in New Issue
Block a user