complete refactor data.h, now replies on iterator to access column
This commit is contained in:
@@ -18,8 +18,7 @@ namespace gbm {
|
||||
* \brief gradient boosted linear model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBLinear : public IGradBooster<FMatrix> {
|
||||
class GBLinear : public IGradBooster {
|
||||
public:
|
||||
virtual ~GBLinear(void) {
|
||||
}
|
||||
@@ -41,13 +40,12 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
virtual void InitModel(void) {
|
||||
model.InitModel();
|
||||
}
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) {
|
||||
this->InitFeatIndex(fmat);
|
||||
std::vector<bst_gpair> &gpair = *in_gpair;
|
||||
const int ngroup = model.param.num_output_group;
|
||||
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
|
||||
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
|
||||
// for all the output group
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
@@ -72,42 +70,46 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
}
|
||||
}
|
||||
}
|
||||
// number of features
|
||||
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const bst_uint fid = feat_index[i];
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
const float v = it.fvalue();
|
||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
sum_grad += p.grad * v;
|
||||
sum_hess += p.hess * v * v;
|
||||
}
|
||||
float &w = model[fid][gid];
|
||||
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
||||
w += dw;
|
||||
// update grad value
|
||||
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
|
||||
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
p.grad += p.hess * it.fvalue() * dw;
|
||||
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
|
||||
while (iter->Next()) {
|
||||
// number of features
|
||||
const ColBatch &batch = iter->Value();
|
||||
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nfeat; ++i) {
|
||||
const bst_uint fid = batch.col_index[i];
|
||||
ColBatch::Inst col = batch[i];
|
||||
for (int gid = 0; gid < ngroup; ++gid) {
|
||||
double sum_grad = 0.0, sum_hess = 0.0;
|
||||
for (bst_uint j = 0; j < col.length; ++j) {
|
||||
const float v = col[j].fvalue;
|
||||
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
sum_grad += p.grad * v;
|
||||
sum_hess += p.hess * v * v;
|
||||
}
|
||||
float &w = model[fid][gid];
|
||||
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
|
||||
w += dw;
|
||||
// update grad value
|
||||
for (bst_uint j = 0; j < col.length; ++j) {
|
||||
bst_gpair &p = gpair[col[j].index * ngroup + gid];
|
||||
if (p.hess < 0.0f) continue;
|
||||
p.grad += p.hess * col[j].fvalue * dw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) {
|
||||
std::vector<float> &preds = *out_preds;
|
||||
preds.resize(0);
|
||||
// start collecting the prediction
|
||||
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
const int ngroup = model.param.num_output_group;
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
@@ -134,18 +136,6 @@ class GBLinear : public IGradBooster<FMatrix> {
|
||||
}
|
||||
|
||||
protected:
|
||||
inline void InitFeatIndex(const FMatrix &fmat) {
|
||||
if (feat_index.size() != 0) return;
|
||||
// initialize feature index
|
||||
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
|
||||
feat_index.reserve(ncol);
|
||||
for (unsigned i = 0; i < ncol; ++i) {
|
||||
if (fmat.GetColSize(i) != 0) {
|
||||
feat_index.push_back(i);
|
||||
}
|
||||
}
|
||||
random::Shuffle(feat_index);
|
||||
}
|
||||
inline void Pred(const RowBatch::Inst &inst, float *preds) {
|
||||
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
|
||||
float psum = model.bias()[gid];
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
#include <vector>
|
||||
#include "../data.h"
|
||||
#include "../utils/io.h"
|
||||
#include "../utils/fmap.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -14,9 +15,7 @@ namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief interface of gradient boosting model
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class IGradBooster {
|
||||
public:
|
||||
/*!
|
||||
@@ -41,17 +40,17 @@ class IGradBooster {
|
||||
virtual void InitModel(void) = 0;
|
||||
/*!
|
||||
* \brief peform update to the model(boosting)
|
||||
* \param fmat feature matrix that provide access to features
|
||||
* \param p_fmat feature matrix that provide access to features
|
||||
* \param info meta information about training
|
||||
* \param in_gpair address of the gradient pair statistics of the data
|
||||
* the booster may change content of gpair
|
||||
*/
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) = 0;
|
||||
/*!
|
||||
* \brief generate predictions for given feature matrix
|
||||
* \param fmat feature matrix
|
||||
* \param p_fmat feature matrix
|
||||
* \param buffer_offset buffer index offset of these instances, if equals -1
|
||||
* this means we do not have buffer index allocated to the gbm
|
||||
* a buffer index is assigned to each instance that requires repeative prediction
|
||||
@@ -59,7 +58,7 @@ class IGradBooster {
|
||||
* \param info extra side information that may be needed for prediction
|
||||
* \param out_preds output vector to hold the predictions
|
||||
*/
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) = 0;
|
||||
@@ -73,21 +72,11 @@ class IGradBooster {
|
||||
// destrcutor
|
||||
virtual ~IGradBooster(void){}
|
||||
};
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
|
||||
#include "gbtree-inl.hpp"
|
||||
#include "gblinear-inl.hpp"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
template<typename FMatrix>
|
||||
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
|
||||
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
|
||||
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
|
||||
utils::Error("unknown booster type: %s", name);
|
||||
return NULL;
|
||||
}
|
||||
/*!
|
||||
* \breif create a gradient booster from given name
|
||||
* \param name name of gradient booster
|
||||
*/
|
||||
IGradBooster* CreateGradBooster(const char *name);
|
||||
} // namespace gbm
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_GBM_GBM_H_
|
||||
|
||||
@@ -9,16 +9,15 @@
|
||||
#include <utility>
|
||||
#include <string>
|
||||
#include "./gbm.h"
|
||||
#include "../utils/omp.h"
|
||||
#include "../tree/updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace gbm {
|
||||
/*!
|
||||
* \brief gradient boosted tree
|
||||
* \tparam FMatrix the data type updater taking
|
||||
*/
|
||||
template<typename FMatrix>
|
||||
class GBTree : public IGradBooster<FMatrix> {
|
||||
class GBTree : public IGradBooster {
|
||||
public:
|
||||
virtual ~GBTree(void) {
|
||||
this->Clear();
|
||||
@@ -82,12 +81,12 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
|
||||
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
|
||||
}
|
||||
virtual void DoBoost(const FMatrix &fmat,
|
||||
virtual void DoBoost(IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
std::vector<bst_gpair> *in_gpair) {
|
||||
const std::vector<bst_gpair> &gpair = *in_gpair;
|
||||
if (mparam.num_output_group == 1) {
|
||||
this->BoostNewTrees(gpair, fmat, info, 0);
|
||||
this->BoostNewTrees(gpair, p_fmat, info, 0);
|
||||
} else {
|
||||
const int ngroup = mparam.num_output_group;
|
||||
utils::Check(gpair.size() % ngroup == 0,
|
||||
@@ -99,11 +98,11 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
tmp[i] = gpair[i * ngroup + gid];
|
||||
}
|
||||
this->BoostNewTrees(tmp, fmat, info, gid);
|
||||
this->BoostNewTrees(tmp, p_fmat, info, gid);
|
||||
}
|
||||
}
|
||||
}
|
||||
virtual void Predict(const FMatrix &fmat,
|
||||
virtual void Predict(IFMatrix *p_fmat,
|
||||
int64_t buffer_offset,
|
||||
const BoosterInfo &info,
|
||||
std::vector<float> *out_preds) {
|
||||
@@ -121,7 +120,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
const size_t stride = info.num_row * mparam.num_output_group;
|
||||
preds.resize(stride * (mparam.size_leaf_vector+1));
|
||||
// start collecting the prediction
|
||||
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
|
||||
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const RowBatch &batch = iter->Value();
|
||||
@@ -172,7 +171,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
char *pstr;
|
||||
pstr = strtok(&tval[0], ",");
|
||||
while (pstr != NULL) {
|
||||
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
|
||||
updaters.push_back(tree::CreateUpdater(pstr));
|
||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||
// set parameters
|
||||
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
|
||||
@@ -183,7 +182,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
}
|
||||
// do group specific group
|
||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||
const FMatrix &fmat,
|
||||
IFMatrix *p_fmat,
|
||||
const BoosterInfo &info,
|
||||
int bst_group) {
|
||||
this->InitUpdater();
|
||||
@@ -198,7 +197,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
}
|
||||
// update the trees
|
||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||
updaters[i]->Update(gpair, fmat, info, new_trees);
|
||||
updaters[i]->Update(gpair, p_fmat, info, new_trees);
|
||||
}
|
||||
// push back to model
|
||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||
@@ -361,7 +360,7 @@ class GBTree : public IGradBooster<FMatrix> {
|
||||
// temporal storage for per thread
|
||||
std::vector<tree::RegTree::FVec> thread_temp;
|
||||
// the updaters that can be applied to each of tree
|
||||
std::vector< tree::IUpdater<FMatrix>* > updaters;
|
||||
std::vector<tree::IUpdater*> updaters;
|
||||
};
|
||||
|
||||
} // namespace gbm
|
||||
|
||||
Reference in New Issue
Block a user