complete refactor data.h, now replies on iterator to access column

This commit is contained in:
tqchen@graphlab.com
2014-08-27 17:00:21 -07:00
parent a59f8945dc
commit 605269133e
15 changed files with 216 additions and 492 deletions

View File

@@ -18,8 +18,7 @@ namespace gbm {
* \brief gradient boosted linear model
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class GBLinear : public IGradBooster<FMatrix> {
class GBLinear : public IGradBooster {
public:
virtual ~GBLinear(void) {
}
@@ -41,13 +40,12 @@ class GBLinear : public IGradBooster<FMatrix> {
virtual void InitModel(void) {
model.InitModel();
}
virtual void DoBoost(const FMatrix &fmat,
virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) {
this->InitFeatIndex(fmat);
std::vector<bst_gpair> &gpair = *in_gpair;
const int ngroup = model.param.num_output_group;
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
const std::vector<bst_uint> &rowset = p_fmat->buffered_rowset();
// for all the output group
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
@@ -72,42 +70,46 @@ class GBLinear : public IGradBooster<FMatrix> {
}
}
}
// number of features
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = feat_index[i];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
const float v = it.fvalue();
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
if (p.hess < 0.0f) continue;
sum_grad += p.grad * v;
sum_hess += p.hess * v * v;
}
float &w = model[fid][gid];
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
w += dw;
// update grad value
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
bst_gpair &p = gpair[it.rindex() * ngroup + gid];
if (p.hess < 0.0f) continue;
p.grad += p.hess * it.fvalue() * dw;
utils::IIterator<ColBatch> *iter = p_fmat->ColIterator();
while (iter->Next()) {
// number of features
const ColBatch &batch = iter->Value();
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = batch.col_index[i];
ColBatch::Inst col = batch[i];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
for (bst_uint j = 0; j < col.length; ++j) {
const float v = col[j].fvalue;
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
sum_grad += p.grad * v;
sum_hess += p.hess * v * v;
}
float &w = model[fid][gid];
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
w += dw;
// update grad value
for (bst_uint j = 0; j < col.length; ++j) {
bst_gpair &p = gpair[col[j].index * ngroup + gid];
if (p.hess < 0.0f) continue;
p.grad += p.hess * col[j].fvalue * dw;
}
}
}
}
}
virtual void Predict(const FMatrix &fmat,
virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<float> *out_preds) {
std::vector<float> &preds = *out_preds;
preds.resize(0);
// start collecting the prediction
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
iter->BeforeFirst();
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
const int ngroup = model.param.num_output_group;
while (iter->Next()) {
const RowBatch &batch = iter->Value();
@@ -134,18 +136,6 @@ class GBLinear : public IGradBooster<FMatrix> {
}
protected:
inline void InitFeatIndex(const FMatrix &fmat) {
if (feat_index.size() != 0) return;
// initialize feature index
unsigned ncol = static_cast<unsigned>(fmat.NumCol());
feat_index.reserve(ncol);
for (unsigned i = 0; i < ncol; ++i) {
if (fmat.GetColSize(i) != 0) {
feat_index.push_back(i);
}
}
random::Shuffle(feat_index);
}
inline void Pred(const RowBatch::Inst &inst, float *preds) {
for (int gid = 0; gid < model.param.num_output_group; ++gid) {
float psum = model.bias()[gid];

View File

@@ -7,6 +7,7 @@
*/
#include <vector>
#include "../data.h"
#include "../utils/io.h"
#include "../utils/fmap.h"
namespace xgboost {
@@ -14,9 +15,7 @@ namespace xgboost {
namespace gbm {
/*!
* \brief interface of gradient boosting model
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class IGradBooster {
public:
/*!
@@ -41,17 +40,17 @@ class IGradBooster {
virtual void InitModel(void) = 0;
/*!
* \brief peform update to the model(boosting)
* \param fmat feature matrix that provide access to features
* \param p_fmat feature matrix that provide access to features
* \param info meta information about training
* \param in_gpair address of the gradient pair statistics of the data
* the booster may change content of gpair
*/
virtual void DoBoost(const FMatrix &fmat,
virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) = 0;
/*!
* \brief generate predictions for given feature matrix
* \param fmat feature matrix
* \param p_fmat feature matrix
* \param buffer_offset buffer index offset of these instances, if equals -1
* this means we do not have buffer index allocated to the gbm
* a buffer index is assigned to each instance that requires repeative prediction
@@ -59,7 +58,7 @@ class IGradBooster {
* \param info extra side information that may be needed for prediction
* \param out_preds output vector to hold the predictions
*/
virtual void Predict(const FMatrix &fmat,
virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<float> *out_preds) = 0;
@@ -73,21 +72,11 @@ class IGradBooster {
// destrcutor
virtual ~IGradBooster(void){}
};
} // namespace gbm
} // namespace xgboost
#include "gbtree-inl.hpp"
#include "gblinear-inl.hpp"
namespace xgboost {
namespace gbm {
template<typename FMatrix>
inline IGradBooster<FMatrix>* CreateGradBooster(const char *name) {
if (!strcmp("gbtree", name)) return new GBTree<FMatrix>();
if (!strcmp("gblinear", name)) return new GBLinear<FMatrix>();
utils::Error("unknown booster type: %s", name);
return NULL;
}
/*!
* \breif create a gradient booster from given name
* \param name name of gradient booster
*/
IGradBooster* CreateGradBooster(const char *name);
} // namespace gbm
} // namespace xgboost
#endif // XGBOOST_GBM_GBM_H_

View File

@@ -9,16 +9,15 @@
#include <utility>
#include <string>
#include "./gbm.h"
#include "../utils/omp.h"
#include "../tree/updater.h"
namespace xgboost {
namespace gbm {
/*!
* \brief gradient boosted tree
* \tparam FMatrix the data type updater taking
*/
template<typename FMatrix>
class GBTree : public IGradBooster<FMatrix> {
class GBTree : public IGradBooster {
public:
virtual ~GBTree(void) {
this->Clear();
@@ -82,12 +81,12 @@ class GBTree : public IGradBooster<FMatrix> {
utils::Assert(mparam.num_trees == 0, "GBTree: model already initialized");
utils::Assert(trees.size() == 0, "GBTree: model already initialized");
}
virtual void DoBoost(const FMatrix &fmat,
virtual void DoBoost(IFMatrix *p_fmat,
const BoosterInfo &info,
std::vector<bst_gpair> *in_gpair) {
const std::vector<bst_gpair> &gpair = *in_gpair;
if (mparam.num_output_group == 1) {
this->BoostNewTrees(gpair, fmat, info, 0);
this->BoostNewTrees(gpair, p_fmat, info, 0);
} else {
const int ngroup = mparam.num_output_group;
utils::Check(gpair.size() % ngroup == 0,
@@ -99,11 +98,11 @@ class GBTree : public IGradBooster<FMatrix> {
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
this->BoostNewTrees(tmp, fmat, info, gid);
this->BoostNewTrees(tmp, p_fmat, info, gid);
}
}
}
virtual void Predict(const FMatrix &fmat,
virtual void Predict(IFMatrix *p_fmat,
int64_t buffer_offset,
const BoosterInfo &info,
std::vector<float> *out_preds) {
@@ -121,7 +120,7 @@ class GBTree : public IGradBooster<FMatrix> {
const size_t stride = info.num_row * mparam.num_output_group;
preds.resize(stride * (mparam.size_leaf_vector+1));
// start collecting the prediction
utils::IIterator<RowBatch> *iter = fmat.RowIterator();
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
@@ -172,7 +171,7 @@ class GBTree : public IGradBooster<FMatrix> {
char *pstr;
pstr = strtok(&tval[0], ",");
while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
updaters.push_back(tree::CreateUpdater(pstr));
for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
@@ -183,7 +182,7 @@ class GBTree : public IGradBooster<FMatrix> {
}
// do group specific group
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
IFMatrix *p_fmat,
const BoosterInfo &info,
int bst_group) {
this->InitUpdater();
@@ -198,7 +197,7 @@ class GBTree : public IGradBooster<FMatrix> {
}
// update the trees
for (size_t i = 0; i < updaters.size(); ++i) {
updaters[i]->Update(gpair, fmat, info, new_trees);
updaters[i]->Update(gpair, p_fmat, info, new_trees);
}
// push back to model
for (size_t i = 0; i < new_trees.size(); ++i) {
@@ -361,7 +360,7 @@ class GBTree : public IGradBooster<FMatrix> {
// temporal storage for per thread
std::vector<tree::RegTree::FVec> thread_temp;
// the updaters that can be applied to each of tree
std::vector< tree::IUpdater<FMatrix>* > updaters;
std::vector<tree::IUpdater*> updaters;
};
} // namespace gbm