change row subsample to prob

This commit is contained in:
tqchen@graphlab.com 2014-08-19 12:07:52 -07:00
parent 91e70c76ff
commit 9caccd3b36
3 changed files with 69 additions and 55 deletions

View File

@ -14,6 +14,7 @@
#include "utils/io.h" #include "utils/io.h"
#include "utils/utils.h" #include "utils/utils.h"
#include "utils/iterator.h" #include "utils/iterator.h"
#include "utils/random.h"
#include "utils/matrix_csr.h" #include "utils/matrix_csr.h"
namespace xgboost { namespace xgboost {
@ -184,7 +185,6 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
/*! \brief constructor */ /*! \brief constructor */
FMatrixS(void) { FMatrixS(void) {
iter_ = NULL; iter_ = NULL;
num_buffered_row_ = 0;
} }
// destructor // destructor
~FMatrixS(void) { ~FMatrixS(void) {
@ -200,8 +200,8 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
return col_ptr_.size() - 1; return col_ptr_.size() - 1;
} }
/*! \brief get number of buffered rows */ /*! \brief get number of buffered rows */
inline size_t NumBufferedRow(void) const { inline const std::vector<bst_uint> buffered_rowset(void) const {
return num_buffered_row_; return buffered_rowset_;
} }
/*! \brief get col sorted iterator */ /*! \brief get col sorted iterator */
inline ColIter GetSortedCol(size_t cidx) const { inline ColIter GetSortedCol(size_t cidx) const {
@ -224,12 +224,12 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
} }
/*! \brief get column density */ /*! \brief get column density */
inline float GetColDensity(size_t cidx) const { inline float GetColDensity(size_t cidx) const {
size_t nmiss = num_buffered_row_ - (col_ptr_[cidx+1] - col_ptr_[cidx]); size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]);
return 1.0f - (static_cast<float>(nmiss)) / num_buffered_row_; return 1.0f - (static_cast<float>(nmiss)) / buffered_rowset_.size();
} }
inline void InitColAccess(size_t max_nrow = ULONG_MAX) { inline void InitColAccess(float pkeep = 1.0f) {
if (this->HaveColAccess()) return; if (this->HaveColAccess()) return;
this->InitColData(max_nrow); this->InitColData(pkeep);
} }
/*! /*!
* \brief get the row iterator associated with FMatrix * \brief get the row iterator associated with FMatrix
@ -248,8 +248,8 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
* \param fo output stream to save to * \param fo output stream to save to
*/ */
inline void SaveColAccess(utils::IStream &fo) const { inline void SaveColAccess(utils::IStream &fo) const {
fo.Write(&num_buffered_row_, sizeof(num_buffered_row_)); fo.Write(buffered_rowset_);
if (num_buffered_row_ != 0) { if (buffered_rowset_.size() != 0) {
SaveBinary(fo, col_ptr_, col_data_); SaveBinary(fo, col_ptr_, col_data_);
} }
} }
@ -258,9 +258,8 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
* \param fo output stream to load from * \param fo output stream to load from
*/ */
inline void LoadColAccess(utils::IStream &fi) { inline void LoadColAccess(utils::IStream &fi) {
utils::Check(fi.Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0, utils::Check(fi.Read(&buffered_rowset_), "invalid input file format");
"invalid input file format"); if (buffered_rowset_.size() != 0) {
if (num_buffered_row_ != 0) {
LoadBinary(fi, &col_ptr_, &col_data_); LoadBinary(fi, &col_ptr_, &col_data_);
} }
} }
@ -304,39 +303,43 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
protected: protected:
/*! /*!
* \brief intialize column data * \brief intialize column data
* \param max_nrow maximum number of rows supported * \param pkeep probability to keep a row
*/ */
inline void InitColData(size_t max_nrow) { inline void InitColData(float pkeep) {
buffered_rowset_.clear();
// note: this part of code is serial, todo, parallelize this transformer // note: this part of code is serial, todo, parallelize this transformer
utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_); utils::SparseCSRMBuilder<SparseBatch::Entry> builder(col_ptr_, col_data_);
builder.InitBudget(0); builder.InitBudget(0);
// start working // start working
iter_->BeforeFirst(); iter_->BeforeFirst();
num_buffered_row_ = 0;
while (iter_->Next()) { while (iter_->Next()) {
const SparseBatch &batch = iter_->Value(); const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break; for (size_t i = 0; i < batch.size; ++i) {
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); if (pkeep==1.0f || random::SampleBinary(pkeep)) {
for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) { buffered_rowset_.push_back(batch.base_rowid+i);
SparseBatch::Inst inst = batch[i]; SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) { for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].findex); builder.AddBudget(inst[j].findex);
}
} }
} }
} }
builder.InitStorage(); builder.InitStorage();
iter_->BeforeFirst(); iter_->BeforeFirst();
size_t ktop = 0;
while (iter_->Next()) { while (iter_->Next()) {
const SparseBatch &batch = iter_->Value(); const SparseBatch &batch = iter_->Value();
if (batch.base_rowid >= max_nrow) break; for (size_t i = 0; i < batch.size; ++i) {
const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); if (ktop < buffered_rowset_.size() &&
for (size_t i = 0; i < nbatch; ++i) { buffered_rowset_[ktop] == batch.base_rowid+i) {
SparseBatch::Inst inst = batch[i]; ++ ktop;
for (bst_uint j = 0; j < inst.length; ++j) { SparseBatch::Inst inst = batch[i];
builder.PushElem(inst[j].findex, for (bst_uint j = 0; j < inst.length; ++j) {
Entry((bst_uint)(batch.base_rowid+i), builder.PushElem(inst[j].findex,
inst[j].fvalue)); Entry((bst_uint)(batch.base_rowid+i),
inst[j].fvalue));
}
} }
} }
} }
@ -353,8 +356,8 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
private: private:
// --- data structure used to support InitColAccess -- // --- data structure used to support InitColAccess --
utils::IIterator<SparseBatch> *iter_; utils::IIterator<SparseBatch> *iter_;
/*! \brief number */ /*! \brief list of row index that are buffered */
size_t num_buffered_row_; std::vector<bst_uint> buffered_rowset_;
/*! \brief column pointer of CSC format */ /*! \brief column pointer of CSC format */
std::vector<size_t> col_ptr_; std::vector<size_t> col_ptr_;
/*! \brief column datas in CSC format */ /*! \brief column datas in CSC format */

View File

@ -30,7 +30,7 @@ class BoostLearner {
name_obj_ = "reg:linear"; name_obj_ = "reg:linear";
name_gbm_ = "gbtree"; name_gbm_ = "gbtree";
silent= 0; silent= 0;
max_buffer_row = std::numeric_limits<size_t>::max(); prob_buffer_row = 1.0f;
} }
~BoostLearner(void) { ~BoostLearner(void) {
if (obj_ != NULL) delete obj_; if (obj_ != NULL) delete obj_;
@ -80,7 +80,7 @@ class BoostLearner {
*/ */
inline void SetParam(const char *name, const char *val) { inline void SetParam(const char *name, const char *val) {
if (!strcmp(name, "silent")) silent = atoi(val); if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "max_buffer_row")) sscanf(val, "%lu", &max_buffer_row); if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val); if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
if (!strcmp("seed", name)) random::Seed(atoi(val)); if (!strcmp("seed", name)) random::Seed(atoi(val));
if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val); if (!strcmp(name, "num_class")) this->SetParam("num_output_group", val);
@ -151,7 +151,7 @@ class BoostLearner {
* \param p_train pointer to the matrix used by training * \param p_train pointer to the matrix used by training
*/ */
inline void CheckInit(DMatrix<FMatrix> *p_train) { inline void CheckInit(DMatrix<FMatrix> *p_train) {
p_train->fmat.InitColAccess(max_buffer_row); p_train->fmat.InitColAccess(prob_buffer_row);
} }
/*! /*!
* \brief update the model for one iteration * \brief update the model for one iteration
@ -293,7 +293,7 @@ class BoostLearner {
// silent during training // silent during training
int silent; int silent;
// maximum buffred row value // maximum buffred row value
size_t max_buffer_row; float prob_buffer_row;
// evaluation set // evaluation set
EvalSet evaluator_; EvalSet evaluator_;
// model parameter // model parameter

View File

@ -80,13 +80,13 @@ class ColMaker: public IUpdater<FMatrix> {
const std::vector<unsigned> &root_index, const std::vector<unsigned> &root_index,
RegTree *p_tree) { RegTree *p_tree) {
this->InitData(gpair, fmat, root_index, *p_tree); this->InitData(gpair, fmat, root_index, *p_tree);
this->InitNewNode(qexpand, gpair, *p_tree); this->InitNewNode(qexpand, gpair, fmat, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) { for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree); this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree); this->ResetPosition(this->qexpand, fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand); this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, *p_tree); this->InitNewNode(qexpand, gpair, fmat, *p_tree);
// if nothing left to be expand, break // if nothing left to be expand, break
if (qexpand.size() == 0) break; if (qexpand.size() == 0) break;
} }
@ -109,25 +109,31 @@ class ColMaker: public IUpdater<FMatrix> {
const FMatrix &fmat, const FMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) { const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree"); utils::Assert(tree.param.num_nodes == tree.param.num_roots, "ColMaker: can only grow new tree");
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
{// setup position {// setup position
position.resize(fmat.NumBufferedRow()); position.resize(gpair.size());
if (root_index.size() == 0) { if (root_index.size() == 0) {
std::fill(position.begin(), position.end(), 0); for (size_t i = 0; i < rowset.size(); ++i) {
position[rowset[i]] = 0;
}
} else { } else {
for (size_t i = 0; i < position.size(); ++i) { for (size_t i = 0; i < rowset.size(); ++i) {
position[i] = root_index[i]; const bst_uint ridx = rowset[i];
utils::Assert(root_index[i] < (unsigned)tree.param.num_roots, "root index exceed setting"); position[ridx] = root_index[ridx];
utils::Assert(root_index[ridx] < (unsigned)tree.param.num_roots, "root index exceed setting");
} }
} }
// mark delete for the deleted datas // mark delete for the deleted datas
for (size_t i = 0; i < position.size(); ++i) { for (size_t i = 0; i < rowset.size(); ++i) {
if (gpair[i].hess < 0.0f) position[i] = -1; const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f) position[ridx] = -1;
} }
// mark subsample // mark subsample
if (param.subsample < 1.0f) { if (param.subsample < 1.0f) {
for (size_t i = 0; i < position.size(); ++i) { for (size_t i = 0; i < rowset.size(); ++i) {
if (gpair[i].hess < 0.0f) continue; const bst_uint ridx = rowset[i];
if (random::SampleBinary(param.subsample) == 0) position[i] = -1; if (gpair[ridx].hess < 0.0f) continue;
if (random::SampleBinary(param.subsample) == 0) position[ridx] = -1;
} }
} }
} }
@ -168,6 +174,7 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */ /*! \brief initialize the base_weight, root_gain, and NodeEntry for all the new nodes in qexpand */
inline void InitNewNode(const std::vector<int> &qexpand, inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
const RegTree &tree) { const RegTree &tree) {
{// setup statistics space for each tree node {// setup statistics space for each tree node
for (size_t i = 0; i < stemp.size(); ++i) { for (size_t i = 0; i < stemp.size(); ++i) {
@ -175,13 +182,15 @@ class ColMaker: public IUpdater<FMatrix> {
} }
snode.resize(tree.param.num_nodes, NodeEntry()); snode.resize(tree.param.num_nodes, NodeEntry());
} }
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// setup position // setup position
const unsigned ndata = static_cast<unsigned>(position.size()); const unsigned ndata = static_cast<unsigned>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) { for (unsigned i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
if (position[i] < 0) continue; if (position[ridx] < 0) continue;
stemp[tid][position[i]].stats.Add(gpair[i]); stemp[tid][position[ridx]].stats.Add(gpair[ridx]);
} }
// sum the per thread statistics together // sum the per thread statistics together
for (size_t j = 0; j < qexpand.size(); ++j) { for (size_t j = 0; j < qexpand.size(); ++j) {
@ -303,17 +312,19 @@ class ColMaker: public IUpdater<FMatrix> {
} }
// reset position of each data points after split is created in the tree // reset position of each data points after split is created in the tree
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) { inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1 // step 1, set default direct nodes to default, and leaf nodes to -1
const unsigned ndata = static_cast<unsigned>(position.size()); const unsigned ndata = static_cast<unsigned>(rowset.size());
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) { for (unsigned i = 0; i < ndata; ++i) {
const int nid = position[i]; const bst_uint ridx = rowset[i];
const int nid = position[ridx];
if (nid >= 0) { if (nid >= 0) {
if (tree[nid].is_leaf()) { if (tree[nid].is_leaf()) {
position[i] = -1; position[ridx] = -1;
} else { } else {
// push to default branch, correct latter // push to default branch, correct latter
position[i] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright(); position[ridx] = tree[nid].default_left() ? tree[nid].cleft(): tree[nid].cright();
} }
} }
} }