This commit is contained in:
tqchen
2014-11-06 15:37:23 -08:00
parent ca96468745
commit 539fce2856
7 changed files with 257 additions and 16 deletions

View File

@@ -15,7 +15,7 @@ IUpdater* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner();
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
if (!strcmp(name, "grow_histmaker")) return new HistMaker<GradStats>();
if (!strcmp(name, "grow_histmaker")) return new QuantileHistMaker<GradStats>();
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();

View File

@@ -8,6 +8,7 @@
#include <vector>
#include <algorithm>
#include "../sync/sync.h"
#include "../utils/quantile.h"
namespace xgboost {
namespace tree {
@@ -140,7 +141,13 @@ class HistMaker: public IUpdater {
}
return n.cdefault();
}
// this function does two jobs
// (1) reset the position in array position, to be the latest leaf id
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) = 0;
private:
virtual void Update(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
@@ -160,7 +167,8 @@ class HistMaker: public IUpdater {
inline void InitData(const std::vector<bst_gpair> &gpair,
const IFMatrix &fmat,
const std::vector<unsigned> &root_index, const RegTree &tree) {
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "HistMaker: can only grow new tree");
utils::Assert(tree.param.num_nodes == tree.param.num_roots,
"HistMaker: can only grow new tree");
{// setup position
position.resize(gpair.size());
if (root_index.size() == 0) {
@@ -212,15 +220,6 @@ class HistMaker: public IUpdater {
node2workindex[qexpand[i]] = static_cast<int>(i);
}
}
// this function does two jobs
// (1) reset the position in array position, to be the latest leaf id
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) {
}
// create histogram for a setup histset
inline void CreateHist(const std::vector<bst_gpair> &gpair,
IFMatrix *p_fmat,
const BoosterInfo &info,
@@ -250,7 +249,7 @@ class HistMaker: public IUpdater {
const int nid = position[ridx];
if (nid >= 0) {
utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
const int wid = node2workindex[nid];
const int wid = node2workindex[nid];
for (bst_uint i = 0; i < inst.length; ++i) {
utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
// feature histogram
@@ -312,7 +311,8 @@ class HistMaker: public IUpdater {
#pragma omp parallel for schedule(dynamic, 1)
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
const int nid = qexpand[wid];
utils::Assert(node2workindex[nid] == static_cast<int>(wid), "node2workindex inconsistent");
utils::Assert(node2workindex[nid] == static_cast<int>(wid),
"node2workindex inconsistent");
SplitEntry &best = sol[wid];
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
for (bst_uint fid = 0; fid < num_feature; ++ fid) {
@@ -345,6 +345,36 @@ class HistMaker: public IUpdater {
}
};
// hist maker that propose using quantile sketch
template<typename TStats>
class QuantileHistMaker: public HistMaker<TStats> {
protected:
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
const BoosterInfo &info,
const RegTree &tree) {
// start accumulating statistics
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
const RowBatch &batch = iter->Value();
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nbatch; ++i) {
RowBatch::Inst inst = batch[i];
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
int nid = this->position[ridx];
if (nid >= 0) {
if (tree[nid].is_leaf()) {
this->position[ridx] = ~nid;
} else {
this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
// todo add the cut point setup
}
}
}
}
}
};
} // namespace tree
} // namespace xgboost

111
src/utils/group_data.h Normal file
View File

@@ -0,0 +1,111 @@
#ifndef XGBOOST_UTILS_GROUP_DATA_H_
#define XGBOOST_UTILS_GROUP_DATA_H_
/*!
* \file group_data.h
* \brief this file defines utils to group data by integer keys
* Input: given input sequence (key,value), (k1,v1), (k2,v2)
* Ouptupt: an array of values data = [v1,v2,v3 .. vn]
* and a group pointer ptr,
* data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
*
* This can be used to construct CSR/CSC matrix from un-ordered input
* The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
* \author Tianqi Chen
*/
namespace xgboost {
namespace utils {
/*!
* \brief multi-thread version of group builder
* \tparam ValueType type of entries in the sparse matrix
* \tparam SizeType type of the index range holder
*/
template<typename ValueType, typename SizeType = size_t>
struct ParallelGroupBuilder {
public:
// parallel group builder of data
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data)
: rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
}
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data,
std::vector< std::vector<SizeType> > *p_thread_rptr)
: rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
}
public:
/*!
* \brief step 1: initialize the helper, with hint of number keys
* and thread used in the construction
* \param nkeys number of keys in the matrix, can be smaller than expected
* \param nthread number of thread that will be used in construction
*/
inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
thread_rptr.resize(nthread);
for (size_t i = 0; i < thread_rptr.size(); ++i) {
thread_rptr[i].resize(nkeys);
std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
}
}
/*!
* \brief step 2: add budget to each key
* \param key the key
* \param threadid the id of thread that calls this function
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
std::vector<SizeType> &trptr = thread_rptr[threadid];
if (trptr.size() < key + 1) {
trptr.resize(key + 1, 0);
}
trptr[key] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage(void) {
// set rptr to correct size
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
if (rptr.size() <= thread_rptr[tid].size()) {
rptr.resize(thread_rptr[tid].size()+1);
}
}
// initialize rptr to be beginning of each segment
size_t start = 0;
for (size_t i = 0; i + 1 < rptr.size(); ++i) {
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
std::vector<SizeType> &trptr = thread_rptr[tid];
if (i < trptr.size()) {
size_t ncnt = trptr[i];
trptr[i] = start;
start += ncnt;
}
}
rptr[i + 1] = start;
}
data.resize(start);
}
/*!
* \brief step 4: add data to the allocated space,
* the calls to this function should be exactly match previous call to AddBudget
*
* \param key the key of
* \param threadid the id of thread that calls this function
*/
inline void Push(size_t key, ValueType value, int threadid = 0) {
SizeType &rp = thread_rptr[threadid][key];
data[rp++] = value;
}
private:
/*! \brief pointer to the beginning and end of each continuous key */
std::vector<SizeType> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<ValueType> &data;
/*! \brief thread local data structure */
std::vector< std::vector<SizeType> > &thread_rptr;
/*! \brief local temp thread ptr, use this if not specified by the constructor */
std::vector< std::vector<SizeType> > tmp_thread_rptr;
};
} // namespace utils
} // namespace xgboost
#endif

View File

@@ -256,7 +256,6 @@ struct SparseCSRFileBuilder {
/*! \brief saved top space of each item */
std::vector<IndexType> buffer_data;
};
} // namespace utils
} // namespace xgboost
#endif