ok
This commit is contained in:
parent
ca96468745
commit
539fce2856
2
Makefile
2
Makefile
@ -74,4 +74,4 @@ Rpack:
|
|||||||
R CMD check --as-cran xgboost*.tar.gz
|
R CMD check --as-cran xgboost*.tar.gz
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) $(OBJ) $(BIN) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
$(RM) $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o */*.o */*/*.o *~ */*~ */*/*~
|
||||||
|
|||||||
@ -15,7 +15,7 @@ IUpdater* CreateUpdater(const char *name) {
|
|||||||
if (!strcmp(name, "prune")) return new TreePruner();
|
if (!strcmp(name, "prune")) return new TreePruner();
|
||||||
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
|
if (!strcmp(name, "refresh")) return new TreeRefresher<GradStats>();
|
||||||
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
|
if (!strcmp(name, "grow_colmaker")) return new ColMaker<GradStats>();
|
||||||
if (!strcmp(name, "grow_histmaker")) return new HistMaker<GradStats>();
|
if (!strcmp(name, "grow_histmaker")) return new QuantileHistMaker<GradStats>();
|
||||||
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
|
if (!strcmp(name, "distcol")) return new DistColMaker<GradStats>();
|
||||||
if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
|
if (!strcmp(name, "grow_colmaker5")) return new ColMaker< CVGradStats<5> >();
|
||||||
if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
|
if (!strcmp(name, "grow_colmaker3")) return new ColMaker< CVGradStats<3> >();
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include "../sync/sync.h"
|
#include "../sync/sync.h"
|
||||||
|
#include "../utils/quantile.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
@ -140,7 +141,13 @@ class HistMaker: public IUpdater {
|
|||||||
}
|
}
|
||||||
return n.cdefault();
|
return n.cdefault();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this function does two jobs
|
||||||
|
// (1) reset the position in array position, to be the latest leaf id
|
||||||
|
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
|
||||||
|
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
|
||||||
|
const BoosterInfo &info,
|
||||||
|
const RegTree &tree) = 0;
|
||||||
private:
|
private:
|
||||||
virtual void Update(const std::vector<bst_gpair> &gpair,
|
virtual void Update(const std::vector<bst_gpair> &gpair,
|
||||||
IFMatrix *p_fmat,
|
IFMatrix *p_fmat,
|
||||||
@ -160,7 +167,8 @@ class HistMaker: public IUpdater {
|
|||||||
inline void InitData(const std::vector<bst_gpair> &gpair,
|
inline void InitData(const std::vector<bst_gpair> &gpair,
|
||||||
const IFMatrix &fmat,
|
const IFMatrix &fmat,
|
||||||
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
const std::vector<unsigned> &root_index, const RegTree &tree) {
|
||||||
utils::Assert(tree.param.num_nodes == tree.param.num_roots, "HistMaker: can only grow new tree");
|
utils::Assert(tree.param.num_nodes == tree.param.num_roots,
|
||||||
|
"HistMaker: can only grow new tree");
|
||||||
{// setup position
|
{// setup position
|
||||||
position.resize(gpair.size());
|
position.resize(gpair.size());
|
||||||
if (root_index.size() == 0) {
|
if (root_index.size() == 0) {
|
||||||
@ -212,15 +220,6 @@ class HistMaker: public IUpdater {
|
|||||||
node2workindex[qexpand[i]] = static_cast<int>(i);
|
node2workindex[qexpand[i]] = static_cast<int>(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// this function does two jobs
|
|
||||||
// (1) reset the position in array position, to be the latest leaf id
|
|
||||||
// (2) propose a set of candidate cuts and set wspace.rptr wspace.cut correctly
|
|
||||||
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
|
|
||||||
const BoosterInfo &info,
|
|
||||||
const RegTree &tree) {
|
|
||||||
|
|
||||||
}
|
|
||||||
// create histogram for a setup histset
|
|
||||||
inline void CreateHist(const std::vector<bst_gpair> &gpair,
|
inline void CreateHist(const std::vector<bst_gpair> &gpair,
|
||||||
IFMatrix *p_fmat,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
@ -250,7 +249,7 @@ class HistMaker: public IUpdater {
|
|||||||
const int nid = position[ridx];
|
const int nid = position[ridx];
|
||||||
if (nid >= 0) {
|
if (nid >= 0) {
|
||||||
utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
|
utils::Assert(tree[nid].is_leaf(), "CreateHist happens in leaf");
|
||||||
const int wid = node2workindex[nid];
|
const int wid = node2workindex[nid];
|
||||||
for (bst_uint i = 0; i < inst.length; ++i) {
|
for (bst_uint i = 0; i < inst.length; ++i) {
|
||||||
utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
|
utils::Assert(inst[i].index < num_feature, "feature index exceed bound");
|
||||||
// feature histogram
|
// feature histogram
|
||||||
@ -312,7 +311,8 @@ class HistMaker: public IUpdater {
|
|||||||
#pragma omp parallel for schedule(dynamic, 1)
|
#pragma omp parallel for schedule(dynamic, 1)
|
||||||
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
|
for (bst_omp_uint wid = 0; wid < nexpand; ++ wid) {
|
||||||
const int nid = qexpand[wid];
|
const int nid = qexpand[wid];
|
||||||
utils::Assert(node2workindex[nid] == static_cast<int>(wid), "node2workindex inconsistent");
|
utils::Assert(node2workindex[nid] == static_cast<int>(wid),
|
||||||
|
"node2workindex inconsistent");
|
||||||
SplitEntry &best = sol[wid];
|
SplitEntry &best = sol[wid];
|
||||||
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
TStats &node_sum = wspace.hset[0][num_feature + wid * (num_feature + 1)].data[0];
|
||||||
for (bst_uint fid = 0; fid < num_feature; ++ fid) {
|
for (bst_uint fid = 0; fid < num_feature; ++ fid) {
|
||||||
@ -345,6 +345,36 @@ class HistMaker: public IUpdater {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// hist maker that propose using quantile sketch
|
||||||
|
template<typename TStats>
|
||||||
|
class QuantileHistMaker: public HistMaker<TStats> {
|
||||||
|
protected:
|
||||||
|
virtual void ResetPosAndPropose(IFMatrix *p_fmat,
|
||||||
|
const BoosterInfo &info,
|
||||||
|
const RegTree &tree) {
|
||||||
|
// start accumulating statistics
|
||||||
|
utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
|
||||||
|
iter->BeforeFirst();
|
||||||
|
while (iter->Next()) {
|
||||||
|
const RowBatch &batch = iter->Value();
|
||||||
|
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nbatch; ++i) {
|
||||||
|
RowBatch::Inst inst = batch[i];
|
||||||
|
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
|
||||||
|
int nid = this->position[ridx];
|
||||||
|
if (nid >= 0) {
|
||||||
|
if (tree[nid].is_leaf()) {
|
||||||
|
this->position[ridx] = ~nid;
|
||||||
|
} else {
|
||||||
|
this->position[ridx] = nid = HistMaker<TStats>::NextLevel(inst, tree, nid);
|
||||||
|
// todo add the cut point setup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
111
src/utils/group_data.h
Normal file
111
src/utils/group_data.h
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
#ifndef XGBOOST_UTILS_GROUP_DATA_H_
|
||||||
|
#define XGBOOST_UTILS_GROUP_DATA_H_
|
||||||
|
/*!
|
||||||
|
* \file group_data.h
|
||||||
|
* \brief this file defines utils to group data by integer keys
|
||||||
|
* Input: given input sequence (key,value), (k1,v1), (k2,v2)
|
||||||
|
* Ouptupt: an array of values data = [v1,v2,v3 .. vn]
|
||||||
|
* and a group pointer ptr,
|
||||||
|
* data[ptr[k]:ptr[k+1]] contains values that corresponds to key k
|
||||||
|
*
|
||||||
|
* This can be used to construct CSR/CSC matrix from un-ordered input
|
||||||
|
* The major algorithm is a two pass linear scan algorithm that requires two pass scan over the data
|
||||||
|
* \author Tianqi Chen
|
||||||
|
*/
|
||||||
|
namespace xgboost {
|
||||||
|
namespace utils {
|
||||||
|
/*!
|
||||||
|
* \brief multi-thread version of group builder
|
||||||
|
* \tparam ValueType type of entries in the sparse matrix
|
||||||
|
* \tparam SizeType type of the index range holder
|
||||||
|
*/
|
||||||
|
template<typename ValueType, typename SizeType = size_t>
|
||||||
|
struct ParallelGroupBuilder {
|
||||||
|
public:
|
||||||
|
// parallel group builder of data
|
||||||
|
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
|
||||||
|
std::vector<ValueType> *p_data)
|
||||||
|
: rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
|
||||||
|
}
|
||||||
|
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
|
||||||
|
std::vector<ValueType> *p_data,
|
||||||
|
std::vector< std::vector<SizeType> > *p_thread_rptr)
|
||||||
|
: rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
/*!
|
||||||
|
* \brief step 1: initialize the helper, with hint of number keys
|
||||||
|
* and thread used in the construction
|
||||||
|
* \param nkeys number of keys in the matrix, can be smaller than expected
|
||||||
|
* \param nthread number of thread that will be used in construction
|
||||||
|
*/
|
||||||
|
inline void InitBudget(size_t nkeys = 0, int nthread = 1) {
|
||||||
|
thread_rptr.resize(nthread);
|
||||||
|
for (size_t i = 0; i < thread_rptr.size(); ++i) {
|
||||||
|
thread_rptr[i].resize(nkeys);
|
||||||
|
std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief step 2: add budget to each key
|
||||||
|
* \param key the key
|
||||||
|
* \param threadid the id of thread that calls this function
|
||||||
|
* \param nelem number of element budget add to this row
|
||||||
|
*/
|
||||||
|
inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) {
|
||||||
|
std::vector<SizeType> &trptr = thread_rptr[threadid];
|
||||||
|
if (trptr.size() < key + 1) {
|
||||||
|
trptr.resize(key + 1, 0);
|
||||||
|
}
|
||||||
|
trptr[key] += nelem;
|
||||||
|
}
|
||||||
|
/*! \brief step 3: initialize the necessary storage */
|
||||||
|
inline void InitStorage(void) {
|
||||||
|
// set rptr to correct size
|
||||||
|
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
|
||||||
|
if (rptr.size() <= thread_rptr[tid].size()) {
|
||||||
|
rptr.resize(thread_rptr[tid].size()+1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// initialize rptr to be beginning of each segment
|
||||||
|
size_t start = 0;
|
||||||
|
for (size_t i = 0; i + 1 < rptr.size(); ++i) {
|
||||||
|
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
|
||||||
|
std::vector<SizeType> &trptr = thread_rptr[tid];
|
||||||
|
if (i < trptr.size()) {
|
||||||
|
size_t ncnt = trptr[i];
|
||||||
|
trptr[i] = start;
|
||||||
|
start += ncnt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rptr[i + 1] = start;
|
||||||
|
}
|
||||||
|
data.resize(start);
|
||||||
|
}
|
||||||
|
/*!
|
||||||
|
* \brief step 4: add data to the allocated space,
|
||||||
|
* the calls to this function should be exactly match previous call to AddBudget
|
||||||
|
*
|
||||||
|
* \param key the key of
|
||||||
|
* \param threadid the id of thread that calls this function
|
||||||
|
*/
|
||||||
|
inline void Push(size_t key, ValueType value, int threadid = 0) {
|
||||||
|
SizeType &rp = thread_rptr[threadid][key];
|
||||||
|
data[rp++] = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/*! \brief pointer to the beginning and end of each continuous key */
|
||||||
|
std::vector<SizeType> &rptr;
|
||||||
|
/*! \brief index of nonzero entries in each row */
|
||||||
|
std::vector<ValueType> &data;
|
||||||
|
/*! \brief thread local data structure */
|
||||||
|
std::vector< std::vector<SizeType> > &thread_rptr;
|
||||||
|
/*! \brief local temp thread ptr, use this if not specified by the constructor */
|
||||||
|
std::vector< std::vector<SizeType> > tmp_thread_rptr;
|
||||||
|
};
|
||||||
|
} // namespace utils
|
||||||
|
} // namespace xgboost
|
||||||
|
#endif
|
||||||
|
|
||||||
@ -256,7 +256,6 @@ struct SparseCSRFileBuilder {
|
|||||||
/*! \brief saved top space of each item */
|
/*! \brief saved top space of each item */
|
||||||
std::vector<IndexType> buffer_data;
|
std::vector<IndexType> buffer_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace utils
|
} // namespace utils
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
29
test/Makefile
Normal file
29
test/Makefile
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
export CC = gcc
|
||||||
|
export CXX = g++
|
||||||
|
export MPICXX = mpicxx
|
||||||
|
export LDFLAGS= -pthread -lm
|
||||||
|
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fPIC -I../src
|
||||||
|
|
||||||
|
ifeq ($(no_omp),1)
|
||||||
|
CFLAGS += -DDISABLE_OPENMP
|
||||||
|
else
|
||||||
|
CFLAGS += -fopenmp
|
||||||
|
endif
|
||||||
|
|
||||||
|
# specify tensor path
|
||||||
|
BIN = test_group_data
|
||||||
|
|
||||||
|
.PHONY: clean all
|
||||||
|
|
||||||
|
all: $(BIN) $(MPIBIN)
|
||||||
|
|
||||||
|
test_group_data: test_group_data.cpp
|
||||||
|
|
||||||
|
$(BIN) :
|
||||||
|
$(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
$(MPIBIN) :
|
||||||
|
$(MPICXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
$(RM) $(BIN) $(MPIBIN) *~
|
||||||
72
test/test_group_data.cpp
Normal file
72
test/test_group_data.cpp
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <vector>
|
||||||
|
#include <utility>
|
||||||
|
#include <ctime>
|
||||||
|
#include <utils/group_data.h>
|
||||||
|
#include <utils/random.h>
|
||||||
|
#include <utils/omp.h>
|
||||||
|
#include <utils/utils.h>
|
||||||
|
|
||||||
|
using namespace xgboost::utils;
|
||||||
|
using namespace xgboost;
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
if (argc < 3) {
|
||||||
|
printf("Usage: <nkey> <ndata> pnthread]\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (argc > 3) {
|
||||||
|
omp_set_num_threads(atoi(argv[3]));
|
||||||
|
}
|
||||||
|
random::Seed(0);
|
||||||
|
unsigned nkey = static_cast<unsigned>(atoi(argv[1]));
|
||||||
|
size_t ndata = static_cast<size_t>(atol(argv[2]));
|
||||||
|
|
||||||
|
std::vector<unsigned> keys;
|
||||||
|
std::vector< std::pair<unsigned, unsigned> > raw;
|
||||||
|
raw.reserve(ndata); keys.reserve(ndata);
|
||||||
|
for (size_t i = 0; i < ndata; ++i) {
|
||||||
|
unsigned key = random::NextUInt32(nkey);
|
||||||
|
utils::Check(key < nkey, "key exceed bound\n");
|
||||||
|
raw.push_back(std::make_pair(key, i));
|
||||||
|
keys.push_back(key);
|
||||||
|
}
|
||||||
|
printf("loading finish, start working\n");
|
||||||
|
time_t start_t = time(NULL);
|
||||||
|
int nthread;
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
nthread = omp_get_num_threads();
|
||||||
|
}
|
||||||
|
std::vector<size_t> rptr;
|
||||||
|
std::vector<unsigned> data;
|
||||||
|
ParallelGroupBuilder<unsigned> builder(&rptr, &data);
|
||||||
|
builder.InitBudget(0, nthread);
|
||||||
|
|
||||||
|
bst_omp_uint nlen = raw.size();
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nlen; ++i) {
|
||||||
|
builder.AddBudget(raw[i].first, omp_get_thread_num());
|
||||||
|
}
|
||||||
|
double first_cost = time(NULL) - start_t;
|
||||||
|
builder.InitStorage();
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (bst_omp_uint i = 0; i < nlen; ++i) {
|
||||||
|
builder.Push(raw[i].first, raw[i].second, omp_get_thread_num());
|
||||||
|
}
|
||||||
|
double second_cost = time(NULL) - start_t;
|
||||||
|
printf("all finish, phase1=%g sec, phase2=%g sec\n", first_cost, second_cost);
|
||||||
|
Check(rptr.size() <= nkey+1, "nkey exceed bound");
|
||||||
|
Check(rptr.back() == ndata, "data shape inconsistent");
|
||||||
|
for (size_t i = 0; i < rptr.size()-1; ++ i) {
|
||||||
|
Check(rptr[i] <= rptr[i+1], "rptr error");
|
||||||
|
for (size_t j = rptr[i]; j < rptr[i+1]; ++j) {
|
||||||
|
unsigned pos = data[j];
|
||||||
|
Check(pos < keys.size(), "invalid pos");
|
||||||
|
Check(keys[pos] == i, "invalid key entry");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("all check pass\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user