change makefile to lazy checkpt, fix col splt code
This commit is contained in:
parent
962c2432a0
commit
b762231b02
8
Makefile
8
Makefile
@ -18,9 +18,9 @@ endif
|
|||||||
|
|
||||||
# specify tensor path
|
# specify tensor path
|
||||||
BIN = xgboost
|
BIN = xgboost
|
||||||
MOCKBIN = xgboost-mock
|
MOCKBIN = xgboost.mock
|
||||||
OBJ = updater.o gbm.o io.o main.o
|
OBJ = updater.o gbm.o io.o main.o
|
||||||
MPIBIN = xgboost-mpi
|
MPIBIN = xgboost.mpi
|
||||||
SLIB = wrapper/libxgboostwrapper.so
|
SLIB = wrapper/libxgboostwrapper.so
|
||||||
|
|
||||||
.PHONY: clean all mpi python Rpack librabit librabit_mpi
|
.PHONY: clean all mpi python Rpack librabit librabit_mpi
|
||||||
@ -42,8 +42,8 @@ updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h src/utils/*
|
|||||||
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
|
gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h
|
||||||
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
|
io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h
|
||||||
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
|
main.o: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h
|
||||||
xgboost-mpi: updater.o gbm.o io.o main.o librabit_mpi
|
xgboost.mpi: updater.o gbm.o io.o main.o librabit_mpi
|
||||||
xgboost-mock: updater.o gbm.o io.o main.o librabit
|
xgboost.mock: updater.o gbm.o io.o main.o librabit
|
||||||
xgboost: updater.o gbm.o io.o main.o librabit
|
xgboost: updater.o gbm.o io.o main.o librabit
|
||||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit
|
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ k=$1
|
|||||||
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
|
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
|
||||||
|
|
||||||
# run xgboost mpi
|
# run xgboost mpi
|
||||||
../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost-mock mushroom-col.conf dsplit=col mock=0,1,0,0 mock=1,1,0,0
|
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock mushroom-col.conf dsplit=col mock=0,2,0,0 mock=1,2,0,0 mock=2,2,8,0 mock=2,3,0,0
|
||||||
|
|
||||||
# the model can be directly loaded by single machine xgboost solver, as usuall
|
# the model can be directly loaded by single machine xgboost solver, as usuall
|
||||||
#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
|
#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
|
||||||
|
|||||||
@ -16,13 +16,13 @@ k=$1
|
|||||||
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
|
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
|
||||||
|
|
||||||
# run xgboost mpi
|
# run xgboost mpi
|
||||||
../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col
|
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col
|
||||||
|
|
||||||
# the model can be directly loaded by single machine xgboost solver, as usuall
|
# the model can be directly loaded by single machine xgboost solver, as usuall
|
||||||
../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
|
../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
|
||||||
|
|
||||||
# run for one round, and continue training
|
# run for one round, and continue training
|
||||||
../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf dsplit=col num_round=1
|
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf dsplit=col num_round=1
|
||||||
../../rabit/tracker/rabit_mpi.py $k local ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model
|
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost mushroom-col.conf mushroom-col.conf dsplit=col model_in=0001.model
|
||||||
|
|
||||||
cat dump.nice.$k.txt
|
cat dump.nice.$k.txt
|
||||||
|
|||||||
@ -17,4 +17,4 @@ cd -
|
|||||||
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
|
python splitrows.py ../../demo/regression/machine.txt.train train-machine $k
|
||||||
|
|
||||||
# run xgboost mpi
|
# run xgboost mpi
|
||||||
../../rabit/tracker/rabit_mpi.py -n $k ../../rabit/test/keepalive.sh ../../xgboost-mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0
|
../../rabit/tracker/rabit_demo.py -n $k ../../xgboost.mock machine-row.conf dsplit=row num_round=3 mock=1,1,1,0 mock=0,0,3,0 mock=2,2,3,0
|
||||||
|
|||||||
@ -46,6 +46,14 @@ class IGradBooster {
|
|||||||
* and recalculate from scratch
|
* and recalculate from scratch
|
||||||
*/
|
*/
|
||||||
virtual void ResetPredBuffer(size_t num_pbuffer) {}
|
virtual void ResetPredBuffer(size_t num_pbuffer) {}
|
||||||
|
/*!
|
||||||
|
* \brief whether the model allow lazy checkpoint
|
||||||
|
* return true if model is only updated in DoBoost
|
||||||
|
* after all Allreduce calls
|
||||||
|
*/
|
||||||
|
virtual bool AllowLazyCheckPoint(void) const {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief peform update to the model(boosting)
|
* \brief peform update to the model(boosting)
|
||||||
* \param p_fmat feature matrix that provide access to features
|
* \param p_fmat feature matrix that provide access to features
|
||||||
@ -76,7 +84,6 @@ class IGradBooster {
|
|||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<float> *out_preds,
|
std::vector<float> *out_preds,
|
||||||
unsigned ntree_limit = 0) = 0;
|
unsigned ntree_limit = 0) = 0;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief predict the leaf index of each tree, the output will be nsample * ntree vector
|
* \brief predict the leaf index of each tree, the output will be nsample * ntree vector
|
||||||
* this is only valid in gbtree predictor
|
* this is only valid in gbtree predictor
|
||||||
|
|||||||
@ -90,13 +90,17 @@ class GBTree : public IGradBooster {
|
|||||||
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
|
pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
|
||||||
pred_counter.resize(mparam.PredBufferSize(), 0);
|
pred_counter.resize(mparam.PredBufferSize(), 0);
|
||||||
}
|
}
|
||||||
|
virtual bool AllowLazyCheckPoint(void) const {
|
||||||
|
return !(tparam.distcol_mode != 0 && mparam.num_output_group != 1);
|
||||||
|
}
|
||||||
virtual void DoBoost(IFMatrix *p_fmat,
|
virtual void DoBoost(IFMatrix *p_fmat,
|
||||||
int64_t buffer_offset,
|
int64_t buffer_offset,
|
||||||
const BoosterInfo &info,
|
const BoosterInfo &info,
|
||||||
std::vector<bst_gpair> *in_gpair) {
|
std::vector<bst_gpair> *in_gpair) {
|
||||||
const std::vector<bst_gpair> &gpair = *in_gpair;
|
const std::vector<bst_gpair> &gpair = *in_gpair;
|
||||||
if (mparam.num_output_group == 1) {
|
std::vector<std::vector<tree::RegTree*> > new_trees;
|
||||||
this->BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0);
|
if (mparam.num_output_group == 1) {
|
||||||
|
new_trees.push_back(BoostNewTrees(gpair, p_fmat, buffer_offset, info, 0));
|
||||||
} else {
|
} else {
|
||||||
const int ngroup = mparam.num_output_group;
|
const int ngroup = mparam.num_output_group;
|
||||||
utils::Check(gpair.size() % ngroup == 0,
|
utils::Check(gpair.size() % ngroup == 0,
|
||||||
@ -108,9 +112,12 @@ class GBTree : public IGradBooster {
|
|||||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||||
tmp[i] = gpair[i * ngroup + gid];
|
tmp[i] = gpair[i * ngroup + gid];
|
||||||
}
|
}
|
||||||
this->BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid);
|
new_trees.push_back(BoostNewTrees(tmp, p_fmat, buffer_offset, info, gid));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
|
||||||
|
this->CommitModel(new_trees[gid], gid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
virtual void Predict(IFMatrix *p_fmat,
|
virtual void Predict(IFMatrix *p_fmat,
|
||||||
int64_t buffer_offset,
|
int64_t buffer_offset,
|
||||||
@ -208,14 +215,15 @@ class GBTree : public IGradBooster {
|
|||||||
tparam.updater_initialized = 1;
|
tparam.updater_initialized = 1;
|
||||||
}
|
}
|
||||||
// do group specific group
|
// do group specific group
|
||||||
inline void BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
inline std::vector<tree::RegTree*>
|
||||||
IFMatrix *p_fmat,
|
BoostNewTrees(const std::vector<bst_gpair> &gpair,
|
||||||
int64_t buffer_offset,
|
IFMatrix *p_fmat,
|
||||||
const BoosterInfo &info,
|
int64_t buffer_offset,
|
||||||
int bst_group) {
|
const BoosterInfo &info,
|
||||||
|
int bst_group) {
|
||||||
|
std::vector<tree::RegTree *> new_trees;
|
||||||
this->InitUpdater();
|
this->InitUpdater();
|
||||||
// create the trees
|
// create the trees
|
||||||
std::vector<tree::RegTree *> new_trees;
|
|
||||||
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
for (int i = 0; i < tparam.num_parallel_tree; ++i) {
|
||||||
new_trees.push_back(new tree::RegTree());
|
new_trees.push_back(new tree::RegTree());
|
||||||
for (size_t j = 0; j < cfg.size(); ++j) {
|
for (size_t j = 0; j < cfg.size(); ++j) {
|
||||||
@ -226,9 +234,12 @@ class GBTree : public IGradBooster {
|
|||||||
// update the trees
|
// update the trees
|
||||||
for (size_t i = 0; i < updaters.size(); ++i) {
|
for (size_t i = 0; i < updaters.size(); ++i) {
|
||||||
updaters[i]->Update(gpair, p_fmat, info, new_trees);
|
updaters[i]->Update(gpair, p_fmat, info, new_trees);
|
||||||
}
|
}
|
||||||
// optimization, update buffer, if possible
|
// optimization, update buffer, if possible
|
||||||
if (buffer_offset >= 0 &&
|
// this is only under distributed column mode
|
||||||
|
// for safety check of lazy checkpoint
|
||||||
|
if (
|
||||||
|
buffer_offset >= 0 &&
|
||||||
new_trees.size() == 1 && updaters.size() > 0 &&
|
new_trees.size() == 1 && updaters.size() > 0 &&
|
||||||
updaters.back()->GetLeafPosition() != NULL) {
|
updaters.back()->GetLeafPosition() != NULL) {
|
||||||
utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
|
utils::Check(info.num_row == p_fmat->buffered_rowset().size(),
|
||||||
@ -238,12 +249,15 @@ class GBTree : public IGradBooster {
|
|||||||
*new_trees[0],
|
*new_trees[0],
|
||||||
updaters.back()->GetLeafPosition());
|
updaters.back()->GetLeafPosition());
|
||||||
}
|
}
|
||||||
// push back to model
|
return new_trees;
|
||||||
|
}
|
||||||
|
// commit new trees all at once
|
||||||
|
inline void CommitModel(const std::vector<tree::RegTree*> &new_trees, int bst_group) {
|
||||||
for (size_t i = 0; i < new_trees.size(); ++i) {
|
for (size_t i = 0; i < new_trees.size(); ++i) {
|
||||||
trees.push_back(new_trees[i]);
|
trees.push_back(new_trees[i]);
|
||||||
tree_info.push_back(bst_group);
|
tree_info.push_back(bst_group);
|
||||||
}
|
}
|
||||||
mparam.num_trees += tparam.num_parallel_tree;
|
mparam.num_trees += static_cast<int>(new_trees.size());
|
||||||
}
|
}
|
||||||
// update buffer by pre-cached position
|
// update buffer by pre-cached position
|
||||||
inline void UpdateBufferByPosition(IFMatrix *p_fmat,
|
inline void UpdateBufferByPosition(IFMatrix *p_fmat,
|
||||||
@ -264,7 +278,7 @@ class GBTree : public IGradBooster {
|
|||||||
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
for (int i = 0; i < mparam.size_leaf_vector; ++i) {
|
||||||
pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
|
pred_buffer[bid + i + 1] += new_tree.leafvec(tid)[i];
|
||||||
}
|
}
|
||||||
pred_counter[bid] += 1;
|
pred_counter[bid] += tparam.num_parallel_tree;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// make a prediction for a single instance
|
// make a prediction for a single instance
|
||||||
@ -362,6 +376,8 @@ class GBTree : public IGradBooster {
|
|||||||
int num_parallel_tree;
|
int num_parallel_tree;
|
||||||
/*! \brief whether updater is already initialized */
|
/*! \brief whether updater is already initialized */
|
||||||
int updater_initialized;
|
int updater_initialized;
|
||||||
|
/*! \brief distributed column mode */
|
||||||
|
int distcol_mode;
|
||||||
/*! \brief tree updater sequence */
|
/*! \brief tree updater sequence */
|
||||||
std::string updater_seq;
|
std::string updater_seq;
|
||||||
// construction
|
// construction
|
||||||
@ -370,6 +386,7 @@ class GBTree : public IGradBooster {
|
|||||||
updater_seq = "grow_colmaker,prune";
|
updater_seq = "grow_colmaker,prune";
|
||||||
num_parallel_tree = 1;
|
num_parallel_tree = 1;
|
||||||
updater_initialized = 0;
|
updater_initialized = 0;
|
||||||
|
distcol_mode = 0;
|
||||||
}
|
}
|
||||||
inline void SetParam(const char *name, const char *val){
|
inline void SetParam(const char *name, const char *val){
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -378,6 +395,9 @@ class GBTree : public IGradBooster {
|
|||||||
updater_seq = val;
|
updater_seq = val;
|
||||||
updater_initialized = 0;
|
updater_initialized = 0;
|
||||||
}
|
}
|
||||||
|
if (!strcmp(name, "dsplit") && !strcmp(val, "col")) {
|
||||||
|
distcol_mode = 1;
|
||||||
|
}
|
||||||
if (!strcmp(name, "nthread")) {
|
if (!strcmp(name, "nthread")) {
|
||||||
omp_set_num_threads(nthread = atoi(val));
|
omp_set_num_threads(nthread = atoi(val));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -270,6 +270,12 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||||
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
|
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
|
||||||
}
|
}
|
||||||
|
/*!
|
||||||
|
* \brief whether model allow lazy checkpoint
|
||||||
|
*/
|
||||||
|
inline bool AllowLazyCheckPoint(void) const {
|
||||||
|
return gbm_->AllowLazyCheckPoint();
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief evaluate the model for specific iteration
|
* \brief evaluate the model for specific iteration
|
||||||
* \param iter iteration number
|
* \param iter iteration number
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class BoostLearnTask {
|
|||||||
std::string pname = rabit::GetProcessorName();
|
std::string pname = rabit::GetProcessorName();
|
||||||
fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
|
fprintf(stderr, "start %s:%d\n", pname.c_str(), rabit::GetRank());
|
||||||
}
|
}
|
||||||
if (rabit::IsDistributed()) {
|
if (rabit::IsDistributed() && data_split == "NONE") {
|
||||||
this->SetParam("dsplit", "row");
|
this->SetParam("dsplit", "row");
|
||||||
}
|
}
|
||||||
if (rabit::GetRank() != 0) {
|
if (rabit::GetRank() != 0) {
|
||||||
@ -89,6 +89,7 @@ class BoostLearnTask {
|
|||||||
if (!strcmp("fmap", name)) name_fmap = val;
|
if (!strcmp("fmap", name)) name_fmap = val;
|
||||||
if (!strcmp("name_dump", name)) name_dump = val;
|
if (!strcmp("name_dump", name)) name_dump = val;
|
||||||
if (!strcmp("name_pred", name)) name_pred = val;
|
if (!strcmp("name_pred", name)) name_pred = val;
|
||||||
|
if (!strcmp("dsplit", name)) data_split = val;
|
||||||
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val);
|
||||||
if (!strncmp("eval[", name, 5)) {
|
if (!strncmp("eval[", name, 5)) {
|
||||||
char evname[256];
|
char evname[256];
|
||||||
@ -116,6 +117,7 @@ class BoostLearnTask {
|
|||||||
name_pred = "pred.txt";
|
name_pred = "pred.txt";
|
||||||
name_dump = "dump.txt";
|
name_dump = "dump.txt";
|
||||||
model_dir_path = "./";
|
model_dir_path = "./";
|
||||||
|
data_split = "NONE";
|
||||||
load_part = 0;
|
load_part = 0;
|
||||||
data = NULL;
|
data = NULL;
|
||||||
}
|
}
|
||||||
@ -172,14 +174,24 @@ class BoostLearnTask {
|
|||||||
inline void TaskTrain(void) {
|
inline void TaskTrain(void) {
|
||||||
int version = rabit::LoadCheckPoint(&learner);
|
int version = rabit::LoadCheckPoint(&learner);
|
||||||
if (version == 0) this->InitLearner();
|
if (version == 0) this->InitLearner();
|
||||||
|
|
||||||
const time_t start = time(NULL);
|
const time_t start = time(NULL);
|
||||||
unsigned long elapsed = 0;
|
unsigned long elapsed = 0;
|
||||||
learner.CheckInit(data);
|
learner.CheckInit(data);
|
||||||
for (int i = version; i < num_round; ++i) {
|
|
||||||
|
bool allow_lazy = learner.AllowLazyCheckPoint();
|
||||||
|
for (int i = version / 2; i < num_round; ++i) {
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
if (version % 2 == 0) {
|
||||||
learner.UpdateOneIter(i, *data);
|
if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed);
|
||||||
|
learner.UpdateOneIter(i, *data);
|
||||||
|
if (allow_lazy) {
|
||||||
|
rabit::LazyCheckPoint(&learner);
|
||||||
|
} else {
|
||||||
|
rabit::CheckPoint(&learner);
|
||||||
|
}
|
||||||
|
version += 1;
|
||||||
|
}
|
||||||
|
utils::Assert(version == rabit::VersionNumber(), "consistent check");
|
||||||
std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
|
std::string res = learner.EvalOneIter(i, devalall, eval_data_names);
|
||||||
if (rabit::IsDistributed()){
|
if (rabit::IsDistributed()){
|
||||||
if (rabit::GetRank() == 0) {
|
if (rabit::GetRank() == 0) {
|
||||||
@ -193,9 +205,13 @@ class BoostLearnTask {
|
|||||||
if (save_period != 0 && (i + 1) % save_period == 0) {
|
if (save_period != 0 && (i + 1) % save_period == 0) {
|
||||||
this->SaveModel(i);
|
this->SaveModel(i);
|
||||||
}
|
}
|
||||||
utils::Assert(rabit::VersionNumber() == i, "incorrect version number");
|
if (allow_lazy) {
|
||||||
// checkpoint the model
|
rabit::LazyCheckPoint(&learner);
|
||||||
rabit::CheckPoint(&learner);
|
} else {
|
||||||
|
rabit::CheckPoint(&learner);
|
||||||
|
}
|
||||||
|
version += 1;
|
||||||
|
utils::Assert(version == rabit::VersionNumber(), "consistent check");
|
||||||
elapsed = (unsigned long)(time(NULL) - start);
|
elapsed = (unsigned long)(time(NULL) - start);
|
||||||
}
|
}
|
||||||
// always save final round
|
// always save final round
|
||||||
@ -272,6 +288,8 @@ class BoostLearnTask {
|
|||||||
std::string task;
|
std::string task;
|
||||||
/*! \brief name of predict file */
|
/*! \brief name of predict file */
|
||||||
std::string name_pred;
|
std::string name_pred;
|
||||||
|
/*! \brief data split mode */
|
||||||
|
std::string data_split;
|
||||||
/*!\brief limit number of trees in prediction */
|
/*!\brief limit number of trees in prediction */
|
||||||
int ntree_limit;
|
int ntree_limit;
|
||||||
/*!\brief whether to directly output margin value */
|
/*!\brief whether to directly output margin value */
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user