pas mock, need to fix rabit lib for not initialization
This commit is contained in:
parent
b078663982
commit
31eedfea59
1
.gitignore
vendored
1
.gitignore
vendored
@ -49,3 +49,4 @@ Debug
|
|||||||
xgboost
|
xgboost
|
||||||
xgboost-mpi
|
xgboost-mpi
|
||||||
train*
|
train*
|
||||||
|
rabit
|
||||||
|
|||||||
2
Makefile
2
Makefile
@ -46,7 +46,7 @@ xgboost: updater.o gbm.o io.o main.o librabit
|
|||||||
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit
|
wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h updater.o gbm.o io.o librabit
|
||||||
|
|
||||||
$(BIN) :
|
$(BIN) :
|
||||||
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit
|
$(CXX) $(CFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit_mock
|
||||||
|
|
||||||
$(SLIB) :
|
$(SLIB) :
|
||||||
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit
|
$(CXX) $(CFLAGS) -fPIC -shared -o $@ $(filter %.cpp %.o %.c, $^) $(LDFLAGS) -lrabit
|
||||||
|
|||||||
@ -1,7 +1,9 @@
|
|||||||
Distributed XGBoost: Column Split Version
|
Distributed XGBoost: Column Split Version
|
||||||
====
|
====
|
||||||
* run ```bash mushroom-col-rabit.sh <n-process>```
|
* run ```bash mushroom-col-rabit.sh <n-process>```
|
||||||
- mushroom-col-tcp.sh starts xgboost job using rabit's allreduce
|
- mushroom-col-rabit.sh starts xgboost job using rabit's allreduce
|
||||||
|
* run ```bash mushroom-col-rabit-mock.sh <n-process>```
|
||||||
|
- mushroom-col-rabit-mock.sh starts xgboost job using rabit's allreduce, inserts suicide signal at certain point and test recovery
|
||||||
* run ```bash mushroom-col-mpi.sh <n-mpi-process>```
|
* run ```bash mushroom-col-mpi.sh <n-mpi-process>```
|
||||||
- mushroom-col.sh starts xgboost-mpi job
|
- mushroom-col.sh starts xgboost-mpi job
|
||||||
|
|
||||||
|
|||||||
25
multi-node/col-split/mushroom-col-rabit-mock.sh
Executable file
25
multi-node/col-split/mushroom-col-rabit-mock.sh
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
if [[ $# -ne 1 ]]
|
||||||
|
then
|
||||||
|
echo "Usage: nprocess"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#
|
||||||
|
# This script is same as mushroom-col except that we will be using xgboost instead of xgboost-mpi
|
||||||
|
# xgboost used built in tcp-based allreduce module, and can be run on more enviroment, so long as we know how to start job by modifying ../submit_job_tcp.py
|
||||||
|
#
|
||||||
|
rm -rf train.col* *.model
|
||||||
|
k=$1
|
||||||
|
|
||||||
|
# split the lib svm file into k subfiles
|
||||||
|
python splitsvm.py ../../demo/data/agaricus.txt.train train $k
|
||||||
|
|
||||||
|
# run xgboost mpi
|
||||||
|
../../rabit/tracker/rabit_mpi.py $k local ../../rabit/test/keepalive.sh ../../xgboost mushroom-col.conf dsplit=col mock=0,0,1,0 mock=1,1,0,0
|
||||||
|
|
||||||
|
# the model can be directly loaded by single machine xgboost solver, as usuall
|
||||||
|
#../../xgboost mushroom-col.conf task=dump model_in=0002.model fmap=../../demo/data/featmap.txt name_dump=dump.nice.$k.txt
|
||||||
|
|
||||||
|
|
||||||
|
#cat dump.nice.$k.txt
|
||||||
@ -53,19 +53,13 @@ class GBTree : public IGradBooster {
|
|||||||
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
|
utils::Check(fi.Read(&tree_info[0], sizeof(int) * mparam.num_trees) != 0,
|
||||||
"GBTree: invalid model file");
|
"GBTree: invalid model file");
|
||||||
}
|
}
|
||||||
if (mparam.num_pbuffer != 0) {
|
if (mparam.num_pbuffer != 0 && with_pbuffer) {
|
||||||
pred_buffer.resize(mparam.PredBufferSize());
|
pred_buffer.resize(mparam.PredBufferSize());
|
||||||
pred_counter.resize(mparam.PredBufferSize());
|
pred_counter.resize(mparam.PredBufferSize());
|
||||||
if (with_pbuffer) {
|
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
|
||||||
utils::Check(fi.Read(&pred_buffer[0], pred_buffer.size() * sizeof(float)) != 0,
|
"GBTree: invalid model file");
|
||||||
"GBTree: invalid model file");
|
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
|
||||||
utils::Check(fi.Read(&pred_counter[0], pred_counter.size() * sizeof(unsigned)) != 0,
|
"GBTree: invalid model file");
|
||||||
"GBTree: invalid model file");
|
|
||||||
} else {
|
|
||||||
// reset predict buffer if the input do not have them
|
|
||||||
std::fill(pred_buffer.begin(), pred_buffer.end(), 0.0f);
|
|
||||||
std::fill(pred_counter.begin(), pred_counter.end(), 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
|
virtual void SaveModel(utils::IStream &fo, bool with_pbuffer) const {
|
||||||
@ -77,7 +71,7 @@ class GBTree : public IGradBooster {
|
|||||||
if (tree_info.size() != 0) {
|
if (tree_info.size() != 0) {
|
||||||
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
|
fo.Write(&tree_info[0], sizeof(int) * tree_info.size());
|
||||||
}
|
}
|
||||||
if (mparam.num_pbuffer != 0 && with_pbuffer) {
|
if (mparam.num_pbuffer != 0 && with_pbuffer) {
|
||||||
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
|
fo.Write(&pred_buffer[0], pred_buffer.size() * sizeof(float));
|
||||||
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
|
fo.Write(&pred_counter[0], pred_counter.size() * sizeof(unsigned));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -48,11 +48,9 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
* \param mats array of pointers to matrix whose prediction result need to be cached
|
* \param mats array of pointers to matrix whose prediction result need to be cached
|
||||||
*/
|
*/
|
||||||
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
|
inline void SetCacheData(const std::vector<DMatrix*>& mats) {
|
||||||
// estimate feature bound
|
utils::Assert(cache_.size() == 0, "can only call cache data once");
|
||||||
unsigned num_feature = 0;
|
|
||||||
// assign buffer index
|
// assign buffer index
|
||||||
size_t buffer_size = 0;
|
size_t buffer_size = 0;
|
||||||
utils::Assert(cache_.size() == 0, "can only call cache data once");
|
|
||||||
for (size_t i = 0; i < mats.size(); ++i) {
|
for (size_t i = 0; i < mats.size(); ++i) {
|
||||||
bool dupilicate = false;
|
bool dupilicate = false;
|
||||||
for (size_t j = 0; j < i; ++j) {
|
for (size_t j = 0; j < i; ++j) {
|
||||||
@ -63,16 +61,10 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
mats[i]->cache_learner_ptr_ = this;
|
mats[i]->cache_learner_ptr_ = this;
|
||||||
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
|
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
|
||||||
buffer_size += mats[i]->info.num_row();
|
buffer_size += mats[i]->info.num_row();
|
||||||
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
|
|
||||||
}
|
}
|
||||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
|
||||||
char str_temp[25];
|
char str_temp[25];
|
||||||
if (num_feature > mparam.num_feature) {
|
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
|
||||||
utils::SPrintf(str_temp, sizeof(str_temp), "%u", num_feature);
|
static_cast<unsigned long>(buffer_size));
|
||||||
this->SetParam("bst:num_feature", str_temp);
|
|
||||||
}
|
|
||||||
utils::SPrintf(str_temp, sizeof(str_temp), "%lu",
|
|
||||||
static_cast<unsigned long>(buffer_size));
|
|
||||||
this->SetParam("num_pbuffer", str_temp);
|
this->SetParam("num_pbuffer", str_temp);
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
|
utils::Printf("buffer_size=%ld\n", static_cast<long>(buffer_size));
|
||||||
@ -126,10 +118,29 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
|
cfg_.push_back(std::make_pair(std::string(name), std::string(val)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// this is an internal function
|
||||||
|
// initialize the trainer, called at InitModel and LoadModel
|
||||||
|
inline void InitTrainer(bool calc_num_feature = true) {
|
||||||
|
if (calc_num_feature) {
|
||||||
|
// estimate feature bound
|
||||||
|
unsigned num_feature = 0;
|
||||||
|
for (size_t i = 0; i < cache_.size(); ++i) {
|
||||||
|
num_feature = std::max(num_feature,
|
||||||
|
static_cast<unsigned>(cache_[i].mat_->info.num_col()));
|
||||||
|
}
|
||||||
|
// run allreduce on num_feature to find the maximum value
|
||||||
|
rabit::Allreduce<rabit::op::Max>(&num_feature, 1);
|
||||||
|
if (num_feature > mparam.num_feature) mparam.num_feature = num_feature;
|
||||||
|
}
|
||||||
|
char str_temp[25];
|
||||||
|
utils::SPrintf(str_temp, sizeof(str_temp), "%d", mparam.num_feature);
|
||||||
|
this->SetParam("bst:num_feature", str_temp);
|
||||||
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief initialize the model
|
* \brief initialize the model
|
||||||
*/
|
*/
|
||||||
inline void InitModel(void) {
|
inline void InitModel(void) {
|
||||||
|
this->InitTrainer();
|
||||||
// initialize model
|
// initialize model
|
||||||
this->InitObjGBM();
|
this->InitObjGBM();
|
||||||
// reset the base score
|
// reset the base score
|
||||||
@ -141,8 +152,9 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
* \brief load model from stream
|
* \brief load model from stream
|
||||||
* \param fi input stream
|
* \param fi input stream
|
||||||
* \param with_pbuffer whether to load with predict buffer
|
* \param with_pbuffer whether to load with predict buffer
|
||||||
|
* \param calc_num_feature whether call InitTrainer with calc_num_feature
|
||||||
*/
|
*/
|
||||||
inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true) {
|
inline void LoadModel(utils::IStream &fi, bool with_pbuffer = true, bool calc_num_feature = true) {
|
||||||
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
utils::Check(fi.Read(&mparam, sizeof(ModelParam)) != 0,
|
||||||
"BoostLearner: wrong model format");
|
"BoostLearner: wrong model format");
|
||||||
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
|
utils::Check(fi.Read(&name_obj_), "BoostLearner: wrong model format");
|
||||||
@ -150,9 +162,10 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
// delete existing gbm if any
|
// delete existing gbm if any
|
||||||
if (obj_ != NULL) delete obj_;
|
if (obj_ != NULL) delete obj_;
|
||||||
if (gbm_ != NULL) delete gbm_;
|
if (gbm_ != NULL) delete gbm_;
|
||||||
|
this->InitTrainer(calc_num_feature);
|
||||||
this->InitObjGBM();
|
this->InitObjGBM();
|
||||||
gbm_->LoadModel(fi, with_pbuffer);
|
gbm_->LoadModel(fi, with_pbuffer);
|
||||||
if (with_pbuffer && distributed_mode == 2 && rabit::GetRank() != 0) {
|
if (!with_pbuffer || distributed_mode == 2) {
|
||||||
gbm_->ResetPredBuffer(pred_buffer_size);
|
gbm_->ResetPredBuffer(pred_buffer_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -160,7 +173,7 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
virtual void Load(rabit::IStream &fi) {
|
virtual void Load(rabit::IStream &fi) {
|
||||||
RabitStreamAdapter fs(fi);
|
RabitStreamAdapter fs(fi);
|
||||||
// for row split, we should not keep pbuffer
|
// for row split, we should not keep pbuffer
|
||||||
this->LoadModel(fs, distributed_mode != 2);
|
this->LoadModel(fs, distributed_mode != 2, false);
|
||||||
}
|
}
|
||||||
// rabit save model to rabit checkpoint
|
// rabit save model to rabit checkpoint
|
||||||
virtual void Save(rabit::IStream &fo) const {
|
virtual void Save(rabit::IStream &fo) const {
|
||||||
@ -209,9 +222,12 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
* \param p_train pointer to the data matrix
|
* \param p_train pointer to the data matrix
|
||||||
*/
|
*/
|
||||||
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
inline void UpdateOneIter(int iter, const DMatrix &train) {
|
||||||
|
printf("!!UpdateOneIter\n");
|
||||||
this->PredictRaw(train, &preds_);
|
this->PredictRaw(train, &preds_);
|
||||||
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
obj_->GetGradient(preds_, train.info, iter, &gpair_);
|
||||||
|
printf("!!UpdateOneDoboost\n");
|
||||||
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
|
gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_);
|
||||||
|
printf("!!UpdateOneIter finish\n");
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief evaluate the model for specific iteration
|
* \brief evaluate the model for specific iteration
|
||||||
@ -335,7 +351,7 @@ class BoostLearner : public rabit::ISerializable {
|
|||||||
/* \brief number of class, if it is multi-class classification */
|
/* \brief number of class, if it is multi-class classification */
|
||||||
int num_class;
|
int num_class;
|
||||||
/*! \brief reserved field */
|
/*! \brief reserved field */
|
||||||
int reserved[32];
|
int reserved[31];
|
||||||
/*! \brief constructor */
|
/*! \brief constructor */
|
||||||
ModelParam(void) {
|
ModelParam(void) {
|
||||||
base_score = 0.5f;
|
base_score = 0.5f;
|
||||||
|
|||||||
@ -48,8 +48,7 @@ class BoostLearnTask {
|
|||||||
}
|
}
|
||||||
if (rabit::GetRank() != 0) {
|
if (rabit::GetRank() != 0) {
|
||||||
this->SetParam("silent", "2");
|
this->SetParam("silent", "2");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task == "train") {
|
if (task == "train") {
|
||||||
// if task is training, will try recover from checkpoint
|
// if task is training, will try recover from checkpoint
|
||||||
this->TaskTrain();
|
this->TaskTrain();
|
||||||
@ -151,7 +150,7 @@ class BoostLearnTask {
|
|||||||
learner.SetCacheData(dcache);
|
learner.SetCacheData(dcache);
|
||||||
|
|
||||||
// add training set to evaluation set if needed
|
// add training set to evaluation set if needed
|
||||||
if( eval_train != 0 ) {
|
if (eval_train != 0) {
|
||||||
devalall.push_back(data);
|
devalall.push_back(data);
|
||||||
eval_data_names.push_back(std::string("train"));
|
eval_data_names.push_back(std::string("train"));
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user