diff --git a/Makefile b/Makefile index a8eb89c12..0a5fd7047 100644 --- a/Makefile +++ b/Makefile @@ -3,15 +3,15 @@ export CXX = clang++ export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas # specify tensor path -BIN = xgunity.exe +BIN = xgboost OBJ = io.o .PHONY: clean all all: $(BIN) $(OBJ) export LDFLAGS= -pthread -lm -xgunity.exe: src/xgunity.cpp -io.o: src/io/io.cpp +xgboost: src/xgboost_main.cpp io.o src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp +io.o: src/io/io.cpp src/data.h src/utils/*.h $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) @@ -24,4 +24,3 @@ install: clean: $(RM) $(OBJ) $(BIN) *~ */*~ */*/*~ - diff --git a/src/data.h b/src/data.h index e37565a20..fe81b4dad 100644 --- a/src/data.h +++ b/src/data.h @@ -310,12 +310,11 @@ class FMatrixS : public FMatrixInterface{ const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); for (size_t i = 0; i < nbatch; ++i, ++num_buffered_row_) { SparseBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < batch.size; ++j) { + for (bst_uint j = 0; j < inst.length; ++j) { builder.AddBudget(inst[j].findex); } } } - builder.InitStorage(); iter_->BeforeFirst(); @@ -325,9 +324,9 @@ class FMatrixS : public FMatrixInterface{ const size_t nbatch = std::min(batch.size, max_nrow - batch.base_rowid); for (size_t i = 0; i < nbatch; ++i) { SparseBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < batch.size; ++j) { + for (bst_uint j = 0; j < inst.length; ++j) { builder.PushElem(inst[j].findex, - Entry((bst_uint)(batch.base_rowid+j), + Entry((bst_uint)(batch.base_rowid+i), inst[j].fvalue)); } } diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h index 5a9a3af98..dcc204868 100644 --- a/src/gbm/gbm.h +++ b/src/gbm/gbm.h @@ -7,6 +7,7 @@ */ #include #include "../data.h" +#include "../utils/fmap.h" namespace xgboost { /*! \brief namespace for gradient booster */ @@ -63,6 +64,13 @@ class IGradBooster { int64_t buffer_offset, const std::vector &root_index, std::vector *out_preds) = 0; + /*! + * \brief dump the model in text format + * \param fmap feature map that may help give interpretations of feature + * \param option extra option of the dumo model + * \return a vector of dump for boosters + */ + virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) = 0; // destrcutor virtual ~IGradBooster(void){} }; diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp index 5ccbcd1f1..1fc90e40c 100644 --- a/src/gbm/gbtree-inl.hpp +++ b/src/gbm/gbtree-inl.hpp @@ -141,6 +141,13 @@ class GBTree : public IGradBooster { } } } + virtual std::vector DumpModel(const utils::FeatMap& fmap, int option) { + std::vector dump; + for (size_t i = 0; i < trees.size(); i++) { + dump.push_back(trees[i]->DumpModel(fmap, option&1)); + } + return dump; + } protected: // clear the model diff --git a/src/io/io.cpp b/src/io/io.cpp index 93d91a61c..2cf42aadf 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -7,9 +7,9 @@ namespace xgboost { namespace io { -DataMatrix* LoadDataMatrix(const char *fname) { +DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) { DMatrixSimple *dmat = new DMatrixSimple(); - dmat->CacheLoad(fname); + dmat->CacheLoad(fname, silent, savebuffer); return dmat; } } // namespace io diff --git a/src/io/io.h b/src/io/io.h index 81f89de89..d6d280d5e 100644 --- a/src/io/io.h +++ b/src/io/io.h @@ -17,9 +17,11 @@ typedef learner::DMatrix DataMatrix; /*! * \brief load DataMatrix from stream * \param fname file name to be loaded + * \param silent whether print message during loading + * \param savebuffer whether temporal buffer the file if the file is in text format * \return a loaded DMatrix */ -DataMatrix* LoadDataMatrix(const char *fname); +DataMatrix* LoadDataMatrix(const char *fname, bool silent = false, bool savebuffer = true); /*! * \brief save DataMatrix into stream, * note: the saved dmatrix format may not be in exactly same as input diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index a4ac1e462..184197d45 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "./evaluation.h" #include "./helper_utils.h" diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index e26f6a52d..3c04837c3 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -120,8 +120,8 @@ class BoostLearner { } inline void SaveModel(utils::IStream &fo) const { fo.Write(&mparam, sizeof(ModelParam)); - fo.Write(&name_obj_); - fo.Write(&name_gbm_); + fo.Write(name_obj_); + fo.Write(name_gbm_); gbm_->SaveModel(fo); } /*! @@ -139,7 +139,7 @@ class BoostLearner { * \param p_train pointer to the data matrix */ inline void UpdateOneIter(int iter, DMatrix *p_train) { - this->PredictRaw(preds_, *p_train); + this->PredictRaw(*p_train, &preds_); obj_->GetGradient(preds_, p_train->info, iter, &gpair_); gbm_->DoBoost(gpair_, p_train->fmat, p_train->info.root_index); } @@ -189,7 +189,11 @@ class BoostLearner { this->PredictRaw(data, out_preds); obj_->PredTransform(out_preds); } - + /*! \brief dump model out */ + inline std::vector DumpModel(const utils::FeatMap& fmap, int option) { + return gbm_->DumpModel(fmap, option); + } + protected: /*! * \brief initialize the objective function and GBM, @@ -212,9 +216,9 @@ class BoostLearner { * \param out_preds output vector that stores the prediction */ inline void PredictRaw(const DMatrix &data, - std::vector *out_preds) { + std::vector *out_preds) const { gbm_->Predict(data.fmat, this->FindBufferOffset(data), - data.info, out_preds); + data.info.root_index, out_preds); } /*! \brief training parameter for regression */ @@ -280,7 +284,7 @@ class BoostLearner { inline int64_t FindBufferOffset(const DMatrix &mat) const { for (size_t i = 0; i < cache_.size(); ++i) { if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) { - if (cache_[i].num_row_ == mat.num_row) { + if (cache_[i].num_row_ == mat.info.num_row) { return cache_[i].buffer_offset_; } } diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp index 7aa11d338..d5cc97fcf 100644 --- a/src/learner/objective-inl.hpp +++ b/src/learner/objective-inl.hpp @@ -6,6 +6,7 @@ * \author Tianqi Chen, Kailong Chen */ #include +#include #include "./objective.h" namespace xgboost { diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp index 1868f8f41..f0624bdeb 100644 --- a/src/tree/updater_colmaker-inl.hpp +++ b/src/tree/updater_colmaker-inl.hpp @@ -27,7 +27,6 @@ class ColMaker: public IUpdater { const FMatrix &fmat, const std::vector &root_index, const std::vector &trees) { - for (size_t i = 0; i < trees.size(); ++i) { Builder builder(param); builder.Update(gpair, fmat, root_index, trees[i]); @@ -132,7 +131,9 @@ class ColMaker: public IUpdater { // initialize feature index unsigned ncol = static_cast(fmat.NumCol()); for (unsigned i = 0; i < ncol; ++i) { - if (fmat.GetColSize(i) != 0) feat_index.push_back(i); + if (fmat.GetColSize(i) != 0) { + feat_index.push_back(i); + } } unsigned n = static_cast(param.colsample_bytree * feat_index.size()); random::Shuffle(feat_index); diff --git a/src/xgboost_main.cpp b/src/xgboost_main.cpp new file mode 100644 index 000000000..16139f0d8 --- /dev/null +++ b/src/xgboost_main.cpp @@ -0,0 +1,244 @@ +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE + +#include +#include +#include +#include "io/io.h" +#include "utils/utils.h" +#include "utils/config.h" +#include "learner/learner-inl.hpp" + +namespace xgboost { +/*! + * \brief wrapping the training process + */ +class BoostLearnTask{ + public: + inline int Run(int argc, char *argv[]) { + if (argc < 2) { + printf("Usage: \n"); + return 0; + } + utils::ConfigIterator itr(argv[1]); + while (itr.Next()) { + this->SetParam(itr.name(), itr.val()); + } + for (int i = 2; i < argc; ++i) { + char name[256], val[256]; + if (sscanf(argv[i], "%[^=]=%s", name, val) == 2) { + this->SetParam(name, val); + } + } + this->InitData(); + this->InitLearner(); + if (task == "dump") { + this->TaskDump(); return 0; + } + if (task == "eval") { + this->TaskEval(); return 0; + } + if (task == "pred") { + this->TaskPred(); + } else { + this->TaskTrain(); + } + return 0; + } + inline void SetParam(const char *name, const char *val) { + if (!strcmp("silent", name)) silent = atoi(val); + if (!strcmp("use_buffer", name)) use_buffer = atoi(val); + if (!strcmp("seed", name)) random::Seed(atoi(val)); + if (!strcmp("num_round", name)) num_round = atoi(val); + if (!strcmp("save_period", name)) save_period = atoi(val); + if (!strcmp("eval_train", name)) eval_train = atoi(val); + if (!strcmp("task", name)) task = val; + if (!strcmp("data", name)) train_path = val; + if (!strcmp("test:data", name)) test_path = val; + if (!strcmp("model_in", name)) model_in = val; + if (!strcmp("model_out", name)) model_out = val; + if (!strcmp("model_dir", name)) model_dir_path = val; + if (!strcmp("fmap", name)) name_fmap = val; + if (!strcmp("name_dump", name)) name_dump = val; + if (!strcmp("name_pred", name)) name_pred = val; + if (!strcmp("dump_stats", name)) dump_model_stats = atoi(val); + if (!strncmp("eval[", name, 5)) { + char evname[256]; + utils::Assert(sscanf(name, "eval[%[^]]", evname) == 1, "must specify evaluation name for display"); + eval_data_names.push_back(std::string(evname)); + eval_data_paths.push_back(std::string(val)); + } + learner.SetParam(name, val); + } + public: + BoostLearnTask(void) { + // default parameters + silent = 0; + use_buffer = 1; + num_round = 10; + save_period = 0; + eval_train = 0; + dump_model_stats = 0; + task = "train"; + model_in = "NULL"; + model_out = "NULL"; + name_fmap = "NULL"; + name_pred = "pred.txt"; + name_dump = "dump.txt"; + model_dir_path = "./"; + data = NULL; + } + ~BoostLearnTask(void){ + for (size_t i = 0; i < deval.size(); i++){ + delete deval[i]; + } + if (data != NULL) delete data; + } + private: + inline void InitData(void) { + if (name_fmap != "NULL") fmap.LoadText(name_fmap.c_str()); + if (task == "dump") return; + if (task == "pred") { + data = io::LoadDataMatrix(test_path.c_str(), silent != 0, use_buffer != 0); + } else { + // training + data = io::LoadDataMatrix(train_path.c_str(), silent != 0, use_buffer != 0); + {// intialize column access + data->fmat.InitColAccess(); + } + utils::Assert(eval_data_names.size() == eval_data_paths.size(), "BUG"); + for (size_t i = 0; i < eval_data_names.size(); ++i) { + deval.push_back(io::LoadDataMatrix(eval_data_paths[i].c_str(), silent != 0, use_buffer != 0)); + devalall.push_back(deval.back()); + } + + std::vector dcache(1, data); + for (size_t i = 0; i < deval.size(); ++ i) { + dcache.push_back(deval[i]); + } + // set cache data to be all training and evaluation data + learner.SetCacheData(dcache); + + // add training set to evaluation set if needed + if( eval_train != 0 ) { + devalall.push_back(data); + eval_data_names.push_back(std::string("train")); + } + } + } + inline void InitLearner(void) { + if (model_in != "NULL"){ + utils::FileStream fi(utils::FopenCheck(model_in.c_str(), "rb")); + learner.LoadModel(fi); + fi.Close(); + } else { + utils::Assert(task == "train", "model_in not specified"); + learner.InitModel(); + } + } + inline void TaskTrain(void) { + const time_t start = time(NULL); + unsigned long elapsed = 0; + for (int i = 0; i < num_round; ++i) { + elapsed = (unsigned long)(time(NULL) - start); + if (!silent) printf("boosting round %d, %lu sec elapsed\n", i, elapsed); + learner.UpdateOneIter(i,data); + std::string res = learner.EvalOneIter(i, devalall, eval_data_names); + fprintf(stderr, "%s\n", res.c_str()); + if (save_period != 0 && (i + 1) % save_period == 0) { + this->SaveModel(i); + } + elapsed = (unsigned long)(time(NULL) - start); + } + // always save final round + if ((save_period == 0 || num_round % save_period != 0) && model_out != "NONE") { + if (model_out == "NULL"){ + this->SaveModel(num_round - 1); + } else { + this->SaveModel(model_out.c_str()); + } + } + if (!silent){ + printf("\nupdating end, %lu sec in all\n", elapsed); + } + } + inline void TaskEval(void) { + learner.EvalOneIter(0, devalall, eval_data_names); + } + inline void TaskDump(void){ + FILE *fo = utils::FopenCheck(name_dump.c_str(), "w"); + std::vector dump = learner.DumpModel(fmap, dump_model_stats != 0); + for (size_t i = 0; i < dump.size(); ++ i) { + fprintf(fo,"booster[%lu]:\n", i); + fprintf(fo,"%s", dump[i].c_str()); + } + fclose(fo); + } + inline void SaveModel(const char *fname) const { + utils::FileStream fo(utils::FopenCheck(fname, "wb")); + learner.SaveModel(fo); + fo.Close(); + } + inline void SaveModel(int i) const { + char fname[256]; + sprintf(fname, "%s/%04d.model", model_dir_path.c_str(), i + 1); + this->SaveModel(fname); + } + inline void TaskPred(void) { + std::vector preds; + if (!silent) printf("start prediction...\n"); + learner.Predict(*data, &preds); + if (!silent) printf("writing prediction to %s\n", name_pred.c_str()); + FILE *fo = utils::FopenCheck(name_pred.c_str(), "w"); + for (size_t i = 0; i < preds.size(); i++) { + fprintf(fo, "%f\n", preds[i]); + } + fclose(fo); + } + private: + /* \brief whether silent */ + int silent; + /* \brief whether use auto binary buffer */ + int use_buffer; + /* \brief whether evaluate training statistics */ + int eval_train; + /* \brief number of boosting iterations */ + int num_round; + /* \brief the period to save the model, 0 means only save the final round model */ + int save_period; + /* \brief the path of training/test data set */ + std::string train_path, test_path; + /* \brief the path of test model file, or file to restart training */ + std::string model_in; + /* \brief the path of final model file, to be saved */ + std::string model_out; + /* \brief the path of directory containing the saved models */ + std::string model_dir_path; + /* \brief task to perform */ + std::string task; + /* \brief name of predict file */ + std::string name_pred; + /* \brief whether dump statistics along with model */ + int dump_model_stats; + /* \brief name of feature map */ + std::string name_fmap; + /* \brief name of dump file */ + std::string name_dump; + /* \brief the paths of validation data sets */ + std::vector eval_data_paths; + /* \brief the names of the evaluation data used in output log */ + std::vector eval_data_names; + private: + io::DataMatrix* data; + std::vector deval; + std::vector devalall; + utils::FeatMap fmap; + learner::BoostLearner learner; +}; +} + +int main(int argc, char *argv[]){ + xgboost::random::Seed(0); + xgboost::BoostLearnTask tsk; + return tsk.Run(argc, argv); +}