From d75e3ed05d9f2c88eb1bc79393e8790b561c030b Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 5 Jan 2016 21:49:48 -0800 Subject: [PATCH] [LIBXGBOOST] pass demo running. --- .gitignore | 2 + Makefile | 23 +- amalgamation/xgboost-all0.cc | 51 ++ demo/guide-python/runall.sh | 3 +- dmlc-core | 2 +- include/xgboost/base.h | 11 + include/xgboost/c_api.h | 7 - include/xgboost/data.h | 19 +- include/xgboost/gbm.h | 2 +- include/xgboost/learner.h | 7 + include/xgboost/logging.h | 50 ++ include/xgboost/metric.h | 4 +- include/xgboost/objective.h | 2 +- include/xgboost/tree_model.h | 2 +- include/xgboost/tree_updater.h | 2 +- old_src/io/dmlc_simple.cpp | 229 ------- old_src/io/libsvm_parser.h | 212 ------- old_src/io/simple_fmatrix-inl.hpp | 374 ------------ old_src/learner/dmatrix.h | 176 ------ python-package/xgboost/libpath.py | 8 +- rabit | 2 +- src/c_api/c_api.cc | 528 ++++++++++++++++ src/c_api/c_api_error.cc | 21 + src/c_api/c_api_error.h | 39 ++ src/cli_main.cc | 34 +- src/common/base64.h | 2 +- src/common/common.cc | 15 + src/common/quantile.h | 2 +- src/common/thread_local.h | 77 +++ src/data/data.cc | 84 +++ src/data/simple_csr_source.cc | 4 +- src/data/simple_dmatrix.cc | 265 ++++++++ src/data/simple_dmatrix.h | 119 ++++ {old_src/io => src/data}/sparse_batch_page.h | 132 ++-- src/gbm/gblinear.cc | 9 +- src/gbm/gbm.cc | 29 + src/gbm/gbtree.cc | 5 +- src/global.cc | 72 --- src/learner.cc | 35 +- src/logging.cc | 20 + src/metric/elementwise_metric.cc | 5 +- src/metric/metric.cc | 42 ++ src/metric/multiclass_metric.cc | 4 +- src/metric/rank_metric.cc | 4 + src/objective/multiclass_obj.cc | 4 +- src/objective/objective.cc | 34 ++ src/objective/rank_obj.cc | 5 +- src/objective/regression_obj.cc | 11 +- src/tree/param.h | 2 +- src/tree/tree_updater.cc | 35 ++ src/tree/updater_colmaker.cc | 4 +- src/tree/updater_histmaker.cc | 3 + src/tree/updater_prune.cc | 3 + src/tree/updater_refresh.cc | 3 + src/tree/updater_skmaker.cc | 3 +- src/tree/updater_sync.cc | 3 + wrapper/.gitignore | 3 - wrapper/README.md | 9 - wrapper/xgboost_wrapper.cpp | 599 ------------------- 59 files changed, 1611 insertions(+), 1845 deletions(-) create mode 100644 amalgamation/xgboost-all0.cc create mode 100644 include/xgboost/logging.h delete mode 100644 old_src/io/dmlc_simple.cpp delete mode 100644 old_src/io/libsvm_parser.h delete mode 100644 old_src/io/simple_fmatrix-inl.hpp delete mode 100644 old_src/learner/dmatrix.h create mode 100644 src/c_api/c_api.cc create mode 100644 src/c_api/c_api_error.cc create mode 100644 src/c_api/c_api_error.h create mode 100644 src/common/common.cc create mode 100644 src/common/thread_local.h create mode 100644 src/data/simple_dmatrix.cc create mode 100644 src/data/simple_dmatrix.h rename {old_src/io => src/data}/sparse_batch_page.h (62%) create mode 100644 src/gbm/gbm.cc delete mode 100644 src/global.cc create mode 100644 src/logging.cc create mode 100644 src/metric/metric.cc create mode 100644 src/objective/objective.cc create mode 100644 src/tree/tree_updater.cc delete mode 100644 wrapper/.gitignore delete mode 100644 wrapper/README.md delete mode 100644 wrapper/xgboost_wrapper.cpp diff --git a/.gitignore b/.gitignore index 3d427e30d..97982775a 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,5 @@ nb-configuration* .settings/ build config.mk +xgboost +*.data diff --git a/Makefile b/Makefile index 50087eb2c..61964aacf 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ ifeq ($(OS), Windows_NT) endif export LDFLAGS= -pthread -lm $(ADD_LDFLAGS) $(DMLC_LDFLAGS) -export CFLAGS= -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) +export CFLAGS= -std=c++0x -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops -fPIC -Iinclude $(ADD_CFLAGS) CFLAGS += -I$(DMLC_CORE)/include -I$(RABIT)/include ifndef LINT_LANG @@ -65,16 +65,27 @@ $(DMLC_CORE)/libdmlc.a: $(RABIT)/lib/$(LIB_RABIT): + cd $(RABIT); make lib/$(LIB_RABIT); cd $(ROOTDIR) + SRC = $(wildcard src/*.cc src/*/*.cc) ALL_OBJ = $(patsubst src/%.cc, build/%.o, $(SRC)) +AMALGA_OBJ = amalgamation/xgboost-all0.o LIB_DEP = $(DMLC_CORE)/libdmlc.a $(RABIT)/lib/$(LIB_RABIT) ALL_DEP = $(filter-out build/cli_main.o, $(ALL_OBJ)) $(LIB_DEP) CLI_OBJ = build/cli_main.o build/%.o: src/%.cc @mkdir -p $(@D) - $(CXX) -std=c++0x $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d - $(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@ + $(CXX) $(CFLAGS) -MM -MT build/$*.o $< >build/$*.d + $(CXX) -c $(CFLAGS) -c $< -o $@ + +# The should be equivalent to $(ALL_OBJ) except for build/cli_main.o +amalgamation/xgboost-all0.o: amalgamation/xgboost-all0.cc + $(CXX) -c $(CFLAGS) -c $< -o $@ + +# Equivalent to lib/libxgboost_all.so +lib/libxgboost_all.so: $(AMALGA_OBJ) $(LIB_DEP) + @mkdir -p $(@D) + $(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) lib/libxgboost.a: $(ALL_DEP) @mkdir -p $(@D) @@ -84,14 +95,14 @@ lib/libxgboost.so: $(ALL_DEP) @mkdir -p $(@D) $(CXX) $(CFLAGS) -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS) -xgboost: lib/libxgboost.a $(CLI_OBJ) $(LIB_DEP) - $(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS) +xgboost: $(CLI_OBJ) lib/libxgboost.a $(LIB_DEP) + $(CXX) $(CFLAGS) -o $@ $(filter %.o %.a, $^) $(LDFLAGS) lint: python2 dmlc-core/scripts/lint.py xgboost ${LINT_LANG} include src clean: - $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ + $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ $(AMALGA_OBJ) clean_all: clean cd $(DMLC_CORE); make clean; cd - diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc new file mode 100644 index 000000000..9e3dba90c --- /dev/null +++ b/amalgamation/xgboost-all0.cc @@ -0,0 +1,51 @@ +/*! + * Copyright 2015 by Contributors. + * \brief XGBoost Amalgamation. + * This offers an alternative way to compile the entire library from this single file. + * + * Example usage command. + * - $(CXX) -std=c++0x -fopenmp -o -shared libxgboost.so xgboost-all0.cc -ldmlc -lrabit + * + * \author Tianqi Chen. + */ + +// metrics +#include "../src/metric/metric.cc" +#include "../src/metric/elementwise_metric.cc" +#include "../src/metric/multiclass_metric.cc" +#include "../src/metric/rank_metric.cc" + +// objectives +#include "../src/objective/objective.cc" +#include "../src/objective/regression_obj.cc" +#include "../src/objective/multiclass_obj.cc" +#include "../src/objective/rank_obj.cc" + +// gbms +#include "../src/gbm/gbm.cc" +#include "../src/gbm/gbtree.cc" +#include "../src/gbm/gblinear.cc" + +// data +#include "../src/data/data.cc" +#include "../src/data/simple_csr_source.cc" +#include "../src/data/simple_dmatrix.cc" + +// tress +#include "../src/tree/tree_model.cc" +#include "../src/tree/tree_updater.cc" +#include "../src/tree/updater_colmaker.cc" +#include "../src/tree/updater_prune.cc" +#include "../src/tree/updater_refresh.cc" +#include "../src/tree/updater_sync.cc" +#include "../src/tree/updater_histmaker.cc" +#include "../src/tree/updater_skmaker.cc" + +// global +#include "../src/learner.cc" +#include "../src/logging.cc" +#include "../src/common/common.cc" + +// c_api +#include "../src/c_api/c_api.cc" +#include "../src/c_api/c_api_error.cc" diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh index 5c8ddf93c..21fa59de2 100755 --- a/demo/guide-python/runall.sh +++ b/demo/guide-python/runall.sh @@ -1,4 +1,5 @@ #!/bin/bash +export PYTHONPATH=PYTHONPATH:../../python-package python basic_walkthrough.py python custom_objective.py python boost_from_prediction.py @@ -9,4 +10,4 @@ python predict_leaf_indices.py python sklearn_examples.py python sklearn_parallel.py python external_memory.py -rm -rf *~ *.model *.buffer +rm -rf *~ *.model *.buffer diff --git a/dmlc-core b/dmlc-core index ec4542185..c0325077a 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit ec454218564fee8e531aee02b8943a4634330ce1 +Subproject commit c0325077a3ceda08fe04b2aa115e004a3520630a diff --git a/include/xgboost/base.h b/include/xgboost/base.h index f07f4b9ee..479cfc7ca 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -16,6 +16,15 @@ #define XGBOOST_STRICT_R_MODE 0 #endif +/*! + * \brief Whether always log console message with time. + * It will display like, with timestamp appended to head of the message. + * "[21:47:50] 6513x126 matrix with 143286 entries loaded from ../data/agaricus.txt.train" + */ +#ifndef XGBOOST_LOG_WITH_TIME +#define XGBOOST_LOG_WITH_TIME 0 +#endif + /*! \brief namespace of xgboo st*/ namespace xgboost { /*! @@ -23,6 +32,8 @@ namespace xgboost { * used for feature index and row index. */ typedef uint32_t bst_uint; +/*! \brief long integers */ +typedef unsigned long bst_ulong; // NOLINT(*) /*! \brief float type, used for storing statistics */ typedef float bst_float; diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index e30ec4c9d..22bc7aa51 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -36,13 +36,6 @@ typedef void *BoosterHandle; */ XGB_DLL const char *XGBGetLastError(); -/*! - * \brief Entry point of CLI program. - * \param argc The number of arguments. - * \param argv The command line arguments. - */ -XGB_DLL int XGBoostCLIMain(int argc, char* argv[]) - /*! * \brief load a data matrix * \param fname the name of the file diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 816ecdaa0..39d488696 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -59,7 +59,7 @@ struct MetaInfo { /*! \brief version flag, used to check version of this info */ static const int kVersion = 1; /*! \brief default constructor */ - MetaInfo() : num_row(0), num_col(0) {} + MetaInfo() : num_row(0), num_col(0), num_nonzero(0) {} /*! * \brief Get weight of each instances. * \param i Instance index. @@ -96,14 +96,6 @@ struct MetaInfo { * \param num Number of elements in the source array. */ void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num); - /*! - * \brief Get information from meta info. - * \param key The key of the information. - * \param dptr The output data pointer of the source array. - * \param dtype The output data type of the information array. - * \param num Number of elements in the array. - */ - void GetInfo(const char* key, const void** dptr, DataType* dtype, size_t* num) const; }; /*! \brief read-only sparse instance batch in CSR format */ @@ -259,11 +251,14 @@ class DMatrix { * \param uri The URI of input. * \param silent Whether print information during loading. * \param load_row_split Flag to read in part of rows, divided among the workers in distributed mode. + * \param file_format The format type of the file, used for dmlc::Parser::Create. + * By default "auto" will be able to load in both local binary file. * \return The created DMatrix. */ static DMatrix* Load(const std::string& uri, bool silent, - bool load_row_split); + bool load_row_split, + const std::string& file_format = "auto"); /*! * \brief create a new DMatrix, by wrapping a row_iterator, and meta info. * \param source The source iterator of the data, the create function takes ownership of the source. @@ -273,7 +268,7 @@ class DMatrix { * \return a Created DMatrix. */ static DMatrix* Create(std::unique_ptr&& source, - const char* cache_prefix = nullptr); + const std::string& cache_prefix = ""); /*! * \brief Create a DMatrix by loaidng data from parser. * Parser can later be deleted after the DMatrix i created. @@ -287,7 +282,7 @@ class DMatrix { * \return A created DMatrix. */ static DMatrix* Create(dmlc::Parser* parser, - const char* cache_prefix = nullptr); + const std::string& cache_prefix = ""); private: // allow learner class to access this field. diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 65a780eb9..15ccd30ef 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -163,7 +163,7 @@ struct GradientBoosterReg */ #define XGBOOST_REGISTER_GBM(UniqueId, Name) \ static ::xgboost::GradientBoosterReg & __make_ ## GradientBoosterReg ## _ ## UniqueId ## __ = \ - ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->__REGISTER__(#Name) + ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->__REGISTER__(Name) } // namespace xgboost #endif // XGBOOST_GBM_H_ diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 166db5779..2d5c5702e 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -36,6 +36,8 @@ namespace xgboost { */ class Learner : public rabit::Serializable { public: + /*! \brief virtual destructor */ + virtual ~Learner() {} /*! * \brief set configuration from pair iterators. * \param begin The beginning iterator. @@ -51,6 +53,11 @@ class Learner : public rabit::Serializable { * \param cfg configurations on both training and model parameters. */ virtual void Configure(const std::vector >& cfg) = 0; + /*! + * \brief Initialize the model using the specified configurations via Configure. + * An model have to be either Loaded or initialized before Update/Predict/Save can be called. + */ + virtual void InitModel() = 0; /*! * \brief load model from stream * \param fi input stream. diff --git a/include/xgboost/logging.h b/include/xgboost/logging.h new file mode 100644 index 000000000..03887fb61 --- /dev/null +++ b/include/xgboost/logging.h @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file logging.h + * \brief defines console logging options for xgboost. + * Use to enforce unified print behavior. + * For debug loggers, use LOG(INFO) and LOG(ERROR). + */ +#ifndef XGBOOST_LOGGING_H_ +#define XGBOOST_LOGGING_H_ + +#include +#include +#include "./base.h" + +namespace xgboost { + +class BaseLogger { + public: + BaseLogger() { +#if XGBOOST_LOG_WITH_TIME + log_stream_ << "[" << dmlc::DateLogger().HumanDate() << "] "; +#endif + } + std::ostream& stream() { return log_stream_; } + + protected: + std::ostringstream log_stream_; +}; + +class ConsoleLogger : public BaseLogger { + public: + ~ConsoleLogger(); +}; + +class TrackerLogger : public BaseLogger { + public: + ~TrackerLogger(); +}; + +// redefines the logging macro if not existed +#ifndef LOG +#define LOG(severity) LOG_##severity.stream() +#endif + +// Enable LOG(CONSOLE) for print messages to console. +#define LOG_CONSOLE ::xgboost::ConsoleLogger() +// Enable LOG(TRACKER) for print messages to tracker +#define LOG_TRACKER ::xgboost::TrackerLogger() +} // namespace xgboost. +#endif // XGBOOST_LOGGING_H_ diff --git a/include/xgboost/metric.h b/include/xgboost/metric.h index 30415756a..02580cbf1 100644 --- a/include/xgboost/metric.h +++ b/include/xgboost/metric.h @@ -70,7 +70,7 @@ struct MetricReg * \endcode */ #define XGBOOST_REGISTER_METRIC(UniqueId, Name) \ - static ::xgboost::MetricReg & __make_ ## MetricReg ## _ ## UniqueId ## __ = \ - ::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(#Name) + ::xgboost::MetricReg& __make_ ## MetricReg ## _ ## UniqueId ## __ = \ + ::dmlc::Registry< ::xgboost::MetricReg>::Get()->__REGISTER__(Name) } // namespace xgboost #endif // XGBOOST_METRIC_H_ diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h index 64439587b..732644dd5 100644 --- a/include/xgboost/objective.h +++ b/include/xgboost/objective.h @@ -106,6 +106,6 @@ struct ObjFunctionReg */ #define XGBOOST_REGISTER_OBJECTIVE(UniqueId, Name) \ static ::xgboost::ObjFunctionReg & __make_ ## ObjFunctionReg ## _ ## UniqueId ## __ = \ - ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(#Name) + ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->__REGISTER__(Name) } // namespace xgboost #endif // XGBOOST_OBJECTIVE_H_ diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 923cfaa7e..4ffce8785 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -8,7 +8,6 @@ #define XGBOOST_TREE_MODEL_H_ #include -#include #include #include #include @@ -17,6 +16,7 @@ #include #include "./base.h" #include "./data.h" +#include "./logging.h" #include "./feature_map.h" namespace xgboost { diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h index 6d2195be5..c4c9275c5 100644 --- a/include/xgboost/tree_updater.h +++ b/include/xgboost/tree_updater.h @@ -79,7 +79,7 @@ struct TreeUpdaterReg */ #define XGBOOST_REGISTER_TREE_UPDATER(UniqueId, Name) \ static ::xgboost::TreeUpdaterReg& __make_ ## TreeUpdaterReg ## _ ## UniqueId ## __ = \ - ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->__REGISTER__(#Name) + ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->__REGISTER__(Name) } // namespace xgboost #endif // XGBOOST_TREE_UPDATER_H_ diff --git a/old_src/io/dmlc_simple.cpp b/old_src/io/dmlc_simple.cpp deleted file mode 100644 index 0448bd578..000000000 --- a/old_src/io/dmlc_simple.cpp +++ /dev/null @@ -1,229 +0,0 @@ -// Copyright by Contributors -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_DEPRECATE -#define NOMINMAX -#include -#include "../utils/io.h" - -// implements a single no split version of DMLC -// in case we want to avoid dependency on dmlc-core - -namespace xgboost { -namespace utils { -/*! - * \brief line split implementation from single FILE - * simply returns lines of files, used for stdin - */ -class SingleFileSplit : public dmlc::InputSplit { - public: - explicit SingleFileSplit(const char *fname) - : use_stdin_(false), - chunk_begin_(NULL), chunk_end_(NULL) { - if (!std::strcmp(fname, "stdin")) { -#ifndef XGBOOST_STRICT_CXX98_ - use_stdin_ = true; fp_ = stdin; -#endif - } - if (!use_stdin_) { - fp_ = utils::FopenCheck(fname, "rb"); - } - buffer_.resize(kBufferSize); - } - virtual ~SingleFileSplit(void) { - if (!use_stdin_) std::fclose(fp_); - } - virtual size_t Read(void *ptr, size_t size) { - return std::fread(ptr, 1, size, fp_); - } - virtual void Write(const void *ptr, size_t size) { - utils::Error("cannot do write in inputsplit"); - } - virtual void BeforeFirst(void) { - std::fseek(fp_, 0, SEEK_SET); - } - virtual bool NextRecord(Blob *out_rec) { - if (chunk_begin_ == chunk_end_) { - if (!LoadChunk()) return false; - } - char *next = FindNextRecord(chunk_begin_, - chunk_end_); - out_rec->dptr = chunk_begin_; - out_rec->size = next - chunk_begin_; - chunk_begin_ = next; - return true; - } - virtual bool NextChunk(Blob *out_chunk) { - if (chunk_begin_ == chunk_end_) { - if (!LoadChunk()) return false; - } - out_chunk->dptr = chunk_begin_; - out_chunk->size = chunk_end_ - chunk_begin_; - chunk_begin_ = chunk_end_; - return true; - } - inline bool ReadChunk(void *buf, size_t *size) { - size_t max_size = *size; - if (max_size <= overflow_.length()) { - *size = 0; return true; - } - if (overflow_.length() != 0) { - std::memcpy(buf, BeginPtr(overflow_), overflow_.length()); - } - size_t olen = overflow_.length(); - overflow_.resize(0); - size_t nread = this->Read(reinterpret_cast(buf) + olen, - max_size - olen); - nread += olen; - if (nread == 0) return false; - if (nread != max_size) { - *size = nread; - return true; - } else { - const char *bptr = reinterpret_cast(buf); - // return the last position where a record starts - const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size); - *size = bend - bptr; - overflow_.resize(max_size - *size); - if (overflow_.length() != 0) { - std::memcpy(BeginPtr(overflow_), bend, overflow_.length()); - } - return true; - } - } - - protected: - inline const char* FindLastRecordBegin(const char *begin, - const char *end) { - if (begin == end) return begin; - for (const char *p = end - 1; p != begin; --p) { - if (*p == '\n' || *p == '\r') return p + 1; - } - return begin; - } - inline char* FindNextRecord(char *begin, char *end) { - char *p; - for (p = begin; p != end; ++p) { - if (*p == '\n' || *p == '\r') break; - } - for (; p != end; ++p) { - if (*p != '\n' && *p != '\r') return p; - } - return end; - } - inline bool LoadChunk(void) { - while (true) { - size_t size = buffer_.length(); - if (!ReadChunk(BeginPtr(buffer_), &size)) return false; - if (size == 0) { - buffer_.resize(buffer_.length() * 2); - } else { - chunk_begin_ = reinterpret_cast(BeginPtr(buffer_)); - chunk_end_ = chunk_begin_ + size; - break; - } - } - return true; - } - - private: - // buffer size - static const size_t kBufferSize = 1 << 18UL; - // file - std::FILE *fp_; - bool use_stdin_; - // internal overflow - std::string overflow_; - // internal buffer - std::string buffer_; - // beginning of chunk - char *chunk_begin_; - // end of chunk - char *chunk_end_; -}; - -class StdFile : public dmlc::Stream { - public: - explicit StdFile(std::FILE *fp, bool use_stdio) - : fp(fp), use_stdio(use_stdio) { - } - virtual ~StdFile(void) { - this->Close(); - } - virtual size_t Read(void *ptr, size_t size) { - return std::fread(ptr, 1, size, fp); - } - virtual void Write(const void *ptr, size_t size) { - Check(std::fwrite(ptr, size, 1, fp) == 1, "StdFile::Write: fwrite error!"); - } - virtual void Seek(size_t pos) { - std::fseek(fp, static_cast(pos), SEEK_SET); // NOLINT(*) - } - virtual size_t Tell(void) { - return std::ftell(fp); - } - virtual bool AtEnd(void) const { - return std::feof(fp) != 0; - } - inline void Close(void) { - if (fp != NULL && !use_stdio) { - std::fclose(fp); fp = NULL; - } - } - - private: - std::FILE *fp; - bool use_stdio; -}; -} // namespace utils -} // namespace xgboost - -namespace dmlc { -InputSplit* InputSplit::Create(const char *uri, - unsigned part, - unsigned nsplit, - const char *type) { - using namespace std; - using namespace xgboost; - const char *msg = "xgboost is compiled in local mode\n"\ - "to use hdfs, s3 or distributed version, compile with make dmlc=1"; - utils::Check(strncmp(uri, "s3://", 5) != 0, msg); - utils::Check(strncmp(uri, "hdfs://", 7) != 0, msg); - utils::Check(nsplit == 1, msg); - return new utils::SingleFileSplit(uri); -} - -Stream *Stream::Create(const char *fname, const char * const mode, bool allow_null) { - using namespace std; - using namespace xgboost; - const char *msg = "xgboost is compiled in local mode\n"\ - "to use hdfs, s3 or distributed version, compile with make dmlc=1"; - utils::Check(strncmp(fname, "s3://", 5) != 0, msg); - utils::Check(strncmp(fname, "hdfs://", 7) != 0, msg); - - std::FILE *fp = NULL; - bool use_stdio = false; - using namespace std; -#ifndef XGBOOST_STRICT_CXX98_ - if (!strcmp(fname, "stdin")) { - use_stdio = true; fp = stdin; - } - if (!strcmp(fname, "stdout")) { - use_stdio = true; fp = stdout; - } -#endif - if (!strncmp(fname, "file://", 7)) fname += 7; - if (!use_stdio) { - std::string flag = mode; - if (flag == "w") flag = "wb"; - if (flag == "r") flag = "rb"; - fp = fopen64(fname, flag.c_str()); - } - if (fp != NULL) { - return new utils::StdFile(fp, use_stdio); - } else { - utils::Check(allow_null, "fail to open file %s", fname); - return NULL; - } -} -} // namespace dmlc - diff --git a/old_src/io/libsvm_parser.h b/old_src/io/libsvm_parser.h deleted file mode 100644 index 43b8d6b90..000000000 --- a/old_src/io/libsvm_parser.h +++ /dev/null @@ -1,212 +0,0 @@ -/*! - * Copyright (c) 2015 by Contributors - * \file libsvm_parser.h - * \brief iterator parser to parse libsvm format - * \author Tianqi Chen - */ -#ifndef XGBOOST_IO_LIBSVM_PARSER_H_ -#define XGBOOST_IO_LIBSVM_PARSER_H_ -#define NOMINMAX -#include -#include -#include -#include -#include "../utils/omp.h" -#include "../utils/utils.h" -#include "../sync/sync.h" -#include "../utils/thread_buffer.h" -#include "./sparse_batch_page.h" - -namespace xgboost { -namespace io { -/*! \brief page returned by libsvm parser */ -struct LibSVMPage : public SparsePage { - std::vector label; - // overload clear - inline void Clear() { - SparsePage::Clear(); - label.clear(); - } -}; -/*! - * \brief libsvm parser that parses the input lines - * and returns rows in input data - * factory that was used by threadbuffer template - */ -class LibSVMPageFactory { - public: - LibSVMPageFactory() - : bytes_read_(0), at_head_(true) { - } - inline bool Init(void) { - return true; - } - inline void Setup(dmlc::InputSplit *source, - int nthread) { - source_ = source; - int maxthread; - #pragma omp parallel - { - maxthread = omp_get_num_procs(); - } - maxthread = std::max(maxthread / 2, 1); - nthread_ = std::min(maxthread, nthread); - } - inline void SetParam(const char *name, const char *val) {} - inline bool LoadNext(std::vector *data) { - return FillData(data); - } - inline void FreeSpace(std::vector *a) { - delete a; - } - inline std::vector *Create(void) { - return new std::vector(); - } - inline void BeforeFirst(void) { - utils::Assert(at_head_, "cannot call beforefirst"); - } - inline void Destroy(void) { - delete source_; - } - inline size_t bytes_read(void) const { - return bytes_read_; - } - - protected: - inline bool FillData(std::vector *data) { - dmlc::InputSplit::Blob chunk; - if (!source_->NextChunk(&chunk)) return false; - int nthread; - #pragma omp parallel num_threads(nthread_) - { - nthread = omp_get_num_threads(); - } - // reserve space for data - data->resize(nthread); - bytes_read_ += chunk.size; - utils::Assert(chunk.size != 0, "LibSVMParser.FileData"); - char *head = reinterpret_cast(chunk.dptr); - #pragma omp parallel num_threads(nthread_) - { - // threadid - int tid = omp_get_thread_num(); - size_t nstep = (chunk.size + nthread - 1) / nthread; - size_t sbegin = std::min(tid * nstep, chunk.size); - size_t send = std::min((tid + 1) * nstep, chunk.size); - char *pbegin = BackFindEndLine(head + sbegin, head); - char *pend; - if (tid + 1 == nthread) { - pend = head + send; - } else { - pend = BackFindEndLine(head + send, head); - } - ParseBlock(pbegin, pend, &(*data)[tid]); - } - return true; - } - /*! - * \brief parse data into out - * \param begin beginning of buffer - * \param end end of buffer - */ - inline void ParseBlock(char *begin, - char *end, - LibSVMPage *out) { - using namespace std; - out->Clear(); - char *p = begin; - while (p != end) { - while (isspace(*p) && p != end) ++p; - if (p == end) break; - char *head = p; - while (isdigit(*p) && p != end) ++p; - if (*p == ':') { - out->data.push_back(SparseBatch::Entry(atol(head), - static_cast(atof(p + 1)))); - } else { - if (out->label.size() != 0) { - out->offset.push_back(out->data.size()); - } - out->label.push_back(static_cast(atof(head))); - } - while (!isspace(*p) && p != end) ++p; - } - if (out->label.size() != 0) { - out->offset.push_back(out->data.size()); - } - utils::Check(out->label.size() + 1 == out->offset.size(), - "LibSVMParser inconsistent"); - } - /*! - * \brief start from bptr, go backward and find first endof line - * \param bptr end position to go backward - * \param begin the beginning position of buffer - * \return position of first endof line going backward - */ - inline char* BackFindEndLine(char *bptr, - char *begin) { - for (; bptr != begin; --bptr) { - if (*bptr == '\n' || *bptr == '\r') return bptr; - } - return begin; - } - - private: - // nthread - int nthread_; - // number of bytes readed - size_t bytes_read_; - // at beginning, at end of stream - bool at_head_; - // source split that provides the data - dmlc::InputSplit *source_; -}; - -class LibSVMParser : public utils::IIterator { - public: - explicit LibSVMParser(dmlc::InputSplit *source, - int nthread) - : at_end_(false), data_ptr_(0), data_(NULL) { - itr.SetParam("buffer_size", "2"); - itr.get_factory().Setup(source, nthread); - itr.Init(); - } - virtual void BeforeFirst(void) { - itr.BeforeFirst(); - } - virtual bool Next(void) { - if (at_end_) return false; - while (true) { - if (data_ == NULL || data_ptr_ >= data_->size()) { - if (!itr.Next(data_)) { - at_end_ = true; return false; - } else { - data_ptr_ = 0; - } - } - while (data_ptr_ < data_->size()) { - data_ptr_ += 1; - if ((*data_)[data_ptr_ - 1].Size() != 0) { - return true; - } - } - } - return true; - } - virtual const LibSVMPage &Value(void) const { - return (*data_)[data_ptr_ - 1]; - } - inline size_t bytes_read(void) const { - return itr.get_factory().bytes_read(); - } - - private: - bool at_end_; - size_t data_ptr_; - std::vector *data_; - utils::ThreadBuffer*, LibSVMPageFactory> itr; -}; - -} // namespace io -} // namespace xgboost -#endif // XGBOOST_IO_LIBSVM_PARSER_H_ diff --git a/old_src/io/simple_fmatrix-inl.hpp b/old_src/io/simple_fmatrix-inl.hpp deleted file mode 100644 index e467263fa..000000000 --- a/old_src/io/simple_fmatrix-inl.hpp +++ /dev/null @@ -1,374 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file simple_fmatrix-inl.hpp - * \brief the input data structure for gradient boosting - * \author Tianqi Chen - */ -#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ -#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP_ - -#include -#include -#include -#include "../data.h" -#include "../utils/utils.h" -#include "../utils/random.h" -#include "../utils/omp.h" -#include "../learner/dmatrix.h" -#include "../utils/group_data.h" -#include "./sparse_batch_page.h" - -namespace xgboost { -namespace io { -/*! - * \brief sparse matrix that support column access, CSC - */ -class FMatrixS : public IFMatrix { - public: - typedef SparseBatch::Entry Entry; - /*! \brief constructor */ - FMatrixS(utils::IIterator *iter, - const learner::MetaInfo &info) - : info_(info) { - this->iter_ = iter; - } - // destructor - virtual ~FMatrixS(void) { - if (iter_ != NULL) delete iter_; - } - /*! \return whether column access is enabled */ - virtual bool HaveColAccess(void) const { - return col_size_.size() != 0; - } - /*! \brief get number of columns */ - virtual size_t NumCol(void) const { - utils::Check(this->HaveColAccess(), "NumCol:need column access"); - return col_size_.size(); - } - /*! \brief get number of buffered rows */ - virtual const std::vector &buffered_rowset(void) const { - return buffered_rowset_; - } - /*! \brief get column size */ - virtual size_t GetColSize(size_t cidx) const { - return col_size_[cidx]; - } - /*! \brief get column density */ - virtual float GetColDensity(size_t cidx) const { - size_t nmiss = buffered_rowset_.size() - col_size_[cidx]; - return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); - } - virtual void InitColAccess(const std::vector &enabled, - float pkeep, size_t max_row_perbatch) { - if (this->HaveColAccess()) return; - this->InitColData(enabled, pkeep, max_row_perbatch); - } - /*! - * \brief get the row iterator associated with FMatrix - */ - virtual utils::IIterator* RowIterator(void) { - iter_->BeforeFirst(); - return iter_; - } - /*! - * \brief get the column based iterator - */ - virtual utils::IIterator* ColIterator(void) { - size_t ncol = this->NumCol(); - col_iter_.col_index_.resize(ncol); - for (size_t i = 0; i < ncol; ++i) { - col_iter_.col_index_[i] = static_cast(i); - } - col_iter_.BeforeFirst(); - return &col_iter_; - } - /*! - * \brief column based iterator - */ - virtual utils::IIterator *ColIterator(const std::vector &fset) { - size_t ncol = this->NumCol(); - col_iter_.col_index_.resize(0); - for (size_t i = 0; i < fset.size(); ++i) { - if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); - } - col_iter_.BeforeFirst(); - return &col_iter_; - } - /*! - * \brief save column access data into stream - * \param fo output stream to save to - */ - inline void SaveColAccess(utils::IStream &fo) const { // NOLINT(*) - size_t n = 0; - fo.Write(&n, sizeof(n)); - } - /*! - * \brief load column access data from stream - * \param fo output stream to load from - */ - inline void LoadColAccess(utils::IStream &fi) { // NOLINT(*) - // do nothing in load col access - } - - protected: - /*! - * \brief initialize column data - * \param enabled the list of enabled columns - * \param pkeep probability to keep a row - * \param max_row_perbatch maximum row per batch - */ - inline void InitColData(const std::vector &enabled, - float pkeep, size_t max_row_perbatch) { - col_iter_.Clear(); - if (info_.num_row() < max_row_perbatch) { - SparsePage *page = new SparsePage(); - this->MakeOneBatch(enabled, pkeep, page); - col_iter_.cpages_.push_back(page); - } else { - this->MakeManyBatch(enabled, pkeep, max_row_perbatch); - } - // setup col-size - col_size_.resize(info_.num_col()); - std::fill(col_size_.begin(), col_size_.end(), 0); - for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) { - SparsePage *pcol = col_iter_.cpages_[i]; - for (size_t j = 0; j < pcol->Size(); ++j) { - col_size_[j] += pcol->offset[j + 1] - pcol->offset[j]; - } - } - } - /*! - * \brief make column page from iterator - * \param pkeep probability to keep a row - * \param pcol the target column - */ - inline void MakeOneBatch(const std::vector &enabled, - float pkeep, - SparsePage *pcol) { - // clear rowset - buffered_rowset_.clear(); - // bit map - int nthread; - std::vector bmap; - #pragma omp parallel - { - nthread = omp_get_num_threads(); - } - pcol->Clear(); - utils::ParallelGroupBuilder - builder(&pcol->offset, &pcol->data); - builder.InitBudget(info_.num_col(), nthread); - // start working - iter_->BeforeFirst(); - while (iter_->Next()) { - const RowBatch &batch = iter_->Value(); - bmap.resize(bmap.size() + batch.size, true); - long batch_size = static_cast(batch.size); // NOLINT(*) - for (long i = 0; i < batch_size; ++i) { // NOLINT(*) - bst_uint ridx = static_cast(batch.base_rowid + i); - if (pkeep == 1.0f || random::SampleBinary(pkeep)) { - buffered_rowset_.push_back(ridx); - } else { - bmap[i] = false; - } - } - #pragma omp parallel for schedule(static) - for (long i = 0; i < batch_size; ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - bst_uint ridx = static_cast(batch.base_rowid + i); - if (bmap[ridx]) { - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - if (enabled[inst[j].index]) { - builder.AddBudget(inst[j].index, tid); - } - } - } - } - } - builder.InitStorage(); - - iter_->BeforeFirst(); - while (iter_->Next()) { - const RowBatch &batch = iter_->Value(); - #pragma omp parallel for schedule(static) - for (long i = 0; i < static_cast(batch.size); ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - bst_uint ridx = static_cast(batch.base_rowid + i); - if (bmap[ridx]) { - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - if (enabled[inst[j].index]) { - builder.Push(inst[j].index, - Entry((bst_uint)(batch.base_rowid+i), - inst[j].fvalue), tid); - } - } - } - } - } - - utils::Assert(pcol->Size() == info_.num_col(), - "inconsistent col data"); - // sort columns - bst_omp_uint ncol = static_cast(pcol->Size()); - #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) - for (bst_omp_uint i = 0; i < ncol; ++i) { - if (pcol->offset[i] < pcol->offset[i + 1]) { - std::sort(BeginPtr(pcol->data) + pcol->offset[i], - BeginPtr(pcol->data) + pcol->offset[i + 1], - SparseBatch::Entry::CmpValue); - } - } - } - - inline void MakeManyBatch(const std::vector &enabled, - float pkeep, size_t max_row_perbatch) { - size_t btop = 0; - buffered_rowset_.clear(); - // internal temp cache - SparsePage tmp; tmp.Clear(); - iter_->BeforeFirst(); - while (iter_->Next()) { - const RowBatch &batch = iter_->Value(); - for (size_t i = 0; i < batch.size; ++i) { - bst_uint ridx = static_cast(batch.base_rowid + i); - if (pkeep == 1.0f || random::SampleBinary(pkeep)) { - buffered_rowset_.push_back(ridx); - tmp.Push(batch[i]); - } - if (tmp.Size() >= max_row_perbatch) { - SparsePage *page = new SparsePage(); - this->MakeColPage(tmp.GetRowBatch(0), - BeginPtr(buffered_rowset_) + btop, - enabled, page); - col_iter_.cpages_.push_back(page); - btop = buffered_rowset_.size(); - tmp.Clear(); - } - } - } - if (tmp.Size() != 0) { - SparsePage *page = new SparsePage(); - this->MakeColPage(tmp.GetRowBatch(0), - BeginPtr(buffered_rowset_) + btop, - enabled, page); - col_iter_.cpages_.push_back(page); - } - } - // make column page from subset of rowbatchs - inline void MakeColPage(const RowBatch &batch, - const bst_uint *ridx, - const std::vector &enabled, - SparsePage *pcol) { - int nthread; - #pragma omp parallel - { - nthread = omp_get_num_threads(); - int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); - if (nthread > max_nthread) { - nthread = max_nthread; - } - } - pcol->Clear(); - utils::ParallelGroupBuilder - builder(&pcol->offset, &pcol->data); - builder.InitBudget(info_.num_col(), nthread); - bst_omp_uint ndata = static_cast(batch.size); - #pragma omp parallel for schedule(static) num_threads(nthread) - for (bst_omp_uint i = 0; i < ndata; ++i) { - int tid = omp_get_thread_num(); - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - const SparseBatch::Entry &e = inst[j]; - if (enabled[e.index]) { - builder.AddBudget(e.index, tid); - } - } - } - builder.InitStorage(); - #pragma omp parallel for schedule(static) num_threads(nthread) - for (bst_omp_uint i = 0; i < ndata; ++i) { - int tid = omp_get_thread_num(); - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - const SparseBatch::Entry &e = inst[j]; - builder.Push(e.index, - SparseBatch::Entry(ridx[i], e.fvalue), - tid); - } - } - utils::Assert(pcol->Size() == info_.num_col(), "inconsistent col data"); - // sort columns - bst_omp_uint ncol = static_cast(pcol->Size()); - #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) - for (bst_omp_uint i = 0; i < ncol; ++i) { - if (pcol->offset[i] < pcol->offset[i + 1]) { - std::sort(BeginPtr(pcol->data) + pcol->offset[i], - BeginPtr(pcol->data) + pcol->offset[i + 1], - SparseBatch::Entry::CmpValue); - } - } - } - - private: - // one batch iterator that return content in the matrix - struct ColBatchIter: utils::IIterator { - ColBatchIter(void) : data_ptr_(0) {} - virtual ~ColBatchIter(void) { - this->Clear(); - } - virtual void BeforeFirst(void) { - data_ptr_ = 0; - } - virtual bool Next(void) { - if (data_ptr_ >= cpages_.size()) return false; - data_ptr_ += 1; - SparsePage *pcol = cpages_[data_ptr_ - 1]; - batch_.size = col_index_.size(); - col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0)); - for (size_t i = 0; i < col_data_.size(); ++i) { - const bst_uint ridx = col_index_[i]; - col_data_[i] = SparseBatch::Inst - (BeginPtr(pcol->data) + pcol->offset[ridx], - static_cast(pcol->offset[ridx + 1] - pcol->offset[ridx])); - } - batch_.col_index = BeginPtr(col_index_); - batch_.col_data = BeginPtr(col_data_); - return true; - } - virtual const ColBatch &Value(void) const { - return batch_; - } - inline void Clear(void) { - for (size_t i = 0; i < cpages_.size(); ++i) { - delete cpages_[i]; - } - cpages_.clear(); - } - // data content - std::vector col_index_; - // column content - std::vector col_data_; - // column sparse pages - std::vector cpages_; - // data pointer - size_t data_ptr_; - // temporal space for batch - ColBatch batch_; - }; - // --- data structure used to support InitColAccess -- - // column iterator - ColBatchIter col_iter_; - // shared meta info with DMatrix - const learner::MetaInfo &info_; - // row iterator - utils::IIterator *iter_; - /*! \brief list of row index that are buffered */ - std::vector buffered_rowset_; - // count for column data - std::vector col_size_; -}; -} // namespace io -} // namespace xgboost -#endif // XGBOOST_IO_SLICE_FMATRIX_INL_HPP_ diff --git a/old_src/learner/dmatrix.h b/old_src/learner/dmatrix.h deleted file mode 100644 index 52828c3be..000000000 --- a/old_src/learner/dmatrix.h +++ /dev/null @@ -1,176 +0,0 @@ -/*! - * Copyright 2014 by Contributors - * \file dmatrix.h - * \brief meta data and template data structure - * used for regression/classification/ranking - * \author Tianqi Chen - */ -#ifndef XGBOOST_LEARNER_DMATRIX_H_ -#define XGBOOST_LEARNER_DMATRIX_H_ - -#include -#include -#include "../data.h" -#include "../utils/io.h" -namespace xgboost { -namespace learner { -/*! - * \brief meta information needed in training, including label, weight - */ -struct MetaInfo { - /*! - * \brief information needed by booster - * BoosterInfo does not implement save and load, - * all serialization is done in MetaInfo - */ - BoosterInfo info; - /*! \brief label of each instance */ - std::vector labels; - /*! - * \brief the index of begin and end of a group - * needed when the learning task is ranking - */ - std::vector group_ptr; - /*! \brief weights of each instance, optional */ - std::vector weights; - /*! - * \brief initialized margins, - * if specified, xgboost will start from this initial margin - * can be used to specify initial prediction to boost from - */ - std::vector base_margin; - /*! \brief version flag, used to check version of this info */ - static const int kVersion = 0; - // constructor - MetaInfo(void) {} - /*! \return number of rows in dataset */ - inline size_t num_row(void) const { - return info.num_row; - } - /*! \return number of columns in dataset */ - inline size_t num_col(void) const { - return info.num_col; - } - /*! \brief clear all the information */ - inline void Clear(void) { - labels.clear(); - group_ptr.clear(); - weights.clear(); - info.root_index.clear(); - base_margin.clear(); - info.num_row = info.num_col = 0; - } - /*! \brief get weight of each instances */ - inline float GetWeight(size_t i) const { - if (weights.size() != 0) { - return weights[i]; - } else { - return 1.0f; - } - } - inline void SaveBinary(utils::IStream &fo) const { // NOLINT(*) - int version = kVersion; - fo.Write(&version, sizeof(version)); - fo.Write(&info.num_row, sizeof(info.num_row)); - fo.Write(&info.num_col, sizeof(info.num_col)); - fo.Write(labels); - fo.Write(group_ptr); - fo.Write(weights); - fo.Write(info.root_index); - fo.Write(base_margin); - } - inline void LoadBinary(utils::IStream &fi) { // NOLINT(*) - int version; - utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format"); - utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format"); - utils::Check(fi.Read(&info.num_col, sizeof(info.num_col)) != 0, "MetaInfo: invalid format"); - utils::Check(fi.Read(&labels), "MetaInfo: invalid format"); - utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format"); - utils::Check(fi.Read(&weights), "MetaInfo: invalid format"); - utils::Check(fi.Read(&info.root_index), "MetaInfo: invalid format"); - utils::Check(fi.Read(&base_margin), "MetaInfo: invalid format"); - } - // try to load group information from file, if exists - inline bool TryLoadGroup(const char* fname, bool silent = false) { - using namespace std; - FILE *fi = fopen64(fname, "r"); - if (fi == NULL) return false; - group_ptr.push_back(0); - unsigned nline; - while (fscanf(fi, "%u", &nline) == 1) { - group_ptr.push_back(group_ptr.back()+nline); - } - if (!silent) { - utils::Printf("%u groups are loaded from %s\n", - static_cast(group_ptr.size()-1), fname); - } - fclose(fi); - return true; - } - inline std::vector& GetFloatInfo(const char *field) { - using namespace std; - if (!strcmp(field, "label")) return labels; - if (!strcmp(field, "weight")) return weights; - if (!strcmp(field, "base_margin")) return base_margin; - utils::Error("unknown field %s", field); - return labels; - } - inline const std::vector& GetFloatInfo(const char *field) const { - return ((MetaInfo*)this)->GetFloatInfo(field); // NOLINT(*) - } - inline std::vector &GetUIntInfo(const char *field) { - using namespace std; - if (!strcmp(field, "root_index")) return info.root_index; - if (!strcmp(field, "fold_index")) return info.fold_index; - utils::Error("unknown field %s", field); - return info.root_index; - } - inline const std::vector &GetUIntInfo(const char *field) const { - return ((MetaInfo*)this)->GetUIntInfo(field); // NOLINT(*) - } - // try to load weight information from file, if exists - inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) { - using namespace std; - std::vector &data = this->GetFloatInfo(field); - FILE *fi = fopen64(fname, "r"); - if (fi == NULL) return false; - float wt; - while (fscanf(fi, "%f", &wt) == 1) { - data.push_back(wt); - } - if (!silent) { - utils::Printf("loading %s from %s\n", field, fname); - } - fclose(fi); - return true; - } -}; - -/*! - * \brief data object used for learning, - * \tparam FMatrix type of feature data source - */ -struct DMatrix { - /*! - * \brief magic number associated with this object - * used to check if it is specific instance - */ - const int magic; - /*! \brief meta information about the dataset */ - MetaInfo info; - /*! - * \brief cache pointer to verify if the data structure is cached in some learner - * used to verify if DMatrix is cached - */ - void *cache_learner_ptr_; - /*! \brief default constructor */ - explicit DMatrix(int magic) : magic(magic), cache_learner_ptr_(NULL) {} - /*! \brief get feature matrix about data content */ - virtual IFMatrix *fmat(void) const = 0; - // virtual destructor - virtual ~DMatrix(void){} -}; - -} // namespace learner -} // namespace xgboost -#endif // XGBOOST_LEARNER_DMATRIX_H_ diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py index 5df72dd3d..a703dcd7b 100644 --- a/python-package/xgboost/libpath.py +++ b/python-package/xgboost/libpath.py @@ -20,8 +20,8 @@ def find_lib_path(): """ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) # make pythonpack hack: copy this directory one level upper for setup.py - dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/'), - os.path.join(curr_path, './wrapper/')] + dll_path = [curr_path, os.path.join(curr_path, '../../lib/'), + os.path.join(curr_path, './lib/')] if os.name == 'nt': if platform.architecture()[0] == '64bit': dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/')) @@ -32,9 +32,9 @@ def find_lib_path(): # hack for pip installation when copy all parent source directory here dll_path.append(os.path.join(curr_path, './windows/Release/')) if os.name == 'nt': - dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] + dll_path = [os.path.join(p, 'libxgboost.dll') for p in dll_path] else: - dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] + dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] #From github issues, most of installation errors come from machines w/o compilers if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): diff --git a/rabit b/rabit index bed63208a..05b958c17 160000 --- a/rabit +++ b/rabit @@ -1 +1 @@ -Subproject commit bed63208af736c4aa289b629fbe5396bd9f513d9 +Subproject commit 05b958c178b16d707ff16b4b05506be124087e13 diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc new file mode 100644 index 000000000..de7e0b425 --- /dev/null +++ b/src/c_api/c_api.cc @@ -0,0 +1,528 @@ +// Copyright (c) 2014 by Contributors + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "./c_api_error.h" +#include "../data/simple_csr_source.h" +#include "../common/thread_local.h" +#include "../common/math.h" +#include "../common/io.h" +#include "../common/group_data.h" + +namespace xgboost { + +// booster wrapper for backward compatible reason. +class Booster { + public: + explicit Booster(const std::vector& cache_mats) + : configured_(false), + initialized_(false), + learner_(Learner::Create(cache_mats)) {} + + inline Learner* learner() { + return learner_.get(); + } + + inline void SetParam(const std::string& name, const std::string& val) { + cfg_.push_back(std::make_pair(name, val)); + if (configured_) { + learner_->Configure(cfg_); + } + } + + inline void LazyInit() { + if (!configured_) { + learner_->Configure(cfg_); + configured_ = true; + } + if (!initialized_) { + learner_->InitModel(); + initialized_ = true; + } + } + + inline void LoadModel(dmlc::Stream* fi) { + learner_->Load(fi); + initialized_ = true; + } + + public: + bool configured_; + bool initialized_; + std::unique_ptr learner_; + std::vector > cfg_; +}; +} // namespace xgboost + +using namespace xgboost; // NOLINT(*); + +/*! \brief entry to to easily hold returning information */ +struct XGBAPIThreadLocalEntry { + /*! \brief result holder for returning string */ + std::string ret_str; + /*! \brief result holder for returning strings */ + std::vector ret_vec_str; + /*! \brief result holder for returning string pointers */ + std::vector ret_vec_charp; + /*! \brief returning float vector. */ + std::vector ret_vec_float; + /*! \brief temp variable of gradient pairs. */ + std::vector tmp_gpair; +}; + +// define the threadlocal store. +typedef xgboost::common::ThreadLocalStore XGBAPIThreadLocalStore; + +int XGDMatrixCreateFromFile(const char *fname, + int silent, + DMatrixHandle *out) { + API_BEGIN(); + *out = DMatrix::Load( + fname, silent != 0, false); + API_END(); +} + +int XGDMatrixCreateFromCSR(const bst_ulong* indptr, + const unsigned *indices, + const float* data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle* out) { + std::unique_ptr source(new data::SimpleCSRSource()); + + API_BEGIN(); + data::SimpleCSRSource& mat = *source; + mat.row_ptr_.resize(nindptr); + for (bst_ulong i = 0; i < nindptr; ++i) { + mat.row_ptr_[i] = static_cast(indptr[i]); + } + mat.row_data_.resize(nelem); + for (bst_ulong i = 0; i < nelem; ++i) { + mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); + mat.info.num_col = std::max(mat.info.num_col, + static_cast(indices[i] + 1)); + } + mat.info.num_row = nindptr - 1; + mat.info.num_nonzero = static_cast(nelem); + *out = DMatrix::Create(std::move(source)); + API_END(); +} + +int XGDMatrixCreateFromCSC(const bst_ulong* col_ptr, + const unsigned* indices, + const float* data, + bst_ulong nindptr, + bst_ulong nelem, + DMatrixHandle* out) { + std::unique_ptr source(new data::SimpleCSRSource()); + + API_BEGIN(); + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + data::SimpleCSRSource& mat = *source; + common::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); + builder.InitBudget(0, nthread); + long ncol = static_cast(nindptr - 1); // NOLINT(*) + #pragma omp parallel for schedule(static) + for (long i = 0; i < ncol; ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.AddBudget(indices[j], tid); + } + } + builder.InitStorage(); + #pragma omp parallel for schedule(static) + for (long i = 0; i < ncol; ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.Push(indices[j], + RowBatch::Entry(static_cast(i), data[j]), + tid); + } +} + mat.info.num_row = mat.row_ptr_.size() - 1; + mat.info.num_col = static_cast(ncol); + mat.info.num_nonzero = nelem; + *out = DMatrix::Create(std::move(source)); + API_END(); +} + +int XGDMatrixCreateFromMat(const float* data, + bst_ulong nrow, + bst_ulong ncol, + float missing, + DMatrixHandle* out) { + std::unique_ptr source(new data::SimpleCSRSource()); + + API_BEGIN(); + data::SimpleCSRSource& mat = *source; + bool nan_missing = common::CheckNAN(missing); + mat.info.num_row = nrow; + mat.info.num_col = ncol; + for (bst_ulong i = 0; i < nrow; ++i, data += ncol) { + bst_ulong nelem = 0; + for (bst_ulong j = 0; j < ncol; ++j) { + if (common::CheckNAN(data[j])) { + CHECK(nan_missing) + << "There are NAN in the matrix, however, you did not set missing=NAN"; + } else { + if (nan_missing || data[j] != missing) { + mat.row_data_.push_back(RowBatch::Entry(j, data[j])); + ++nelem; + } + } + } + mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem); + } + mat.info.num_nonzero = mat.row_data_.size(); + *out = DMatrix::Create(std::move(source)); + API_END(); +} + +int XGDMatrixSliceDMatrix(DMatrixHandle handle, + const int* idxset, + bst_ulong len, + DMatrixHandle* out) { + std::unique_ptr source(new data::SimpleCSRSource()); + + API_BEGIN(); + data::SimpleCSRSource src; + src.CopyFrom(static_cast(handle)); + data::SimpleCSRSource& ret = *source; + + CHECK_EQ(src.info.group_ptr.size(), 0) + << "slice does not support group structure"; + + ret.Clear(); + ret.info.num_row = len; + ret.info.num_col = src.info.num_col; + + dmlc::DataIter* iter = &src; + iter->BeforeFirst(); + CHECK(iter->Next()); + + const RowBatch& batch = iter->Value(); + for (bst_ulong i = 0; i < len; ++i) { + const int ridx = idxset[i]; + RowBatch::Inst inst = batch[ridx]; + CHECK_LT(static_cast(ridx), batch.size); + ret.row_data_.resize(ret.row_data_.size() + inst.length); + std::memcpy(dmlc::BeginPtr(ret.row_data_) + ret.row_ptr_.back(), inst.data, + sizeof(RowBatch::Entry) * inst.length); + ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); + ret.info.num_nonzero += inst.length; + + if (src.info.labels.size() != 0) { + ret.info.labels.push_back(src.info.labels[ridx]); + } + if (src.info.weights.size() != 0) { + ret.info.weights.push_back(src.info.weights[ridx]); + } + if (src.info.root_index.size() != 0) { + ret.info.root_index.push_back(src.info.root_index[ridx]); + } + } + *out = DMatrix::Create(std::move(source)); + API_END(); +} + +int XGDMatrixFree(DMatrixHandle handle) { + API_BEGIN(); + delete static_cast(handle); + API_END(); +} + +int XGDMatrixSaveBinary(DMatrixHandle handle, + const char* fname, + int silent) { + API_BEGIN(); + static_cast(handle)->SaveToLocalFile(fname); + API_END(); +} + +int XGDMatrixSetFloatInfo(DMatrixHandle handle, + const char* field, + const float* info, + bst_ulong len) { + API_BEGIN(); + static_cast(handle)->info().SetInfo(field, info, kFloat32, len); + API_END(); +} + +int XGDMatrixSetUIntInfo(DMatrixHandle handle, + const char* field, + const unsigned* info, + bst_ulong len) { + API_BEGIN(); + static_cast(handle)->info().SetInfo(field, info, kUInt32, len); + API_END(); +} + +int XGDMatrixSetGroup(DMatrixHandle handle, + const unsigned* group, + bst_ulong len) { + API_BEGIN(); + DMatrix *pmat = static_cast(handle); + MetaInfo& info = pmat->info(); + info.group_ptr.resize(len + 1); + info.group_ptr[0] = 0; + for (uint64_t i = 0; i < len; ++i) { + info.group_ptr[i + 1] = info.group_ptr[i] + group[i]; + } + API_END(); +} + +int XGDMatrixGetFloatInfo(const DMatrixHandle handle, + const char* field, + bst_ulong* out_len, + const float** out_dptr) { + API_BEGIN(); + const MetaInfo& info = static_cast(handle)->info(); + const std::vector* vec = nullptr; + if (!std::strcmp(field, "label")) { + vec = &info.labels; + } else if (!std::strcmp(field, "weight")) { + vec = &info.weights; + } else if (!std::strcmp(field, "base_margin")) { + vec = &info.base_margin; + } else { + LOG(FATAL) << "Unknown float field name " << field; + } + *out_len = static_cast(vec->size()); + *out_dptr = dmlc::BeginPtr(*vec); + API_END(); +} + +int XGDMatrixGetUIntInfo(const DMatrixHandle handle, + const char *field, + bst_ulong *out_len, + const unsigned **out_dptr) { + API_BEGIN(); + const MetaInfo& info = static_cast(handle)->info(); + const std::vector* vec = nullptr; + if (!std::strcmp(field, "root_index")) { + vec = &info.root_index; + } else { + LOG(FATAL) << "Unknown uint field name " << field; + } + *out_len = static_cast(vec->size()); + *out_dptr = dmlc::BeginPtr(*vec); + API_END(); +} + +int XGDMatrixNumRow(const DMatrixHandle handle, + bst_ulong *out) { + API_BEGIN(); + *out = static_cast(static_cast(handle)->info().num_row); + API_END(); +} + +int XGDMatrixNumCol(const DMatrixHandle handle, + bst_ulong *out) { + API_BEGIN(); + *out = static_cast(static_cast(handle)->info().num_col); + API_END(); +} + +// xgboost implementation +int XGBoosterCreate(DMatrixHandle dmats[], + bst_ulong len, + BoosterHandle *out) { + API_BEGIN(); + std::vector mats; + for (bst_ulong i = 0; i < len; ++i) { + mats.push_back(static_cast(dmats[i])); + } + *out = new Booster(mats); + API_END(); +} + +int XGBoosterFree(BoosterHandle handle) { + API_BEGIN(); + delete static_cast(handle); + API_END(); +} + +int XGBoosterSetParam(BoosterHandle handle, + const char *name, + const char *value) { + API_BEGIN(); + static_cast(handle)->SetParam(name, value); + API_END(); +} + +int XGBoosterUpdateOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dtrain) { + API_BEGIN(); + Booster* bst = static_cast(handle); + DMatrix *dtr = static_cast(dtrain); + + bst->LazyInit(); + bst->learner()->UpdateOneIter(iter, dtr); + API_END(); +} + +int XGBoosterBoostOneIter(BoosterHandle handle, + DMatrixHandle dtrain, + float *grad, + float *hess, + bst_ulong len) { + std::vector& tmp_gpair = XGBAPIThreadLocalStore::Get()->tmp_gpair; + API_BEGIN(); + Booster* bst = static_cast(handle); + DMatrix* dtr = static_cast(dtrain); + tmp_gpair.resize(len); + for (bst_ulong i = 0; i < len; ++i) { + tmp_gpair[i] = bst_gpair(grad[i], hess[i]); + } + + bst->LazyInit(); + bst->learner()->BoostOneIter(0, dtr, &tmp_gpair); + API_END(); +} + +int XGBoosterEvalOneIter(BoosterHandle handle, + int iter, + DMatrixHandle dmats[], + const char* evnames[], + bst_ulong len, + const char** out_str) { + std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str; + API_BEGIN(); + Booster* bst = static_cast(handle); + std::vector data_sets; + std::vector data_names; + + for (bst_ulong i = 0; i < len; ++i) { + data_sets.push_back(static_cast(dmats[i])); + data_names.push_back(std::string(evnames[i])); + } + + bst->LazyInit(); + eval_str = bst->learner()->EvalOneIter(iter, data_sets, data_names); + *out_str = eval_str.c_str(); + API_END(); +} + +int XGBoosterPredict(BoosterHandle handle, + DMatrixHandle dmat, + int option_mask, + unsigned ntree_limit, + bst_ulong *len, + const float **out_result) { + std::vector& preds = XGBAPIThreadLocalStore::Get()->ret_vec_float; + API_BEGIN(); + Booster *bst = static_cast(handle); + bst->LazyInit(); + bst->learner()->Predict( + static_cast(dmat), + (option_mask & 1) != 0, + &preds, ntree_limit, + (option_mask & 2) != 0); + *out_result = dmlc::BeginPtr(preds); + *len = static_cast(preds.size()); + API_END(); +} + +int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { + API_BEGIN(); + std::unique_ptr fi(dmlc::Stream::Create(fname, "r")); + static_cast(handle)->LoadModel(fi.get()); + API_END(); +} + +int XGBoosterSaveModel(BoosterHandle handle, const char* fname) { + API_BEGIN(); + std::unique_ptr fo(dmlc::Stream::Create(fname, "w")); + Booster *bst = static_cast(handle); + bst->LazyInit(); + bst->learner()->Save(fo.get()); + API_END(); +} + +int XGBoosterLoadModelFromBuffer(BoosterHandle handle, + const void* buf, + bst_ulong len) { + API_BEGIN(); + common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*) + static_cast(handle)->LoadModel(&fs); + API_END(); +} + +int XGBoosterGetModelRaw(BoosterHandle handle, + bst_ulong* out_len, + const char** out_dptr) { + std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str; + raw_str.resize(0); + + API_BEGIN(); + common::MemoryBufferStream fo(&raw_str); + Booster *bst = static_cast(handle); + bst->LazyInit(); + bst->learner()->Save(&fo); + *out_dptr = dmlc::BeginPtr(raw_str); + *out_len = static_cast(raw_str.length()); + API_END(); +} + +inline void XGBoostDumpModelImpl( + BoosterHandle handle, + const FeatureMap& fmap, + int with_stats, + bst_ulong* len, + const char*** out_models) { + std::vector& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str; + std::vector& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp; + Booster *bst = static_cast(handle); + bst->LazyInit(); + str_vecs = bst->learner()->Dump2Text(fmap, with_stats != 0); + charp_vecs.resize(str_vecs.size()); + for (size_t i = 0; i < str_vecs.size(); ++i) { + charp_vecs[i] = str_vecs[i].c_str(); + } + *out_models = dmlc::BeginPtr(charp_vecs); + *len = static_cast(charp_vecs.size()); +} +int XGBoosterDumpModel(BoosterHandle handle, + const char* fmap, + int with_stats, + bst_ulong* len, + const char*** out_models) { + API_BEGIN(); + FeatureMap featmap; + if (strlen(fmap) != 0) { + std::unique_ptr fs( + dmlc::Stream::Create(fmap, "r")); + dmlc::istream is(fs.get()); + featmap.LoadText(is); + } + XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models); + API_END(); +} + +int XGBoosterDumpModelWithFeatures(BoosterHandle handle, + int fnum, + const char** fname, + const char** ftype, + int with_stats, + bst_ulong* len, + const char*** out_models) { + API_BEGIN(); + FeatureMap featmap; + for (int i = 0; i < fnum; ++i) { + featmap.PushBack(i, fname[i], ftype[i]); + } + XGBoostDumpModelImpl(handle, featmap, with_stats, len, out_models); + API_END(); +} diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc new file mode 100644 index 000000000..e1949e560 --- /dev/null +++ b/src/c_api/c_api_error.cc @@ -0,0 +1,21 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file c_api_error.cc + * \brief C error handling + */ +#include "./c_api_error.h" +#include "../common/thread_local.h" + +struct XGBAPIErrorEntry { + std::string last_error; +}; + +typedef xgboost::common::ThreadLocalStore XGBAPIErrorStore; + +const char *XGBGetLastError() { + return XGBAPIErrorStore::Get()->last_error.c_str(); +} + +void XGBAPISetLastError(const char* msg) { + XGBAPIErrorStore::Get()->last_error = msg; +} diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h new file mode 100644 index 000000000..4bb631ecd --- /dev/null +++ b/src/c_api/c_api_error.h @@ -0,0 +1,39 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file c_api_error.h + * \brief Error handling for C API. + */ +#ifndef XGBOOST_C_API_C_API_ERROR_H_ +#define XGBOOST_C_API_C_API_ERROR_H_ + +#include +#include +#include + +/*! \brief macro to guard beginning and end section of all functions */ +#define API_BEGIN() try { +/*! \brief every function starts with API_BEGIN(); + and finishes with API_END() or API_END_HANDLE_ERROR */ +#define API_END() } catch(dmlc::Error &_except_) { return XGBAPIHandleException(_except_); } return 0; // NOLINT(*) +/*! + * \brief every function starts with API_BEGIN(); + * and finishes with API_END() or API_END_HANDLE_ERROR + * The finally clause contains procedure to cleanup states when an error happens. + */ +#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; return XGBAPIHandleException(_except_); } return 0; // NOLINT(*) + +/*! + * \brief Set the last error message needed by C API + * \param msg The error message to set. + */ +void XGBAPISetLastError(const char* msg); +/*! + * \brief handle exception throwed out + * \param e the exception + * \return the return value of API after exception is handled + */ +inline int XGBAPIHandleException(const dmlc::Error &e) { + XGBAPISetLastError(e.what()); + return -1; +} +#endif // XGBOOST_C_API_C_API_ERROR_H_ diff --git a/src/cli_main.cc b/src/cli_main.cc index 7ae514580..a08e3fd6d 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -11,8 +11,9 @@ #include #include -#include +#include #include +#include #include #include #include @@ -107,6 +108,8 @@ struct CLIParam : public dmlc::Parameter { .describe("Data split mode."); DMLC_DECLARE_FIELD(ntree_limit).set_default(0).set_lower_bound(0) .describe("Number of trees used for prediction, 0 means use all trees."); + DMLC_DECLARE_FIELD(pred_margin).set_default(false) + .describe("Whether to predict margin value instead of probability."); DMLC_DECLARE_FIELD(dump_stats).set_default(false) .describe("Whether dump the model statistics."); DMLC_DECLARE_FIELD(name_fmap).set_default("NULL") @@ -115,7 +118,8 @@ struct CLIParam : public dmlc::Parameter { .describe("Name of the output dump text file."); // alias DMLC_DECLARE_ALIAS(train_path, data); - DMLC_DECLARE_ALIAS(test_path, "test:data"); + DMLC_DECLARE_ALIAS(test_path, test:data); + DMLC_DECLARE_ALIAS(name_fmap, fmap); } // customized configure function of CLIParam inline void Configure(const std::vector >& cfg) { @@ -149,7 +153,7 @@ DMLC_REGISTER_PARAMETER(CLIParam); void CLITrain(const CLIParam& param) { if (rabit::IsDistributed()) { std::string pname = rabit::GetProcessorName(); - LOG(INFO) << "start " << pname << ":" << rabit::GetRank(); + LOG(CONSOLE) << "start " << pname << ":" << rabit::GetRank(); } // load in data. std::unique_ptr dtrain( @@ -178,6 +182,8 @@ void CLITrain(const CLIParam& param) { std::unique_ptr fi( dmlc::Stream::Create(param.model_in.c_str(), "r")); learner->Load(fi.get()); + } else { + learner->InitModel(); } } // start training. @@ -186,7 +192,7 @@ void CLITrain(const CLIParam& param) { double elapsed = dmlc::GetTime() - start; if (version % 2 == 0) { if (param.silent == 0) { - LOG(INFO) << "boosting round " << i << ", " << elapsed << " sec elapsed"; + LOG(CONSOLE) << "boosting round " << i << ", " << elapsed << " sec elapsed"; } learner->UpdateOneIter(i, dtrain.get()); if (learner->AllowLazyCheckPoint()) { @@ -200,16 +206,18 @@ void CLITrain(const CLIParam& param) { std::string res = learner->EvalOneIter(i, eval_datasets, eval_data_names); if (rabit::IsDistributed()) { if (rabit::GetRank() == 0) { - rabit::TrackerPrint(res + "\n"); + LOG(TRACKER) << res; } } else { if (param.silent < 2) { - LOG(INFO) << res; + LOG(CONSOLE) << res; } } if (param.save_period != 0 && (i + 1) % param.save_period == 0) { std::ostringstream os; - os << param.model_dir << '/' << i + 1 << ".model"; + os << param.model_dir << '/' + << std::setfill('0') << std::setw(4) + << i + 1 << ".model"; std::unique_ptr fo( dmlc::Stream::Create(os.str().c_str(), "w")); learner->Save(fo.get()); @@ -228,7 +236,9 @@ void CLITrain(const CLIParam& param) { param.model_out != "NONE") { std::ostringstream os; if (param.model_out == "NULL") { - os << param.model_dir << '/' << param.num_round << ".model"; + os << param.model_dir << '/' + << std::setfill('0') << std::setw(4) + << param.num_round << ".model"; } else { os << param.model_out; } @@ -239,7 +249,7 @@ void CLITrain(const CLIParam& param) { if (param.silent == 0) { double elapsed = dmlc::GetTime() - start; - LOG(INFO) << "update end, " << elapsed << " sec in all"; + LOG(CONSOLE) << "update end, " << elapsed << " sec in all"; } } @@ -272,6 +282,8 @@ void CLIDump2Text(const CLIParam& param) { } void CLIPredict(const CLIParam& param) { + CHECK_NE(param.test_path, "NULL") + << "Test dataset parameter test:data must be specified."; // load data std::unique_ptr dtest( DMatrix::Load(param.test_path, param.silent != 0, param.dsplit == 2)); @@ -284,12 +296,12 @@ void CLIPredict(const CLIParam& param) { learner->Load(fi.get()); if (param.silent == 0) { - LOG(INFO) << "start prediction..."; + LOG(CONSOLE) << "start prediction..."; } std::vector preds; learner->Predict(dtest.get(), param.pred_margin, &preds, param.ntree_limit); if (param.silent == 0) { - LOG(INFO) << "writing prediction to " << param.name_pred; + LOG(CONSOLE) << "writing prediction to " << param.name_pred; } std::unique_ptr fo( dmlc::Stream::Create(param.name_pred.c_str(), "w")); diff --git a/src/common/base64.h b/src/common/base64.h index 0eb992ffe..4c876b5f8 100644 --- a/src/common/base64.h +++ b/src/common/base64.h @@ -8,7 +8,7 @@ #ifndef XGBOOST_COMMON_BASE64_H_ #define XGBOOST_COMMON_BASE64_H_ -#include +#include #include #include #include diff --git a/src/common/common.cc b/src/common/common.cc new file mode 100644 index 000000000..398f12186 --- /dev/null +++ b/src/common/common.cc @@ -0,0 +1,15 @@ +/*! + * Copyright 2015 by Contributors + * \file common.cc + * \brief Enable all kinds of global variables in common. + */ +#include "./random.h" + +namespace xgboost { +namespace common { +RandomEngine& GlobalRandom() { + static RandomEngine inst; + return inst; +} +} +} // namespace xgboost diff --git a/src/common/quantile.h b/src/common/quantile.h index 1b853207b..824689e3a 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -8,7 +8,7 @@ #define XGBOOST_COMMON_QUANTILE_H_ #include -#include +#include #include #include #include diff --git a/src/common/thread_local.h b/src/common/thread_local.h new file mode 100644 index 000000000..35301b66e --- /dev/null +++ b/src/common/thread_local.h @@ -0,0 +1,77 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file thread_local.h + * \brief Common utility for thread local storage. + */ +#ifndef XGBOOST_COMMON_THREAD_LOCAL_H_ +#define XGBOOST_COMMON_THREAD_LOCAL_H_ + +#include +#include +#include + +namespace xgboost { +namespace common { + +// macro hanlding for threadlocal variables +#ifdef __GNUC__ + #define MX_TREAD_LOCAL __thread +#elif __STDC_VERSION__ >= 201112L + #define MX_TREAD_LOCAL _Thread_local +#elif defined(_MSC_VER) + #define MX_TREAD_LOCAL __declspec(thread) +#endif + +#ifndef MX_TREAD_LOCAL +#message("Warning: Threadlocal is not enabled"); +#endif + +/*! + * \brief A threadlocal store to store threadlocal variables. + * Will return a thread local singleton of type T + * \tparam T the type we like to store + */ +template +class ThreadLocalStore { + public: + /*! \return get a thread local singleton */ + static T* Get() { + static MX_TREAD_LOCAL T* ptr = nullptr; + if (ptr == nullptr) { + ptr = new T(); + Singleton()->RegisterDelete(ptr); + } + return ptr; + } + + private: + /*! \brief constructor */ + ThreadLocalStore() {} + /*! \brief destructor */ + ~ThreadLocalStore() { + for (size_t i = 0; i < data_.size(); ++i) { + delete data_[i]; + } + } + /*! \return singleton of the store */ + static ThreadLocalStore *Singleton() { + static ThreadLocalStore inst; + return &inst; + } + /*! + * \brief register str for internal deletion + * \param str the string pointer + */ + void RegisterDelete(T *str) { + std::unique_lock lock(mutex_); + data_.push_back(str); + lock.unlock(); + } + /*! \brief internal mutex */ + std::mutex mutex_; + /*!\brief internal data */ + std::vector data_; +}; +} // namespace common +} // namespace xgboost +#endif // XGBOOST_COMMON_THREAD_LOCAL_H_ diff --git a/src/data/data.cc b/src/data/data.cc index 11bef8a9b..4f6f2a878 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -3,7 +3,12 @@ * \file data.cc */ #include +#include #include +#include "./sparse_batch_page.h" +#include "./simple_dmatrix.h" +#include "./simple_csr_source.h" +#include "../common/io.h" namespace xgboost { // implementation of inline functions @@ -83,4 +88,83 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t } } + +DMatrix* DMatrix::Load(const std::string& uri, + bool silent, + bool load_row_split, + const std::string& file_format) { + std::string fname, cache_file; + size_t dlm_pos = uri.find('#'); + if (dlm_pos != std::string::npos) { + cache_file = uri.substr(dlm_pos + 1, uri.length()); + fname = uri.substr(0, dlm_pos); + CHECK_EQ(cache_file.find('#'), std::string::npos) + << "Only one `#` is allowed in file path for cache file specification."; + if (load_row_split) { + std::ostringstream os; + os << cache_file << ".r" << rabit::GetRank(); + cache_file = os.str(); + } + } else { + fname = uri; + } + int partid = 0, npart = 1; + if (load_row_split) { + partid = rabit::GetRank(); + npart = rabit::GetWorldSize(); + } + + // legacy handling of binary data loading + if (file_format == "auto" && !load_row_split) { + int magic; + std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); + common::PeekableInStream is(fi.get()); + if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) && + magic == data::SimpleCSRSource::kMagic) { + std::unique_ptr source(new data::SimpleCSRSource()); + source->LoadBinary(&is); + DMatrix* dmat = DMatrix::Create(std::move(source), cache_file); + if (!silent) { + LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with " + << dmat->info().num_nonzero << " entries loaded from " << uri; + } + return dmat; + } + } + + std::string ftype = file_format; + if (file_format == "auto") ftype = "libsvm"; + std::unique_ptr > parser( + dmlc::Parser::Create(fname.c_str(), partid, npart, ftype.c_str())); + DMatrix* dmat = DMatrix::Create(parser.get(), cache_file); + if (!silent) { + LOG(CONSOLE) << dmat->info().num_row << 'x' << dmat->info().num_col << " matrix with " + << dmat->info().num_nonzero << " entries loaded from " << uri; + } + return dmat; +} + +DMatrix* DMatrix::Create(dmlc::Parser* parser, + const std::string& cache_prefix) { + if (cache_prefix.length() == 0) { + std::unique_ptr source(new data::SimpleCSRSource()); + source->CopyFrom(parser); + return DMatrix::Create(std::move(source), cache_prefix); + } else { + LOG(FATAL) << "external memory not yet implemented"; + return nullptr; + } +} + +void DMatrix::SaveToLocalFile(const std::string& fname) { + data::SimpleCSRSource source; + source.CopyFrom(this); + std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); + source.SaveBinary(fo.get()); +} + +DMatrix* DMatrix::Create(std::unique_ptr&& source, + const std::string& cache_prefix) { + return new data::SimpleDMatrix(std::move(source)); +} } // namespace xgboost diff --git a/src/data/simple_csr_source.cc b/src/data/simple_csr_source.cc index daad2c21d..0654853da 100644 --- a/src/data/simple_csr_source.cc +++ b/src/data/simple_csr_source.cc @@ -3,7 +3,7 @@ * \file simple_csr_source.cc */ #include -#include +#include #include "./simple_csr_source.h" namespace xgboost { @@ -80,7 +80,7 @@ void SimpleCSRSource::SaveBinary(dmlc::Stream* fo) const { } void SimpleCSRSource::BeforeFirst() { - at_first_ = false; + at_first_ = true; } bool SimpleCSRSource::Next() { diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc new file mode 100644 index 000000000..69700f45b --- /dev/null +++ b/src/data/simple_dmatrix.cc @@ -0,0 +1,265 @@ +/*! + * Copyright 2014 by Contributors + * \file simple_dmatrix.cc + * \brief the input data structure for gradient boosting + * \author Tianqi Chen + */ +#include +#include +#include +#include +#include "./simple_dmatrix.h" +#include "../common/random.h" +#include "../common/group_data.h" + +namespace xgboost { +namespace data { + +bool SimpleDMatrix::ColBatchIter::Next() { + if (data_ptr_ >= cpages_.size()) return false; + data_ptr_ += 1; + SparsePage* pcol = cpages_[data_ptr_ - 1].get(); + batch_.size = col_index_.size(); + col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0)); + for (size_t i = 0; i < col_data_.size(); ++i) { + const bst_uint ridx = col_index_[i]; + col_data_[i] = SparseBatch::Inst + (dmlc::BeginPtr(pcol->data) + pcol->offset[ridx], + static_cast(pcol->offset[ridx + 1] - pcol->offset[ridx])); + } + batch_.col_index = dmlc::BeginPtr(col_index_); + batch_.col_data = dmlc::BeginPtr(col_data_); + return true; +} + +dmlc::DataIter* SimpleDMatrix::ColIterator() { + size_t ncol = this->info().num_col; + col_iter_.col_index_.resize(ncol); + for (size_t i = 0; i < ncol; ++i) { + col_iter_.col_index_[i] = static_cast(i); + } + col_iter_.BeforeFirst(); + return &col_iter_; +} + +dmlc::DataIter* SimpleDMatrix::ColIterator(const std::vector&fset) { + size_t ncol = this->info().num_col; + col_iter_.col_index_.resize(0); + for (size_t i = 0; i < fset.size(); ++i) { + if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); + } + col_iter_.BeforeFirst(); + return &col_iter_; +} + +void SimpleDMatrix::InitColAccess(const std::vector &enabled, + float pkeep, + size_t max_row_perbatch) { + if (this->HaveColAccess()) return; + + col_iter_.cpages_.clear(); + if (info().num_row < max_row_perbatch) { + std::unique_ptr page(new SparsePage()); + this->MakeOneBatch(enabled, pkeep, page.get()); + col_iter_.cpages_.push_back(std::move(page)); + } else { + this->MakeManyBatch(enabled, pkeep, max_row_perbatch); + } + // setup col-size + col_size_.resize(info().num_col); + std::fill(col_size_.begin(), col_size_.end(), 0); + for (size_t i = 0; i < col_iter_.cpages_.size(); ++i) { + SparsePage *pcol = col_iter_.cpages_[i].get(); + for (size_t j = 0; j < pcol->Size(); ++j) { + col_size_[j] += pcol->offset[j + 1] - pcol->offset[j]; + } + } +} + +// internal function to make one batch from row iter. +void SimpleDMatrix::MakeOneBatch(const std::vector& enabled, + float pkeep, + SparsePage *pcol) { + // clear rowset + buffered_rowset_.clear(); + // bit map + int nthread; + std::vector bmap; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + + pcol->Clear(); + common::ParallelGroupBuilder + builder(&pcol->offset, &pcol->data); + builder.InitBudget(info().num_col, nthread); + // start working + dmlc::DataIter* iter = this->RowIterator(); + iter->BeforeFirst(); + while (iter->Next()) { + const RowBatch& batch = iter->Value(); + bmap.resize(bmap.size() + batch.size, true); + std::bernoulli_distribution coin_flip(pkeep); + auto& rnd = common::GlobalRandom(); + + long batch_size = static_cast(batch.size); // NOLINT(*) + for (long i = 0; i < batch_size; ++i) { // NOLINT(*) + bst_uint ridx = static_cast(batch.base_rowid + i); + if (pkeep == 1.0f || coin_flip(rnd)) { + buffered_rowset_.push_back(ridx); + } else { + bmap[i] = false; + } + } + #pragma omp parallel for schedule(static) + for (long i = 0; i < batch_size; ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + bst_uint ridx = static_cast(batch.base_rowid + i); + if (bmap[ridx]) { + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + if (enabled[inst[j].index]) { + builder.AddBudget(inst[j].index, tid); + } + } + } + } + } + builder.InitStorage(); + + iter->BeforeFirst(); + while (iter->Next()) { + const RowBatch& batch = iter->Value(); + #pragma omp parallel for schedule(static) + for (long i = 0; i < static_cast(batch.size); ++i) { // NOLINT(*) + int tid = omp_get_thread_num(); + bst_uint ridx = static_cast(batch.base_rowid + i); + if (bmap[ridx]) { + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + if (enabled[inst[j].index]) { + builder.Push(inst[j].index, + SparseBatch::Entry((bst_uint)(batch.base_rowid+i), + inst[j].fvalue), tid); + } + } + } + } + } + + CHECK_EQ(pcol->Size(), info().num_col); + // sort columns + bst_omp_uint ncol = static_cast(pcol->Size()); + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) + for (bst_omp_uint i = 0; i < ncol; ++i) { + if (pcol->offset[i] < pcol->offset[i + 1]) { + std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i], + dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1], + SparseBatch::Entry::CmpValue); + } + } +} + +void SimpleDMatrix::MakeManyBatch(const std::vector& enabled, + float pkeep, + size_t max_row_perbatch) { + size_t btop = 0; + std::bernoulli_distribution coin_flip(pkeep); + auto& rnd = common::GlobalRandom(); + buffered_rowset_.clear(); + // internal temp cache + SparsePage tmp; tmp.Clear(); + // start working + dmlc::DataIter* iter = this->RowIterator(); + iter->BeforeFirst(); + + while (iter->Next()) { + const RowBatch &batch = iter->Value(); + for (size_t i = 0; i < batch.size; ++i) { + bst_uint ridx = static_cast(batch.base_rowid + i); + if (pkeep == 1.0f || coin_flip(rnd)) { + buffered_rowset_.push_back(ridx); + tmp.Push(batch[i]); + } + if (tmp.Size() >= max_row_perbatch) { + std::unique_ptr page(new SparsePage()); + this->MakeColPage(tmp.GetRowBatch(0), + dmlc::BeginPtr(buffered_rowset_) + btop, + enabled, page.get()); + col_iter_.cpages_.push_back(std::move(page)); + btop = buffered_rowset_.size(); + tmp.Clear(); + } + } + } + + if (tmp.Size() != 0) { + std::unique_ptr page(new SparsePage()); + this->MakeColPage(tmp.GetRowBatch(0), + dmlc::BeginPtr(buffered_rowset_) + btop, + enabled, page.get()); + col_iter_.cpages_.push_back(std::move(page)); + } +} + +// make column page from subset of rowbatchs +void SimpleDMatrix::MakeColPage(const RowBatch& batch, + const bst_uint* ridx, + const std::vector& enabled, + SparsePage* pcol) { + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + int max_nthread = std::max(omp_get_num_procs() / 2 - 2, 1); + if (nthread > max_nthread) { + nthread = max_nthread; + } + } + pcol->Clear(); + common::ParallelGroupBuilder + builder(&pcol->offset, &pcol->data); + builder.InitBudget(info().num_col, nthread); + bst_omp_uint ndata = static_cast(batch.size); + #pragma omp parallel for schedule(static) num_threads(nthread) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + const SparseBatch::Entry &e = inst[j]; + if (enabled[e.index]) { + builder.AddBudget(e.index, tid); + } + } + } + builder.InitStorage(); + #pragma omp parallel for schedule(static) num_threads(nthread) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + const SparseBatch::Entry &e = inst[j]; + builder.Push(e.index, + SparseBatch::Entry(ridx[i], e.fvalue), + tid); + } + } + CHECK_EQ(pcol->Size(), info().num_col); + // sort columns + bst_omp_uint ncol = static_cast(pcol->Size()); + #pragma omp parallel for schedule(dynamic, 1) num_threads(nthread) + for (bst_omp_uint i = 0; i < ncol; ++i) { + if (pcol->offset[i] < pcol->offset[i + 1]) { + std::sort(dmlc::BeginPtr(pcol->data) + pcol->offset[i], + dmlc::BeginPtr(pcol->data) + pcol->offset[i + 1], + SparseBatch::Entry::CmpValue); + } + } +} + +bool SimpleDMatrix::SingleColBlock() const { + return col_iter_.cpages_.size() <= 1; +} +} // namespace data +} // namespace xgboost diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h new file mode 100644 index 000000000..3b63e1e97 --- /dev/null +++ b/src/data/simple_dmatrix.h @@ -0,0 +1,119 @@ +/*! + * Copyright 2015 by Contributors + * \file simple_dmatrix.h + * \brief In-memory version of DMatrix. + * \author Tianqi Chen + */ +#ifndef XGBOOST_DATA_SIMPLE_DMATRIX_H_ +#define XGBOOST_DATA_SIMPLE_DMATRIX_H_ + +#include +#include +#include +#include +#include +#include "./sparse_batch_page.h" + +namespace xgboost { +namespace data { + +class SimpleDMatrix : public DMatrix { + public: + explicit SimpleDMatrix(std::unique_ptr&& source) + : source_(std::move(source)) {} + + MetaInfo& info() override { + return source_->info; + } + + const MetaInfo& info() const override { + return source_->info; + } + + dmlc::DataIter* RowIterator() override { + dmlc::DataIter* iter = source_.get(); + iter->BeforeFirst(); + return iter; + } + + bool HaveColAccess() const override { + return col_size_.size() != 0; + } + + const std::vector& buffered_rowset() const override { + return buffered_rowset_; + } + + size_t GetColSize(size_t cidx) const { + return col_size_[cidx]; + } + + float GetColDensity(size_t cidx) const override { + size_t nmiss = buffered_rowset_.size() - col_size_[cidx]; + return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); + } + + dmlc::DataIter* ColIterator() override; + + dmlc::DataIter* ColIterator(const std::vector& fset) override; + + void InitColAccess(const std::vector& enabled, + float subsample, + size_t max_row_perbatch) override; + + bool SingleColBlock() const override; + + private: + // in-memory column batch iterator. + struct ColBatchIter: dmlc::DataIter { + public: + ColBatchIter() : data_ptr_(0) {} + void BeforeFirst() override { + data_ptr_ = 0; + } + const ColBatch &Value() const override { + return batch_; + } + bool Next() override; + + private: + // allow SimpleDMatrix to access it. + friend class SimpleDMatrix; + // data content + std::vector col_index_; + // column content + std::vector col_data_; + // column sparse pages + std::vector > cpages_; + // data pointer + size_t data_ptr_; + // temporal space for batch + ColBatch batch_; + }; + + // source data pointer. + std::unique_ptr source_; + // column iterator + ColBatchIter col_iter_; + // list of row index that are buffered. + std::vector buffered_rowset_; + /*! \brief sizeof column data */ + std::vector col_size_; + + // internal function to make one batch from row iter. + void MakeOneBatch(const std::vector& enabled, + float pkeep, + SparsePage *pcol); + + void MakeManyBatch(const std::vector& enabled, + float pkeep, + size_t max_row_perbatch); + + void MakeColPage(const RowBatch& batch, + const bst_uint* ridx, + const std::vector& enabled, + SparsePage* pcol); +}; +} // namespace data +} // namespace xgboost +#endif // XGBOOST_DATA_SIMPLE_DMATRIX_H_ diff --git a/old_src/io/sparse_batch_page.h b/src/data/sparse_batch_page.h similarity index 62% rename from old_src/io/sparse_batch_page.h rename to src/data/sparse_batch_page.h index 96810c0fb..b19504bda 100644 --- a/old_src/io/sparse_batch_page.h +++ b/src/data/sparse_batch_page.h @@ -6,17 +6,18 @@ * use in external memory computation * \author Tianqi Chen */ -#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_ -#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_ +#ifndef XGBOOST_DATA_SPARSE_BATCH_PAGE_H_ +#define XGBOOST_DATA_SPARSE_BATCH_PAGE_H_ +#include +#include #include #include -#include "../data.h" namespace xgboost { -namespace io { +namespace data { /*! - * \brief storage unit of sparse batch + * \brief in-memory storage unit of sparse batch */ class SparsePage { public: @@ -24,6 +25,7 @@ class SparsePage { std::vector offset; /*! \brief the data of the segments */ std::vector data; + /*! \brief constructor */ SparsePage() { this->Clear(); @@ -38,14 +40,14 @@ class SparsePage { * \param sorted_index_set sorted index of segments we are interested in * \return true of the loading as successful, false if end of file was reached */ - inline bool Load(utils::ISeekStream *fi, + inline bool Load(dmlc::SeekStream *fi, const std::vector &sorted_index_set) { if (!fi->Read(&disk_offset_)) return false; // setup the offset offset.clear(); offset.push_back(0); for (size_t i = 0; i < sorted_index_set.size(); ++i) { bst_uint fid = sorted_index_set[i]; - utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format"); + CHECK_LT(fid + 1, disk_offset_.size()); size_t size = disk_offset_[fid + 1] - disk_offset_[fid]; offset.push_back(offset.back() + size); } @@ -56,7 +58,7 @@ class SparsePage { for (size_t i = 0; i < sorted_index_set.size();) { bst_uint fid = sorted_index_set[i]; if (disk_offset_[fid] != curr_offset) { - utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted"); + CHECK_GT(disk_offset_[fid], curr_offset); fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry)); curr_offset = disk_offset_[fid]; } @@ -68,10 +70,12 @@ class SparsePage { break; } } + if (size_to_read != 0) { - utils::Check(fi->Read(BeginPtr(data) + offset[i], - size_to_read * sizeof(SparseBatch::Entry)) != 0, - "Invalid SparsePage file"); + CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset[i], + size_to_read * sizeof(SparseBatch::Entry)), + size_to_read * sizeof(SparseBatch::Entry)) + << "Invalid SparsePage file"; curr_offset += size_to_read; } i = j; @@ -87,13 +91,14 @@ class SparsePage { * \param fi the input stream of the file * \return true of the loading as successful, false if end of file was reached */ - inline bool Load(utils::IStream *fi) { + inline bool Load(dmlc::Stream *fi) { if (!fi->Read(&offset)) return false; - utils::Check(offset.size() != 0, "Invalid SparsePage file"); + CHECK_NE(offset.size(), 0) << "Invalid SparsePage file"; data.resize(offset.back()); if (data.size() != 0) { - utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0, - "Invalid SparsePage file"); + CHECK_EQ(fi->Read(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)), + data.size() * sizeof(SparseBatch::Entry)) + << "Invalid SparsePage file"; } return true; } @@ -102,12 +107,12 @@ class SparsePage { * to disk it must contain all the elements in the * \param fo output stream */ - inline void Save(utils::IStream *fo) const { - utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset"); - utils::Assert(offset.back() == data.size(), "in consistent SparsePage"); + inline void Save(dmlc::Stream *fo) const { + CHECK(offset.size() != 0 && offset[0] == 0); + CHECK_EQ(offset.back(), data.size()); fo->Write(offset); if (data.size() != 0) { - fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)); + fo->Write(dmlc::BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)); } } /*! \return estimation of memory cost of this page */ @@ -125,13 +130,14 @@ class SparsePage { * \param fi the input stream of the file * \return true of the loading as successful, false if end of file was reached */ - inline bool PushLoad(utils::IStream *fi) { + inline bool PushLoad(dmlc::Stream *fi) { if (!fi->Read(&disk_offset_)) return false; data.resize(offset.back() + disk_offset_.back()); if (disk_offset_.back() != 0) { - utils::Check(fi->Read(BeginPtr(data) + offset.back(), - disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0, - "Invalid SparsePage file"); + CHECK_EQ(fi->Read(dmlc::BeginPtr(data) + offset.back(), + disk_offset_.back() * sizeof(SparseBatch::Entry)), + disk_offset_.back() * sizeof(SparseBatch::Entry)) + << "Invalid SparsePage file"; } size_t top = offset.back(); size_t begin = offset.size(); @@ -147,7 +153,7 @@ class SparsePage { */ inline void Push(const RowBatch &batch) { data.resize(offset.back() + batch.ind_ptr[batch.size]); - std::memcpy(BeginPtr(data) + offset.back(), + std::memcpy(dmlc::BeginPtr(data) + offset.back(), batch.data_ptr + batch.ind_ptr[0], sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]); size_t top = offset.back(); @@ -164,8 +170,8 @@ class SparsePage { inline void Push(const SparsePage &batch) { size_t top = offset.back(); data.resize(top + batch.data.size()); - std::memcpy(BeginPtr(data) + top, - BeginPtr(batch.data), + std::memcpy(dmlc::BeginPtr(data) + top, + dmlc::BeginPtr(batch.data), sizeof(SparseBatch::Entry) * batch.data.size()); size_t begin = offset.size(); offset.resize(begin + batch.Size()); @@ -182,7 +188,7 @@ class SparsePage { size_t begin = data.size(); data.resize(begin + inst.length); if (inst.length != 0) { - std::memcpy(BeginPtr(data) + begin, inst.data, + std::memcpy(dmlc::BeginPtr(data) + begin, inst.data, sizeof(SparseBatch::Entry) * inst.length); } } @@ -193,8 +199,8 @@ class SparsePage { inline RowBatch GetRowBatch(size_t base_rowid) const { RowBatch out; out.base_rowid = base_rowid; - out.ind_ptr = BeginPtr(offset); - out.data_ptr = BeginPtr(data); + out.ind_ptr = dmlc::BeginPtr(offset); + out.data_ptr = dmlc::BeginPtr(data); out.size = offset.size() - 1; return out; } @@ -203,70 +209,6 @@ class SparsePage { /*! \brief external memory column offset */ std::vector disk_offset_; }; -/*! - * \brief factory class for SparsePage, - * used in threadbuffer template - */ -class SparsePageFactory { - public: - SparsePageFactory(void) - : action_load_all_(true), set_load_all_(true) {} - inline void SetFile(const utils::FileStream &fi, - size_t file_begin = 0) { - fi_ = fi; - file_begin_ = file_begin; - } - inline const std::vector &index_set(void) const { - return action_index_set_; - } - // set index set, will be used after next before first - inline void SetIndexSet(const std::vector &index_set, - bool load_all) { - set_load_all_ = load_all; - if (!set_load_all_) { - set_index_set_ = index_set; - std::sort(set_index_set_.begin(), set_index_set_.end()); - } - } - inline bool Init(void) { - return true; - } - inline void SetParam(const char *name, const char *val) {} - inline bool LoadNext(SparsePage *val) { - if (!action_load_all_) { - if (action_index_set_.size() == 0) { - return false; - } else { - return val->Load(&fi_, action_index_set_); - } - } else { - return val->Load(&fi_); - } - } - inline SparsePage *Create(void) { - return new SparsePage(); - } - inline void FreeSpace(SparsePage *a) { - delete a; - } - inline void Destroy(void) { - fi_.Close(); - } - inline void BeforeFirst(void) { - fi_.Seek(file_begin_); - action_load_all_ = set_load_all_; - if (!set_load_all_) { - action_index_set_ = set_index_set_; - } - } - - private: - bool action_load_all_, set_load_all_; - size_t file_begin_; - utils::FileStream fi_; - std::vector action_index_set_; - std::vector set_index_set_; -}; -} // namespace io +} // namespace data } // namespace xgboost -#endif // XGBOOST_IO_SPARSE_BATCH_PAGE_H_ +#endif // XGBOOST_DATA_SPARSE_BATCH_PAGE_H_ diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 010655047..f4d235e1b 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -5,10 +5,10 @@ * the update rule is parallel coordinate descent (shotgun) * \author Tianqi Chen */ -#include #include #include #include +#include #include #include #include @@ -17,6 +17,9 @@ namespace xgboost { namespace gbm { + +DMLC_REGISTRY_FILE_TAG(gblinear); + // model parameter struct GBLinearModelParam :public dmlc::Parameter { // number of feature dimension @@ -168,6 +171,9 @@ class GBLinear : public GradientBooster { int64_t buffer_offset, std::vector *out_preds, unsigned ntree_limit) override { + if (model.weight.size() == 0) { + model.InitModel(); + } CHECK_EQ(ntree_limit, 0) << "GBLinear::Predict ntrees is only valid for gbtree predictor"; std::vector &preds = *out_preds; @@ -293,4 +299,3 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear") }); } // namespace gbm } // namespace xgboost - diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc new file mode 100644 index 000000000..ae5185867 --- /dev/null +++ b/src/gbm/gbm.cc @@ -0,0 +1,29 @@ +/*! + * Copyright 2015 by Contributors + * \file gbm.cc + * \brief Registry of gradient boosters. + */ +#include +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg); +} // namespace dmlc + +namespace xgboost { +GradientBooster* GradientBooster::Create(const std::string& name) { + auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name); + if (e == nullptr) { + LOG(FATAL) << "Unknown gbm type " << name; + } + return (e->body)(); +} +} // namespace xgboost + +namespace xgboost { +namespace gbm { +// List of files that will be force linked in static links. +DMLC_REGISTRY_LINK_TAG(gblinear); +DMLC_REGISTRY_LINK_TAG(gbtree); +} // namespace gbm +} // namespace xgboost diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index deb814ded..6618cd503 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -4,9 +4,9 @@ * \brief gradient boosted tree implementation. * \author Tianqi Chen */ -#include #include #include +#include #include #include @@ -19,6 +19,8 @@ namespace xgboost { namespace gbm { +DMLC_REGISTRY_FILE_TAG(gbtree); + /*! \brief training parameters */ struct GBTreeTrainParam : public dmlc::Parameter { /*! \brief number of threads */ @@ -482,4 +484,3 @@ XGBOOST_REGISTER_GBM(GBTree, "gbtree") }); } // namespace gbm } // namespace xgboost - diff --git a/src/global.cc b/src/global.cc deleted file mode 100644 index 62cf95f1f..000000000 --- a/src/global.cc +++ /dev/null @@ -1,72 +0,0 @@ -/*! - * Copyright 2015 by Contributors - * \file global.cc - * \brief Enable all kinds of global static registry and variables. - */ -#include -#include -#include -#include -#include "./common/random.h" -#include "./common/base64.h" - -namespace dmlc { -DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg); -DMLC_REGISTRY_ENABLE(::xgboost::MetricReg); -DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg); -DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg); -} // namespace dmlc - -namespace xgboost { -// implement factory functions -ObjFunction* ObjFunction::Create(const std::string& name) { - auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name); - if (e == nullptr) { - LOG(FATAL) << "Unknown objective function " << name; - } - return (e->body)(); -} - -Metric* Metric::Create(const std::string& name) { - std::string buf = name; - std::string prefix = name; - auto pos = buf.find('@'); - if (pos == std::string::npos) { - auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(name); - if (e == nullptr) { - LOG(FATAL) << "Unknown objective function " << name; - } - return (e->body)(nullptr); - } else { - std::string prefix = buf.substr(0, pos); - auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str()); - if (e == nullptr) { - LOG(FATAL) << "Unknown objective function " << name; - } - return (e->body)(buf.substr(pos + 1, buf.length()).c_str()); - } -} - -TreeUpdater* TreeUpdater::Create(const std::string& name) { - auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name); - if (e == nullptr) { - LOG(FATAL) << "Unknown tree updater " << name; - } - return (e->body)(); -} - -GradientBooster* GradientBooster::Create(const std::string& name) { - auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name); - if (e == nullptr) { - LOG(FATAL) << "Unknown gbm type " << name; - } - return (e->body)(); -} - -namespace common { -RandomEngine& GlobalRandom() { - static RandomEngine inst; - return inst; -} -} -} // namespace xgboost diff --git a/src/learner.cc b/src/learner.cc index 0a01791c7..57daa2361 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "./common/io.h" #include "./common/random.h" @@ -94,6 +95,9 @@ struct LearnerTrainParam } }; +DMLC_REGISTER_PARAMETER(LearnerModelParam); +DMLC_REGISTER_PARAMETER(LearnerTrainParam); + /*! * \brief learner that performs gradient boosting for a specific objective function. * It does training and prediction. @@ -144,6 +148,9 @@ class LearnerImpl : public Learner { if (cfg_.count("num_class") != 0) { cfg_["num_output_group"] = cfg_["num_class"]; + if (atoi(cfg_["num_class"].c_str()) > 1 && cfg_.count("objective") == 0) { + cfg_["objective"] = "multi:softmax"; + } } if (cfg_.count("max_delta_step") == 0 && @@ -187,6 +194,10 @@ class LearnerImpl : public Learner { } } + void InitModel() override { + this->LazyInitModel(); + } + void Load(dmlc::Stream* fi) override { // TODO(tqchen) mark deprecation of old format. common::PeekableInStream fp(fi); @@ -202,7 +213,6 @@ class LearnerImpl : public Learner { } // use the peekable reader. fi = &fp; - std::string name_gbm, name_obj; // read parameter CHECK_EQ(fi->Read(&mparam, sizeof(mparam)), sizeof(mparam)) << "BoostLearner: wrong model format"; @@ -218,7 +228,7 @@ class LearnerImpl : public Learner { len = len >> static_cast(32UL); } if (len != 0) { - name_obj.resize(len); + name_obj_.resize(len); CHECK_EQ(fi->Read(&name_obj_[0], len), len) <<"BoostLearner: wrong model format"; } @@ -226,8 +236,10 @@ class LearnerImpl : public Learner { CHECK(fi->Read(&name_gbm_)) << "BoostLearner: wrong model format"; // duplicated code with LazyInitModel - obj_.reset(ObjFunction::Create(cfg_.at(name_obj_))); - gbm_.reset(GradientBooster::Create(cfg_.at(name_gbm_))); + obj_.reset(ObjFunction::Create(name_obj_)); + gbm_.reset(GradientBooster::Create(name_gbm_)); + gbm_->Load(fi); + if (metrics_.size() == 0) { metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric())); } @@ -246,11 +258,12 @@ class LearnerImpl : public Learner { } void UpdateOneIter(int iter, DMatrix* train) override { + CHECK(ModelInitialized()) + << "Always call InitModel or LoadModel before update"; if (tparam.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter); } this->LazyInitDMatrix(train); - this->LazyInitModel(); this->PredictRaw(train, &preds_); obj_->GetGradient(preds_, train->info(), iter, &gpair_); gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_); @@ -262,6 +275,7 @@ class LearnerImpl : public Learner { if (tparam.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter); } + this->LazyInitDMatrix(train); gbm_->DoBoost(train, this->FindBufferOffset(train), in_gpair); } @@ -269,7 +283,8 @@ class LearnerImpl : public Learner { const std::vector& data_sets, const std::vector& data_names) override { std::ostringstream os; - os << '[' << iter << ']'; + os << '[' << iter << ']' + << std::setiosflags(std::ios::fixed); for (size_t i = 0; i < data_sets.size(); ++i) { this->PredictRaw(data_sets[i], &preds_); obj_->EvalTransform(&preds_); @@ -347,8 +362,6 @@ class LearnerImpl : public Learner { if (num_feature > mparam.num_feature) { mparam.num_feature = num_feature; } - // reset the base score - mparam.base_score = obj_->ProbToMargin(mparam.base_score); // setup cfg_["num_feature"] = ToString(mparam.num_feature); @@ -357,9 +370,13 @@ class LearnerImpl : public Learner { gbm_.reset(GradientBooster::Create(name_gbm_)); gbm_->Configure(cfg_.begin(), cfg_.end()); obj_->Configure(cfg_.begin(), cfg_.end()); + + // reset the base score + mparam.base_score = obj_->ProbToMargin(mparam.base_score); if (metrics_.size() == 0) { metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric())); } + this->base_score_ = mparam.base_score; gbm_->ResetPredBuffer(pred_buffer_size_); } @@ -373,6 +390,8 @@ class LearnerImpl : public Learner { inline void PredictRaw(DMatrix* data, std::vector* out_preds, unsigned ntree_limit = 0) const { + CHECK(gbm_.get() != nullptr) + << "Predict must happen after Load or InitModel"; gbm_->Predict(data, this->FindBufferOffset(data), out_preds, diff --git a/src/logging.cc b/src/logging.cc new file mode 100644 index 000000000..ffb249bd2 --- /dev/null +++ b/src/logging.cc @@ -0,0 +1,20 @@ +/*! + * Copyright 2015 by Contributors + * \file logging.cc + * \brief Implementation of loggers. + * \author Tianqi Chen + */ +#include +#include +#include "./common/sync.h" + +namespace xgboost { +ConsoleLogger::~ConsoleLogger() { + std::cout << log_stream_.str() << std::endl; +} + +TrackerLogger::~TrackerLogger() { + log_stream_ << '\n'; + rabit::TrackerPrint(log_stream_.str()); +} +} // namespace xgboost diff --git a/src/metric/elementwise_metric.cc b/src/metric/elementwise_metric.cc index bccee0ddf..ac9ef387f 100644 --- a/src/metric/elementwise_metric.cc +++ b/src/metric/elementwise_metric.cc @@ -5,12 +5,16 @@ * \author Kailong Chen, Tianqi Chen */ #include +#include #include #include "../common/math.h" #include "../common/sync.h" namespace xgboost { namespace metric { +// tag the this file, used by force static link later. +DMLC_REGISTRY_FILE_TAG(elementwise_metric); + /*! * \brief base class of element-wise evaluation * \tparam Derived the name of subclass @@ -124,4 +128,3 @@ XGBOOST_REGISTER_METRIC(PossionNegLoglik, "poisson-nloglik") } // namespace metric } // namespace xgboost - diff --git a/src/metric/metric.cc b/src/metric/metric.cc new file mode 100644 index 000000000..7986dec6b --- /dev/null +++ b/src/metric/metric.cc @@ -0,0 +1,42 @@ +/*! + * Copyright 2015 by Contributors + * \file metric_registry.cc + * \brief Registry of objective functions. + */ +#include +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::MetricReg); +} + +namespace xgboost { +Metric* Metric::Create(const std::string& name) { + std::string buf = name; + std::string prefix = name; + auto pos = buf.find('@'); + if (pos == std::string::npos) { + auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(name); + if (e == nullptr) { + LOG(FATAL) << "Unknown metric function " << name; + } + return (e->body)(nullptr); + } else { + std::string prefix = buf.substr(0, pos); + auto *e = ::dmlc::Registry< ::xgboost::MetricReg>::Get()->Find(prefix.c_str()); + if (e == nullptr) { + LOG(FATAL) << "Unknown metric function " << name; + } + return (e->body)(buf.substr(pos + 1, buf.length()).c_str()); + } +} +} // namespace xgboost + +namespace xgboost { +namespace metric { +// List of files that will be force linked in static links. +DMLC_REGISTRY_LINK_TAG(elementwise_metric); +DMLC_REGISTRY_LINK_TAG(multiclass_metric); +DMLC_REGISTRY_LINK_TAG(rank_metric); +} // namespace metric +} // namespace xgboost diff --git a/src/metric/multiclass_metric.cc b/src/metric/multiclass_metric.cc index cd10168f9..d51379c64 100644 --- a/src/metric/multiclass_metric.cc +++ b/src/metric/multiclass_metric.cc @@ -11,6 +11,9 @@ namespace xgboost { namespace metric { +// tag the this file, used by force static link later. +DMLC_REGISTRY_FILE_TAG(multiclass_metric); + /*! * \brief base class of multi-class evaluation * \tparam Derived the name of subclass @@ -114,4 +117,3 @@ XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss") .set_body([](const char* param) { return new EvalMultiLogLoss(); }); } // namespace metric } // namespace xgboost - diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc index ee2a0c948..feb0f37ff 100644 --- a/src/metric/rank_metric.cc +++ b/src/metric/rank_metric.cc @@ -5,12 +5,16 @@ * \author Kailong Chen, Tianqi Chen */ #include +#include #include #include "../common/sync.h" #include "../common/math.h" namespace xgboost { namespace metric { +// tag the this file, used by force static link later. +DMLC_REGISTRY_FILE_TAG(rank_metric); + /*! \brief AMS: also records best threshold */ struct EvalAMS : public Metric { public: diff --git a/src/objective/multiclass_obj.cc b/src/objective/multiclass_obj.cc index cf381f5ef..42b9fa255 100644 --- a/src/objective/multiclass_obj.cc +++ b/src/objective/multiclass_obj.cc @@ -4,9 +4,9 @@ * \brief Definition of multi-class classification objectives. * \author Tianqi Chen */ -#include #include #include +#include #include #include #include @@ -16,6 +16,8 @@ namespace xgboost { namespace obj { +DMLC_REGISTRY_FILE_TAG(multiclass_obj); + struct SoftmaxMultiClassParam : public dmlc::Parameter { int num_class; // declare parameters diff --git a/src/objective/objective.cc b/src/objective/objective.cc new file mode 100644 index 000000000..413494d3d --- /dev/null +++ b/src/objective/objective.cc @@ -0,0 +1,34 @@ +/*! + * Copyright 2015 by Contributors + * \file objective.cc + * \brief Registry of all objective functions. + */ +#include +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg); +} // namespace dmlc + +namespace xgboost { +// implement factory functions +ObjFunction* ObjFunction::Create(const std::string& name) { + auto *e = ::dmlc::Registry< ::xgboost::ObjFunctionReg>::Get()->Find(name); + if (e == nullptr) { + for (const auto& entry : ::dmlc::Registry< ::xgboost::ObjFunctionReg>::List()) { + LOG(INFO) << "Objective candidate: " << entry->name; + } + LOG(FATAL) << "Unknown objective function " << name; + } + return (e->body)(); +} +} // namespace xgboost + +namespace xgboost { +namespace obj { +// List of files that will be force linked in static links. +DMLC_REGISTRY_LINK_TAG(regression_obj); +DMLC_REGISTRY_LINK_TAG(multiclass_obj); +DMLC_REGISTRY_LINK_TAG(rank_obj); +} // namespace obj +} // namespace xgboost diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc index 8cb2676a2..faa17c322 100644 --- a/src/objective/rank_obj.cc +++ b/src/objective/rank_obj.cc @@ -4,8 +4,8 @@ * \brief Definition of rank loss. * \author Tianqi Chen, Kailong Chen */ -#include #include +#include #include #include #include @@ -16,6 +16,8 @@ namespace xgboost { namespace obj { +DMLC_REGISTRY_FILE_TAG(rank_obj); + struct LambdaRankParam : public dmlc::Parameter { int num_pairsample; float fix_list_weight; @@ -324,4 +326,3 @@ XGBOOST_REGISTER_OBJECTIVE(LambdaRankObjMAP, "rank:map") } // namespace obj } // namespace xgboost - diff --git a/src/objective/regression_obj.cc b/src/objective/regression_obj.cc index aa8a0237f..bf2a08e1a 100644 --- a/src/objective/regression_obj.cc +++ b/src/objective/regression_obj.cc @@ -4,8 +4,8 @@ * \brief Definition of single-value regression and classification objectives. * \author Tianqi Chen, Kailong Chen */ -#include #include +#include #include #include #include @@ -14,6 +14,9 @@ namespace xgboost { namespace obj { + +DMLC_REGISTRY_FILE_TAG(regression_obj); + // common regressions // linear regression struct LinearSquareLoss { @@ -84,7 +87,9 @@ class RegLossObj : public ObjFunction { int iter, std::vector *out_gpair) override { CHECK_NE(info.labels.size(), 0) << "label set cannot be empty"; - CHECK_EQ(preds.size(), info.labels.size()) << "labels are not correctly provided"; + CHECK_EQ(preds.size(), info.labels.size()) + << "labels are not correctly provided" + << "preds.size=" << preds.size() << ", label.size=" << info.labels.size(); out_gpair->resize(preds.size()); // check if label in range bool label_correct = true; @@ -95,7 +100,7 @@ class RegLossObj : public ObjFunction { float p = Loss::PredTransform(preds[i]); float w = info.GetWeight(i); if (info.labels[i] == 1.0f) w *= param_.scale_pos_weight; - if (Loss::CheckLabel(info.labels[i])) label_correct = false; + if (!Loss::CheckLabel(info.labels[i])) label_correct = false; out_gpair->at(i) = bst_gpair(Loss::FirstOrderGradient(p, info.labels[i]) * w, Loss::SecondOrderGradient(p, info.labels[i]) * w); } diff --git a/src/tree/param.h b/src/tree/param.h index cd1bfa999..b6ac89aef 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -71,7 +71,7 @@ struct TrainParam : public dmlc::Parameter { .describe("L2 regularization on leaf weight"); DMLC_DECLARE_FIELD(reg_alpha).set_lower_bound(0.0f).set_default(0.0f) .describe("L1 regularization on leaf weight"); - DMLC_DECLARE_FIELD(default_direction) + DMLC_DECLARE_FIELD(default_direction).set_default(0) .add_enum("learn", 0) .add_enum("left", 1) .add_enum("right", 2) diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc new file mode 100644 index 000000000..ca04a2c84 --- /dev/null +++ b/src/tree/tree_updater.cc @@ -0,0 +1,35 @@ +/*! + * Copyright 2015 by Contributors + * \file tree_updater.cc + * \brief Registry of tree updaters. + */ +#include +#include + +namespace dmlc { +DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg); +} // namespace dmlc + +namespace xgboost { + +TreeUpdater* TreeUpdater::Create(const std::string& name) { + auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name); + if (e == nullptr) { + LOG(FATAL) << "Unknown tree updater " << name; + } + return (e->body)(); +} + +} // namespace xgboost + +namespace xgboost { +namespace tree { +// List of files that will be force linked in static links. +DMLC_REGISTRY_LINK_TAG(updater_colmaker); +DMLC_REGISTRY_LINK_TAG(updater_skmaker); +DMLC_REGISTRY_LINK_TAG(updater_refresh); +DMLC_REGISTRY_LINK_TAG(updater_prune); +DMLC_REGISTRY_LINK_TAG(updater_histmaker); +DMLC_REGISTRY_LINK_TAG(updater_sync); +} // namespace tree +} // namespace xgboost diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 2556ac74a..26efb33bc 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -15,6 +15,9 @@ namespace xgboost { namespace tree { + +DMLC_REGISTRY_FILE_TAG(updater_colmaker); + /*! \brief column-wise update to construct a tree */ template class ColMaker: public TreeUpdater { @@ -891,4 +894,3 @@ XGBOOST_REGISTER_TREE_UPDATER(DistColMaker, "distcol") }); } // namespace tree } // namespace xgboost - diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index e79676bc2..c6d53b270 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -15,6 +15,9 @@ namespace xgboost { namespace tree { + +DMLC_REGISTRY_FILE_TAG(updater_histmaker); + template class HistMaker: public BaseMaker { public: diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc index fcc816d44..af52f73f4 100644 --- a/src/tree/updater_prune.cc +++ b/src/tree/updater_prune.cc @@ -14,6 +14,9 @@ namespace xgboost { namespace tree { + +DMLC_REGISTRY_FILE_TAG(updater_prune); + /*! \brief pruner that prunes a tree after growing finishes */ class TreePruner: public TreeUpdater { public: diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc index 25fd57a89..3fef13ef6 100644 --- a/src/tree/updater_refresh.cc +++ b/src/tree/updater_refresh.cc @@ -14,6 +14,9 @@ namespace xgboost { namespace tree { + +DMLC_REGISTRY_FILE_TAG(updater_refresh); + /*! \brief pruner that prunes a tree after growing finishs */ template class TreeRefresher: public TreeUpdater { diff --git a/src/tree/updater_skmaker.cc b/src/tree/updater_skmaker.cc index 6125bf066..c0d62ce5e 100644 --- a/src/tree/updater_skmaker.cc +++ b/src/tree/updater_skmaker.cc @@ -18,6 +18,8 @@ namespace xgboost { namespace tree { +DMLC_REGISTRY_FILE_TAG(updater_skmaker); + class SketchMaker: public BaseMaker { public: void Update(const std::vector &gpair, @@ -399,4 +401,3 @@ XGBOOST_REGISTER_TREE_UPDATER(SketchMaker, "grow_skmaker") }); } // namespace tree } // namespace xgboost - diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc index 0869a232f..bd17968cd 100644 --- a/src/tree/updater_sync.cc +++ b/src/tree/updater_sync.cc @@ -12,6 +12,9 @@ namespace xgboost { namespace tree { + +DMLC_REGISTRY_FILE_TAG(updater_sync); + /*! * \brief syncher that synchronize the tree in all distributed nodes * can implement various strategies, so far it is always set to node 0's tree diff --git a/wrapper/.gitignore b/wrapper/.gitignore deleted file mode 100644 index 2ebc5b00b..000000000 --- a/wrapper/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -build -dist -*.egg* diff --git a/wrapper/README.md b/wrapper/README.md deleted file mode 100644 index 77316e15c..000000000 --- a/wrapper/README.md +++ /dev/null @@ -1,9 +0,0 @@ -XGBoost Wrappers -================ -This folder provides wrapper to create xgboost packages to other languages. - -***Supported Language Packages*** -* [Python package](../python-package) -* [R-package](../R-package) -* [Java Package](../java) -* [Julia Package](https://github.com/antinucleon/XGBoost.jl) diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp deleted file mode 100644 index 6d547fe18..000000000 --- a/wrapper/xgboost_wrapper.cpp +++ /dev/null @@ -1,599 +0,0 @@ -// Copyright (c) 2014 by Contributors -// implementations in ctypes -#define _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_DEPRECATE -#include -#include -#include -#include -#include -#include -#include -// include all std functions -using namespace std; -#include "./xgboost_wrapper.h" -#include "../src/data.h" -#include "../src/learner/learner-inl.hpp" -#include "../src/io/io.h" -#include "../src/utils/utils.h" -#include "../src/utils/math.h" -#include "../src/utils/group_data.h" -#include "../src/io/simple_dmatrix-inl.hpp" - -using namespace xgboost; -using namespace xgboost::io; - -namespace xgboost { -namespace wrapper { -// booster wrapper class -class Booster: public learner::BoostLearner { - public: - explicit Booster(const std::vector& mats) { - this->silent = 1; - this->init_model = false; - this->SetCacheData(mats); - } - inline const float *Pred(const DataMatrix &dmat, int option_mask, - unsigned ntree_limit, bst_ulong *len) { - this->CheckInitModel(); - this->Predict(dmat, (option_mask&1) != 0, &this->preds_, - ntree_limit, (option_mask&2) != 0); - *len = static_cast(this->preds_.size()); - return BeginPtr(this->preds_); - } - inline void BoostOneIter(const DataMatrix &train, - float *grad, float *hess, bst_ulong len) { - this->gpair_.resize(len); - const bst_omp_uint ndata = static_cast(len); - #pragma omp parallel for schedule(static) - for (bst_omp_uint j = 0; j < ndata; ++j) { - gpair_[j] = bst_gpair(grad[j], hess[j]); - } - gbm_->DoBoost(train.fmat(), this->FindBufferOffset(train), train.info.info, &gpair_); - } - inline void CheckInitModel(void) { - if (!init_model) { - this->InitModel(); init_model = true; - } - } - inline void LoadModel(const char *fname) { - learner::BoostLearner::LoadModel(fname); - this->init_model = true; - } - inline void LoadModelFromBuffer(const void *buf, size_t size) { - utils::MemoryFixSizeBuffer fs((void*)buf, size); // NOLINT(*) - learner::BoostLearner::LoadModel(fs, true); - this->init_model = true; - } - inline const char *GetModelRaw(bst_ulong *out_len) { - this->CheckInitModel(); - model_str.resize(0); - utils::MemoryBufferStream fs(&model_str); - learner::BoostLearner::SaveModel(fs, false); - *out_len = static_cast(model_str.length()); - if (*out_len == 0) { - return NULL; - } else { - return &model_str[0]; - } - } - inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) { - model_dump = this->DumpModel(fmap, with_stats); - model_dump_cptr.resize(model_dump.size()); - for (size_t i = 0; i < model_dump.size(); ++i) { - model_dump_cptr[i] = model_dump[i].c_str(); - } - *len = static_cast(model_dump.size()); - return BeginPtr(model_dump_cptr); - } - // temporal fields - // temporal data to save evaluation dump - std::string eval_str; - // temporal data to save model dump - std::string model_str; - // temporal space to save model dump - std::vector model_dump; - std::vector model_dump_cptr; - - private: - bool init_model; -}; -} // namespace wrapper -} // namespace xgboost - -using namespace xgboost::wrapper; - -#ifndef XGBOOST_STRICT_CXX98_ -namespace xgboost { -namespace wrapper { -// helper to support threadlocal -struct ThreadLocalStore { - std::vector data; - // allocate a string - inline std::string *Alloc() { - mutex.Lock(); - data.push_back(new std::string()); - std::string *ret = data.back(); - mutex.Unlock(); - return ret; - } - ThreadLocalStore() { - mutex.Init(); - } - ~ThreadLocalStore() { - for (size_t i = 0; i < data.size(); ++i) { - delete data[i]; - } - mutex.Destroy(); - } - utils::Mutex mutex; -}; - -static ThreadLocalStore thread_local_store; -} // namespace wrapper -} // namespace xgboost - -/*! \brief macro to guard beginning and end section of all functions */ -#define API_BEGIN() try { -/*! - * \brief every function starts with API_BEGIN(); and finishes with API_END(); - * \param Finalize optionally put in a finalizer - */ -#define API_END_FINALIZE(Finalize) } catch(std::exception &e) { \ - Finalize; return XGBHandleException(e); \ - } return 0; -/*! \brief API End with no finalization */ -#define API_END() API_END_FINALIZE(;) - -// do not use threadlocal on OSX since it is not always available -#ifndef DISABLE_THREAD_LOCAL -#ifdef __GNUC__ - #define XGB_TREAD_LOCAL __thread -#elif __STDC_VERSION__ >= 201112L - #define XGB_TREAD_LOCAL _Thread_local -#elif defined(_MSC_VER) - #define XGB_TREAD_LOCAL __declspec(thread) -#endif -#endif - -#ifndef XGB_TREAD_LOCAL -#pragma message("Warning: Threadlocal not enabled, used single thread error handling") -#define XGB_TREAD_LOCAL -#endif - -/*! - * \brief a helper function for error handling - * will set the last error to be str_set when it is not NULL - * \param str_set the error to set - * \return a pointer message to last error - */ -const char *XGBSetGetLastError_(const char *str_set) { - // use last_error to record last error - static XGB_TREAD_LOCAL std::string *last_error = NULL; - if (last_error == NULL) { - last_error = thread_local_store.Alloc(); - } - if (str_set != NULL) { - *last_error = str_set; - } - return last_error->c_str(); -} -#else -// crippled implementation for solaris case -// exception handling is not needed for R, so it is OK. -#define API_BEGIN() -#define API_END_FINALIZE(Finalize) return 0 -#define API_END() return 0 - -const char *XGBSetGetLastError_(const char *str_set) { - return NULL; -} -#endif // XGBOOST_STRICT_CXX98_ - -/*! \brief return str message of the last error */ -const char *XGBGetLastError() { - return XGBSetGetLastError_(NULL); -} - -/*! - * \brief handle exception throwed out - * \param e the exception - * \return the return value of API after exception is handled - */ -int XGBHandleException(const std::exception &e) { - XGBSetGetLastError_(e.what()); - return -1; -} - -int XGDMatrixCreateFromFile(const char *fname, - int silent, - DMatrixHandle *out) { - API_BEGIN(); - *out = LoadDataMatrix(fname, silent != 0, false, false); - API_END(); -} - -int XGDMatrixCreateFromCSR(const bst_ulong *indptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem, - DMatrixHandle *out) { - DMatrixSimple *p_mat = NULL; - API_BEGIN(); - p_mat = new DMatrixSimple(); - DMatrixSimple &mat = *p_mat; - mat.row_ptr_.resize(nindptr); - for (bst_ulong i = 0; i < nindptr; ++i) { - mat.row_ptr_[i] = static_cast(indptr[i]); - } - mat.row_data_.resize(nelem); - for (bst_ulong i = 0; i < nelem; ++i) { - mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); - mat.info.info.num_col = std::max(mat.info.info.num_col, - static_cast(indices[i]+1)); - } - mat.info.info.num_row = nindptr - 1; - *out = p_mat; - API_END_FINALIZE(delete p_mat); -} - -int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr, - const unsigned *indices, - const float *data, - bst_ulong nindptr, - bst_ulong nelem, - DMatrixHandle *out) { - DMatrixSimple *p_mat = NULL; - API_BEGIN(); - int nthread; - #pragma omp parallel - { - nthread = omp_get_num_threads(); - } - p_mat = new DMatrixSimple(); - DMatrixSimple &mat = *p_mat; - utils::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); - builder.InitBudget(0, nthread); - long ncol = static_cast(nindptr - 1); // NOLINT(*) - #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.AddBudget(indices[j], tid); - } - } - builder.InitStorage(); - #pragma omp parallel for schedule(static) - for (long i = 0; i < ncol; ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.Push(indices[j], - RowBatch::Entry(static_cast(i), data[j]), - tid); - } - } - mat.info.info.num_row = mat.row_ptr_.size() - 1; - mat.info.info.num_col = static_cast(ncol); - *out = p_mat; - API_END_FINALIZE(delete p_mat); -} - -int XGDMatrixCreateFromMat(const float *data, - bst_ulong nrow, - bst_ulong ncol, - float missing, - DMatrixHandle *out) { - DMatrixSimple *p_mat = NULL; - API_BEGIN(); - p_mat = new DMatrixSimple(); - bool nan_missing = utils::CheckNAN(missing); - DMatrixSimple &mat = *p_mat; - mat.info.info.num_row = nrow; - mat.info.info.num_col = ncol; - for (bst_ulong i = 0; i < nrow; ++i, data += ncol) { - bst_ulong nelem = 0; - for (bst_ulong j = 0; j < ncol; ++j) { - if (utils::CheckNAN(data[j])) { - utils::Check(nan_missing, - "There are NAN in the matrix, however, you did not set missing=NAN"); - } else { - if (nan_missing || data[j] != missing) { - mat.row_data_.push_back(RowBatch::Entry(j, data[j])); - ++nelem; - } - } - } - mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem); - } - *out = p_mat; - API_END_FINALIZE(delete p_mat); -} - -int XGDMatrixSliceDMatrix(DMatrixHandle handle, - const int *idxset, - bst_ulong len, - DMatrixHandle *out) { - DMatrixSimple *p_ret = NULL; - API_BEGIN(); - DMatrixSimple tmp; - DataMatrix &dsrc = *static_cast(handle); - if (dsrc.magic != DMatrixSimple::kMagic) { - tmp.CopyFrom(dsrc); - } - DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ? - *static_cast(handle): tmp); - p_ret = new DMatrixSimple(); - DMatrixSimple &ret = *p_ret; - - utils::Check(src.info.group_ptr.size() == 0, - "slice does not support group structure"); - ret.Clear(); - ret.info.info.num_row = len; - ret.info.info.num_col = src.info.num_col(); - - utils::IIterator *iter = src.fmat()->RowIterator(); - iter->BeforeFirst(); - utils::Assert(iter->Next(), "slice"); - const RowBatch &batch = iter->Value(); - for (bst_ulong i = 0; i < len; ++i) { - const int ridx = idxset[i]; - RowBatch::Inst inst = batch[ridx]; - utils::Check(static_cast(ridx) < batch.size, "slice index exceed number of rows"); - ret.row_data_.resize(ret.row_data_.size() + inst.length); - memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data, - sizeof(RowBatch::Entry) * inst.length); - ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length); - if (src.info.labels.size() != 0) { - ret.info.labels.push_back(src.info.labels[ridx]); - } - if (src.info.weights.size() != 0) { - ret.info.weights.push_back(src.info.weights[ridx]); - } - if (src.info.info.root_index.size() != 0) { - ret.info.info.root_index.push_back(src.info.info.root_index[ridx]); - } - if (src.info.info.fold_index.size() != 0) { - ret.info.info.fold_index.push_back(src.info.info.fold_index[ridx]); - } - } - *out = p_ret; - API_END_FINALIZE(delete p_ret); -} - -int XGDMatrixFree(DMatrixHandle handle) { - API_BEGIN(); - delete static_cast(handle); - API_END(); -} - -int XGDMatrixSaveBinary(DMatrixHandle handle, - const char *fname, - int silent) { - API_BEGIN(); - SaveDataMatrix(*static_cast(handle), fname, silent != 0); - API_END(); -} - -int XGDMatrixSetFloatInfo(DMatrixHandle handle, - const char *field, - const float *info, - bst_ulong len) { - API_BEGIN(); - std::vector &vec = - static_cast(handle)->info.GetFloatInfo(field); - vec.resize(len); - memcpy(BeginPtr(vec), info, sizeof(float) * len); - API_END(); -} - -int XGDMatrixSetUIntInfo(DMatrixHandle handle, - const char *field, - const unsigned *info, - bst_ulong len) { - API_BEGIN(); - std::vector &vec = - static_cast(handle)->info.GetUIntInfo(field); - vec.resize(len); - memcpy(BeginPtr(vec), info, sizeof(unsigned) * len); - API_END(); -} - -int XGDMatrixSetGroup(DMatrixHandle handle, - const unsigned *group, - bst_ulong len) { - API_BEGIN(); - DataMatrix *pmat = static_cast(handle); - pmat->info.group_ptr.resize(len + 1); - pmat->info.group_ptr[0] = 0; - for (uint64_t i = 0; i < len; ++i) { - pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i] + group[i]; - } - API_END(); -} - -int XGDMatrixGetFloatInfo(const DMatrixHandle handle, - const char *field, - bst_ulong *out_len, - const float **out_dptr) { - API_BEGIN(); - const std::vector &vec = - static_cast(handle)->info.GetFloatInfo(field); - *out_len = static_cast(vec.size()); - *out_dptr = BeginPtr(vec); - API_END(); -} - -int XGDMatrixGetUIntInfo(const DMatrixHandle handle, - const char *field, - bst_ulong *out_len, - const unsigned **out_dptr) { - API_BEGIN(); - const std::vector &vec = - static_cast(handle)->info.GetUIntInfo(field); - *out_len = static_cast(vec.size()); - *out_dptr = BeginPtr(vec); - API_END(); -} - -int XGDMatrixNumRow(const DMatrixHandle handle, - bst_ulong *out) { - API_BEGIN(); - *out = static_cast(static_cast(handle)->info.num_row()); - API_END(); -} - -int XGDMatrixNumCol(const DMatrixHandle handle, - bst_ulong *out) { - API_BEGIN(); - *out = static_cast(static_cast(handle)->info.num_col()); - API_END(); -} - -// xgboost implementation -int XGBoosterCreate(DMatrixHandle dmats[], - bst_ulong len, - BoosterHandle *out) { - API_BEGIN(); - std::vector mats; - for (bst_ulong i = 0; i < len; ++i) { - DataMatrix *dtr = static_cast(dmats[i]); - mats.push_back(dtr); - } - *out = new Booster(mats); - API_END(); -} - -int XGBoosterFree(BoosterHandle handle) { - API_BEGIN(); - delete static_cast(handle); - API_END(); -} - -int XGBoosterSetParam(BoosterHandle handle, - const char *name, const char *value) { - API_BEGIN(); - static_cast(handle)->SetParam(name, value); - API_END(); -} - -int XGBoosterUpdateOneIter(BoosterHandle handle, - int iter, - DMatrixHandle dtrain) { - API_BEGIN(); - Booster *bst = static_cast(handle); - DataMatrix *dtr = static_cast(dtrain); - bst->CheckInitModel(); - bst->CheckInit(dtr); - bst->UpdateOneIter(iter, *dtr); - API_END(); -} - -int XGBoosterBoostOneIter(BoosterHandle handle, - DMatrixHandle dtrain, - float *grad, - float *hess, - bst_ulong len) { - API_BEGIN(); - Booster *bst = static_cast(handle); - DataMatrix *dtr = static_cast(dtrain); - bst->CheckInitModel(); - bst->CheckInit(dtr); - bst->BoostOneIter(*dtr, grad, hess, len); - API_END(); -} - -int XGBoosterEvalOneIter(BoosterHandle handle, - int iter, - DMatrixHandle dmats[], - const char *evnames[], - bst_ulong len, - const char **out_str) { - API_BEGIN(); - Booster *bst = static_cast(handle); - std::vector names; - std::vector mats; - for (bst_ulong i = 0; i < len; ++i) { - mats.push_back(static_cast(dmats[i])); - names.push_back(std::string(evnames[i])); - } - bst->CheckInitModel(); - bst->eval_str = bst->EvalOneIter(iter, mats, names); - *out_str = bst->eval_str.c_str(); - API_END(); -} - -int XGBoosterPredict(BoosterHandle handle, - DMatrixHandle dmat, - int option_mask, - unsigned ntree_limit, - bst_ulong *len, - const float **out_result) { - API_BEGIN(); - *out_result = static_cast(handle)-> - Pred(*static_cast(dmat), - option_mask, ntree_limit, len); - API_END(); -} - -int XGBoosterLoadModel(BoosterHandle handle, const char *fname) { - API_BEGIN(); - static_cast(handle)->LoadModel(fname); - API_END(); -} - -int XGBoosterSaveModel(BoosterHandle handle, const char *fname) { - API_BEGIN(); - Booster *bst = static_cast(handle); - bst->CheckInitModel(); - bst->SaveModel(fname, false); - API_END(); -} - -int XGBoosterLoadModelFromBuffer(BoosterHandle handle, - const void *buf, - bst_ulong len) { - API_BEGIN(); - static_cast(handle)->LoadModelFromBuffer(buf, len); - API_END(); -} - -int XGBoosterGetModelRaw(BoosterHandle handle, - bst_ulong *out_len, - const char **out_dptr) { - API_BEGIN(); - *out_dptr = static_cast(handle)->GetModelRaw(out_len); - API_END(); -} - -int XGBoosterDumpModel(BoosterHandle handle, - const char *fmap, - int with_stats, - bst_ulong *len, - const char ***out_models) { - API_BEGIN(); - utils::FeatMap featmap; - if (strlen(fmap) != 0) { - featmap.LoadText(fmap); - } - *out_models = static_cast(handle)->GetModelDump( - featmap, with_stats != 0, len); - API_END(); -} - -int XGBoosterDumpModelWithFeatures(BoosterHandle handle, - int fnum, - const char **fname, - const char **ftype, - int with_stats, - bst_ulong *len, - const char ***out_models) { - API_BEGIN(); - utils::FeatMap featmap; - for (int i = 0; i < fnum; ++i) { - featmap.PushBack(i, fname[i], ftype[i]); - } - *out_models = static_cast(handle)->GetModelDump( - featmap, with_stats != 0, len); - API_END(); -}