diff --git a/Makefile b/Makefile index 5a4f70fef..e92f627ce 100644 --- a/Makefile +++ b/Makefile @@ -14,19 +14,19 @@ endif BIN = xgboost OBJ = updater.o gbm.o io.o SLIB = wrapper/libxgboostwrapper.so -#RLIB = wrapper/libxgboostR.so -.PHONY: clean all R python + +.PHONY: clean all python all: $(BIN) $(OBJ) $(SLIB) -#python: wrapper/libxgboostwrapper.so -#xgboost: src/xgboost_main.cpp src/io/io.cpp src/data.h src/tree/*.h src/tree/*.hpp src/gbm/*.h src/gbm/*.hpp src/utils/*.h src/learner/*.h src/learner/*.hpp +python: wrapper/libxgboostwrapper.so # now the wrapper takes in two files. io and wrapper part wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp $(OBJ) -updater.o: src/tree/updater.cpp -gbm.o: src/gbm/gbm.cpp -io.o: src/io/io.cpp -xgboost: src/xgboost_main.cpp $(OBJ) +updater.o: src/tree/updater.cpp src/tree/*.hpp src/*.h src/tree/*.h +gbm.o: src/gbm/gbm.cpp src/gbm/*.hpp src/gbm/*.h +io.o: src/io/io.cpp src/io/*.hpp src/utils/*.h src/learner/dmatrix.h src/*.h +xgboost: src/xgboost_main.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ) +wrapper/libxgboostwrapper.so: wrapper/xgboost_wrapper.cpp src/utils/*.h src/*.h src/learner/*.hpp src/learner/*.h $(OBJ) $(BIN) : $(CXX) $(CFLAGS) $(LDFLAGS) -o $@ $(filter %.cpp %.o %.c, $^) diff --git a/src/gbm/gbm.cpp b/src/gbm/gbm.cpp new file mode 100644 index 000000000..ae0e4af94 --- /dev/null +++ b/src/gbm/gbm.cpp @@ -0,0 +1,18 @@ +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#include +#include "./gbm.h" +#include "./gbtree-inl.hpp" +#include "./gblinear-inl.hpp" + +namespace xgboost { +namespace gbm { +IGradBooster* CreateGradBooster(const char *name) { + if (!strcmp("gbtree", name)) return new GBTree(); + if (!strcmp("gblinear", name)) return new GBLinear(); + utils::Error("unknown booster type: %s", name); + return NULL; +} +} // namespace gbm +} // namespace xgboost + diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp new file mode 100644 index 000000000..5bbde0740 --- /dev/null +++ b/src/io/simple_fmatrix-inl.hpp @@ -0,0 +1,242 @@ +#ifndef XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP +#define XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP +/*! + * \file simple_fmatrix-inl.hpp + * \brief the input data structure for gradient boosting + * \author Tianqi Chen + */ +#include "../data.h" +#include "../utils/utils.h" +#include "../utils/random.h" +#include "../utils/omp.h" +#include "../utils/matrix_csr.h" +namespace xgboost { +namespace io { +/*! + * \brief sparse matrix that support column access, CSC + */ +class FMatrixS : public IFMatrix{ + public: + typedef SparseBatch::Entry Entry; + /*! \brief constructor */ + FMatrixS(utils::IIterator *iter) { + this->iter_ = iter; + } + // destructor + virtual ~FMatrixS(void) { + if (iter_ != NULL) delete iter_; + } + /*! \return whether column access is enabled */ + virtual bool HaveColAccess(void) const { + return col_ptr_.size() != 0; + } + /*! \brief get number of colmuns */ + virtual size_t NumCol(void) const { + utils::Check(this->HaveColAccess(), "NumCol:need column access"); + return col_ptr_.size() - 1; + } + /*! \brief get number of buffered rows */ + virtual const std::vector &buffered_rowset(void) const { + return buffered_rowset_; + } + /*! \brief get column size */ + virtual size_t GetColSize(size_t cidx) const { + return col_ptr_[cidx+1] - col_ptr_[cidx]; + } + /*! \brief get column density */ + virtual float GetColDensity(size_t cidx) const { + size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); + return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); + } + virtual void InitColAccess(float pkeep = 1.0f) { + if (this->HaveColAccess()) return; + this->InitColData(pkeep); + } + /*! + * \brief get the row iterator associated with FMatrix + */ + virtual utils::IIterator* RowIterator(void) { + iter_->BeforeFirst(); + return iter_; + } + /*! + * \brief get the column based iterator + */ + virtual utils::IIterator* ColIterator(void) { + size_t ncol = this->NumCol(); + col_iter_.col_index_.resize(ncol); + for (size_t i = 0; i < ncol; ++i) { + col_iter_.col_index_[i] = static_cast(i); + } + col_iter_.SetBatch(col_ptr_, col_data_); + return &col_iter_; + } + /*! + * \brief colmun based iterator + */ + virtual utils::IIterator *ColIterator(const std::vector &fset) { + col_iter_.col_index_ = fset; + col_iter_.SetBatch(col_ptr_, col_data_); + return &col_iter_; + } + /*! + * \brief save column access data into stream + * \param fo output stream to save to + */ + inline void SaveColAccess(utils::IStream &fo) const { + fo.Write(buffered_rowset_); + if (buffered_rowset_.size() != 0) { + SaveBinary(fo, col_ptr_, col_data_); + } + } + /*! + * \brief load column access data from stream + * \param fo output stream to load from + */ + inline void LoadColAccess(utils::IStream &fi) { + utils::Check(fi.Read(&buffered_rowset_), "invalid input file format"); + if (buffered_rowset_.size() != 0) { + LoadBinary(fi, &col_ptr_, &col_data_); + } + } + /*! + * \brief save data to binary stream + * \param fo output stream + * \param ptr pointer data + * \param data data content + */ + inline static void SaveBinary(utils::IStream &fo, + const std::vector &ptr, + const std::vector &data) { + size_t nrow = ptr.size() - 1; + fo.Write(&nrow, sizeof(size_t)); + fo.Write(&ptr[0], ptr.size() * sizeof(size_t)); + if (data.size() != 0) { + fo.Write(&data[0], data.size() * sizeof(RowBatch::Entry)); + } + } + /*! + * \brief load data from binary stream + * \param fi input stream + * \param out_ptr pointer data + * \param out_data data content + */ + inline static void LoadBinary(utils::IStream &fi, + std::vector *out_ptr, + std::vector *out_data) { + size_t nrow; + utils::Check(fi.Read(&nrow, sizeof(size_t)) != 0, "invalid input file format"); + out_ptr->resize(nrow + 1); + utils::Check(fi.Read(&(*out_ptr)[0], out_ptr->size() * sizeof(size_t)) != 0, + "invalid input file format"); + out_data->resize(out_ptr->back()); + if (out_data->size() != 0) { + utils::Assert(fi.Read(&(*out_data)[0], out_data->size() * sizeof(RowBatch::Entry)) != 0, + "invalid input file format"); + } + } + + protected: + /*! + * \brief intialize column data + * \param pkeep probability to keep a row + */ + inline void InitColData(float pkeep) { + buffered_rowset_.clear(); + // note: this part of code is serial, todo, parallelize this transformer + utils::SparseCSRMBuilder builder(col_ptr_, col_data_); + builder.InitBudget(0); + // start working + iter_->BeforeFirst(); + while (iter_->Next()) { + const RowBatch &batch = iter_->Value(); + for (size_t i = 0; i < batch.size; ++i) { + if (pkeep == 1.0f || random::SampleBinary(pkeep)) { + buffered_rowset_.push_back(static_cast(batch.base_rowid+i)); + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + builder.AddBudget(inst[j].index); + } + } + } + } + builder.InitStorage(); + + iter_->BeforeFirst(); + size_t ktop = 0; + while (iter_->Next()) { + const RowBatch &batch = iter_->Value(); + for (size_t i = 0; i < batch.size; ++i) { + if (ktop < buffered_rowset_.size() && + buffered_rowset_[ktop] == batch.base_rowid+i) { + ++ktop; + RowBatch::Inst inst = batch[i]; + for (bst_uint j = 0; j < inst.length; ++j) { + builder.PushElem(inst[j].index, + Entry((bst_uint)(batch.base_rowid+i), + inst[j].fvalue)); + } + } + } + } + // sort columns + bst_omp_uint ncol = static_cast(this->NumCol()); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < ncol; ++i) { + std::sort(&col_data_[0] + col_ptr_[i], + &col_data_[0] + col_ptr_[i + 1], Entry::CmpValue); + } + } + + private: + // one batch iterator that return content in the matrix + struct OneBatchIter: utils::IIterator { + OneBatchIter(void) : at_first_(true){} + virtual ~OneBatchIter(void) {} + virtual void BeforeFirst(void) { + at_first_ = true; + } + virtual bool Next(void) { + if (!at_first_) return false; + at_first_ = false; + return true; + } + virtual const ColBatch &Value(void) const { + return batch_; + } + inline void SetBatch(const std::vector &ptr, + const std::vector &data) { + batch_.size = col_index_.size(); + col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0)); + for (size_t i = 0; i < col_data_.size(); ++i) { + const bst_uint ridx = col_index_[i]; + col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx], + ptr[ridx+1] - ptr[ridx]); + } + batch_.col_index = &col_index_[0]; + batch_.col_data = &col_data_[0]; + this->BeforeFirst(); + } + // data content + std::vector col_index_; + std::vector col_data_; + // whether is at first + bool at_first_; + // temporal space for batch + ColBatch batch_; + }; + // --- data structure used to support InitColAccess -- + // column iterator + OneBatchIter col_iter_; + // row iterator + utils::IIterator *iter_; + /*! \brief list of row index that are buffered */ + std::vector buffered_rowset_; + /*! \brief column pointer of CSC format */ + std::vector col_ptr_; + /*! \brief column datas in CSC format */ + std::vector col_data_; +}; +} // namespace io +} // namespace xgboost +#endif // XGBOOST_IO_SIMPLE_FMATRIX_INL_HPP diff --git a/src/tree/updater.cpp b/src/tree/updater.cpp new file mode 100644 index 000000000..25bee7922 --- /dev/null +++ b/src/tree/updater.cpp @@ -0,0 +1,20 @@ +#define _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_DEPRECATE +#include +#include "./updater.h" +#include "./updater_prune-inl.hpp" +#include "./updater_refresh-inl.hpp" +#include "./updater_colmaker-inl.hpp" + +namespace xgboost { +namespace tree { +IUpdater* CreateUpdater(const char *name) { + if (!strcmp(name, "prune")) return new TreePruner(); + if (!strcmp(name, "refresh")) return new TreeRefresher(); + if (!strcmp(name, "grow_colmaker")) return new ColMaker(); + utils::Error("unknown updater:%s", name); + return NULL; +} + +} // namespace tree +} // namespace xgboost