From 3d8431fc5cfde177ceb08d21ae5d61bf72b47ad1 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 15 Apr 2015 13:42:03 -0700 Subject: [PATCH 01/14] simplify and parallelize data builder --- src/io/io.cpp | 45 ------ src/io/simple_fmatrix-inl.hpp | 48 +++++-- src/utils/group_data.h | 8 +- src/utils/matrix_csr.h | 260 ---------------------------------- wrapper/xgboost_wrapper.cpp | 22 ++- 5 files changed, 55 insertions(+), 328 deletions(-) delete mode 100644 src/utils/matrix_csr.h diff --git a/src/io/io.cpp b/src/io/io.cpp index 1500ce658..1535a9e42 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -6,10 +6,6 @@ #include "../utils/io.h" #include "../utils/utils.h" #include "simple_dmatrix-inl.hpp" -#ifndef XGBOOST_STRICT_CXX98_ -#include "page_dmatrix-inl.hpp" -#include "page_fmatrix-inl.hpp" -#endif // implements data loads using dmatrix simple for now namespace xgboost { @@ -28,43 +24,12 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, utils::FileStream fs(utils::FopenCheck(fname, "rb")); utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); fs.Seek(0); - if (magic == DMatrixSimple::kMagic) { DMatrixSimple *dmat = new DMatrixSimple(); dmat->LoadBinary(fs, silent, fname); fs.Close(); return dmat; } -#ifndef XGBOOST_STRICT_CXX98_ - std::string tmp_fname; - const char *fname_ext = NULL; - if (strchr(fname, ';') != NULL) { - tmp_fname = fname; - char *ptr = strchr(&tmp_fname[0], ';'); - ptr[0] = '\0'; fname = &tmp_fname[0]; - fname_ext = ptr + 1; - } - if (magic == DMatrixPage::kMagic) { - if (fname_ext == NULL) { - DMatrixPage *dmat = new DMatrixPage(); - dmat->Load(fs, silent, fname); - return dmat; - } else { - DMatrixColPage *dmat = new DMatrixColPage(fname_ext); - dmat->Load(fs, silent, fname, true); - return dmat; - } - } - if (magic == DMatrixColPage::kMagic) { - std::string sfname = fname; - if (fname_ext == NULL) { - sfname += ".col"; fname_ext = sfname.c_str(); - } - DMatrixColPage *dmat = new DMatrixColPage(fname_ext); - dmat->Load(fs, silent, fname); - return dmat; - } - #endif fs.Close(); DMatrixSimple *dmat = new DMatrixSimple(); dmat->CacheLoad(fname, silent, savebuffer); @@ -72,16 +37,6 @@ DataMatrix* LoadDataMatrix(const char *fname, bool silent, } void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { -#ifndef XGBOOST_STRICT_CXX98_ - if (!strcmp(fname + strlen(fname) - 5, ".page")) { - DMatrixPage::Save(fname, dmat, silent); - return; - } - if (!strcmp(fname + strlen(fname) - 6, ".cpage")) { - DMatrixColPage::Save(fname, dmat, silent); - return; - } -#endif if (dmat.magic == DMatrixSimple::kMagic) { const DMatrixSimple *p_dmat = static_cast(&dmat); p_dmat->SaveBinary(fname, silent); diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index 21279900f..9f204536f 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -9,7 +9,8 @@ #include "../utils/utils.h" #include "../utils/random.h" #include "../utils/omp.h" -#include "../utils/matrix_csr.h" +#include "../utils/group_data.h" + namespace xgboost { namespace io { /*! @@ -147,21 +148,40 @@ class FMatrixS : public IFMatrix{ * \param pkeep probability to keep a row */ inline void InitColData(float pkeep, const std::vector &enabled) { + // clear rowset buffered_rowset_.clear(); - // note: this part of code is serial, todo, parallelize this transformer - utils::SparseCSRMBuilder builder(col_ptr_, col_data_); - builder.InitBudget(0); + // bit map + int nthread; + std::vector bmap; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + // build the column matrix in parallel + utils::ParallelGroupBuilder builder(&col_ptr_, &col_data_); + builder.InitBudget(0, nthread); // start working iter_->BeforeFirst(); while (iter_->Next()) { const RowBatch &batch = iter_->Value(); - for (size_t i = 0; i < batch.size; ++i) { + bmap.resize(bmap.size() + batch.size, true); + for (size_t i = 0; i < batch.size; ++i) { + bst_uint ridx = static_cast(batch.base_rowid + i); if (pkeep == 1.0f || random::SampleBinary(pkeep)) { - buffered_rowset_.push_back(static_cast(batch.base_rowid+i)); + buffered_rowset_.push_back(ridx); + } else { + bmap[i] = false; + } + } + #pragma omp parallel for schedule(static) + for (size_t i = 0; i < batch.size; ++i) { + int tid = omp_get_thread_num(); + bst_uint ridx = static_cast(batch.base_rowid + i); + if (bmap[ridx]) { RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { if (enabled[inst[j].index]){ - builder.AddBudget(inst[j].index); + builder.AddBudget(inst[j].index, tid); } } } @@ -170,19 +190,19 @@ class FMatrixS : public IFMatrix{ builder.InitStorage(); iter_->BeforeFirst(); - size_t ktop = 0; while (iter_->Next()) { const RowBatch &batch = iter_->Value(); + #pragma omp parallel for schedule(static) for (size_t i = 0; i < batch.size; ++i) { - if (ktop < buffered_rowset_.size() && - buffered_rowset_[ktop] == batch.base_rowid+i) { - ++ktop; + int tid = omp_get_thread_num(); + bst_uint ridx = static_cast(batch.base_rowid + i); + if (bmap[ridx]) { RowBatch::Inst inst = batch[i]; for (bst_uint j = 0; j < inst.length; ++j) { if (enabled[inst[j].index]) { - builder.PushElem(inst[j].index, - Entry((bst_uint)(batch.base_rowid+i), - inst[j].fvalue)); + builder.Push(inst[j].index, + Entry((bst_uint)(batch.base_rowid+i), + inst[j].fvalue), tid); } } } diff --git a/src/utils/group_data.h b/src/utils/group_data.h index a25eb1edd..6e12a39ff 100644 --- a/src/utils/group_data.h +++ b/src/utils/group_data.h @@ -40,7 +40,7 @@ struct ParallelGroupBuilder { * \param nkeys number of keys in the matrix, can be smaller than expected * \param nthread number of thread that will be used in construction */ - inline void InitBudget(size_t nkeys = 0, int nthread = 1) { + inline void InitBudget(size_t nkeys, int nthread) { thread_rptr.resize(nthread); for (size_t i = 0; i < thread_rptr.size(); ++i) { thread_rptr[i].resize(nkeys); @@ -53,7 +53,7 @@ struct ParallelGroupBuilder { * \param threadid the id of thread that calls this function * \param nelem number of element budget add to this row */ - inline void AddBudget(size_t key, int threadid = 0, SizeType nelem = 1) { + inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) { std::vector &trptr = thread_rptr[threadid]; if (trptr.size() < key + 1) { trptr.resize(key + 1, 0); @@ -65,7 +65,7 @@ struct ParallelGroupBuilder { // set rptr to correct size for (size_t tid = 0; tid < thread_rptr.size(); ++tid) { if (rptr.size() <= thread_rptr[tid].size()) { - rptr.resize(thread_rptr[tid].size()+1); + rptr.resize(thread_rptr[tid].size() + 1); } } // initialize rptr to be beginning of each segment @@ -90,7 +90,7 @@ struct ParallelGroupBuilder { * \param key the key of * \param threadid the id of thread that calls this function */ - inline void Push(size_t key, ValueType value, int threadid = 0) { + inline void Push(size_t key, ValueType value, int threadid) { SizeType &rp = thread_rptr[threadid][key]; data[rp++] = value; } diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h deleted file mode 100644 index 14e0667ee..000000000 --- a/src/utils/matrix_csr.h +++ /dev/null @@ -1,260 +0,0 @@ -#ifndef XGBOOST_UTILS_MATRIX_CSR_H_ -#define XGBOOST_UTILS_MATRIX_CSR_H_ -/*! - * \file matrix_csr.h - * \brief this file defines some easy to use STL based class for in memory sparse CSR matrix - * \author Tianqi Chen - */ -#include -#include -#include -#include "./io.h" -#include "./utils.h" -#include "./omp.h" - -namespace xgboost { -namespace utils { -/*! - * \brief a class used to help construct CSR format matrix, - * can be used to convert row major CSR to column major CSR - * \tparam IndexType type of index used to store the index position, usually unsigned or size_t - * \tparam whether enabling the usage of aclist, this option must be enabled manually - */ -template -struct SparseCSRMBuilder { - private: - /*! \brief dummy variable used in the indicator matrix construction */ - std::vector dummy_aclist; - /*! \brief pointer to each of the row */ - std::vector &rptr; - /*! \brief index of nonzero entries in each row */ - std::vector &findex; - /*! \brief a list of active rows, used when many rows are empty */ - std::vector &aclist; - - public: - SparseCSRMBuilder(std::vector &p_rptr, - std::vector &p_findex) - :rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) { - Assert(!UseAcList, "enabling bug"); - } - /*! \brief use with caution! rptr must be cleaned before use */ - SparseCSRMBuilder(std::vector &p_rptr, - std::vector &p_findex, - std::vector &p_aclist) - :rptr(p_rptr), findex(p_findex), aclist(p_aclist) { - Assert(UseAcList, "must manually enable the option use aclist"); - } - - public: - /*! - * \brief step 1: initialize the number of rows in the data, not necessary exact - * \nrows number of rows in the matrix, can be smaller than expected - */ - inline void InitBudget(size_t nrows = 0) { - if (!UseAcList) { - rptr.clear(); - rptr.resize(nrows + 1, 0); - } else { - Assert(nrows + 1 == rptr.size(), "rptr must be initialized already"); - this->Cleanup(); - } - } - /*! - * \brief step 2: add budget to each rows, this function is called when aclist is used - * \param row_id the id of the row - * \param nelem number of element budget add to this row - */ - inline void AddBudget(size_t row_id, SizeType nelem = 1) { - if (rptr.size() < row_id + 2) { - rptr.resize(row_id + 2, 0); - } - if (UseAcList) { - if (rptr[row_id + 1] == 0) aclist.push_back(row_id); - } - rptr[row_id + 1] += nelem; - } - /*! \brief step 3: initialize the necessary storage */ - inline void InitStorage(void) { - // initialize rptr to be beginning of each segment - size_t start = 0; - if (!UseAcList) { - for (size_t i = 1; i < rptr.size(); i++) { - size_t rlen = rptr[i]; - rptr[i] = start; - start += rlen; - } - } else { - // case with active list - std::sort(aclist.begin(), aclist.end()); - for (size_t i = 0; i < aclist.size(); i++) { - size_t ridx = aclist[i]; - size_t rlen = rptr[ridx + 1]; - rptr[ridx + 1] = start; - // set previous rptr to right position if previous feature is not active - if (i == 0 || ridx != aclist[i - 1] + 1) rptr[ridx] = start; - start += rlen; - } - } - findex.resize(start); - } - /*! - * \brief step 4: - * used in indicator matrix construction, add new - * element to each row, the number of calls shall be exactly same as add_budget - */ - inline void PushElem(size_t row_id, IndexType col_id) { - SizeType &rp = rptr[row_id + 1]; - findex[rp++] = col_id; - } - /*! - * \brief step 5: only needed when aclist is used - * clean up the rptr for next usage - */ - inline void Cleanup(void) { - Assert(UseAcList, "this function can only be called use AcList"); - for (size_t i = 0; i < aclist.size(); i++) { - const size_t ridx = aclist[i]; - rptr[ridx] = 0; rptr[ridx + 1] = 0; - } - aclist.clear(); - } -}; - -/*! - * \brief a class used to help construct CSR format matrix file - * \tparam IndexType type of index used to store the index position - * \tparam SizeType type of size used in row pointer - */ -template -struct SparseCSRFileBuilder { - public: - explicit SparseCSRFileBuilder(utils::ISeekStream *fo, size_t buffer_size) - : fo(fo), buffer_size(buffer_size) { - } - /*! - * \brief step 1: initialize the number of rows in the data, not necessary exact - * \nrows number of rows in the matrix, can be smaller than expected - */ - inline void InitBudget(size_t nrows = 0) { - rptr.clear(); - rptr.resize(nrows + 1, 0); - } - /*! - * \brief step 2: add budget to each rows - * \param row_id the id of the row - * \param nelem number of element budget add to this row - */ - inline void AddBudget(size_t row_id, SizeType nelem = 1) { - if (rptr.size() < row_id + 2) { - rptr.resize(row_id + 2, 0); - } - rptr[row_id + 1] += nelem; - } - /*! \brief step 3: initialize the necessary storage */ - inline void InitStorage(void) { - SizeType nelem = 0; - for (size_t i = 1; i < rptr.size(); i++) { - nelem += rptr[i]; - rptr[i] = nelem; - } - begin_data = static_cast(fo->Tell()) + sizeof(SizeType); - SizeType begin_meta = begin_data + nelem * sizeof(IndexType); - fo->Write(&begin_meta, sizeof(begin_meta)); - fo->Seek(begin_meta); - fo->Write(rptr); - // setup buffer space - buffer_rptr.resize(rptr.size()); - buffer_temp.reserve(buffer_size); - buffer_data.resize(buffer_size); - saved_offset = rptr; - saved_offset.resize(rptr.size() - 1); - this->ClearBuffer(); - } - /*! \brief step 4: push element into buffer */ - inline void PushElem(SizeType row_id, IndexType col_id) { - if (buffer_temp.size() == buffer_size) { - this->WriteBuffer(); - this->ClearBuffer(); - } - buffer_rptr[row_id + 1] += 1; - buffer_temp.push_back(std::make_pair(row_id, col_id)); - } - /*! \brief finalize the construction */ - inline void Finalize(void) { - this->WriteBuffer(); - for (size_t i = 0; i < saved_offset.size(); ++i) { - utils::Assert(saved_offset[i] == rptr[i+1], "some block not write out"); - } - } - /*! \brief content must be in wb+ */ - template - inline void SortRows(Comparator comp, size_t step) { - for (size_t i = 0; i < rptr.size() - 1; i += step) { - bst_omp_uint begin = static_cast(i); - bst_omp_uint end = static_cast(std::min(rptr.size() - 1, i + step)); - if (rptr[end] != rptr[begin]) { - fo->Seek(begin_data + rptr[begin] * sizeof(IndexType)); - buffer_data.resize(rptr[end] - rptr[begin]); - fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); - // do parallel sorting - #pragma omp parallel for schedule(static) - for (bst_omp_uint j = begin; j < end; ++j) { - std::sort(&buffer_data[0] + rptr[j] - rptr[begin], - &buffer_data[0] + rptr[j+1] - rptr[begin], - comp); - } - fo->Seek(begin_data + rptr[begin] * sizeof(IndexType)); - fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); - } - } - } - protected: - inline void WriteBuffer(void) { - SizeType start = 0; - for (size_t i = 1; i < buffer_rptr.size(); ++i) { - size_t rlen = buffer_rptr[i]; - buffer_rptr[i] = start; - start += rlen; - } - for (size_t i = 0; i < buffer_temp.size(); ++i) { - SizeType &rp = buffer_rptr[buffer_temp[i].first + 1]; - buffer_data[rp++] = buffer_temp[i].second; - } - // write out - for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) { - size_t nelem = buffer_rptr[i+1] - buffer_rptr[i]; - if (nelem != 0) { - utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound"); - fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data); - fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType)); - saved_offset[i] += nelem; - } - } - } - inline void ClearBuffer(void) { - buffer_temp.clear(); - std::fill(buffer_rptr.begin(), buffer_rptr.end(), 0); - } - private: - /*! \brief output file pointer the data */ - utils::ISeekStream *fo; - /*! \brief pointer to each of the row */ - std::vector rptr; - /*! \brief saved top space of each item */ - std::vector saved_offset; - /*! \brief beginning position of data */ - size_t begin_data; - // ----- the following are buffer space - /*! \brief maximum size of content buffer*/ - size_t buffer_size; - /*! \brief store the data content */ - std::vector< std::pair > buffer_temp; - /*! \brief saved top space of each item */ - std::vector buffer_rptr; - /*! \brief saved top space of each item */ - std::vector buffer_data; -}; -} // namespace utils -} // namespace xgboost -#endif diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index b70939951..dec266ff6 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -19,7 +19,7 @@ using namespace std; #include "../src/learner/learner-inl.hpp" #include "../src/io/io.h" #include "../src/utils/utils.h" -#include "../src/utils/matrix_csr.h" +#include "../src/utils/group_data.h" #include "../src/io/simple_dmatrix-inl.hpp" using namespace xgboost; @@ -139,20 +139,32 @@ extern "C"{ const float *data, bst_ulong nindptr, bst_ulong nelem) { + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + DMatrixSimple *p_mat = new DMatrixSimple(); DMatrixSimple &mat = *p_mat; - utils::SparseCSRMBuilder builder(mat.row_ptr_, mat.row_data_); - builder.InitBudget(); + utils::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); + builder.InitBudget(0, nthread); bst_ulong ncol = nindptr - 1; + #pragma omp parallel for schedule(static) for (bst_ulong i = 0; i < ncol; ++i) { + int tid = omp_get_thread_num(); for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.AddBudget(indices[j]); + builder.AddBudget(indices[j], tid); } } builder.InitStorage(); + #pragma omp parallel for schedule(static) for (bst_ulong i = 0; i < ncol; ++i) { + int tid = omp_get_thread_num(); for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.PushElem(indices[j], RowBatch::Entry(static_cast(i), data[j])); + builder.Push(indices[j], + RowBatch::Entry(static_cast(i), data[j]), + tid); } } mat.info.info.num_row = mat.row_ptr_.size() - 1; From e8f6f3b541c2947698db78849f5be39ebd2e2052 Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 15 Apr 2015 15:15:23 -0700 Subject: [PATCH 02/14] some initial try of cachefiles --- .gitignore | 2 +- R-package/src/Makevars | 4 +- R-package/src/Makevars.win | 4 +- R-package/src/xgboost_R.cpp | 11 ++ R-package/src/xgboost_R.h | 9 + src/io/io.cpp | 62 +++--- src/io/io.h | 6 +- src/io/page_dmatrix-inl.hpp | 98 +++++++-- src/io/page_fmatrix-inl.hpp | 382 ------------------------------------ src/utils/thread_buffer.h | 2 +- wrapper/xgboost.py | 18 +- wrapper/xgboost_wrapper.cpp | 5 + wrapper/xgboost_wrapper.h | 13 ++ 13 files changed, 185 insertions(+), 431 deletions(-) delete mode 100644 src/io/page_fmatrix-inl.hpp diff --git a/.gitignore b/.gitignore index 8b2c65f62..35061b857 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,7 @@ train* rabit .Rbuildignore R-package.Rproj - +*.cache* R-package/inst R-package/src diff --git a/R-package/src/Makevars b/R-package/src/Makevars index 406f59517..d0eb23b25 100644 --- a/R-package/src/Makevars +++ b/R-package/src/Makevars @@ -2,7 +2,7 @@ PKGROOT=../../ # _*_ mode: Makefile; _*_ PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) +PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index e3c98d2b7..56b550e7f 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -13,7 +13,7 @@ xgblib: cp -r ../../subtree . PKG_CPPFLAGS= -DXGBOOST_CUSTOMIZE_MSG_ -DXGBOOST_CUSTOMIZE_PRNG_ -DXGBOOST_STRICT_CXX98_ -DRABIT_CUSTOMIZE_MSG_ -DRABIT_STRICT_CXX98_ -I$(PKGROOT) -I../.. -PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) +PKG_CXXFLAGS= $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) $(SHLIB_PTHREAD_FLAGS) OBJECTS= xgboost_R.o xgboost_assert.o $(PKGROOT)/wrapper/xgboost_wrapper.o $(PKGROOT)/src/io/io.o $(PKGROOT)/src/gbm/gbm.o $(PKGROOT)/src/tree/updater.o $(PKGROOT)/subtree/rabit/src/engine_empty.o $(PKGROOT)/src/io/dmlc_simple.o $(OBJECTS) : xgblib diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index a2ca9536f..f67462564 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -76,6 +76,17 @@ extern "C" { _WrapperEnd(); return ret; } + SEXP XGDMatrixCreateCache_R(SEXP fname, SEXP cache_file, SEXP silent) { + _WrapperBegin(); + void *handle = XGDMatrixCreateCache(CHAR(asChar(fname)), + CHAR(asChar(cache_file)), + asInteger(silent)); + SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); + R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); + UNPROTECT(1); + _WrapperEnd(); + return ret; + } SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing) { _WrapperBegin(); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 61b84a80e..1314cef15 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -24,6 +24,15 @@ extern "C" { * \return a loaded data matrix */ SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent); + /*! + * \brief load a cached DMatrix, this is backed by several cache_files + * and usually cost less memory + * \param fname the name of the file, can be a cached buffer or text + * \param cache_file the name of cached file + * \param silent whether print messages during loading + * \return a loaded data matrix + */ + SEXP XGDMatrixCreateCache_R(SEXP fname, SEXP cache_file, SEXP silent); /*! * \brief create matrix content from dense matrix * This assumes the matrix is stored in column major format diff --git a/src/io/io.cpp b/src/io/io.cpp index 1535a9e42..8d6856b92 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -6,37 +6,53 @@ #include "../utils/io.h" #include "../utils/utils.h" #include "simple_dmatrix-inl.hpp" -// implements data loads using dmatrix simple for now +#include "page_dmatrix-inl.hpp" namespace xgboost { namespace io { -DataMatrix* LoadDataMatrix(const char *fname, bool silent, - bool savebuffer, bool loadsplit) { - if (!std::strcmp(fname, "stdin") || - !std::strncmp(fname, "s3://", 5) || - !std::strncmp(fname, "hdfs://", 7) || - loadsplit) { - DMatrixSimple *dmat = new DMatrixSimple(); - dmat->LoadText(fname, silent, loadsplit); - return dmat; - } - int magic; - utils::FileStream fs(utils::FopenCheck(fname, "rb")); - utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); - fs.Seek(0); - if (magic == DMatrixSimple::kMagic) { - DMatrixSimple *dmat = new DMatrixSimple(); - dmat->LoadBinary(fs, silent, fname); +DataMatrix* LoadDataMatrix(const char *fname, + bool silent, + bool savebuffer, + bool loadsplit, + const char *cache_file) { + if (cache_file == NULL) { + if (!std::strcmp(fname, "stdin") || + !std::strncmp(fname, "s3://", 5) || + !std::strncmp(fname, "hdfs://", 7) || + loadsplit) { + DMatrixSimple *dmat = new DMatrixSimple(); + dmat->LoadText(fname, silent, loadsplit); + return dmat; + } + int magic; + utils::FileStream fs(utils::FopenCheck(fname, "rb")); + utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); + fs.Seek(0); + if (magic == DMatrixSimple::kMagic) { + DMatrixSimple *dmat = new DMatrixSimple(); + dmat->LoadBinary(fs, silent, fname); + fs.Close(); + return dmat; + } fs.Close(); + DMatrixSimple *dmat = new DMatrixSimple(); + dmat->CacheLoad(fname, silent, savebuffer); + return dmat; + } else { + if (!strcmp(fname, cache_file)) { + DMatrixPage *dmat = new DMatrixPage(); + utils::FileStream fs(utils::FopenCheck(fname, "rb")); + dmat->LoadBinary(fs, silent, fname); + fs.Close(); + return dmat; + } + DMatrixPage *dmat = new DMatrixPage(); + dmat->LoadText(fname, cache_file, false, loadsplit); return dmat; } - fs.Close(); - DMatrixSimple *dmat = new DMatrixSimple(); - dmat->CacheLoad(fname, silent, savebuffer); - return dmat; } -void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { +void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent) { if (dmat.magic == DMatrixSimple::kMagic) { const DMatrixSimple *p_dmat = static_cast(&dmat); p_dmat->SaveBinary(fname, silent); diff --git a/src/io/io.h b/src/io/io.h index 2de76dd38..ed075977c 100644 --- a/src/io/io.h +++ b/src/io/io.h @@ -21,12 +21,16 @@ typedef learner::DMatrix DataMatrix; * \param savebuffer whether temporal buffer the file if the file is in text format * \param loadsplit whether we only load a split of input files * such that each worker node get a split of the data + * \param cache_file name of cache_file, used by external memory version + * can be NULL, if cache_file is specified, this will be the temporal + * space that can be re-used to store intermediate data * \return a loaded DMatrix */ DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer, - bool loadsplit); + bool loadsplit, + const char *cache_file = NULL); /*! * \brief save DataMatrix into stream, * note: the saved dmatrix format may not be in exactly same as input diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 4f70ff2e9..73a3087fb 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -2,7 +2,7 @@ #define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_ /*! * \file page_row_iter-inl.hpp - * row iterator based on sparse page + * row iterator based on sparse page * \author Tianqi Chen */ #include @@ -212,23 +212,24 @@ class DMatrixPageBase : public DataMatrix { // to be cleaned up in a more clear way } /*! \brief load and initialize the iterator with fi */ - inline void Load(utils::FileStream &fi, - bool silent = false, - const char *fname = NULL, - bool skip_magic_check = false) { + inline void LoadBinary(utils::FileStream &fi, + bool silent, + const char *fname_) { + std::string fname = fname_; int tmagic; utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); - if (!skip_magic_check) { - utils::Check(tmagic == magic, "invalid format,magic number mismatch"); - } + utils::Check(tmagic == magic, "invalid format,magic number mismatch"); this->info.LoadBinary(fi); - iter_->Load(fi); + // load in the row data file + fname += ".row.blob"; + utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb")); + iter_->Load(fs); if (!silent) { utils::Printf("DMatrixPage: %lux%lu matrix is loaded", static_cast(info.num_row()), static_cast(info.num_col())); - if (fname != NULL) { - utils::Printf(" from %s\n", fname); + if (fname_ != NULL) { + utils::Printf(" from %s\n", fname_); } else { utils::Printf("\n"); } @@ -237,18 +238,83 @@ class DMatrixPageBase : public DataMatrix { } } } - /*! \brief save a DataMatrix as DMatrixPage*/ - inline static void Save(const char* fname, const DataMatrix &mat, bool silent) { - utils::FileStream fs(utils::FopenCheck(fname, "wb")); + /*! \brief save a DataMatrix as DMatrixPage */ + inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) { + std::string fname = fname_; + utils::FileStream fs(utils::FopenCheck(fname.c_str(), "wb")); int magic = kMagic; fs.Write(&magic, sizeof(magic)); mat.info.SaveBinary(fs); - ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fs); fs.Close(); + fname += ".row.blob"; + utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb")); + ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fbin); + fbin.Close(); if (!silent) { utils::Printf("DMatrixPage: %lux%lu is saved to %s\n", static_cast(mat.info.num_row()), - static_cast(mat.info.num_col()), fname); + static_cast(mat.info.num_col()), fname_); + } + } + /*! \brief save a LibSVM format file as DMatrixPage */ + inline void LoadText(const char *uri, + const char* cache_file, + bool silent, + bool loadsplit) { + int rank = 0, npart = 1; + if (loadsplit) { + rank = rabit::GetRank(); + npart = rabit::GetWorldSize(); + } + std::string fname_row = std::string(cache_file) + ".row.blob"; + utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); + RowBatchPage page(ThreadRowPageIterator::kPageSize); + dmlc::InputSplit *in = + dmlc::InputSplit::Create(uri, rank, npart); + std::string line; + info.Clear(); + while (in->ReadRecord(&line)) { + float label; + std::istringstream ss(line); + std::vector feats; + ss >> label; + while (!ss.eof()) { + RowBatch::Entry e; + if (!(ss >> e.index)) break; + ss.ignore(32, ':'); + if (!(ss >> e.fvalue)) break; + feats.push_back(e); + } + RowBatch::Inst row(BeginPtr(feats), feats.size()); + if (!page.PushRow(row)) { + page.Save(fo); + page.Clear(); + utils::Check(page.PushRow(row), "row is too big"); + } + for (size_t i = 0; i < feats.size(); ++i) { + info.info.num_col = std::max(info.info.num_col, + static_cast(feats[i].index+1)); + } + this->info.labels.push_back(label); + info.info.num_row += 1; + } + if (page.Size() != 0) { + page.Save(fo); + } + delete in; + fo.Close(); + iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb"))); + // save data matrix + utils::FileStream fs(utils::FopenCheck(cache_file, "wb")); + int magic = kMagic; + fs.Write(&magic, sizeof(magic)); + this->info.SaveBinary(fs); + fs.Close(); + if (!silent) { + utils::Printf("DMatrixPage: %lux%lu is parsed from %s\n", + static_cast(info.num_row()), + static_cast(info.num_col()), + uri); } } /*! \brief magic number used to identify DMatrix */ diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp deleted file mode 100644 index 44cb9abdc..000000000 --- a/src/io/page_fmatrix-inl.hpp +++ /dev/null @@ -1,382 +0,0 @@ -#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ -#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ -/*! - * \file page_fmatrix-inl.hpp - * sparse page manager for fmatrix - * \author Tianqi Chen - */ -#include -#include -#include -#include "../data.h" -#include "../utils/iterator.h" -#include "../utils/io.h" -#include "../utils/matrix_csr.h" -#include "../utils/thread_buffer.h" -namespace xgboost { -namespace io { -class CSCMatrixManager { - public: - /*! \brief in memory page */ - struct Page { - public: - /*! \brief initialize the page */ - explicit Page(size_t size) { - buffer.resize(size); - col_index.reserve(10); - col_data.reserve(10); - } - /*! \brief clear the page */ - inline void Clear(void) { - num_entry = 0; - col_index.clear(); - col_data.clear(); - } - /*! \brief number of used entries */ - size_t num_entry; - /*! \brief column index */ - std::vector col_index; - /*! \brief column data */ - std::vector col_data; - /*! \brief number of free entries */ - inline size_t NumFreeEntry(void) const { - return buffer.size() - num_entry; - } - inline ColBatch::Entry* AllocEntry(size_t len) { - ColBatch::Entry *p_data = &buffer[0] + num_entry; - num_entry += len; - return p_data; - } - /*! \brief get underlying batch */ - inline ColBatch GetBatch(void) const { - ColBatch batch; - batch.size = col_index.size(); - batch.col_index = BeginPtr(col_index); - batch.col_data = BeginPtr(col_data); - return batch; - } - - private: - /*! \brief buffer space, not to be changed since ready */ - std::vector buffer; - }; - /*! \brief define type of page pointer */ - typedef Page *PagePtr; - // constructor - CSCMatrixManager(void) { - fi_ = NULL; - } - /*! \brief get column pointer */ - inline const std::vector &col_ptr(void) const { - return col_ptr_; - } - inline void SetParam(const char *name, const char *val) { - } - inline PagePtr Create(void) { - return new Page(page_size_); - } - inline void FreeSpace(PagePtr &a) { - delete a; - } - inline void Destroy(void) { - } - inline void BeforeFirst(void) { - col_index_ = col_todo_; - read_top_ = 0; - } - inline bool LoadNext(PagePtr &val) { - val->Clear(); - if (read_top_ >= col_index_.size()) return false; - while (read_top_ < col_index_.size()) { - if (!this->TryFill(col_index_[read_top_], val)) { - return true; - } - ++read_top_; - } - return true; - } - inline bool Init(void) { - this->BeforeFirst(); - return true; - } - inline void Setup(utils::ISeekStream *fi, double page_ratio) { - fi_ = fi; - fi_->Read(&begin_meta_ , sizeof(begin_meta_)); - begin_data_ = static_cast(fi->Tell()); - fi_->Seek(begin_meta_); - fi_->Read(&col_ptr_); - size_t psmax = 0; - for (size_t i = 0; i < col_ptr_.size() - 1; ++i) { - psmax = std::max(psmax, col_ptr_[i+1] - col_ptr_[i]); - } - utils::Check(page_ratio >= 1.0f, "col_page_ratio must be at least 1"); - page_size_ = std::max(static_cast(psmax * page_ratio), psmax); - } - inline void SetColSet(const std::vector &cset, bool setall) { - if (!setall) { - col_todo_.resize(0); - for (size_t i = 0; i < cset.size(); ++i) { - if (col_todo_[i] < static_cast(col_ptr_.size() - 1)) { - col_todo_.push_back(cset[i]); - } - } - std::sort(col_todo_.begin(), col_todo_.end()); - } else { - col_todo_.resize(col_ptr_.size()-1); - for (size_t i = 0; i < col_todo_.size(); ++i) { - col_todo_[i] = static_cast(i); - } - } - } - - private: - /*! \brief fill a page with */ - inline bool TryFill(size_t cidx, Page *p_page) { - size_t len = col_ptr_[cidx+1] - col_ptr_[cidx]; - if (p_page->NumFreeEntry() < len) return false; - ColBatch::Entry *p_data = p_page->AllocEntry(len); - fi_->Seek(col_ptr_[cidx] * sizeof(ColBatch::Entry) + begin_data_); - utils::Check(fi_->Read(p_data, sizeof(ColBatch::Entry) * len) != 0, - "invalid column buffer format"); - p_page->col_data.push_back(ColBatch::Inst(p_data, static_cast(len))); - p_page->col_index.push_back(static_cast(cidx)); - return true; - } - // the following are in memory auxiliary data structure - /*! \brief top of reader position */ - size_t read_top_; - /*! \brief size of page */ - size_t page_size_; - /*! \brief column index to be loaded */ - std::vector col_index_; - /*! \brief column index to be after calling before first */ - std::vector col_todo_; - // the following are input content - /*! \brief beginning position of data content */ - size_t begin_data_; - /*! \brief size of data content */ - size_t begin_meta_; - /*! \brief input stream */ - utils::ISeekStream *fi_; - /*! \brief column pointer of CSC format */ - std::vector col_ptr_; -}; - -class ThreadColPageIterator : public utils::IIterator { - public: - explicit ThreadColPageIterator(utils::ISeekStream *fi, - float page_ratio, bool silent) { - itr_.SetParam("buffer_size", "2"); - itr_.get_factory().Setup(fi, page_ratio); - itr_.Init(); - if (!silent) { - utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n", - static_cast(col_ptr().size() - 1)); - } - } - virtual ~ThreadColPageIterator(void) { - } - virtual void BeforeFirst(void) { - itr_.BeforeFirst(); - } - virtual bool Next(void) { - // page to be loaded - CSCMatrixManager::PagePtr page; - if (!itr_.Next(page)) return false; - out_ = page->GetBatch(); - return true; - } - virtual const ColBatch &Value(void) const { - return out_; - } - inline const std::vector &col_ptr(void) const { - return itr_.get_factory().col_ptr(); - } - inline void SetColSet(const std::vector &cset, - bool setall = false) { - itr_.get_factory().SetColSet(cset, setall); - } - - private: - // output data - ColBatch out_; - // internal iterator - utils::ThreadBuffer itr_; -}; -/*! - * \brief sparse matrix that support column access - */ -class FMatrixPage : public IFMatrix { - public: - /*! \brief constructor */ - FMatrixPage(utils::IIterator *iter, std::string fname_buffer) - : fname_cbuffer_(fname_buffer) { - this->row_iter_ = iter; - this->col_iter_ = NULL; - this->fi_ = NULL; - } - // destructor - virtual ~FMatrixPage(void) { - if (row_iter_ != NULL) delete row_iter_; - if (col_iter_ != NULL) delete col_iter_; - if (fi_ != NULL) { - fi_->Close(); delete fi_; - } - } - /*! \return whether column access is enabled */ - virtual bool HaveColAccess(void) const { - return col_iter_ != NULL; - } - /*! \brief get number of colmuns */ - virtual size_t NumCol(void) const { - utils::Check(this->HaveColAccess(), "NumCol:need column access"); - return col_iter_->col_ptr().size() - 1; - } - /*! \brief get number of buffered rows */ - virtual const std::vector &buffered_rowset(void) const { - return buffered_rowset_; - } - /*! \brief get column size */ - virtual size_t GetColSize(size_t cidx) const { - const std::vector &col_ptr = col_iter_->col_ptr(); - return col_ptr[cidx+1] - col_ptr[cidx]; - } - /*! \brief get column density */ - virtual float GetColDensity(size_t cidx) const { - const std::vector &col_ptr = col_iter_->col_ptr(); - size_t nmiss = buffered_rowset_.size() - (col_ptr[cidx+1] - col_ptr[cidx]); - return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); - } - virtual void InitColAccess(const std::vector &enabled, float pkeep = 1.0f) { - if (this->HaveColAccess()) return; - utils::Printf("start to initialize page col access\n"); - if (this->LoadColData()) { - utils::Printf("loading previously saved col data\n"); - return; - } - this->InitColData(pkeep, fname_cbuffer_.c_str(), - 1 << 30, 5); - utils::Check(this->LoadColData(), "fail to read in column data"); - utils::Printf("finish initialize page col access\n"); - } - /*! - * \brief get the row iterator associated with FMatrix - */ - virtual utils::IIterator* RowIterator(void) { - row_iter_->BeforeFirst(); - return row_iter_; - } - /*! - * \brief get the column based iterator - */ - virtual utils::IIterator* ColIterator(void) { - std::vector cset; - col_iter_->SetColSet(cset, true); - col_iter_->BeforeFirst(); - return col_iter_; - } - /*! - * \brief colmun based iterator - */ - virtual utils::IIterator *ColIterator(const std::vector &fset) { - col_iter_->SetColSet(fset, false); - col_iter_->BeforeFirst(); - return col_iter_; - } - - protected: - /*! - * \brief try load column data from file - */ - inline bool LoadColData(void) { - FILE *fp = fopen64(fname_cbuffer_.c_str(), "rb"); - if (fp == NULL) return false; - fi_ = new utils::FileStream(fp); - static_cast(fi_)->Read(&buffered_rowset_); - col_iter_ = new ThreadColPageIterator(fi_, 2.0f, false); - return true; - } - /*! - * \brief intialize column data - * \param pkeep probability to keep a row - */ - inline void InitColData(float pkeep, const char *fname, - size_t buffer_size, size_t col_step) { - buffered_rowset_.clear(); - utils::FileStream fo(utils::FopenCheck(fname, "wb+")); - // use 64M buffer - utils::SparseCSRFileBuilder builder(&fo, buffer_size); - // start working - row_iter_->BeforeFirst(); - while (row_iter_->Next()) { - const RowBatch &batch = row_iter_->Value(); - for (size_t i = 0; i < batch.size; ++i) { - if (pkeep == 1.0f || random::SampleBinary(pkeep)) { - buffered_rowset_.push_back(static_cast(batch.base_rowid+i)); - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - builder.AddBudget(inst[j].index); - } - } - } - } - // write buffered rowset - static_cast(&fo)->Write(buffered_rowset_); - builder.InitStorage(); - row_iter_->BeforeFirst(); - size_t ktop = 0; - while (row_iter_->Next()) { - const RowBatch &batch = row_iter_->Value(); - for (size_t i = 0; i < batch.size; ++i) { - if (ktop < buffered_rowset_.size() && - buffered_rowset_[ktop] == batch.base_rowid + i) { - ++ktop; - RowBatch::Inst inst = batch[i]; - for (bst_uint j = 0; j < inst.length; ++j) { - builder.PushElem(inst[j].index, - ColBatch::Entry((bst_uint)(batch.base_rowid+i), - inst[j].fvalue)); - } - if (ktop % 100000 == 0) { - utils::Printf("\r \r"); - utils::Printf("InitCol: %lu rows ", static_cast(ktop)); - } - } - } - } - builder.Finalize(); - builder.SortRows(ColBatch::Entry::CmpValue, col_step); - fo.Close(); - } - - private: - // row iterator - utils::IIterator *row_iter_; - // column iterator - ThreadColPageIterator *col_iter_; - // file pointer to data - utils::FileStream *fi_; - // file name of column buffer - std::string fname_cbuffer_; - /*! \brief list of row index that are buffered */ - std::vector buffered_rowset_; -}; - -class DMatrixColPage : public DMatrixPageBase<0xffffab03> { - public: - explicit DMatrixColPage(const char *fname) { - fmat_ = new FMatrixPage(iter_, fname); - } - virtual ~DMatrixColPage(void) { - delete fmat_; - } - virtual IFMatrix *fmat(void) const { - return fmat_; - } - /*! \brief the real fmatrix */ - IFMatrix *fmat_; -}; - -} // namespace io -} // namespace xgboost -#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h index ed36e1b43..d4ca1d111 100644 --- a/src/utils/thread_buffer.h +++ b/src/utils/thread_buffer.h @@ -31,7 +31,7 @@ class ThreadBuffer { } /*!\brief set parameter, will also pass the parameter to factory */ inline void SetParam(const char *name, const char *val) { - if (!strcmp( name, "buffer_size")) buf_size = atoi(val); + if (!std::strcmp( name, "buffer_size")) buf_size = atoi(val); factory.SetParam(name, val); } /*! diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index 1906ce09e..bfab05deb 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -87,27 +87,39 @@ def c_array(ctype, values): class DMatrix(object): - def __init__(self, data, label=None, missing=0.0, weight=None): + def __init__(self, data, label=None, missing=0.0, weight=None, cache_file=None): """ Data matrix used in XGBoost. Parameters ---------- data : string/numpy array/scipy.sparse - Data source, string type is the path of svmlight format txt file or xgb buffer. + Data source, string type is the path of svmlight format txt file, + xgb buffer or path to cache_file label : list or numpy 1-D array (optional) Label of the training data. missing : float Value in the data which needs to be present as a missing value. weight : list or numpy 1-D array (optional) Weight for each instance. + cache_file: string + Path to the binary cache of input data, when this is enabled, + several binary cache files with the prefix cache_file will be created, + xgboost will try to use external memory as much as possible, + thus save memory during computation in general """ # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None return - if isinstance(data, string_types): + if cache_file is not None: + if not isinstance(data, string_types): + raise Exception('cache_file must be used together with input file name') + if not isinstance(cache_file, string_types): + raise Exception('cache_file must be string') + self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateCache(c_str(data), c_str(cache_file), 0)) + elif isinstance(data, string_types): self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0)) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index dec266ff6..45fc05082 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -114,6 +114,11 @@ extern "C"{ void* XGDMatrixCreateFromFile(const char *fname, int silent) { return LoadDataMatrix(fname, silent != 0, false, false); } + void* XGDMatrixCreateCache(const char *fname, + const char *cache_file, + int silent) { + return LoadDataMatrix(fname, silent != 0, false, false, cache_file); + } void* XGDMatrixCreateFromCSR(const bst_ulong *indptr, const unsigned *indices, const float *data, diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 78df68c71..66d1dfbc0 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -19,9 +19,22 @@ extern "C" { #endif /*! * \brief load a data matrix + * \param fname the name of the file + * \param silent whether print messages during loading * \return a loaded data matrix */ XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent); + /*! + * \brief load a cached DMatrix, this is backed by several cache_files + * and usually cost less memory + * \param fname the name of the file, can be a cached buffer or text + * \param cache_file the name of cached file + * \param silent whether print messages during loading + * \return a loaded data matrix + */ + XGB_DLL void* XGDMatrixCreateCache(const char *fname, + const char *cache_file, + int silent); /*! * \brief create a matrix content from csr format * \param indptr pointer to row headers From a514340c96b880783de67db43fab010c8be908da Mon Sep 17 00:00:00 2001 From: tqchen Date: Wed, 15 Apr 2015 22:28:43 -0700 Subject: [PATCH 03/14] current progress --- .gitignore | 1 - src/io/page_dmatrix-inl.hpp | 195 ++++++------------------------ src/io/page_fmatrix-inl.hpp | 130 ++++++++++++++++++++ src/io/simple_fmatrix-inl.hpp | 4 +- src/io/sparse_batch_page.h | 216 ++++++++++++++++++++++++++++++++++ 5 files changed, 381 insertions(+), 165 deletions(-) create mode 100644 src/io/page_fmatrix-inl.hpp create mode 100644 src/io/sparse_batch_page.h diff --git a/.gitignore b/.gitignore index 35061b857..48e3a9400 100644 --- a/.gitignore +++ b/.gitignore @@ -57,4 +57,3 @@ R-package.Rproj *.cache* R-package/inst R-package/src - diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 73a3087fb..eb56bb80d 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -1,7 +1,7 @@ -#ifndef XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_ -#define XGBOOST_IO_PAGE_ROW_ITER_INL_HPP_ +#ifndef XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ +#define XGBOOST_IO_PAGE_DMATRIX_INL_HPP_ /*! - * \file page_row_iter-inl.hpp + * \file page_dmatrix-inl.hpp * row iterator based on sparse page * \author Tianqi Chen */ @@ -10,97 +10,11 @@ #include "../utils/iterator.h" #include "../utils/thread_buffer.h" #include "./simple_fmatrix-inl.hpp" +#include "./sparse_batch_page.h" +#include "./page_fmatrix-inl.hpp" namespace xgboost { namespace io { -/*! \brief page structure that can be used to store a rowbatch */ -struct RowBatchPage { - public: - explicit RowBatchPage(size_t page_size) : kPageSize(page_size) { - data_ = new int[kPageSize]; - utils::Assert(data_ != NULL, "fail to allocate row batch page"); - this->Clear(); - } - ~RowBatchPage(void) { - if (data_ != NULL) delete [] data_; - } - /*! - * \brief Push one row into page - * \param row an instance row - * \return false or true to push into - */ - inline bool PushRow(const RowBatch::Inst &row) { - const size_t dsize = row.length * sizeof(RowBatch::Entry); - if (FreeBytes() < dsize+ sizeof(int)) return false; - row_ptr(Size() + 1) = row_ptr(Size()) + row.length; - memcpy(data_ptr(row_ptr(Size())) , row.data, dsize); - ++data_[0]; - return true; - } - /*! - * \brief get a row batch representation from the page - * \param p_rptr a temporal space that can be used to provide - * ind_ptr storage for RowBatch - * \return a new RowBatch object - */ - inline RowBatch GetRowBatch(std::vector *p_rptr, size_t base_rowid) { - RowBatch batch; - batch.base_rowid = base_rowid; - batch.data_ptr = this->data_ptr(0); - batch.size = static_cast(this->Size()); - std::vector &rptr = *p_rptr; - rptr.resize(this->Size() + 1); - for (size_t i = 0; i < rptr.size(); ++i) { - rptr[i] = static_cast(this->row_ptr(static_cast(i))); - } - batch.ind_ptr = &rptr[0]; - return batch; - } - /*! \brief get i-th row from the batch */ - inline RowBatch::Inst operator[](int i) { - return RowBatch::Inst(data_ptr(0) + row_ptr(i), - static_cast(row_ptr(i+1) - row_ptr(i))); - } - /*! - * \brief clear the page, cleanup the content - */ - inline void Clear(void) { - memset(&data_[0], 0, sizeof(int) * kPageSize); - } - /*! - * \brief load one page form instream - * \return true if loading is successful - */ - inline bool Load(utils::IStream &fi) { - return fi.Read(&data_[0], sizeof(int) * kPageSize) != 0; - } - /*! \brief save one page into outstream */ - inline void Save(utils::IStream &fo) { - fo.Write(&data_[0], sizeof(int) * kPageSize); - } - /*! \return number of elements */ - inline int Size(void) const { - return data_[0]; - } - - protected: - /*! \return number of elements */ - inline size_t FreeBytes(void) { - return (kPageSize - (Size() + 2)) * sizeof(int) - - row_ptr(Size()) * sizeof(RowBatch::Entry); - } - /*! \brief equivalent row pointer at i */ - inline int& row_ptr(int i) { - return data_[kPageSize - i - 1]; - } - inline RowBatch::Entry* data_ptr(int i) { - return (RowBatch::Entry*)(&data_[1]) + i; - } - // content of data - int *data_; - // page size - const size_t kPageSize; -}; /*! \brief thread buffer iterator */ class ThreadRowPageIterator: public utils::IIterator { public: @@ -118,7 +32,10 @@ class ThreadRowPageIterator: public utils::IIterator { } virtual bool Next(void) { if (!itr.Next(page_)) return false; - out_ = page_->GetRowBatch(&tmp_ptr_, base_rowid_); + out_.base_rowid = base_rowid_; + out_.ind_ptr = BeginPtr(page_->offset); + out_.data_ptr = BeginPtr(page_->data); + out_.size = page_->offset.size() - 1; base_rowid_ += out_.size; return true; } @@ -127,76 +44,18 @@ class ThreadRowPageIterator: public utils::IIterator { } /*! \brief load and initialize the iterator with fi */ inline void Load(const utils::FileStream &fi) { - itr.get_factory().SetFile(fi); + itr.get_factory().SetFile(fi, 0); itr.Init(); this->BeforeFirst(); } - /*! - * \brief save a row iterator to output stream, in row iterator format - */ - inline static void Save(utils::IIterator *iter, - utils::IStream &fo) { - RowBatchPage page(kPageSize); - iter->BeforeFirst(); - while (iter->Next()) { - const RowBatch &batch = iter->Value(); - for (size_t i = 0; i < batch.size; ++i) { - if (!page.PushRow(batch[i])) { - page.Save(fo); - page.Clear(); - utils::Check(page.PushRow(batch[i]), "row is too big"); - } - } - } - if (page.Size() != 0) page.Save(fo); - } - /*! \brief page size 64 MB */ - static const size_t kPageSize = 64 << 18; private: // base row id size_t base_rowid_; - // temporal ptr - std::vector tmp_ptr_; // output data RowBatch out_; - // page pointer type - typedef RowBatchPage* PagePtr; - // loader factory for page - struct Factory { - public: - size_t file_begin_; - utils::FileStream fi; - Factory(void) {} - inline void SetFile(const utils::FileStream &fi) { - this->fi = fi; - file_begin_ = this->fi.Tell(); - } - inline bool Init(void) { - return true; - } - inline void SetParam(const char *name, const char *val) {} - inline bool LoadNext(PagePtr &val) { - return val->Load(fi); - } - inline PagePtr Create(void) { - PagePtr a = new RowBatchPage(kPageSize); - return a; - } - inline void FreeSpace(PagePtr &a) { - delete a; - } - inline void Destroy(void) { - fi.Close(); - } - inline void BeforeFirst(void) { - fi.Seek(file_begin_); - } - }; - - protected: - PagePtr page_; - utils::ThreadBuffer itr; + SparsePage *page_; + utils::ThreadBuffer itr; }; /*! \brief data matrix using page */ @@ -247,8 +106,20 @@ class DMatrixPageBase : public DataMatrix { mat.info.SaveBinary(fs); fs.Close(); fname += ".row.blob"; + utils::IIterator *iter = mat.fmat()->RowIterator(); utils::FileStream fbin(utils::FopenCheck(fname.c_str(), "wb")); - ThreadRowPageIterator::Save(mat.fmat()->RowIterator(), fbin); + SparsePage page; + iter->BeforeFirst(); + while (iter->Next()) { + const RowBatch &batch = iter->Value(); + for (size_t i = 0; i < batch.size; ++i) { + page.Push(batch[i]); + if (page.MemCostBytes() >= kPageSize) { + page.Save(&fbin); page.Clear(); + } + } + } + if (page.data.size() != 0) page.Save(&fbin); fbin.Close(); if (!silent) { utils::Printf("DMatrixPage: %lux%lu is saved to %s\n", @@ -268,7 +139,7 @@ class DMatrixPageBase : public DataMatrix { } std::string fname_row = std::string(cache_file) + ".row.blob"; utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); - RowBatchPage page(ThreadRowPageIterator::kPageSize); + SparsePage page; dmlc::InputSplit *in = dmlc::InputSplit::Create(uri, rank, npart); std::string line; @@ -286,10 +157,9 @@ class DMatrixPageBase : public DataMatrix { feats.push_back(e); } RowBatch::Inst row(BeginPtr(feats), feats.size()); - if (!page.PushRow(row)) { - page.Save(fo); - page.Clear(); - utils::Check(page.PushRow(row), "row is too big"); + page.Push(row); + if (page.MemCostBytes() >= kPageSize) { + page.Save(&fo); page.Clear(); } for (size_t i = 0; i < feats.size(); ++i) { info.info.num_col = std::max(info.info.num_col, @@ -298,8 +168,8 @@ class DMatrixPageBase : public DataMatrix { this->info.labels.push_back(label); info.info.num_row += 1; } - if (page.Size() != 0) { - page.Save(fo); + if (page.data.size() != 0) { + page.Save(&fo); } delete in; fo.Close(); @@ -319,7 +189,8 @@ class DMatrixPageBase : public DataMatrix { } /*! \brief magic number used to identify DMatrix */ static const int kMagic = TKMagic; - + /*! \brief page size 64 MB */ + static const size_t kPageSize = 64 << 18; protected: /*! \brief row iterator */ ThreadRowPageIterator *iter_; diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp new file mode 100644 index 000000000..44643c329 --- /dev/null +++ b/src/io/page_fmatrix-inl.hpp @@ -0,0 +1,130 @@ +#ifndef XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ +#define XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ +/*! + * \file page_fmatrix-inl.hpp + * col iterator based on sparse page + * \author Tianqi Chen + */ +namespace xgboost { +namespace io { +/*! \brief thread buffer iterator */ +class ThreadColPageIterator: public utils::IIterator { + public: + ThreadColPageIterator(void) { + itr.SetParam("buffer_size", "2"); + page_ = NULL; + } + virtual ~ThreadColPageIterator(void) {} + virtual void Init(void) {} + virtual void BeforeFirst(void) { + itr.BeforeFirst(); + } + virtual bool Next(void) { + if (!itr.Next(page_)) return false; + out_.col_index = BeginPtr(itr.get_factory().index_set()); + col_data_.resize(page_->offset.size() - 1, SparseBatch::Inst(NULL, 0)); + for (size_t i = 0; i < col_data_.size(); ++i) { + col_data_[i] = SparseBatch::Inst + (BeginPtr(page_->data) + page_->offset[i], + page_->offset[i + 1] - page_->offset[i]); + } + out_.col_data = BeginPtr(col_data_); + out_.size = col_data_.size(); + return true; + } + virtual const ColBatch &Value(void) const { + return out_; + } + /*! \brief load and initialize the iterator with fi */ + inline void SetFile(const utils::FileStream &fi) { + itr.get_factory().SetFile(fi, 0); + itr.Init(); + } + // set index set + inline void SetIndexSet(const std::vector &fset) { + itr.get_factory().SetIndexSet(fset); + } + + private: + // output data + ColBatch out_; + SparsePage *page_; + std::vector col_data_; + utils::ThreadBuffer itr; +}; +/*! + * \brief sparse matrix that support column access, CSC + */ +class FMatrixS : public IFMatrix { + public: + typedef SparseBatch::Entry Entry; + /*! \brief constructor */ + FMatrixS(utils::IIterator *iter) { + this->iter_ = iter; + } + // destructor + virtual ~FMatrixS(void) { + if (iter_ != NULL) delete iter_; + } + /*! \return whether column access is enabled */ + virtual bool HaveColAccess(void) const { + return col_ptr_.size() != 0; + } + /*! \brief get number of colmuns */ + virtual size_t NumCol(void) const { + utils::Check(this->HaveColAccess(), "NumCol:need column access"); + return col_ptr_.size() - 1; + } + /*! \brief get number of buffered rows */ + virtual const std::vector &buffered_rowset(void) const { + return buffered_rowset_; + } + /*! \brief get column size */ + virtual size_t GetColSize(size_t cidx) const { + return col_ptr_[cidx+1] - col_ptr_[cidx]; + } + /*! \brief get column density */ + virtual float GetColDensity(size_t cidx) const { + size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); + return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); + } + virtual void InitColAccess(const std::vector &enabled, + float pkeep = 1.0f) { + if (this->HaveColAccess()) return; + this->InitColData(pkeep, enabled); + } + /*! + * \brief get the row iterator associated with FMatrix + */ + virtual utils::IIterator* RowIterator(void) { + iter_->BeforeFirst(); + return iter_; + } + /*! + * \brief get the column based iterator + */ + virtual utils::IIterator* ColIterator(void) { + size_t ncol = this->NumCol(); + col_iter_.col_index_.resize(ncol); + for (size_t i = 0; i < ncol; ++i) { + col_iter_.col_index_[i] = static_cast(i); + } + col_iter_.SetBatch(col_ptr_, col_data_); + return &col_iter_; + } + /*! + * \brief colmun based iterator + */ + virtual utils::IIterator *ColIterator(const std::vector &fset) { + size_t ncol = this->NumCol(); + col_iter_.col_index_.resize(0); + for (size_t i = 0; i < fset.size(); ++i) { + if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); + } + col_iter_.SetBatch(col_ptr_, col_data_); + return &col_iter_; + } +}; +} // namespace io +} // namespace xgboost +#endif // XGBOOST_IO_PAGE_FMATRIX_INL_HPP_ diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index 9f204536f..3ba6f2801 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -16,7 +16,7 @@ namespace io { /*! * \brief sparse matrix that support column access, CSC */ -class FMatrixS : public IFMatrix{ +class FMatrixS : public IFMatrix { public: typedef SparseBatch::Entry Entry; /*! \brief constructor */ @@ -238,7 +238,7 @@ class FMatrixS : public IFMatrix{ inline void SetBatch(const std::vector &ptr, const std::vector &data) { batch_.size = col_index_.size(); - col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL,0)); + col_data_.resize(col_index_.size(), SparseBatch::Inst(NULL, 0)); for (size_t i = 0; i < col_data_.size(); ++i) { const bst_uint ridx = col_index_[i]; col_data_[i] = SparseBatch::Inst(&data[0] + ptr[ridx], diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h new file mode 100644 index 000000000..f7bbb08de --- /dev/null +++ b/src/io/sparse_batch_page.h @@ -0,0 +1,216 @@ +#ifndef XGBOOST_IO_SPARSE_BATCH_PAGE_H_ +#define XGBOOST_IO_SPARSE_BATCH_PAGE_H_ +/*! + * \file sparse_batch_page.h + * content holder of sparse batch that can be saved to disk + * the representation can be effectively + * use in external memory computation + * \author Tianqi Chen + */ +#include "../data.h" + +namespace xgboost { +namespace io { +/*! + * \brief storage unit of sparse batch + */ +class SparsePage { + public: + /*! \brief offset of the segments */ + std::vector offset; + /*! \brief the data of the segments */ + std::vector data; + /*! \brief constructor */ + SparsePage() { + this->Clear(); + } + /*! + * \brief load the by providing a list of interested segments + * only the interested segments are loaded + * \param fi the input stream of the file + * \param sorted_index_set sorted index of segments we are interested in + * \return true of the loading as successful, false if end of file was reached + */ + inline bool Load(utils::ISeekStream *fi, + const std::vector &sorted_index_set) { + if (!fi->Read(&disk_offset_)) return false; + // setup the offset + offset.clear(); offset.push_back(0); + for (size_t i = 0; i < sorted_index_set.size(); ++i) { + bst_uint fid = sorted_index_set[i]; + size_t size = disk_offset_[fid + 1] - disk_offset_[fid]; + offset.push_back(offset.back() + size); + } + data.resize(offset.back()); + // read in the data + size_t begin = fi->Tell(); + size_t curr_offset = 0; + for (size_t i = 0; i < sorted_index_set.size();) { + bst_uint fid = sorted_index_set[i]; + if (disk_offset_[fid] != curr_offset) { + utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted"); + fi->Seek(begin + disk_offset_[fid]); + curr_offset = disk_offset_[fid]; + } + size_t j, size_to_read = 0; + for (j = i; j < sorted_index_set.size(); ++j) { + if (disk_offset_[sorted_index_set[j]] == disk_offset_[fid] + size_to_read) { + size_to_read += offset[j + 1] - offset[j]; + } else { + break; + } + } + if (size_to_read != 0) { + utils::Check(fi->Read(BeginPtr(data) + offset[i], + size_to_read * sizeof(SparseBatch::Entry)) != 0, + "Invalid SparsePage file"); + curr_offset += size_to_read; + } + i = j; + } + return true; + } + /*! + * \brief load all the segments + * \param fi the input stream of the file + * \return true of the loading as successful, false if end of file was reached + */ + inline bool Load(utils::IStream *fi) { + if (!fi->Read(&offset)) return false; + utils::Check(offset.size() != 0, "Invalid SparsePage file"); + data.resize(offset.back()); + if (data.size() != 0) { + utils::Check(fi->Read(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)) != 0, + "Invalid SparsePage file"); + } + return true; + } + /*! + * \brief save the data to fo, when a page was written + * to disk it must contain all the elements in the + * \param fo output stream + */ + inline void Save(utils::IStream *fo) const { + utils::Assert(offset.size() != 0 && offset[0] == 0, "bad offset"); + utils::Assert(offset.back() == data.size(), "in consistent SparsePage"); + fo->Write(offset); + if (data.size() != 0) { + fo->Write(BeginPtr(data), data.size() * sizeof(SparseBatch::Entry)); + } + } + /*! \return estimation of memory cost of this page */ + inline size_t MemCostBytes(void) const { + return offset.size() * sizeof(size_t) + data.size() * sizeof(SparseBatch::Entry); + } + /*! \brief clear the page */ + inline void Clear(void) { + offset.clear(); + offset.push_back(0); + data.clear(); + } + /*! + * \brief load all the segments and add it to existing batch + * \param fi the input stream of the file + * \return true of the loading as successful, false if end of file was reached + */ + inline bool PushLoad(utils::IStream *fi) { + if (!fi->Read(&disk_offset_)) return false; + data.resize(offset.back() + disk_offset_.back()); + if (disk_offset_.back() != 0) { + utils::Check(fi->Read(BeginPtr(data) + offset.back(), + disk_offset_.back() * sizeof(SparseBatch::Entry)) != 0, + "Invalid SparsePage file"); + } + size_t top = offset.back(); + size_t begin = offset.size(); + offset.resize(offset.size() + disk_offset_.size()); + for (size_t i = 0; i < disk_offset_.size(); ++i) { + offset[i + begin] = top + disk_offset_[i]; + } + } + /*! + * \brief Push row batch into the page + * \param batch the row batch + */ + inline void Push(const RowBatch &batch) { + data.resize(offset.back() + batch.size); + std::memcpy(BeginPtr(data) + offset.back(), + batch.data_ptr + batch.ind_ptr[0], + sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]); + size_t top = offset.back(); + size_t begin = offset.size(); + offset.resize(offset.size() + batch.size); + for (size_t i = 0; i < batch.size; ++i) { + offset[i + begin] = top + batch.ind_ptr[i] - batch.ind_ptr[0]; + } + } + /*! + * \brief Push one instance into page + * \param row an instance row + */ + inline void Push(const SparseBatch::Inst &inst) { + offset.push_back(offset.back() + inst.length); + size_t begin = data.size(); + data.resize(begin + inst.length); + std::memcpy(BeginPtr(data) + begin, inst.data, + sizeof(SparseBatch::Entry) * inst.length); + } + + private: + /*! \brief external memory column offset */ + std::vector disk_offset_; +}; +/*! + * \brief factory class for SparsePage, + * used in threadbuffer template + */ +class SparsePageFactory { + public: + SparsePageFactory(void) {} + inline void SetFile(const utils::FileStream &fi, + size_t file_begin = 0) { + fi_ = fi; + file_begin_ = file_begin; + } + inline const std::vector &index_set(void) const { + return action_index_set_; + } + // set index set, will be used after next before first + inline void SetIndexSet(const std::vector &index_set) { + set_index_set_ = index_set; + std::sort(set_index_set_.begin(), set_index_set_.end()); + } + inline bool Init(void) { + return true; + } + inline void SetParam(const char *name, const char *val) {} + inline bool LoadNext(SparsePage *val) { + if (action_index_set_.size() != 0) { + return val->Load(&fi_, action_index_set_); + } else { + return val->Load(&fi_); + } + } + inline SparsePage *Create(void) { + return new SparsePage(); + } + inline void FreeSpace(SparsePage *a) { + delete a; + } + inline void Destroy(void) { + fi_.Close(); + } + inline void BeforeFirst(void) { + fi_.Seek(file_begin_); + action_index_set_ = set_index_set_; + } + + private: + size_t file_begin_; + utils::FileStream fi_; + std::vector action_index_set_; + std::vector set_index_set_; +}; +} // namespace io +} // namespace xgboost +#endif // XGBOOST_IO_SPARSE_BATCH_PAGE_H_ From 22abf4e29538cde0d9d61f79b6a5b3a19344c7a9 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 16 Apr 2015 12:34:39 -0700 Subject: [PATCH 04/14] need more check --- src/data.h | 2 +- src/io/io.cpp | 26 +++- src/io/page_dmatrix-inl.hpp | 70 ++++++----- src/io/page_fmatrix-inl.hpp | 189 ++++++++++++++++++++++++++--- src/io/sparse_batch_page.h | 42 +++++-- src/learner/learner-inl.hpp | 9 +- src/tree/updater_basemaker-inl.hpp | 2 +- src/tree/updater_histmaker-inl.hpp | 4 +- 8 files changed, 271 insertions(+), 73 deletions(-) diff --git a/src/data.h b/src/data.h index 162a31bfe..e7a5f65d0 100644 --- a/src/data.h +++ b/src/data.h @@ -149,7 +149,7 @@ class IFMatrix { virtual size_t NumCol(void) const = 0; /*! \brief get number of non-missing entries in column */ virtual size_t GetColSize(size_t cidx) const = 0; - /*! \brief get column density */ + /*! \brief get column density */ virtual float GetColDensity(size_t cidx) const = 0; /*! \brief reference of buffered rowset */ virtual const std::vector &buffered_rowset(void) const = 0; diff --git a/src/io/io.cpp b/src/io/io.cpp index 8d6856b92..81ccf9489 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -15,6 +15,18 @@ DataMatrix* LoadDataMatrix(const char *fname, bool savebuffer, bool loadsplit, const char *cache_file) { + std::string fname_ = fname; + const char *dlm = strchr(fname, '#'); + if (dlm != NULL) { + utils::Check(strchr(dlm + 1, '#') == NULL, + "only one `#` is allowed in file path for cachefile specification"); + utils::Check(cache_file == NULL, + "can only specify the cachefile with `#` or argument, not both"); + fname_ = std::string(fname, dlm - fname); + fname = fname_.c_str(); + cache_file = dlm +1; + } + if (cache_file == NULL) { if (!std::strcmp(fname, "stdin") || !std::strncmp(fname, "s3://", 5) || @@ -39,16 +51,18 @@ DataMatrix* LoadDataMatrix(const char *fname, dmat->CacheLoad(fname, silent, savebuffer); return dmat; } else { - if (!strcmp(fname, cache_file)) { + FILE *fi = fopen64(cache_file, "rb"); + if (fi != NULL) { DMatrixPage *dmat = new DMatrixPage(); - utils::FileStream fs(utils::FopenCheck(fname, "rb")); - dmat->LoadBinary(fs, silent, fname); + utils::FileStream fs(fi); + dmat->LoadBinary(fs, silent, cache_file); fs.Close(); return dmat; + } else { + DMatrixPage *dmat = new DMatrixPage(); + dmat->LoadText(fname, cache_file, false, loadsplit); + return dmat; } - DMatrixPage *dmat = new DMatrixPage(); - dmat->LoadText(fname, cache_file, false, loadsplit); - return dmat; } } diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index eb56bb80d..03f0d5ca8 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -70,33 +70,6 @@ class DMatrixPageBase : public DataMatrix { // do not delete row iterator, since it is owned by fmat // to be cleaned up in a more clear way } - /*! \brief load and initialize the iterator with fi */ - inline void LoadBinary(utils::FileStream &fi, - bool silent, - const char *fname_) { - std::string fname = fname_; - int tmagic; - utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); - utils::Check(tmagic == magic, "invalid format,magic number mismatch"); - this->info.LoadBinary(fi); - // load in the row data file - fname += ".row.blob"; - utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb")); - iter_->Load(fs); - if (!silent) { - utils::Printf("DMatrixPage: %lux%lu matrix is loaded", - static_cast(info.num_row()), - static_cast(info.num_col())); - if (fname_ != NULL) { - utils::Printf(" from %s\n", fname_); - } else { - utils::Printf("\n"); - } - if (info.group_ptr.size() != 0) { - utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1); - } - } - } /*! \brief save a DataMatrix as DMatrixPage */ inline static void Save(const char *fname_, const DataMatrix &mat, bool silent) { std::string fname = fname_; @@ -127,18 +100,48 @@ class DMatrixPageBase : public DataMatrix { static_cast(mat.info.num_col()), fname_); } } + /*! \brief load and initialize the iterator with fi */ + inline void LoadBinary(utils::FileStream &fi, + bool silent, + const char *fname_) { + this->set_cache_file(fname_); + std::string fname = fname_; + int tmagic; + utils::Check(fi.Read(&tmagic, sizeof(tmagic)) != 0, "invalid input file format"); + utils::Check(tmagic == magic, "invalid format,magic number mismatch"); + this->info.LoadBinary(fi); + // load in the row data file + fname += ".row.blob"; + utils::FileStream fs(utils::FopenCheck(fname.c_str(), "rb")); + iter_->Load(fs); + if (!silent) { + utils::Printf("DMatrixPage: %lux%lu matrix is loaded", + static_cast(info.num_row()), + static_cast(info.num_col())); + if (fname_ != NULL) { + utils::Printf(" from %s\n", fname_); + } else { + utils::Printf("\n"); + } + if (info.group_ptr.size() != 0) { + utils::Printf("data contains %u groups\n", (unsigned)info.group_ptr.size() - 1); + } + } + } /*! \brief save a LibSVM format file as DMatrixPage */ inline void LoadText(const char *uri, const char* cache_file, bool silent, bool loadsplit) { + int rank = 0, npart = 1; if (loadsplit) { rank = rabit::GetRank(); npart = rabit::GetWorldSize(); } + this->set_cache_file(cache_file); std::string fname_row = std::string(cache_file) + ".row.blob"; - utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); + utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); SparsePage page; dmlc::InputSplit *in = dmlc::InputSplit::Create(uri, rank, npart); @@ -190,8 +193,10 @@ class DMatrixPageBase : public DataMatrix { /*! \brief magic number used to identify DMatrix */ static const int kMagic = TKMagic; /*! \brief page size 64 MB */ - static const size_t kPageSize = 64 << 18; + static const size_t kPageSize = 64UL << 20UL; + protected: + virtual void set_cache_file(const std::string &cache_file) = 0; /*! \brief row iterator */ ThreadRowPageIterator *iter_; }; @@ -199,7 +204,7 @@ class DMatrixPageBase : public DataMatrix { class DMatrixPage : public DMatrixPageBase<0xffffab02> { public: DMatrixPage(void) { - fmat_ = new FMatrixS(iter_); + fmat_ = new FMatrixPage(iter_, this->info); } virtual ~DMatrixPage(void) { delete fmat_; @@ -207,8 +212,11 @@ class DMatrixPage : public DMatrixPageBase<0xffffab02> { virtual IFMatrix *fmat(void) const { return fmat_; } + virtual void set_cache_file(const std::string &cache_file) { + fmat_->set_cache_file(cache_file); + } /*! \brief the real fmatrix */ - IFMatrix *fmat_; + FMatrixPage *fmat_; }; } // namespace io } // namespace xgboost diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 44643c329..ea2e94733 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -37,12 +37,12 @@ class ThreadColPageIterator: public utils::IIterator { } /*! \brief load and initialize the iterator with fi */ inline void SetFile(const utils::FileStream &fi) { - itr.get_factory().SetFile(fi, 0); + itr.get_factory().SetFile(fi); itr.Init(); } // set index set - inline void SetIndexSet(const std::vector &fset) { - itr.get_factory().SetIndexSet(fset); + inline void SetIndexSet(const std::vector &fset, bool load_all) { + itr.get_factory().SetIndexSet(fset, load_all); } private: @@ -55,25 +55,26 @@ class ThreadColPageIterator: public utils::IIterator { /*! * \brief sparse matrix that support column access, CSC */ -class FMatrixS : public IFMatrix { +class FMatrixPage : public IFMatrix { public: typedef SparseBatch::Entry Entry; /*! \brief constructor */ - FMatrixS(utils::IIterator *iter) { + FMatrixPage(utils::IIterator *iter, + const learner::MetaInfo &info) : info(info) { this->iter_ = iter; } // destructor - virtual ~FMatrixS(void) { + virtual ~FMatrixPage(void) { if (iter_ != NULL) delete iter_; } /*! \return whether column access is enabled */ - virtual bool HaveColAccess(void) const { - return col_ptr_.size() != 0; + virtual bool HaveColAccess(void) const { + return col_size_.size() != 0; } /*! \brief get number of colmuns */ virtual size_t NumCol(void) const { utils::Check(this->HaveColAccess(), "NumCol:need column access"); - return col_ptr_.size() - 1; + return col_size_.size(); } /*! \brief get number of buffered rows */ virtual const std::vector &buffered_rowset(void) const { @@ -81,17 +82,19 @@ class FMatrixS : public IFMatrix { } /*! \brief get column size */ virtual size_t GetColSize(size_t cidx) const { - return col_ptr_[cidx+1] - col_ptr_[cidx]; + return col_size_[cidx]; } /*! \brief get column density */ virtual float GetColDensity(size_t cidx) const { - size_t nmiss = buffered_rowset_.size() - (col_ptr_[cidx+1] - col_ptr_[cidx]); - return 1.0f - (static_cast(nmiss)) / buffered_rowset_.size(); + size_t nmiss = num_buffered_row_ - (col_size_[cidx]); + return 1.0f - (static_cast(nmiss)) / num_buffered_row_; } virtual void InitColAccess(const std::vector &enabled, float pkeep = 1.0f) { if (this->HaveColAccess()) return; - this->InitColData(pkeep, enabled); + if (TryLoadColData()) return; + this->InitColData(enabled, pkeep); + utils::Check(TryLoadColData(), "failed on creating col.blob"); } /*! * \brief get the row iterator associated with FMatrix @@ -105,25 +108,171 @@ class FMatrixS : public IFMatrix { */ virtual utils::IIterator* ColIterator(void) { size_t ncol = this->NumCol(); - col_iter_.col_index_.resize(ncol); + col_index_.resize(0); for (size_t i = 0; i < ncol; ++i) { - col_iter_.col_index_[i] = static_cast(i); + col_index_.push_back(i); } - col_iter_.SetBatch(col_ptr_, col_data_); + col_iter_.SetIndexSet(col_index_, false); + col_iter_.BeforeFirst(); return &col_iter_; } /*! * \brief colmun based iterator */ - virtual utils::IIterator *ColIterator(const std::vector &fset) { + virtual utils::IIterator *ColIterator(const std::vector &fset) { size_t ncol = this->NumCol(); - col_iter_.col_index_.resize(0); + col_index_.resize(0); for (size_t i = 0; i < fset.size(); ++i) { - if (fset[i] < ncol) col_iter_.col_index_.push_back(fset[i]); + if (fset[i] < ncol) col_index_.push_back(fset[i]); } - col_iter_.SetBatch(col_ptr_, col_data_); + col_iter_.SetIndexSet(col_index_, false); + col_iter_.BeforeFirst(); return &col_iter_; } + // set the cache file name + inline void set_cache_file(const std::string &cache_file) { + col_data_name_ = std::string(cache_file) + ".col.blob"; + col_meta_name_ = std::string(cache_file) + ".col.meta"; + } + + protected: + inline bool TryLoadColData(void) { + FILE *fi = fopen64(col_meta_name_.c_str(), "rb"); + if (fi == NULL) return false; + utils::FileStream fs(fi); + LoadMeta(&fs); + fs.Close(); + fi = utils::FopenCheck(col_data_name_.c_str(), "rb"); + if (fi == NULL) return false; + col_iter_.SetFile(utils::FileStream(fi)); + return true; + } + inline void LoadMeta(utils::IStream *fi) { + utils::Check(fi->Read(&num_buffered_row_, sizeof(num_buffered_row_)) != 0, + "invalid col.blob file"); + utils::Check(fi->Read(&buffered_rowset_), + "invalid col.blob file"); + utils::Check(fi->Read(&col_size_), + "invalid col.blob file"); + } + inline void SaveMeta(utils::IStream *fo) { + fo->Write(&num_buffered_row_, sizeof(num_buffered_row_)); + fo->Write(buffered_rowset_); + fo->Write(col_size_); + } + /*! + * \brief intialize column data + * \param pkeep probability to keep a row + */ + inline void InitColData(const std::vector &enabled, float pkeep) { + SparsePage prow, pcol; + size_t btop = 0; + // clear rowset + buffered_rowset_.clear(); + col_size_.resize(info.num_col()); + std::fill(col_size_.begin(), col_size_.end(), 0); + utils::FileStream fo; + fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb")); + // start working + iter_->BeforeFirst(); + while (iter_->Next()) { + const RowBatch &batch = iter_->Value(); + for (size_t i = 0; i < batch.size; ++i) { + bst_uint ridx = static_cast(batch.base_rowid + i); + if (pkeep == 1.0f || random::SampleBinary(pkeep)) { + buffered_rowset_.push_back(ridx); + prow.Push(batch[i]); + if (prow.MemCostBytes() >= kPageSize) { + this->PushColPage(prow, BeginPtr(buffered_rowset_) + btop, + enabled, &pcol, &fo); + btop += prow.Size(); + prow.Clear(); + } + } + } + } + if (prow.Size() != 0) { + this->PushColPage(prow, BeginPtr(buffered_rowset_) + btop, + enabled, &pcol, &fo); + } + fo.Close(); + num_buffered_row_ = buffered_rowset_.size(); + fo = utils::FileStream(utils::FopenCheck(col_meta_name_.c_str(), "wb")); + this->SaveMeta(&fo); + fo.Close(); + } + inline void PushColPage(const SparsePage &prow, + const bst_uint *ridx, + const std::vector &enabled, + SparsePage *pcol, + utils::IStream *fo) { + pcol->Clear(); + int nthread; + #pragma omp parallel + { + nthread = omp_get_num_threads(); + } + pcol->Clear(); + utils::ParallelGroupBuilder + builder(&pcol->offset, &pcol->data); + builder.InitBudget(info.num_col(), nthread); + bst_omp_uint ndata = static_cast(prow.Size()); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + for (bst_uint j = prow.offset[i]; j < prow.offset[i+1]; ++j) { + const SparseBatch::Entry &e = prow.data[j]; + if (enabled[e.index]) { + builder.AddBudget(e.index, tid); + } + } + } + builder.InitStorage(); + #pragma omp parallel for schedule(static) + for (bst_omp_uint i = 0; i < ndata; ++i) { + int tid = omp_get_thread_num(); + for (bst_uint j = prow.offset[i]; j < prow.offset[i+1]; ++j) { + const SparseBatch::Entry &e = prow.data[j]; + builder.Push(e.index, + SparseBatch::Entry(ridx[i], e.fvalue), + tid); + } + } + utils::Assert(pcol->Size() == info.num_col(), "inconsistent col data"); + // sort columns + bst_omp_uint ncol = static_cast(pcol->Size()); + #pragma omp parallel for schedule(dynamic, 1) + for (bst_omp_uint i = 0; i < ncol; ++i) { + if (pcol->offset[i] < pcol->offset[i + 1]) { + std::sort(BeginPtr(pcol->data) + pcol->offset[i], + BeginPtr(pcol->data) + pcol->offset[i + 1], Entry::CmpValue); + } + col_size_[i] += pcol->offset[i + 1] - pcol->offset[i]; + } + pcol->Save(fo); + } + + private: + /*! \brief page size 256 M */ + static const size_t kPageSize = 256 << 20UL; + // shared meta info with DMatrix + const learner::MetaInfo &info; + // row iterator + utils::IIterator *iter_; + /*! \brief column based data file name */ + std::string col_data_name_; + /*! \brief column based data file name */ + std::string col_meta_name_; + /*! \brief list of row index that are buffered */ + std::vector buffered_rowset_; + // number of buffered rows + size_t num_buffered_row_; + // count for column data + std::vector col_size_; + // internal column index for output + std::vector col_index_; + // internal thread backed col iterator + ThreadColPageIterator col_iter_; }; } // namespace io } // namespace xgboost diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h index f7bbb08de..756bd1034 100644 --- a/src/io/sparse_batch_page.h +++ b/src/io/sparse_batch_page.h @@ -24,6 +24,10 @@ class SparsePage { SparsePage() { this->Clear(); } + /*! \return number of instance in the page */ + inline size_t Size() const { + return offset.size() - 1; + } /*! * \brief load the by providing a list of interested segments * only the interested segments are loaded @@ -38,6 +42,7 @@ class SparsePage { offset.clear(); offset.push_back(0); for (size_t i = 0; i < sorted_index_set.size(); ++i) { bst_uint fid = sorted_index_set[i]; + utils::Check(fid + 1 < disk_offset_.size(), "bad col.blob format"); size_t size = disk_offset_[fid + 1] - disk_offset_[fid]; offset.push_back(offset.back() + size); } @@ -49,7 +54,7 @@ class SparsePage { bst_uint fid = sorted_index_set[i]; if (disk_offset_[fid] != curr_offset) { utils::Assert(disk_offset_[fid] > curr_offset, "fset index was not sorted"); - fi->Seek(begin + disk_offset_[fid]); + fi->Seek(begin + disk_offset_[fid] * sizeof(SparseBatch::Entry)); curr_offset = disk_offset_[fid]; } size_t j, size_to_read = 0; @@ -68,6 +73,10 @@ class SparsePage { } i = j; } + // seek to end of record + if (curr_offset != disk_offset_.back()) { + fi->Seek(begin + disk_offset_.back() * sizeof(SparseBatch::Entry)); + } return true; } /*! @@ -166,7 +175,8 @@ class SparsePage { */ class SparsePageFactory { public: - SparsePageFactory(void) {} + SparsePageFactory(void) + : action_load_all_(true), set_load_all_(true) {} inline void SetFile(const utils::FileStream &fi, size_t file_begin = 0) { fi_ = fi; @@ -176,19 +186,27 @@ class SparsePageFactory { return action_index_set_; } // set index set, will be used after next before first - inline void SetIndexSet(const std::vector &index_set) { - set_index_set_ = index_set; - std::sort(set_index_set_.begin(), set_index_set_.end()); + inline void SetIndexSet(const std::vector &index_set, + bool load_all) { + set_load_all_ = load_all; + if (!set_load_all_) { + set_index_set_ = index_set; + std::sort(set_index_set_.begin(), set_index_set_.end()); + } } inline bool Init(void) { return true; } inline void SetParam(const char *name, const char *val) {} - inline bool LoadNext(SparsePage *val) { - if (action_index_set_.size() != 0) { - return val->Load(&fi_, action_index_set_); + inline bool LoadNext(SparsePage *val) { + if (!action_load_all_) { + if (action_index_set_.size() == 0) { + return false; + } else { + return val->Load(&fi_, action_index_set_); + } } else { - return val->Load(&fi_); + return val->Load(&fi_); } } inline SparsePage *Create(void) { @@ -202,10 +220,14 @@ class SparsePageFactory { } inline void BeforeFirst(void) { fi_.Seek(file_begin_); - action_index_set_ = set_index_set_; + action_load_all_ = set_load_all_; + if (!set_load_all_) { + action_index_set_ = set_index_set_; + } } private: + bool action_load_all_, set_load_all_; size_t file_begin_; utils::FileStream fi_; std::vector action_index_set_; diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp index eafd400ca..9415279ee 100644 --- a/src/learner/learner-inl.hpp +++ b/src/learner/learner-inl.hpp @@ -69,7 +69,7 @@ class BoostLearner : public rabit::Serializable { utils::SPrintf(str_temp, sizeof(str_temp), "%lu", static_cast(buffer_size)); this->SetParam("num_pbuffer", str_temp); - this->pred_buffer_size = buffer_size; + this->pred_buffer_size = buffer_size; } /*! * \brief set parameters from outside @@ -259,7 +259,12 @@ class BoostLearner : public rabit::Serializable { int ncol = static_cast(p_train->info.info.num_col); std::vector enabled(ncol, true); // initialize column access - p_train->fmat()->InitColAccess(enabled, prob_buffer_row); + p_train->fmat()->InitColAccess(enabled, prob_buffer_row); + const int kMagicSimple = 0xffffab01; + // check, if it is not DMatrix simple, then use hist maker + if (p_train->magic != kMagicSimple) { + this->SetParam("updater", "grow_histmaker,prune"); + } } /*! * \brief update the model for one iteration diff --git a/src/tree/updater_basemaker-inl.hpp b/src/tree/updater_basemaker-inl.hpp index f8816dd6e..75683a68d 100644 --- a/src/tree/updater_basemaker-inl.hpp +++ b/src/tree/updater_basemaker-inl.hpp @@ -50,7 +50,7 @@ class BaseMaker: public IUpdater { fminmax[fid * 2 + 1] = std::max(c[c.length - 1].fvalue, fminmax[fid * 2 + 1]); } } - } + } rabit::Allreduce(BeginPtr(fminmax), fminmax.size()); } // get feature type, 0:empty 1:binary 2:real diff --git a/src/tree/updater_histmaker-inl.hpp b/src/tree/updater_histmaker-inl.hpp index 8c617450b..f8c194c62 100644 --- a/src/tree/updater_histmaker-inl.hpp +++ b/src/tree/updater_histmaker-inl.hpp @@ -366,7 +366,7 @@ class CQHistMaker: public HistMaker { } else { feat2workindex[fset[i]] = -2; } - } + } this->GetNodeStats(gpair, *p_fmat, tree, info, &thread_stats, &node_stats); sketchs.resize(this->qexpand.size() * freal_set.size()); @@ -578,7 +578,7 @@ class QuantileHistMaker: public HistMaker { IFMatrix *p_fmat, const BoosterInfo &info, const std::vector &fset, - const RegTree &tree) { + const RegTree &tree) { // initialize the data structure int nthread = BaseMaker::get_nthread(); sketchs.resize(this->qexpand.size() * tree.param.num_feature); From ddb7e538dfa2edef6ea24637f9028dc2e64822f3 Mon Sep 17 00:00:00 2001 From: tqchen Date: Thu, 16 Apr 2015 17:03:18 -0700 Subject: [PATCH 05/14] OK --- R-package/src/xgboost_R.cpp | 11 ----------- R-package/src/xgboost_R.h | 9 --------- src/io/io.cpp | 10 +++++++++- wrapper/xgboost.py | 15 ++------------- wrapper/xgboost_wrapper.cpp | 5 ----- wrapper/xgboost_wrapper.h | 11 ----------- 6 files changed, 11 insertions(+), 50 deletions(-) diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp index f67462564..a2ca9536f 100644 --- a/R-package/src/xgboost_R.cpp +++ b/R-package/src/xgboost_R.cpp @@ -76,17 +76,6 @@ extern "C" { _WrapperEnd(); return ret; } - SEXP XGDMatrixCreateCache_R(SEXP fname, SEXP cache_file, SEXP silent) { - _WrapperBegin(); - void *handle = XGDMatrixCreateCache(CHAR(asChar(fname)), - CHAR(asChar(cache_file)), - asInteger(silent)); - SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); - R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); - UNPROTECT(1); - _WrapperEnd(); - return ret; - } SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing) { _WrapperBegin(); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 1314cef15..61b84a80e 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -24,15 +24,6 @@ extern "C" { * \return a loaded data matrix */ SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent); - /*! - * \brief load a cached DMatrix, this is backed by several cache_files - * and usually cost less memory - * \param fname the name of the file, can be a cached buffer or text - * \param cache_file the name of cached file - * \param silent whether print messages during loading - * \return a loaded data matrix - */ - SEXP XGDMatrixCreateCache_R(SEXP fname, SEXP cache_file, SEXP silent); /*! * \brief create matrix content from dense matrix * This assumes the matrix is stored in column major format diff --git a/src/io/io.cpp b/src/io/io.cpp index 81ccf9489..967abf15b 100644 --- a/src/io/io.cpp +++ b/src/io/io.cpp @@ -16,6 +16,7 @@ DataMatrix* LoadDataMatrix(const char *fname, bool loadsplit, const char *cache_file) { std::string fname_ = fname; + const char *dlm = strchr(fname, '#'); if (dlm != NULL) { utils::Check(strchr(dlm + 1, '#') == NULL, @@ -26,7 +27,7 @@ DataMatrix* LoadDataMatrix(const char *fname, fname = fname_.c_str(); cache_file = dlm +1; } - + if (cache_file == NULL) { if (!std::strcmp(fname, "stdin") || !std::strncmp(fname, "s3://", 5) || @@ -51,6 +52,13 @@ DataMatrix* LoadDataMatrix(const char *fname, dmat->CacheLoad(fname, silent, savebuffer); return dmat; } else { + std::string cache_fname = cache_file; + if (loadsplit) { + std::ostringstream os; + os << cache_file << ".r" << rabit::GetRank(); + cache_fname = os.str(); + cache_file = cache_fname.c_str(); + } FILE *fi = fopen64(cache_file, "rb"); if (fi != NULL) { DMatrixPage *dmat = new DMatrixPage(); diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index bfab05deb..5bb6377c5 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -87,7 +87,7 @@ def c_array(ctype, values): class DMatrix(object): - def __init__(self, data, label=None, missing=0.0, weight=None, cache_file=None): + def __init__(self, data, label=None, missing=0.0, weight=None): """ Data matrix used in XGBoost. @@ -102,24 +102,13 @@ class DMatrix(object): Value in the data which needs to be present as a missing value. weight : list or numpy 1-D array (optional) Weight for each instance. - cache_file: string - Path to the binary cache of input data, when this is enabled, - several binary cache files with the prefix cache_file will be created, - xgboost will try to use external memory as much as possible, - thus save memory during computation in general """ # force into void_p, mac need to pass things in as void_p if data is None: self.handle = None return - if cache_file is not None: - if not isinstance(data, string_types): - raise Exception('cache_file must be used together with input file name') - if not isinstance(cache_file, string_types): - raise Exception('cache_file must be string') - self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateCache(c_str(data), c_str(cache_file), 0)) - elif isinstance(data, string_types): + if isinstance(data, string_types): self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromFile(c_str(data), 0)) elif isinstance(data, scipy.sparse.csr_matrix): self._init_from_csr(data) diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 45fc05082..dec266ff6 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -114,11 +114,6 @@ extern "C"{ void* XGDMatrixCreateFromFile(const char *fname, int silent) { return LoadDataMatrix(fname, silent != 0, false, false); } - void* XGDMatrixCreateCache(const char *fname, - const char *cache_file, - int silent) { - return LoadDataMatrix(fname, silent != 0, false, false, cache_file); - } void* XGDMatrixCreateFromCSR(const bst_ulong *indptr, const unsigned *indices, const float *data, diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 66d1dfbc0..d51eb284f 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -24,17 +24,6 @@ extern "C" { * \return a loaded data matrix */ XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent); - /*! - * \brief load a cached DMatrix, this is backed by several cache_files - * and usually cost less memory - * \param fname the name of the file, can be a cached buffer or text - * \param cache_file the name of cached file - * \param silent whether print messages during loading - * \return a loaded data matrix - */ - XGB_DLL void* XGDMatrixCreateCache(const char *fname, - const char *cache_file, - int silent); /*! * \brief create a matrix content from csr format * \param indptr pointer to row headers From 6bc5d6f0b44b957cc9f0d0b1fe5d420b0b59b8e2 Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 17 Apr 2015 21:07:33 -0700 Subject: [PATCH 06/14] Squashed 'subtree/rabit/' changes from 3bf8661..7568f75 7568f75 new io interface git-subtree-dir: subtree/rabit git-subtree-split: 7568f75f450232b81536914cb4f083c92b5752ec --- include/dmlc/io.h | 85 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 18 deletions(-) diff --git a/include/dmlc/io.h b/include/dmlc/io.h index bd9e127a0..4feeb30d2 100644 --- a/include/dmlc/io.h +++ b/include/dmlc/io.h @@ -11,7 +11,6 @@ #include #include #include -#include /*! \brief namespace for dmlc */ namespace dmlc { @@ -100,32 +99,71 @@ class Serializable { }; /*! - * \brief input split header, used to create input split on input dataset - * this class can be used to obtain filesystem invariant splits from input files + * \brief input split creates that allows reading + * of records from split of data, + * independent part that covers all the dataset + * + * see InputSplit::Create for definition of record */ class InputSplit { public: + /*! \brief a blob of memory region */ + struct Blob { + /*! \brief points to start of the memory region */ + void *dptr; + /*! \brief size of the memory region */ + size_t size; + }; /*! - * \brief read next record, store into out_data - * the data in outcomming record depends on the input data format - * if input is text data, each line is returned as a record (\n not included) - * if input is recordio, each record is returned - * \param out_data the string that stores the line data, \n is not included - * \return true of next line was found, false if we read all the lines + * \brief get the next record, the returning value + * is valid until next call to NextRecord or NextChunk + * caller can modify the memory content of out_rec + * \param out_rec used to store the result + * \return true if we can successfully get next record + * false if we reached end of split + * \sa InputSplit::Create for definition of record */ - virtual bool ReadRecord(std::string *out_data) = 0; + virtual bool NextRecord(Blob *out_rec) = 0; + /*! + * \brief get a chunk of memory that can contain multiple records, + * the caller needs to parse the content of the resulting chunk, + * for text file, out_chunk can contain data of multiple lines + * for recordio, out_chunk can contain data of multiple records + * + * This function ensures there won't be partial record in the chunk + * caller can modify the memory content of out_chunk, + * the memory is valid until next call to NextRecord or NextChunk + * + * Usually NextRecord is sufficient, NextChunk can be used by some + * multi-threaded parsers to parse the input content + * + * \param out_chunk used to store the result + * \return true if we can successfully get next record + * false if we reached end of split + * \sa InputSplit::Create for definition of record + */ + virtual bool NextChunk(Blob *out_chunk) = 0; /*! \brief destructor*/ - virtual ~InputSplit(void) {} + virtual ~InputSplit(void) {} /*! * \brief factory function: * create input split given a uri * \param uri the uri of the input, can contain hdfs prefix * \param part_index the part id of current input * \param num_parts total number of splits + * \param type type of record + * List of possible types: "text", "recordio" + * - "text": + * text file, each line is treated as a record + * input split will split on \n or \r + * - "recordio": + * binary recordio file, see recordio.h + * \sa InputSplit::Type */ static InputSplit* Create(const char *uri, unsigned part_index, - unsigned num_parts); + unsigned num_parts, + const char *type); }; /*! @@ -172,7 +210,7 @@ class ostream : public std::basic_ostream { public: explicit OutBuf(size_t buffer_size) : stream_(NULL), buffer_(buffer_size) { - assert(buffer_.size() > 0); + if (buffer_size == 0) buffer_.resize(2); } // set stream to the buffer inline void set_stream(Stream *stream); @@ -225,22 +263,32 @@ class istream : public std::basic_istream { buf_.set_stream(stream); this->rdbuf(&buf_); } - + /*! \return how many bytes we read so far */ + inline size_t bytes_read(void) const { + return buf_.bytes_read(); + } + private: // internal streambuf class InBuf : public std::streambuf { public: explicit InBuf(size_t buffer_size) - : stream_(NULL), buffer_(buffer_size) { - assert(buffer_.size() > 0); + : stream_(NULL), bytes_read_(0), + buffer_(buffer_size) { + if (buffer_size == 0) buffer_.resize(2); } // set stream to the buffer inline void set_stream(Stream *stream); - + // return how many bytes read so far + inline size_t bytes_read(void) const { + return bytes_read_; + } private: /*! \brief internal stream by StreamBuf */ Stream *stream_; - /*! \brief internal buffer */ + /*! \brief how many bytes we read so far */ + size_t bytes_read_; + /*! \brief internal buffer */ std::vector buffer_; // override underflow inline int_type underflow(); @@ -322,6 +370,7 @@ inline int istream::InBuf::underflow() { if (this->gptr() == this->egptr()) { size_t sz = stream_->Read(bhead, buffer_.size()); this->setg(bhead, bhead, bhead + sz); + bytes_read_ += sz; } if (this->gptr() == this->egptr()) { return traits_type::eof(); From 788785f164a3e465a44e5d33021031d2773cec7c Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 17 Apr 2015 22:07:59 -0700 Subject: [PATCH 07/14] faster libsvm parser --- src/io/dmlc_simple.cpp | 128 ++++++++++++++++++++++---- src/io/libsvm_parser.h | 168 ++++++++++++++++++++++++++++++++++ src/io/page_dmatrix-inl.hpp | 41 ++++----- src/io/simple_dmatrix-inl.hpp | 47 ++++++---- src/io/sparse_batch_page.h | 18 +++- src/utils/utils.h | 8 ++ 6 files changed, 348 insertions(+), 62 deletions(-) create mode 100644 src/io/libsvm_parser.h diff --git a/src/io/dmlc_simple.cpp b/src/io/dmlc_simple.cpp index 4296d5caa..3138dae94 100644 --- a/src/io/dmlc_simple.cpp +++ b/src/io/dmlc_simple.cpp @@ -8,45 +8,132 @@ namespace xgboost { namespace utils { +/*! + * \brief line split implementation from single FILE + * simply returns lines of files, used for stdin + */ class SingleFileSplit : public dmlc::InputSplit { public: - explicit SingleFileSplit(const char *fname) - : use_stdin_(false) { + explicit SingleFileSplit(const char *fname) + : use_stdin_(false), + chunk_begin_(NULL), chunk_end_(NULL) { if (!std::strcmp(fname, "stdin")) { #ifndef XGBOOST_STRICT_CXX98_ use_stdin_ = true; fp_ = stdin; #endif } if (!use_stdin_) { - fp_ = utils::FopenCheck(fname, "r"); + fp_ = utils::FopenCheck(fname, "rb"); } - end_of_file_ = false; + buffer_.resize(kBufferSize); } virtual ~SingleFileSplit(void) { if (!use_stdin_) std::fclose(fp_); } - virtual bool ReadRecord(std::string *out_data) { - if (end_of_file_) return false; - out_data->clear(); - while (true) { - char c = std::fgetc(fp_); - if (c == EOF) { - end_of_file_ = true; + virtual size_t Read(void *ptr, size_t size) { + return std::fread(ptr, 1, size, fp_); + } + virtual void Write(const void *ptr, size_t size) { + utils::Error("cannot do write in inputsplit"); + } + virtual bool NextRecord(Blob *out_rec) { + if (chunk_begin_ == chunk_end_) { + if (!LoadChunk()) return false; + } + char *next = FindNextRecord(chunk_begin_, + chunk_end_); + out_rec->dptr = chunk_begin_; + out_rec->size = next - chunk_begin_; + chunk_begin_ = next; + return true; + } + virtual bool NextChunk(Blob *out_chunk) { + if (chunk_begin_ == chunk_end_) { + if (!LoadChunk()) return false; + } + out_chunk->dptr = chunk_begin_; + out_chunk->size = chunk_end_ - chunk_begin_; + chunk_begin_ = chunk_end_; + return true; + } + inline bool ReadChunk(void *buf, size_t *size) { + size_t max_size = *size; + if (max_size <= overflow_.length()) { + *size = 0; return true; + } + if (overflow_.length() != 0) { + std::memcpy(buf, BeginPtr(overflow_), overflow_.length()); + } + size_t olen = overflow_.length(); + overflow_.resize(0); + size_t nread = this->Read(reinterpret_cast(buf) + olen, + max_size - olen); + nread += olen; + if (nread == 0) return false; + if (nread != max_size) { + *size = nread; + return true; + } else { + const char *bptr = reinterpret_cast(buf); + // return the last position where a record starts + const char *bend = this->FindLastRecordBegin(bptr, bptr + max_size); + *size = bend - bptr; + overflow_.resize(max_size - *size); + if (overflow_.length() != 0) { + std::memcpy(BeginPtr(overflow_), bend, overflow_.length()); } - if (c != '\r' && c != '\n' && c != EOF) { - *out_data += c; + return true; + } + } + + protected: + inline const char* FindLastRecordBegin(const char *begin, + const char *end) { + if (begin == end) return begin; + for (const char *p = end - 1; p != begin; --p) { + if (*p == '\n' || *p == '\r') return p + 1; + } + return begin; + } + inline char* FindNextRecord(char *begin, char *end) { + char *p; + for (p = begin; p != end; ++p) { + if (*p == '\n' || *p == '\r') break; + } + for (; p != end; ++p) { + if (*p != '\n' && *p != '\r') return p; + } + return end; + } + inline bool LoadChunk(void) { + while (true) { + size_t size = buffer_.length(); + if (!ReadChunk(BeginPtr(buffer_), &size)) return false; + if (size == 0) { + buffer_.resize(buffer_.length() * 2); } else { - if (out_data->length() != 0) return true; - if (end_of_file_) return false; + chunk_begin_ = reinterpret_cast(BeginPtr(buffer_)); + chunk_end_ = chunk_begin_ + size; + break; } } - return false; - } - + return true; + } + private: + // buffer size + static const size_t kBufferSize = 1 << 18UL; + // file std::FILE *fp_; bool use_stdin_; - bool end_of_file_; + // internal overflow + std::string overflow_; + // internal buffer + std::string buffer_; + // beginning of chunk + char *chunk_begin_; + // end of chunk + char *chunk_end_; }; class StdFile : public dmlc::Stream { @@ -105,7 +192,8 @@ class StdFile : public dmlc::Stream { namespace dmlc { InputSplit* InputSplit::Create(const char *uri, unsigned part, - unsigned nsplit) { + unsigned nsplit, + const char *type) { using namespace xgboost; const char *msg = "xgboost is compiled in local mode\n"\ "to use hdfs, s3 or distributed version, compile with make dmlc=1"; diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h new file mode 100644 index 000000000..6767d61be --- /dev/null +++ b/src/io/libsvm_parser.h @@ -0,0 +1,168 @@ +/*! + * Copyright (c) 2015 by Contributors + * \file libsvm_parser.h + * \brief iterator parser to parse libsvm format + * \author Tianqi Chen + */ +#ifndef XGBOOST_IO_LIBSVM_PARSER_H_ +#define XGBOOST_IO_LIBSVM_PARSER_H_ + +#include +#include +#include +#include "../utils/omp.h" +#include "../utils/utils.h" +#include "../sync/sync.h" +#include "./sparse_batch_page.h" + +namespace xgboost { +namespace io { +/*! \brief page returned by libsvm parser */ +struct LibSVMPage : public SparsePage { + std::vector label; + // overload clear + inline void Clear() { + SparsePage::Clear(); + label.clear(); + } +}; +/*! + * \brief libsvm parser that parses the input lines + * and returns rows in input data + */ +class LibSVMParser : public utils::IIterator { + public: + explicit LibSVMParser(dmlc::InputSplit *source, + int nthread) + : bytes_read_(0), at_head_(true), + data_ptr_(0), data_end_(0), source_(source) { + int maxthread; + #pragma omp parallel + { + maxthread = omp_get_num_threads(); + } + maxthread = std::max(maxthread / 2, 1); + nthread_ = std::min(maxthread, nthread); + } + virtual ~LibSVMParser() { + delete source_; + } + virtual void BeforeFirst(void) { + utils::Assert(at_head_, "cannot call BeforeFirst"); + } + virtual const LibSVMPage &Value(void) const { + return data_[data_ptr_ - 1]; + } + virtual bool Next(void) { + while (true) { + while (data_ptr_ < data_end_) { + data_ptr_ += 1; + if (data_[data_ptr_ - 1].Size() != 0) { + return true; + } + } + if (!FillData()) break; + data_ptr_ = 0; data_end_ = data_.size(); + } + return false; + } + inline size_t bytes_read(void) const { + return bytes_read_; + } + + protected: + inline bool FillData() { + dmlc::InputSplit::Blob chunk; + if (!source_->NextChunk(&chunk)) return false; + int nthread; + #pragma omp parallel num_threads(nthread_) + { + nthread = omp_get_num_threads(); + } + // reserve space for data + data_.resize(nthread); + bytes_read_ += chunk.size; + utils::Assert(chunk.size != 0, "LibSVMParser.FileData"); + char *head = reinterpret_cast(chunk.dptr); + #pragma omp parallel num_threads(nthread_) + { + // threadid + int tid = omp_get_thread_num(); + size_t nstep = (chunk.size + nthread - 1) / nthread; + size_t sbegin = std::min(tid * nstep, chunk.size); + size_t send = std::min((tid + 1) * nstep, chunk.size); + char *pbegin = BackFindEndLine(head + sbegin, head); + char *pend; + if (tid + 1 == nthread) { + pend = head + send; + } else { + pend = BackFindEndLine(head + send, head); + } + ParseBlock(pbegin, pend, &data_[tid]); + } + data_ptr_ = 0; + return true; + } + /*! + * \brief parse data into out + * \param begin beginning of buffer + * \param end end of buffer + */ + inline void ParseBlock(char *begin, + char *end, + LibSVMPage *out) { + out->Clear(); + char *p = begin; + while (p != end) { + while (isspace(*p) && p != end) ++p; + if (p == end) break; + char *head = p; + while (isdigit(*p) && p != end) ++p; + if (*p == ':') { + out->data.push_back(SparseBatch::Entry(atol(head), + atof(p + 1))); + } else { + if (out->label.size() != 0) { + out->offset.push_back(out->data.size()); + } + out->label.push_back(atof(head)); + } + while (!isspace(*p) && p != end) ++p; + } + if (out->label.size() != 0) { + out->offset.push_back(out->data.size()); + } + utils::Check(out->label.size() + 1 == out->offset.size(), + "LibSVMParser inconsistent"); + } + /*! + * \brief start from bptr, go backward and find first endof line + * \param bptr end position to go backward + * \param begin the beginning position of buffer + * \return position of first endof line going backward + */ + inline char* BackFindEndLine(char *bptr, + char *begin) { + for (; bptr != begin; --bptr) { + if (*bptr == '\n' || *bptr == '\r') return bptr; + } + return begin; + } + + private: + // nthread + int nthread_; + // number of bytes readed + size_t bytes_read_; + // at beginning, at end of stream + bool at_head_; + // pointer to begin and end of data + size_t data_ptr_, data_end_; + // source split that provides the data + dmlc::InputSplit *source_; + // internal data + std::vector data_; +}; +} // namespace io +} // namespace xgboost +#endif // XGBOOST_IO_LIBSVM_PARSER_H_ diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 03f0d5ca8..047d589a1 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -12,6 +12,7 @@ #include "./simple_fmatrix-inl.hpp" #include "./sparse_batch_page.h" #include "./page_fmatrix-inl.hpp" +#include "./libsvm_parser.h" namespace xgboost { namespace io { @@ -143,38 +144,32 @@ class DMatrixPageBase : public DataMatrix { std::string fname_row = std::string(cache_file) + ".row.blob"; utils::FileStream fo(utils::FopenCheck(fname_row.c_str(), "wb")); SparsePage page; - dmlc::InputSplit *in = - dmlc::InputSplit::Create(uri, rank, npart); - std::string line; + + LibSVMParser parser( + dmlc::InputSplit::Create(uri, rank, npart, "text"), 4); info.Clear(); - while (in->ReadRecord(&line)) { - float label; - std::istringstream ss(line); - std::vector feats; - ss >> label; - while (!ss.eof()) { - RowBatch::Entry e; - if (!(ss >> e.index)) break; - ss.ignore(32, ':'); - if (!(ss >> e.fvalue)) break; - feats.push_back(e); + while (parser.Next()) { + const LibSVMPage &batch = parser.Value(); + size_t nlabel = info.labels.size(); + info.labels.resize(nlabel + batch.label.size()); + if (batch.label.size() != 0) { + std::memcpy(BeginPtr(info.labels) + nlabel, + BeginPtr(batch.label), + batch.label.size() * sizeof(float)); + } + page.Push(batch); + for (size_t i = 0; i < batch.data.size(); ++i) { + info.info.num_col = std::max(info.info.num_col, + static_cast(batch.data[i].index+1)); } - RowBatch::Inst row(BeginPtr(feats), feats.size()); - page.Push(row); if (page.MemCostBytes() >= kPageSize) { page.Save(&fo); page.Clear(); } - for (size_t i = 0; i < feats.size(); ++i) { - info.info.num_col = std::max(info.info.num_col, - static_cast(feats[i].index+1)); - } - this->info.labels.push_back(label); - info.info.num_row += 1; + info.info.num_row += batch.label.size(); } if (page.data.size() != 0) { page.Save(&fo); } - delete in; fo.Close(); iter_->Load(utils::FileStream(utils::FopenCheck(fname_row.c_str(), "rb"))); // save data matrix diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index e0ea7899e..010bf9893 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -19,6 +19,7 @@ #include "./io.h" #include "./simple_fmatrix-inl.hpp" #include "../sync/sync.h" +#include "./libsvm_parser.h" namespace xgboost { namespace io { @@ -72,7 +73,8 @@ class DMatrixSimple : public DataMatrix { inline size_t AddRow(const std::vector &feats) { for (size_t i = 0; i < feats.size(); ++i) { row_data_.push_back(feats[i]); - info.info.num_col = std::max(info.info.num_col, static_cast(feats[i].index+1)); + info.info.num_col = std::max(info.info.num_col, + static_cast(feats[i].index+1)); } row_ptr_.push_back(row_ptr_.back() + feats.size()); info.info.num_row += 1; @@ -90,26 +92,35 @@ class DMatrixSimple : public DataMatrix { rank = rabit::GetRank(); npart = rabit::GetWorldSize(); } - dmlc::InputSplit *in = - dmlc::InputSplit::Create(uri, rank, npart); + LibSVMParser parser( + dmlc::InputSplit::Create(uri, rank, npart, "text"), 4); this->Clear(); - std::string line; - while (in->ReadRecord(&line)) { - float label; - std::istringstream ss(line); - std::vector feats; - ss >> label; - while (!ss.eof()) { - RowBatch::Entry e; - if (!(ss >> e.index)) break; - ss.ignore(32, ':'); - if (!(ss >> e.fvalue)) break; - feats.push_back(e); + while (parser.Next()) { + const LibSVMPage &batch = parser.Value(); + size_t nlabel = info.labels.size(); + info.labels.resize(nlabel + batch.label.size()); + if (batch.label.size() != 0) { + std::memcpy(BeginPtr(info.labels) + nlabel, + BeginPtr(batch.label), + batch.label.size() * sizeof(float)); } - info.labels.push_back(label); - this->AddRow(feats); + size_t ndata = row_data_.size(); + row_data_.resize(ndata + batch.data.size()); + if (batch.data.size() != 0) { + std::memcpy(BeginPtr(row_data_) + ndata, + BeginPtr(batch.data), + batch.data.size() * sizeof(RowBatch::Entry)); + } + row_ptr_.resize(row_ptr_.size() + batch.label.size()); + for (size_t i = 0; i < batch.label.size(); ++i) { + row_ptr_[nlabel + i + 1] = row_ptr_[nlabel] + batch.offset[i + 1]; + } + info.info.num_row += batch.Size(); + for (size_t i = 0; i < batch.data.size(); ++i) { + info.info.num_col = std::max(info.info.num_col, + static_cast(batch.data[i].index+1)); + } } - delete in; if (!silent) { utils::Printf("%lux%lu matrix with %lu entries is loaded from %s\n", static_cast(info.num_row()), diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h index 756bd1034..61155f380 100644 --- a/src/io/sparse_batch_page.h +++ b/src/io/sparse_batch_page.h @@ -150,7 +150,23 @@ class SparsePage { size_t begin = offset.size(); offset.resize(offset.size() + batch.size); for (size_t i = 0; i < batch.size; ++i) { - offset[i + begin] = top + batch.ind_ptr[i] - batch.ind_ptr[0]; + offset[i + begin] = top + batch.ind_ptr[i + 1] - batch.ind_ptr[0]; + } + } + /*! + * \brief Push a sparse page + * \param batch the row page + */ + inline void Push(const SparsePage &batch) { + data.resize(offset.back() + batch.Size()); + std::memcpy(BeginPtr(data) + offset.back(), + BeginPtr(batch.data), + sizeof(SparseBatch::Entry) * batch.data.size()); + size_t top = offset.back(); + size_t begin = offset.size(); + offset.resize(offset.size() + batch.Size()); + for (size_t i = 0; i < batch.Size(); ++i) { + offset[i + begin] = top + batch.offset[i + 1]; } } /*! diff --git a/src/utils/utils.h b/src/utils/utils.h index afe17f64c..7aee5a29a 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -174,5 +174,13 @@ inline const T *BeginPtr(const std::vector &vec) { return &vec[0]; } } +inline char* BeginPtr(std::string &str) { + if (str.length() == 0) return NULL; + return &str[0]; +} +inline const char* BeginPtr(const std::string &str) { + if (str.length() == 0) return NULL; + return &str[0]; +} } // namespace xgboost #endif // XGBOOST_UTILS_UTILS_H_ From 0a7d233c5de92e45b3dfb557f04fece0b60a35ac Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 17 Apr 2015 22:09:26 -0700 Subject: [PATCH 08/14] add --- src/io/page_dmatrix-inl.hpp | 17 +++++++++++++++-- src/sync/sync.h | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index 03f0d5ca8..ed47a265c 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -133,7 +133,9 @@ class DMatrixPageBase : public DataMatrix { const char* cache_file, bool silent, bool loadsplit) { - + if (!silent) { + utils::Printf("start generate text file from %s\n", uri); + } int rank = 0, npart = 1; if (loadsplit) { rank = rabit::GetRank(); @@ -146,6 +148,8 @@ class DMatrixPageBase : public DataMatrix { dmlc::InputSplit *in = dmlc::InputSplit::Create(uri, rank, npart); std::string line; + size_t bytes_write = 0; + double tstart = rabit::utils::GetTime(); info.Clear(); while (in->ReadRecord(&line)) { float label; @@ -162,8 +166,17 @@ class DMatrixPageBase : public DataMatrix { RowBatch::Inst row(BeginPtr(feats), feats.size()); page.Push(row); if (page.MemCostBytes() >= kPageSize) { - page.Save(&fo); page.Clear(); + bytes_write += page.MemCostBytes(); + page.Save(&fo); + page.Clear(); + double tdiff = rabit::utils::GetTime() - tstart; + if (!silent) { + utils::Printf("Writting to %s in %g MB/s, %g MB written\n", + cache_file, (bytes_write >> 20UL) / tdiff, + (bytes_write >> 20UL)); + } } + for (size_t i = 0; i < feats.size(); ++i) { info.info.num_col = std::max(info.info.num_col, static_cast(feats[i].index+1)); diff --git a/src/sync/sync.h b/src/sync/sync.h index aec5e2abd..3a371b03c 100644 --- a/src/sync/sync.h +++ b/src/sync/sync.h @@ -7,6 +7,7 @@ * \author Tianqi Chen */ #include "../../subtree/rabit/include/rabit.h" +#include "../../subtree/rabit/include/rabit/timer.h" #endif // XGBOOST_SYNC_H_ From 5dfab4ba70148d228c32a83c1beafe0bc1291d6e Mon Sep 17 00:00:00 2001 From: tqchen Date: Fri, 17 Apr 2015 23:02:30 -0700 Subject: [PATCH 09/14] fast loader --- src/io/libsvm_parser.h | 107 +++++++++++++++++++++++----------- src/io/page_dmatrix-inl.hpp | 6 +- src/io/page_fmatrix-inl.hpp | 9 +++ src/io/simple_dmatrix-inl.hpp | 2 +- src/io/sparse_batch_page.h | 10 ++-- 5 files changed, 92 insertions(+), 42 deletions(-) diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h index 6767d61be..8041892d7 100644 --- a/src/io/libsvm_parser.h +++ b/src/io/libsvm_parser.h @@ -13,6 +13,7 @@ #include "../utils/omp.h" #include "../utils/utils.h" #include "../sync/sync.h" +#include "../utils/thread_buffer.h" #include "./sparse_batch_page.h" namespace xgboost { @@ -29,13 +30,19 @@ struct LibSVMPage : public SparsePage { /*! * \brief libsvm parser that parses the input lines * and returns rows in input data + * factry that was used by threadbuffer template */ -class LibSVMParser : public utils::IIterator { +class LibSVMPageFactory { public: - explicit LibSVMParser(dmlc::InputSplit *source, - int nthread) - : bytes_read_(0), at_head_(true), - data_ptr_(0), data_end_(0), source_(source) { + explicit LibSVMPageFactory() + : bytes_read_(0), at_head_(true) { + } + inline bool Init(void) { + return true; + } + inline void Setup(dmlc::InputSplit *source, + int nthread) { + source_ = source; int maxthread; #pragma omp parallel { @@ -44,34 +51,28 @@ class LibSVMParser : public utils::IIterator { maxthread = std::max(maxthread / 2, 1); nthread_ = std::min(maxthread, nthread); } - virtual ~LibSVMParser() { + inline void SetParam(const char *name, const char *val) {} + inline bool LoadNext(std::vector *data) { + return FillData(data); + } + inline void FreeSpace(std::vector *a) { + delete a; + } + inline std::vector *Create(void) { + return new std::vector(); + } + inline void BeforeFirst(void) { + utils::Assert(at_head_, "cannot call beforefirst"); + } + inline void Destroy(void) { delete source_; } - virtual void BeforeFirst(void) { - utils::Assert(at_head_, "cannot call BeforeFirst"); - } - virtual const LibSVMPage &Value(void) const { - return data_[data_ptr_ - 1]; - } - virtual bool Next(void) { - while (true) { - while (data_ptr_ < data_end_) { - data_ptr_ += 1; - if (data_[data_ptr_ - 1].Size() != 0) { - return true; - } - } - if (!FillData()) break; - data_ptr_ = 0; data_end_ = data_.size(); - } - return false; - } inline size_t bytes_read(void) const { return bytes_read_; } protected: - inline bool FillData() { + inline bool FillData(std::vector *data) { dmlc::InputSplit::Blob chunk; if (!source_->NextChunk(&chunk)) return false; int nthread; @@ -80,7 +81,7 @@ class LibSVMParser : public utils::IIterator { nthread = omp_get_num_threads(); } // reserve space for data - data_.resize(nthread); + data->resize(nthread); bytes_read_ += chunk.size; utils::Assert(chunk.size != 0, "LibSVMParser.FileData"); char *head = reinterpret_cast(chunk.dptr); @@ -98,9 +99,8 @@ class LibSVMParser : public utils::IIterator { } else { pend = BackFindEndLine(head + send, head); } - ParseBlock(pbegin, pend, &data_[tid]); + ParseBlock(pbegin, pend, &(*data)[tid]); } - data_ptr_ = 0; return true; } /*! @@ -156,13 +156,54 @@ class LibSVMParser : public utils::IIterator { size_t bytes_read_; // at beginning, at end of stream bool at_head_; - // pointer to begin and end of data - size_t data_ptr_, data_end_; // source split that provides the data dmlc::InputSplit *source_; - // internal data - std::vector data_; }; + +class LibSVMParser : public utils::IIterator { + public: + explicit LibSVMParser(dmlc::InputSplit *source, + int nthread) + : at_end_(false), data_ptr_(0), data_(NULL) { + itr.SetParam("buffer_size", "2"); + itr.get_factory().Setup(source, nthread); + itr.Init(); + } + virtual void BeforeFirst(void) { + itr.BeforeFirst(); + } + virtual bool Next(void) { + if (at_end_) return false; + while (true) { + if (data_ == NULL || data_ptr_ >= data_->size()) { + if (!itr.Next(data_)) { + at_end_ = true; return false; + } else { + data_ptr_ = 0; + } + } + while (data_ptr_ < data_->size()) { + data_ptr_ += 1; + if ((*data_)[data_ptr_ - 1].Size() != 0) { + return true; + } + } + } + return true; + } + virtual const LibSVMPage &Value(void) const { + return (*data_)[data_ptr_ - 1]; + } + inline size_t bytes_read(void) const { + return itr.get_factory().bytes_read(); + } + private: + bool at_end_; + size_t data_ptr_; + std::vector *data_; + utils::ThreadBuffer*, LibSVMPageFactory> itr; +}; + } // namespace io } // namespace xgboost #endif // XGBOOST_IO_LIBSVM_PARSER_H_ diff --git a/src/io/page_dmatrix-inl.hpp b/src/io/page_dmatrix-inl.hpp index ba4bf2d3a..dd7a1b496 100644 --- a/src/io/page_dmatrix-inl.hpp +++ b/src/io/page_dmatrix-inl.hpp @@ -149,7 +149,7 @@ class DMatrixPageBase : public DataMatrix { size_t bytes_write = 0; double tstart = rabit::utils::GetTime(); LibSVMParser parser( - dmlc::InputSplit::Create(uri, rank, npart, "text"), 4); + dmlc::InputSplit::Create(uri, rank, npart, "text"), 16); info.Clear(); while (parser.Next()) { const LibSVMPage &batch = parser.Value(); @@ -159,7 +159,7 @@ class DMatrixPageBase : public DataMatrix { std::memcpy(BeginPtr(info.labels) + nlabel, BeginPtr(batch.label), batch.label.size() * sizeof(float)); - } + } page.Push(batch); for (size_t i = 0; i < batch.data.size(); ++i) { info.info.num_col = std::max(info.info.num_col, @@ -171,7 +171,7 @@ class DMatrixPageBase : public DataMatrix { page.Clear(); double tdiff = rabit::utils::GetTime() - tstart; if (!silent) { - utils::Printf("Writting to %s in %g MB/s, %g MB written\n", + utils::Printf("Writting to %s in %g MB/s, %lu MB written\n", cache_file, (bytes_write >> 20UL) / tdiff, (bytes_write >> 20UL)); } diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index ea2e94733..8bcea2bd4 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -173,6 +173,8 @@ class FMatrixPage : public IFMatrix { std::fill(col_size_.begin(), col_size_.end(), 0); utils::FileStream fo; fo = utils::FileStream(utils::FopenCheck(col_data_name_.c_str(), "wb")); + size_t bytes_write = 0; + double tstart = rabit::utils::GetTime(); // start working iter_->BeforeFirst(); while (iter_->Next()) { @@ -183,10 +185,17 @@ class FMatrixPage : public IFMatrix { buffered_rowset_.push_back(ridx); prow.Push(batch[i]); if (prow.MemCostBytes() >= kPageSize) { + bytes_write += prow.MemCostBytes(); this->PushColPage(prow, BeginPtr(buffered_rowset_) + btop, enabled, &pcol, &fo); btop += prow.Size(); prow.Clear(); + + double tdiff = rabit::utils::GetTime() - tstart; + utils::Printf("Writting to %s in %g MB/s, %lu MB written\n", + col_data_name_.c_str(), + (bytes_write >> 20UL) / tdiff, + (bytes_write >> 20UL)); } } } diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp index 010bf9893..9b0addc1c 100644 --- a/src/io/simple_dmatrix-inl.hpp +++ b/src/io/simple_dmatrix-inl.hpp @@ -93,7 +93,7 @@ class DMatrixSimple : public DataMatrix { npart = rabit::GetWorldSize(); } LibSVMParser parser( - dmlc::InputSplit::Create(uri, rank, npart, "text"), 4); + dmlc::InputSplit::Create(uri, rank, npart, "text"), 16); this->Clear(); while (parser.Next()) { const LibSVMPage &batch = parser.Value(); diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h index 61155f380..e03cca026 100644 --- a/src/io/sparse_batch_page.h +++ b/src/io/sparse_batch_page.h @@ -142,7 +142,7 @@ class SparsePage { * \param batch the row batch */ inline void Push(const RowBatch &batch) { - data.resize(offset.back() + batch.size); + data.resize(offset.back() + batch.ind_ptr[batch.size]); std::memcpy(BeginPtr(data) + offset.back(), batch.data_ptr + batch.ind_ptr[0], sizeof(SparseBatch::Entry) * batch.ind_ptr[batch.size]); @@ -158,13 +158,13 @@ class SparsePage { * \param batch the row page */ inline void Push(const SparsePage &batch) { - data.resize(offset.back() + batch.Size()); - std::memcpy(BeginPtr(data) + offset.back(), + size_t top = offset.back(); + data.resize(top + batch.data.size()); + std::memcpy(BeginPtr(data) + top, BeginPtr(batch.data), sizeof(SparseBatch::Entry) * batch.data.size()); - size_t top = offset.back(); size_t begin = offset.size(); - offset.resize(offset.size() + batch.Size()); + offset.resize(begin + batch.Size()); for (size_t i = 0; i < batch.Size(); ++i) { offset[i + begin] = top + batch.offset[i + 1]; } From 20da8bbe504c0b81f6f3aff5b23f5bc3ee97d3f4 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Apr 2015 00:05:15 -0700 Subject: [PATCH 10/14] Squashed 'subtree/rabit/' changes from 7568f75..f52daf9 f52daf9 make timer cross platform git-subtree-dir: subtree/rabit git-subtree-split: f52daf9be158c036c6d59058335f417cb91c82cd --- include/rabit/timer.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/rabit/timer.h b/include/rabit/timer.h index eaada7ef4..46b7affc4 100644 --- a/include/rabit/timer.h +++ b/include/rabit/timer.h @@ -18,7 +18,8 @@ namespace utils { * \brief return time in seconds, not cross platform, avoid to use this in most places */ inline double GetTime(void) { - #ifdef __MACH__ + // TODO: use c++11 chrono when c++11 was available + #ifdef __MACH__ clock_serv_t cclock; mach_timespec_t mts; host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); @@ -26,9 +27,14 @@ inline double GetTime(void) { mach_port_deallocate(mach_task_self(), cclock); return static_cast(mts.tv_sec) + static_cast(mts.tv_nsec) * 1e-9; #else + #if defined(__unix__) || defined(__linux__) timespec ts; utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time"); return static_cast(ts.tv_sec) + static_cast(ts.tv_nsec) * 1e-9; + #else + // TODO: add MSVC macro, and MSVC timer + return static_cast(time(NULL)); + #endif #endif } } // namespace utils From 9527b55f35126f5a27e2371047088cc230921839 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Apr 2015 00:05:56 -0700 Subject: [PATCH 11/14] fix makefile --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 2af8932ca..a265abb9c 100644 --- a/Makefile +++ b/Makefile @@ -99,10 +99,8 @@ Rpack: cp -r src xgboost/src/src mkdir xgboost/src/subtree mkdir xgboost/src/subtree/rabit - mkdir xgboost/src/subtree/rabit/rabit-learn cp -r subtree/rabit/include xgboost/src/subtree/rabit/include cp -r subtree/rabit/src xgboost/src/subtree/rabit/src - cp -r subtree/rabit/rabit-learn/io xgboost/src/subtree/rabit/rabit-learn/io rm -rf xgboost/src/subtree/rabit/src/*.o mkdir xgboost/src/wrapper cp wrapper/xgboost_wrapper.h xgboost/src/wrapper From 18277086d9feeb477d66d68f664785289b1f7a8b Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sun, 19 Apr 2015 00:20:52 -0700 Subject: [PATCH 12/14] fix windows warnings --- src/io/libsvm_parser.h | 7 ++++--- src/io/page_fmatrix-inl.hpp | 8 ++++---- src/io/simple_fmatrix-inl.hpp | 7 ++++--- src/learner/evaluation-inl.hpp | 2 +- wrapper/xgboost_wrapper.cpp | 6 +++--- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h index 8041892d7..6579d80e3 100644 --- a/src/io/libsvm_parser.h +++ b/src/io/libsvm_parser.h @@ -6,10 +6,11 @@ */ #ifndef XGBOOST_IO_LIBSVM_PARSER_H_ #define XGBOOST_IO_LIBSVM_PARSER_H_ - +#define NOMINMAX #include #include #include +#include #include "../utils/omp.h" #include "../utils/utils.h" #include "../sync/sync.h" @@ -120,12 +121,12 @@ class LibSVMPageFactory { while (isdigit(*p) && p != end) ++p; if (*p == ':') { out->data.push_back(SparseBatch::Entry(atol(head), - atof(p + 1))); + static_cast(atof(p + 1)))); } else { if (out->label.size() != 0) { out->offset.push_back(out->data.size()); } - out->label.push_back(atof(head)); + out->label.push_back(static_cast(atof(head))); } while (!isspace(*p) && p != end) ++p; } diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 8bcea2bd4..0883998f4 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -26,7 +26,7 @@ class ThreadColPageIterator: public utils::IIterator { for (size_t i = 0; i < col_data_.size(); ++i) { col_data_[i] = SparseBatch::Inst (BeginPtr(page_->data) + page_->offset[i], - page_->offset[i + 1] - page_->offset[i]); + static_cast(page_->offset[i + 1] - page_->offset[i])); } out_.col_data = BeginPtr(col_data_); out_.size = col_data_.size(); @@ -110,7 +110,7 @@ class FMatrixPage : public IFMatrix { size_t ncol = this->NumCol(); col_index_.resize(0); for (size_t i = 0; i < ncol; ++i) { - col_index_.push_back(i); + col_index_.push_back(static_cast(i)); } col_iter_.SetIndexSet(col_index_, false); col_iter_.BeforeFirst(); @@ -229,7 +229,7 @@ class FMatrixPage : public IFMatrix { #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < ndata; ++i) { int tid = omp_get_thread_num(); - for (bst_uint j = prow.offset[i]; j < prow.offset[i+1]; ++j) { + for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) { const SparseBatch::Entry &e = prow.data[j]; if (enabled[e.index]) { builder.AddBudget(e.index, tid); @@ -240,7 +240,7 @@ class FMatrixPage : public IFMatrix { #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < ndata; ++i) { int tid = omp_get_thread_num(); - for (bst_uint j = prow.offset[i]; j < prow.offset[i+1]; ++j) { + for (size_t j = prow.offset[i]; j < prow.offset[i+1]; ++j) { const SparseBatch::Entry &e = prow.data[j]; builder.Push(e.index, SparseBatch::Entry(ridx[i], e.fvalue), diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp index 3ba6f2801..acf85297f 100644 --- a/src/io/simple_fmatrix-inl.hpp +++ b/src/io/simple_fmatrix-inl.hpp @@ -165,7 +165,8 @@ class FMatrixS : public IFMatrix { while (iter_->Next()) { const RowBatch &batch = iter_->Value(); bmap.resize(bmap.size() + batch.size, true); - for (size_t i = 0; i < batch.size; ++i) { + long batch_size = static_cast(batch.size); + for (long i = 0; i < batch_size; ++i) { bst_uint ridx = static_cast(batch.base_rowid + i); if (pkeep == 1.0f || random::SampleBinary(pkeep)) { buffered_rowset_.push_back(ridx); @@ -174,7 +175,7 @@ class FMatrixS : public IFMatrix { } } #pragma omp parallel for schedule(static) - for (size_t i = 0; i < batch.size; ++i) { + for (long i = 0; i < batch_size; ++i) { int tid = omp_get_thread_num(); bst_uint ridx = static_cast(batch.base_rowid + i); if (bmap[ridx]) { @@ -193,7 +194,7 @@ class FMatrixS : public IFMatrix { while (iter_->Next()) { const RowBatch &batch = iter_->Value(); #pragma omp parallel for schedule(static) - for (size_t i = 0; i < batch.size; ++i) { + for (long i = 0; i < static_cast(batch.size); ++i) { int tid = omp_get_thread_num(); bst_uint ridx = static_cast(batch.base_rowid + i); if (bmap[ridx]) { diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp index 02d86f606..8798ff99b 100644 --- a/src/learner/evaluation-inl.hpp +++ b/src/learner/evaluation-inl.hpp @@ -130,7 +130,7 @@ struct EvalMClassBase : public IEvaluator { const float wt = info.GetWeight(i); int label = static_cast(info.labels[i]); if (label >= 0 && label < static_cast(nclass)) { - sum += Derived::EvalRow(info.labels[i], + sum += Derived::EvalRow(label, BeginPtr(preds) + i * nclass, nclass) * wt; wsum += wt; diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index dec266ff6..2b28469c7 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -149,9 +149,9 @@ extern "C"{ DMatrixSimple &mat = *p_mat; utils::ParallelGroupBuilder builder(&mat.row_ptr_, &mat.row_data_); builder.InitBudget(0, nthread); - bst_ulong ncol = nindptr - 1; + long ncol = static_cast(nindptr - 1); #pragma omp parallel for schedule(static) - for (bst_ulong i = 0; i < ncol; ++i) { + for (long i = 0; i < ncol; ++i) { int tid = omp_get_thread_num(); for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { builder.AddBudget(indices[j], tid); @@ -159,7 +159,7 @@ extern "C"{ } builder.InitStorage(); #pragma omp parallel for schedule(static) - for (bst_ulong i = 0; i < ncol; ++i) { + for (long i = 0; i < ncol; ++i) { int tid = omp_get_thread_num(); for (unsigned j = col_ptr[i]; j < col_ptr[i+1]; ++j) { builder.Push(indices[j], From 44fd329b021bfd46a6b033a64467cda7d40310db Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Apr 2015 00:23:02 -0700 Subject: [PATCH 13/14] Squashed 'subtree/rabit/' changes from f52daf9..c679671 c679671 fix io style git-subtree-dir: subtree/rabit git-subtree-split: c67967161e9863a7da5554ab695c5f798d3ebc67 --- include/dmlc/io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dmlc/io.h b/include/dmlc/io.h index 4feeb30d2..f808094c1 100644 --- a/include/dmlc/io.h +++ b/include/dmlc/io.h @@ -345,13 +345,13 @@ inline int ostream::OutBuf::sync(void) { if (stream_ == NULL) return -1; std::ptrdiff_t n = pptr() - pbase(); stream_->Write(pbase(), n); - this->pbump(-n); + this->pbump(-static_cast(n)); return 0; } inline int ostream::OutBuf::overflow(int c) { *(this->pptr()) = c; std::ptrdiff_t n = pptr() - pbase(); - this->pbump(-n); + this->pbump(-static_cast(n)); if (c == EOF) { stream_->Write(pbase(), n); } else { From 5123b07d73068b946d4f398914eb37d145281484 Mon Sep 17 00:00:00 2001 From: tqchen Date: Sun, 19 Apr 2015 00:55:11 -0700 Subject: [PATCH 14/14] add more docs --- demo/guide-python/README.md | 2 + demo/guide-python/external_memory.py | 25 ++++++ doc/README.md | 19 ++++ doc/external_memory.md | 32 +++++++ doc/parameter.md | 111 +++++++++++++++++++++++ doc/python.md | 126 +++++++++++++++++++++++++++ 6 files changed, 315 insertions(+) create mode 100755 demo/guide-python/external_memory.py create mode 100644 doc/README.md create mode 100644 doc/external_memory.md create mode 100644 doc/parameter.md create mode 100644 doc/python.md diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md index bc1c219d0..32d0290ab 100644 --- a/demo/guide-python/README.md +++ b/demo/guide-python/README.md @@ -7,3 +7,5 @@ XGBoost Python Feature Walkthrough * [Generalized Linear Model](generalized_linear_model.py) * [Cross validation](cross_validation.py) * [Predicting leaf indices](predict_leaf_indices.py) +* [Sklearn Wrapper](sklearn_example.py) +* [External Memory](external_memory.py) diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py new file mode 100755 index 000000000..eb579c935 --- /dev/null +++ b/demo/guide-python/external_memory.py @@ -0,0 +1,25 @@ +#!/usr/bin/python +import numpy as np +import scipy.sparse +import xgboost as xgb + +### simple example for using external memory version + +# this is the only difference, add a # followed by a cache prefix name +# several cache file with the prefix will be generated +# currently only support convert from libsvm file +dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache') +dtest = xgb.DMatrix('../data/agaricus.txt.test#dtest.cache') + +# specify validations set to watch performance +param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + +# performance notice: set nthread to be the number of your real cpu +# some cpu offer two threads per core, for example, a 4 core cpu with 8 threads, in such case set nthread=4 +#param['nthread']=num_real_cpu + +watchlist = [(dtest,'eval'), (dtrain,'train')] +num_round = 2 +bst = xgb.train(param, dtrain, num_round, watchlist) + + diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000..801e7e65a --- /dev/null +++ b/doc/README.md @@ -0,0 +1,19 @@ +XGBoost Documentation +==== +This is an ongoing effort to move the [wiki document](https://github.com/dmlc/xgboost/wiki) to here. + +List of Documentations +==== +* [Parameters](parameter.md) +* [Using XGBoost in Python](python.md) +* [External Memory Version](external_memory.md) + +Highlights Links +==== +This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request. +* Blogpost by phunther: [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) +* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) + +Contribution +==== +Contribution of document usecases are welcomed! diff --git a/doc/external_memory.md b/doc/external_memory.md new file mode 100644 index 000000000..e98133467 --- /dev/null +++ b/doc/external_memory.md @@ -0,0 +1,32 @@ +Using XGBoost External Memory Version +==== +There is no big difference between using external memory version and in-memory version. +The only difference is the filename format. + +The external memory version takes in the following filename format +``` +filename#cacheprefix +``` + +The ```filename``` is the normal path to libsvm file you want to load in, ```cacheprefix``` is a +path to a cache file that xgboost will use for external memory cache. + +The following code was extracted from [../demo/guide-python/external_memory.py](../demo/guide-python/external_memory.py) +```python +dtrain = xgb.DMatrix('../data/agaricus.txt.train#dtrain.cache') +``` +You can find that there is additional ```#dtrain.cache``` following the libsvm file, this is the name of cache file. +For CLI version, simply use ```"../data/agaricus.txt.train#dtrain.cache"``` in filename. + +Performance Note +==== +* the parameter ```nthread``` should be set to number of ***real*** cores + - Most modern CPU offer hyperthreading, which means you can have a 4 core cpu with 8 threads + - Set nthread to be 4 for maximum performance in such case + +Usage Note: +==== +* This is a experimental version + - If you like to try and test it, report results to https://github.com/dmlc/xgboost/issues/244 +* Currently only importing from libsvm format is supported + - Contribution of ingestion from other common external memory data source is welcomed diff --git a/doc/parameter.md b/doc/parameter.md new file mode 100644 index 000000000..2ced29935 --- /dev/null +++ b/doc/parameter.md @@ -0,0 +1,111 @@ +XGBoost Parameters +==== +Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters: +- General parameters relates to which booster we are using to do boosting, commonly tree or linear model +- Booster parameters depends on which booster you have chosen +- Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks. +- In addition to these parameters, there can be console parameters that relates to behavior of console version of xgboost(e.g. when to save model) + +### Parameters in R Package +In R-package, you can use .(dot) to replace under score in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R. + +### General Parameters +* booster [default=gbtree] + - which booster to use, can be gbtree or gblinear. The details about different boosters are described [here](https://github.com/dmlc/xgboost/wiki/Boosters). +* silent [default=0] + - 0 means printing running messages, 1 means silent mode. +* nthread [default to maximum number of threads available if not set] + - number of parallel threads used to run xgboost +* num_pbuffer [set automatically by xgboost, no need to be set by user] + - size of prediction buffer, normally set to number of training instances. The buffers are used to save the prediction results of last boosting step. +* num_feature [set automatically by xgboost, no need to be set by user] + - feature dimension used in boosting, set to maximum dimension of the feature + +### Booster Parameters +From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parameters. Parameter with or without bst: prefix will be equivalent(i.e. both bst:eta and eta will be valid parameter setting) . + +#### Parameter for Tree Booster +* eta [default=0.3] + - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinkage the feature weights to make the boosting process more conservative. +* gamma + - minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. +* max_depth [default=6] + - maximum depth of a tree +* min_child_weight [default=1] + - minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. +* max_delta_step [default=0] + - Maximum delta step we allow each tree's weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value, it can help making the update step more conservative. Usually this parameter is not needed, but it might help in logistic regression when class is extremely imbalanced. Set it to value of 1-10 might help control the update +* subsample [default=1] + - subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. +* colsample_bytree [default=1] + - subsample ratio of columns when constructing each tree. + +#### Parameter for Linear Booster +* lambda [default=0] + - L2 regularization term on weights +* alpha [default=0] + - L1 regularization term on weights +* lambda_bias + - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important) + +### Task Parameters +* objective [ default=reg:linear ] + - specify the learning task and the corresponding learning objective, and the objective options are below: + - "reg:linear" --linear regression + - "reg:logistic" --logistic regression + - "binary:logistic" --logistic regression for binary classification, output probability + - "binary:logitraw" --logistic regression for binary classification, output score before logistic transformation + - "multi:softmax" --set XGBoost to do multiclass classification using the softmax objective, you also need to set num_class(number of classes) + - "multi:softprob" --same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probability of each data point belonging to each class. + - "rank:pairwise" --set XGBoost to do ranking task by minimizing the pairwise loss +* base_score [ default=0.5 ] + - the initial prediction score of all instances, global bias +* eval_metric [ default according to objective ] + - evaluation metrics for validation data, a default metric will be assigned according to objective( rmse for regression, and error for classification, mean average precision for ranking ) + - User can add multiple evaluation metrics, for python user, remember to pass the metrics in as list of parameters pairs instead of map, so that latter 'eval_metric' won't override previous one + - The choices are listed below: + - "rmse": [root mean square error](http://en.wikipedia.org/wiki/Root_mean_square_error) + - "logloss": negative [log-likelihood](http://en.wikipedia.org/wiki/Log-likelihood) + - "error": Binary classification error rate. It is calculated as #(wrong cases)/#(all cases). For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. + - "merror": Multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases). + - "mlogloss": Multiclass logloss + - "auc": [Area under the curve](http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve) for ranking evaluation. + - "ndcg":[Normalized Discounted Cumulative Gain](http://en.wikipedia.org/wiki/NDCG) + - "map":[Mean average precision](http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision) + - "ndcg@n","map@n": n can be assigned as an integer to cut off the top positions in the lists for evaluation. + - "ndcg-","map-","ndcg@n-","map@n-": In XGBoost, NDCG and MAP will evaluate the score of a list without any positive samples as 1. By adding "-" in the evaluation metric XGBoost will evaluate these score as 0 to be consistent under some conditions. +training repeatively +* seed [ default=0 ] + - random number seed. + +### Console Parameters +The following parameters are only used in the console version of xgboost +* use_buffer [ default=1 ] + - whether create binary buffer for text input, this normally will speedup loading when do +* num_round + - the number of round for boosting. +* data + - The path of training data +* test:data + - The path of test data to do prediction +* save_period [default=0] + - the period to save the model, setting save_period=10 means that for every 10 rounds XGBoost will save the model, setting it to 0 means not save any model during training. +* task [default=train] options: train, pred, eval, dump + - train: training using data + - pred: making prediction for test:data + - eval: for evaluating statistics specified by eval[name]=filenam + - dump: for dump the learned model into text format(preliminary) +* model_in [default=NULL] + - path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model +* model_out [default=NULL] + - path to output model after training finishes, if not specified, will output like 0003.model where 0003 is number of rounds to do boosting. +* model_dir [default=models] + - The output directory of the saved models during training +* fmap + - feature map, used for dump model +* name_dump [default=dump.txt] + - name of model dump file +* name_pred [default=pred.txt] + - name of prediction file, used in pred mode +* pred_margin [default=0] + - predict margin instead of transformed probability diff --git a/doc/python.md b/doc/python.md new file mode 100644 index 000000000..233a6f797 --- /dev/null +++ b/doc/python.md @@ -0,0 +1,126 @@ +XGBoost Python Module +==== + +This page will introduce XGBoost Python module, including: +* [Building and Import](#building-and-import) +* [Data Interface](#data-interface) +* [Setting Parameters](#setting-parameters) +* [Train Model](#training-model) +* [Early Stopping](#early-stopping) +* [Prediction](#prediction) + +A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided. + += +#### Install + +To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run + +```shell +python setup.py install +``` +Then import the module in Python as usual +```python +import xgboost as xgb +``` + += +#### Data Interface +XGBoost python module is able to loading from libsvm txt format file, Numpy 2D array and xgboost binary buffer file. The data will be store in ```DMatrix``` object. + +* To load libsvm text format file and XGBoost binary file into ```DMatrix```, the usage is like +```python +dtrain = xgb.DMatrix('train.svm.txt') +dtest = xgb.DMatrix('test.svm.buffer') +``` +* To load numpy array into ```DMatrix```, the usage is like +```python +data = np.random.rand(5,10) # 5 entities, each contains 10 features +label = np.random.randint(2, size=5) # binary target +dtrain = xgb.DMatrix( data, label=label) +``` +* Build ```DMatrix``` from ```scipy.sparse``` +```python +csr = scipy.sparse.csr_matrix( (dat, (row,col)) ) +dtrain = xgb.DMatrix( csr ) +``` +* Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time. The usage is like: +```python +dtrain = xgb.DMatrix('train.svm.txt') +dtrain.save_binary("train.buffer") +``` +* To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` like: +```python +dtrain = xgb.DMatrix( data, label=label, missing = -999.0) +``` +* Weight can be set when needed, like +```python +w = np.random.rand(5,1) +dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=w) +``` + + += +#### Setting Parameters +XGBoost use list of pair to save [parameters](https://github.com/tqchen/xgboost/wiki/Parameters). Eg +* Booster parameters +```python +param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } +param['nthread'] = 4 +plst = param.items() +plst += [('eval_metric', 'auc')] # Multiple evals can be handled in this way +plst += [('eval_metric', 'ams@0')] +``` +* Specify validations set to watch performance +```python +evallist = [(dtest,'eval'), (dtrain,'train')] +``` + += +#### Training Model +With parameter list and data, you are able to train a model. +* Training +```python +num_round = 10 +bst = xgb.train( plst, dtrain, num_round, evallist ) +``` +* Saving model +After training, you can save model and dump it out. +```python +bst.save_model('0001.model') +``` +* Dump Model and Feature Map +You can dump the model to txt and review the meaning of model +```python +# dump model +bst.dump_model('dump.raw.txt') +# dump model with feature map +bst.dump_model('dump.raw.txt','featmap.txt') +``` +* Loading model +After you save your model, you can load model file at anytime by using +```python +bst = xgb.Booster({'nthread':4}) #init model +bst.load_model("model.bin") # load data +``` += +#### Early stopping + +If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in `evals`. If there's more than one, it will use the last. + +`train(..., evals=evals, early_stopping_rounds=10)` + +The model will train until the validation score stops improving. Validation error needs to decrease at least every `early_stopping_rounds` to continue training. + +If early stopping occurs, the model will have two additional fields: `bst.best_score` and `bst.best_iteration`. Note that `train()` will return a model from the last iteration, not the best one. + +This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). + += +#### Prediction +After you training/loading a model and preparing the data, you can start to do prediction. +```python +data = np.random.rand(7,10) # 7 entities, each contains 10 features +dtest = xgb.DMatrix( data, missing = -999.0 ) +ypred = bst.predict( xgmat ) +```