From 226d26d40c7a7c44e607dab2a7ae476b3e15fd58 Mon Sep 17 00:00:00 2001 From: tqchen Date: Tue, 2 Sep 2014 17:18:17 -0700 Subject: [PATCH] still buggy --- src/io/page_fmatrix-inl.hpp | 9 +++++++-- src/utils/matrix_csr.h | 21 ++++++++++++--------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp index 4189c0c85..9e586e1c4 100644 --- a/src/io/page_fmatrix-inl.hpp +++ b/src/io/page_fmatrix-inl.hpp @@ -132,6 +132,7 @@ class CSCMatrixManager { "invalid column buffer format"); p_page->col_data.push_back(ColBatch::Inst(p_data, len)); p_page->col_index.push_back(cidx); + return true; } // the following are in memory auxiliary data structure /*! \brief top of reader position */ @@ -159,6 +160,7 @@ class ThreadColPageIterator : public utils::IIterator { float page_ratio, bool silent) { itr_.SetParam("buffer_size", "2"); itr_.get_factory().Setup(fi, page_ratio); + itr_.Init(); if (!silent) { utils::Printf("ThreadColPageIterator: finish initialzing, %u columns\n", static_cast(col_ptr().size() - 1)); @@ -239,8 +241,11 @@ class FMatrixPage : public IFMatrix { } virtual void InitColAccess(float pkeep = 1.0f) { if (this->HaveColAccess()) return; - this->InitColData(pkeep, fname_cbuffer_.c_str(), - 64 << 20, 5); + if (!this->LoadColData()) { + this->InitColData(pkeep, fname_cbuffer_.c_str(), + 64 << 20, 5); + utils::Check(this->LoadColData(), "fail to read in column data"); + } } /*! * \brief get the row iterator associated with FMatrix diff --git a/src/utils/matrix_csr.h b/src/utils/matrix_csr.h index e4c410511..ea5bc8b2d 100644 --- a/src/utils/matrix_csr.h +++ b/src/utils/matrix_csr.h @@ -6,6 +6,7 @@ * \author Tianqi Chen */ #include +#include #include #include "./io.h" #include "./utils.h" @@ -156,7 +157,7 @@ struct SparseCSRFileBuilder { for (size_t i = 1; i < rptr.size(); i++) { nelem += rptr[i]; rptr[i] = nelem; - } + } begin_data = static_cast(fo->Tell()) + sizeof(SizeType); SizeType begin_meta = begin_data + nelem * sizeof(IndexType); fo->Write(&begin_meta, sizeof(begin_meta)); @@ -166,8 +167,8 @@ struct SparseCSRFileBuilder { buffer_rptr.resize(rptr.size()); buffer_temp.reserve(buffer_size); buffer_data.resize(buffer_size); - saved_offset.clear(); - saved_offset.resize(rptr.size() - 1, 0); + saved_offset = rptr; + saved_offset.resize(rptr.size() - 1); this->ClearBuffer(); } /*! \brief step 4: push element into buffer */ @@ -176,7 +177,8 @@ struct SparseCSRFileBuilder { this->WriteBuffer(); this->ClearBuffer(); } - buffer_temp.push_back(std::make_pair(row_id, col_id)); + buffer_rptr[row_id + 1] += 1; + buffer_temp.push_back(std::make_pair(row_id, col_id)); } /*! \brief finalize the construction */ inline void Finalize(void) { @@ -190,14 +192,14 @@ struct SparseCSRFileBuilder { inline void SortRows(Comparator comp, size_t step) { for (size_t i = 0; i < rptr.size() - 1; i += step) { bst_omp_uint begin = static_cast(i); - bst_omp_uint end = static_cast(std::min(rptr.size(), i + step)); + bst_omp_uint end = static_cast(std::min(rptr.size() - 1, i + step)); if (rptr[end] != rptr[begin]) { fo->Seek(begin_data + rptr[begin] * sizeof(IndexType)); buffer_data.resize(rptr[end] - rptr[begin]); fo->Read(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); // do parallel sorting #pragma omp parallel for schedule(static) - for (bst_omp_uint j = begin; j < end; ++j){ + for (bst_omp_uint j = begin; j < end; ++j) { std::sort(&buffer_data[0] + rptr[j] - rptr[begin], &buffer_data[0] + rptr[j+1] - rptr[begin], comp); @@ -206,6 +208,7 @@ struct SparseCSRFileBuilder { fo->Write(BeginPtr(buffer_data), (rptr[end] - rptr[begin]) * sizeof(IndexType)); } } + printf("CSV::begin_dat=%lu\n", begin_data); } protected: inline void WriteBuffer(void) { @@ -220,11 +223,11 @@ struct SparseCSRFileBuilder { buffer_data[rp++] = buffer_temp[i].second; } // write out - for (size_t i = 0; i < buffer_rptr.size(); ++i) { + for (size_t i = 0; i < buffer_rptr.size() - 1; ++i) { size_t nelem = buffer_rptr[i+1] - buffer_rptr[i]; if (nelem != 0) { - utils::Assert(saved_offset[i] < rptr[i+1], "data exceed bound"); - fo->Seek((rptr[i] + saved_offset[i]) * sizeof(IndexType) + begin_data); + utils::Assert(saved_offset[i] + nelem <= rptr[i+1], "data exceed bound"); + fo->Seek(saved_offset[i] * sizeof(IndexType) + begin_data); fo->Write(&buffer_data[0] + buffer_rptr[i], nelem * sizeof(IndexType)); saved_offset[i] += nelem; }