diff --git a/NEWS.md b/NEWS.md index 84eea6cfe..da512f4b1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ This file records the changes in xgboost library in reverse chronological order. ## in progress version * Refactored gbm to allow more friendly cache strategy - Specialized some prediction routine +* Automatically remove nan from input data when it is sparse. + - This can solve some of user reported problem of istart != hist.size ## v0.6 (2016.07.29) * Version 0.5 is skipped due to major improvements in the core diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 0f535b250..f1c02188f 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -238,22 +238,31 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, API_BEGIN(); data::SimpleCSRSource& mat = *source; - mat.row_ptr_.resize(nindptr); - for (size_t i = 0; i < nindptr; ++i) { - mat.row_ptr_[i] = indptr[i]; - } - mat.row_data_.resize(nelem); - for (size_t i = 0; i < nelem; ++i) { - mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]); - mat.info.num_col = std::max(mat.info.num_col, - static_cast(indices[i] + 1)); + mat.row_ptr_.reserve(nindptr); + mat.row_data_.reserve(nelem); + mat.row_ptr_.resize(1); + mat.row_ptr_[0] = 0; + size_t num_column = 0; + for (size_t i = 1; i < nindptr; ++i) { + for (size_t j = indptr[i - 1]; j < indptr[i]; ++j) { + if (!common::CheckNAN(data[j])) { + // automatically skip nan. + mat.row_data_.emplace_back(RowBatch::Entry(indices[j], data[j])); + num_column = std::max(num_column, static_cast(indices[j] + 1)); + } + } + mat.row_ptr_.push_back(mat.row_data_.size()); } + + mat.info.num_col = num_column; if (num_col > 0) { - CHECK_LE(mat.info.num_col, num_col); + CHECK_LE(mat.info.num_col, num_col) + << "num_col=" << num_col << " vs " << mat.info.num_col; mat.info.num_col = num_col; } mat.info.num_row = nindptr - 1; - mat.info.num_nonzero = nelem; + LOG(INFO) << "num_row=" << mat.info.num_row; + mat.info.num_nonzero = mat.row_data_.size(); *out = new std::shared_ptr(DMatrix::Create(std::move(source))); API_END(); } @@ -291,7 +300,9 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, for (omp_ulong i = 0; i < static_cast(ncol); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.AddBudget(indices[j], tid); + if (!common::CheckNAN(data[j])) { + builder.AddBudget(indices[j], tid); + } } } builder.InitStorage(); @@ -299,9 +310,11 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, for (omp_ulong i = 0; i < static_cast(ncol); ++i) { // NOLINT(*) int tid = omp_get_thread_num(); for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) { - builder.Push(indices[j], - RowBatch::Entry(static_cast(i), data[j]), - tid); + if (!common::CheckNAN(data[j])) { + builder.Push(indices[j], + RowBatch::Entry(static_cast(i), data[j]), + tid); + } } } mat.info.num_row = mat.row_ptr_.size() - 1; diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 5c4126702..60a8eb75b 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -670,9 +670,8 @@ class ColMaker: public TreeUpdater { #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < ndata; ++i) { const bst_uint ridx = rowset[i]; - if (ridx >= position.size()) { - LOG(INFO) << "ridx exceed bound\n"; - } + CHECK_LT(ridx, position.size()) + << "ridx exceed bound " << "ridx="<< ridx << " pos=" << position.size(); const int nid = this->DecodePosition(ridx); if (tree[nid].is_leaf()) { // mark finish when it is not a fresh leaf diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index c8fe5c7a3..87c73e9f8 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -298,8 +298,15 @@ class CQHistMaker: public HistMaker { hist.data[istart].Add(gstats); } else { while (istart < hist.size && !(fv < hist.cut[istart])) ++istart; - CHECK_NE(istart, hist.size); - hist.data[istart].Add(gstats); + if (istart != hist.size) { + hist.data[istart].Add(gstats); + } else { + LOG(INFO) << "fv=" << fv << ", hist.size=" << hist.size; + for (size_t i = 0; i < hist.size; ++i) { + LOG(INFO) << "hist[" << i << "]=" << hist.cut[i]; + } + LOG(FATAL) << "fv=" << fv << ", hist.last=" << hist.cut[hist.size - 1]; + } } } };