Automatically remove nan from input data when it is sparse. (#2062)
* [DATALoad] Automatically remove Nan when load from sparse matrix * add log
This commit is contained in:
parent
5d093a7f4c
commit
fd19b7a188
2
NEWS.md
2
NEWS.md
@ -6,6 +6,8 @@ This file records the changes in xgboost library in reverse chronological order.
|
|||||||
## in progress version
|
## in progress version
|
||||||
* Refactored gbm to allow more friendly cache strategy
|
* Refactored gbm to allow more friendly cache strategy
|
||||||
- Specialized some prediction routine
|
- Specialized some prediction routine
|
||||||
|
* Automatically remove nan from input data when it is sparse.
|
||||||
|
- This can solve some of user reported problem of istart != hist.size
|
||||||
|
|
||||||
## v0.6 (2016.07.29)
|
## v0.6 (2016.07.29)
|
||||||
* Version 0.5 is skipped due to major improvements in the core
|
* Version 0.5 is skipped due to major improvements in the core
|
||||||
|
|||||||
@ -238,22 +238,31 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
|||||||
|
|
||||||
API_BEGIN();
|
API_BEGIN();
|
||||||
data::SimpleCSRSource& mat = *source;
|
data::SimpleCSRSource& mat = *source;
|
||||||
mat.row_ptr_.resize(nindptr);
|
mat.row_ptr_.reserve(nindptr);
|
||||||
for (size_t i = 0; i < nindptr; ++i) {
|
mat.row_data_.reserve(nelem);
|
||||||
mat.row_ptr_[i] = indptr[i];
|
mat.row_ptr_.resize(1);
|
||||||
}
|
mat.row_ptr_[0] = 0;
|
||||||
mat.row_data_.resize(nelem);
|
size_t num_column = 0;
|
||||||
for (size_t i = 0; i < nelem; ++i) {
|
for (size_t i = 1; i < nindptr; ++i) {
|
||||||
mat.row_data_[i] = RowBatch::Entry(indices[i], data[i]);
|
for (size_t j = indptr[i - 1]; j < indptr[i]; ++j) {
|
||||||
mat.info.num_col = std::max(mat.info.num_col,
|
if (!common::CheckNAN(data[j])) {
|
||||||
static_cast<uint64_t>(indices[i] + 1));
|
// automatically skip nan.
|
||||||
|
mat.row_data_.emplace_back(RowBatch::Entry(indices[j], data[j]));
|
||||||
|
num_column = std::max(num_column, static_cast<size_t>(indices[j] + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mat.row_ptr_.push_back(mat.row_data_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mat.info.num_col = num_column;
|
||||||
if (num_col > 0) {
|
if (num_col > 0) {
|
||||||
CHECK_LE(mat.info.num_col, num_col);
|
CHECK_LE(mat.info.num_col, num_col)
|
||||||
|
<< "num_col=" << num_col << " vs " << mat.info.num_col;
|
||||||
mat.info.num_col = num_col;
|
mat.info.num_col = num_col;
|
||||||
}
|
}
|
||||||
mat.info.num_row = nindptr - 1;
|
mat.info.num_row = nindptr - 1;
|
||||||
mat.info.num_nonzero = nelem;
|
LOG(INFO) << "num_row=" << mat.info.num_row;
|
||||||
|
mat.info.num_nonzero = mat.row_data_.size();
|
||||||
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
|
||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
@ -291,7 +300,9 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
|||||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||||
builder.AddBudget(indices[j], tid);
|
if (!common::CheckNAN(data[j])) {
|
||||||
|
builder.AddBudget(indices[j], tid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
builder.InitStorage();
|
builder.InitStorage();
|
||||||
@ -299,9 +310,11 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
|||||||
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
for (omp_ulong i = 0; i < static_cast<omp_ulong>(ncol); ++i) { // NOLINT(*)
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
for (size_t j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||||
builder.Push(indices[j],
|
if (!common::CheckNAN(data[j])) {
|
||||||
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
builder.Push(indices[j],
|
||||||
tid);
|
RowBatch::Entry(static_cast<bst_uint>(i), data[j]),
|
||||||
|
tid);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mat.info.num_row = mat.row_ptr_.size() - 1;
|
mat.info.num_row = mat.row_ptr_.size() - 1;
|
||||||
|
|||||||
@ -670,9 +670,8 @@ class ColMaker: public TreeUpdater {
|
|||||||
#pragma omp parallel for schedule(static)
|
#pragma omp parallel for schedule(static)
|
||||||
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
for (bst_omp_uint i = 0; i < ndata; ++i) {
|
||||||
const bst_uint ridx = rowset[i];
|
const bst_uint ridx = rowset[i];
|
||||||
if (ridx >= position.size()) {
|
CHECK_LT(ridx, position.size())
|
||||||
LOG(INFO) << "ridx exceed bound\n";
|
<< "ridx exceed bound " << "ridx="<< ridx << " pos=" << position.size();
|
||||||
}
|
|
||||||
const int nid = this->DecodePosition(ridx);
|
const int nid = this->DecodePosition(ridx);
|
||||||
if (tree[nid].is_leaf()) {
|
if (tree[nid].is_leaf()) {
|
||||||
// mark finish when it is not a fresh leaf
|
// mark finish when it is not a fresh leaf
|
||||||
|
|||||||
@ -298,8 +298,15 @@ class CQHistMaker: public HistMaker<TStats> {
|
|||||||
hist.data[istart].Add(gstats);
|
hist.data[istart].Add(gstats);
|
||||||
} else {
|
} else {
|
||||||
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
while (istart < hist.size && !(fv < hist.cut[istart])) ++istart;
|
||||||
CHECK_NE(istart, hist.size);
|
if (istart != hist.size) {
|
||||||
hist.data[istart].Add(gstats);
|
hist.data[istart].Add(gstats);
|
||||||
|
} else {
|
||||||
|
LOG(INFO) << "fv=" << fv << ", hist.size=" << hist.size;
|
||||||
|
for (size_t i = 0; i < hist.size; ++i) {
|
||||||
|
LOG(INFO) << "hist[" << i << "]=" << hist.cut[i];
|
||||||
|
}
|
||||||
|
LOG(FATAL) << "fv=" << fv << ", hist.last=" << hist.cut[hist.size - 1];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user