Fix CPU hist init for sparse dataset. (#4625)

* Fix CPU hist init for sparse dataset.

* Implement sparse histogram cut.
* Allow empty features.

* Fix windows build, don't use sparse in distributed environment.

* Comments.

* Smaller threshold.

* Fix windows omp.

* Fix msvc lambda capture.

* Fix MSVC macro.

* Fix MSVC initialization list.

* Fix MSVC initialization list x2.

* Preserve categorical feature behavior.

* Rename matrix to sparse cuts.
* Reuse UseGroup.
* Check for categorical data when adding cut.

Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu>

* Sanity check.

* Fix comments.

* Fix comment.
This commit is contained in:
Jiaming Yuan
2019-07-04 19:27:03 -04:00
committed by Philip Hyunsu Cho
parent b7a1f22d24
commit d9a47794a5
33 changed files with 681 additions and 299 deletions

View File

@@ -75,7 +75,7 @@ class ColumnMatrix {
// construct column matrix from GHistIndexMatrix
inline void Init(const GHistIndexMatrix& gmat,
double sparse_threshold) {
const int32_t nfeature = static_cast<int32_t>(gmat.cut.row_ptr.size() - 1);
const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
const size_t nrow = gmat.row_ptr.size() - 1;
// identify type of each column
@@ -85,7 +85,7 @@ class ColumnMatrix {
uint32_t max_val = std::numeric_limits<uint32_t>::max();
for (int32_t fid = 0; fid < nfeature; ++fid) {
CHECK_LE(gmat.cut.row_ptr[fid + 1] - gmat.cut.row_ptr[fid], max_val);
CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
}
gmat.GetFeatureCounts(&feature_counts_[0]);
@@ -123,7 +123,7 @@ class ColumnMatrix {
// store least bin id for each feature
index_base_.resize(nfeature);
for (int32_t fid = 0; fid < nfeature; ++fid) {
index_base_[fid] = gmat.cut.row_ptr[fid];
index_base_[fid] = gmat.cut.Ptrs()[fid];
}
// pre-fill index_ for dense columns
@@ -150,9 +150,9 @@ class ColumnMatrix {
size_t fid = 0;
for (size_t i = ibegin; i < iend; ++i) {
const uint32_t bin_id = gmat.index[i];
while (bin_id >= gmat.cut.row_ptr[fid + 1]) {
++fid;
}
auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
gmat.cut.Ptrs().cend(), bin_id);
fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
if (type_[fid] == kDenseColumn) {
uint32_t* begin = &index_[boundary_[fid].index_begin];
begin[rid] = bin_id - index_base_[fid];