Fix CPU hist init for sparse dataset. (#4625)
* Fix CPU hist init for sparse dataset. * Implement sparse histogram cut. * Allow empty features. * Fix windows build, don't use sparse in distributed environment. * Comments. * Smaller threshold. * Fix windows omp. * Fix msvc lambda capture. * Fix MSVC macro. * Fix MSVC initialization list. * Fix MSVC initialization list x2. * Preserve categorical feature behavior. * Rename matrix to sparse cuts. * Reuse UseGroup. * Check for categorical data when adding cut. Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu> * Sanity check. * Fix comments. * Fix comment.
This commit is contained in:
committed by
Philip Hyunsu Cho
parent
b7a1f22d24
commit
d9a47794a5
@@ -75,7 +75,7 @@ class ColumnMatrix {
|
||||
// construct column matrix from GHistIndexMatrix
|
||||
inline void Init(const GHistIndexMatrix& gmat,
|
||||
double sparse_threshold) {
|
||||
const int32_t nfeature = static_cast<int32_t>(gmat.cut.row_ptr.size() - 1);
|
||||
const int32_t nfeature = static_cast<int32_t>(gmat.cut.Ptrs().size() - 1);
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
|
||||
// identify type of each column
|
||||
@@ -85,7 +85,7 @@ class ColumnMatrix {
|
||||
|
||||
uint32_t max_val = std::numeric_limits<uint32_t>::max();
|
||||
for (int32_t fid = 0; fid < nfeature; ++fid) {
|
||||
CHECK_LE(gmat.cut.row_ptr[fid + 1] - gmat.cut.row_ptr[fid], max_val);
|
||||
CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
|
||||
}
|
||||
|
||||
gmat.GetFeatureCounts(&feature_counts_[0]);
|
||||
@@ -123,7 +123,7 @@ class ColumnMatrix {
|
||||
// store least bin id for each feature
|
||||
index_base_.resize(nfeature);
|
||||
for (int32_t fid = 0; fid < nfeature; ++fid) {
|
||||
index_base_[fid] = gmat.cut.row_ptr[fid];
|
||||
index_base_[fid] = gmat.cut.Ptrs()[fid];
|
||||
}
|
||||
|
||||
// pre-fill index_ for dense columns
|
||||
@@ -150,9 +150,9 @@ class ColumnMatrix {
|
||||
size_t fid = 0;
|
||||
for (size_t i = ibegin; i < iend; ++i) {
|
||||
const uint32_t bin_id = gmat.index[i];
|
||||
while (bin_id >= gmat.cut.row_ptr[fid + 1]) {
|
||||
++fid;
|
||||
}
|
||||
auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
|
||||
gmat.cut.Ptrs().cend(), bin_id);
|
||||
fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
|
||||
if (type_[fid] == kDenseColumn) {
|
||||
uint32_t* begin = &index_[boundary_[fid].index_begin];
|
||||
begin[rid] = bin_id - index_base_[fid];
|
||||
|
||||
Reference in New Issue
Block a user