Fix CPU hist init for sparse dataset. (#4625)
* Fix CPU hist init for sparse dataset. * Implement sparse histogram cut. * Allow empty features. * Fix windows build, don't use sparse in distributed environment. * Comments. * Smaller threshold. * Fix windows omp. * Fix msvc lambda capture. * Fix MSVC macro. * Fix MSVC initialization list. * Fix MSVC initialization list x2. * Preserve categorical feature behavior. * Rename matrix to sparse cuts. * Reuse UseGroup. * Check for categorical data when adding cut. Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu> * Sanity check. * Fix comments. * Fix comment.
This commit is contained in:
committed by
Philip Hyunsu Cho
parent
b7a1f22d24
commit
d9a47794a5
@@ -170,7 +170,7 @@ void FeatureInteractionConstraint::ClearBuffers() {
|
||||
CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(
|
||||
dh::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
|
||||
common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
|
||||
ClearBuffersKernel<<<n_grids, kBlockThreads>>>(
|
||||
output_buffer_bits_, input_buffer_bits_);
|
||||
}
|
||||
@@ -227,7 +227,7 @@ common::Span<int32_t> FeatureInteractionConstraint::Query(
|
||||
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(
|
||||
dh::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
|
||||
common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
|
||||
SetInputBufferKernel<<<n_grids, kBlockThreads>>>(feature_list, input_buffer_bits_);
|
||||
|
||||
QueryFeatureListKernel<<<n_grids, kBlockThreads>>>(
|
||||
@@ -328,8 +328,8 @@ void FeatureInteractionConstraint::Split(
|
||||
BitField right = s_node_constraints_[right_id];
|
||||
|
||||
dim3 const block3(16, 64, 1);
|
||||
dim3 const grid3(dh::DivRoundUp(n_sets_, 16),
|
||||
dh::DivRoundUp(s_fconstraints_.size(), 64));
|
||||
dim3 const grid3(common::DivRoundUp(n_sets_, 16),
|
||||
common::DivRoundUp(s_fconstraints_.size(), 64));
|
||||
RestoreFeatureListFromSetsKernel<<<grid3, block3>>>
|
||||
(feature_buffer_,
|
||||
feature_id,
|
||||
@@ -339,7 +339,7 @@ void FeatureInteractionConstraint::Split(
|
||||
s_sets_ptr_);
|
||||
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(dh::DivRoundUp(node.Size(), kBlockThreads));
|
||||
const int n_grids = static_cast<int>(common::DivRoundUp(node.Size(), kBlockThreads));
|
||||
InteractionConstraintSplitKernel<<<n_grids, kBlockThreads>>>
|
||||
(feature_buffer_,
|
||||
feature_id,
|
||||
|
||||
@@ -76,7 +76,7 @@ static const int kNoneKey = -100;
|
||||
*/
|
||||
template <int BLKDIM_L1L3 = 256>
|
||||
int ScanTempBufferSize(int size) {
|
||||
int num_blocks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
||||
int num_blocks = common::DivRoundUp(size, BLKDIM_L1L3);
|
||||
return num_blocks;
|
||||
}
|
||||
|
||||
@@ -250,7 +250,7 @@ void ReduceScanByKey(common::Span<GradientPair> sums,
|
||||
common::Span<GradientPair> tmpScans,
|
||||
common::Span<int> tmpKeys,
|
||||
common::Span<const int> colIds, NodeIdT nodeStart) {
|
||||
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
||||
int nBlks = common::DivRoundUp(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair));
|
||||
CubScanByKeyL1<BLKDIM_L1L3>
|
||||
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
|
||||
@@ -448,7 +448,7 @@ void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
|
||||
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
||||
*(devices.begin()), nodeSplits.data(), nUniqKeys,
|
||||
ExactSplitCandidate());
|
||||
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
int nBlks = common::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
switch (algo) {
|
||||
case kAbkGmem:
|
||||
AtomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
|
||||
@@ -793,11 +793,11 @@ class GPUMaker : public TreeUpdater {
|
||||
const int BlkDim = 256;
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = dh::DivRoundUp(n_rows_, BlkDim);
|
||||
int nBlks = common::DivRoundUp(n_rows_, BlkDim);
|
||||
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.data(),
|
||||
nodes_.data(), n_rows_);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
|
||||
nBlks = common::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
|
||||
AssignNodeIds<<<nBlks, BlkDim>>>(
|
||||
node_assigns_per_inst_.data(), nodeLocations_.Current(),
|
||||
nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
|
||||
@@ -823,7 +823,7 @@ class GPUMaker : public TreeUpdater {
|
||||
|
||||
void MarkLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = dh::DivRoundUp(maxNodes_, BlkDim);
|
||||
int nBlks = common::DivRoundUp(maxNodes_, BlkDim);
|
||||
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.data(), maxNodes_);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -480,8 +480,8 @@ __global__ void CompressBinEllpackKernel(
|
||||
common::CompressedByteT* __restrict__ buffer, // gidx_buffer
|
||||
const size_t* __restrict__ row_ptrs, // row offset of input data
|
||||
const Entry* __restrict__ entries, // One batch of input data
|
||||
const float* __restrict__ cuts, // HistCutMatrix::cut
|
||||
const uint32_t* __restrict__ cut_rows, // HistCutMatrix::row_ptrs
|
||||
const float* __restrict__ cuts, // HistogramCuts::cut
|
||||
const uint32_t* __restrict__ cut_rows, // HistogramCuts::row_ptrs
|
||||
size_t base_row, // batch_row_begin
|
||||
size_t n_rows,
|
||||
size_t row_stride,
|
||||
@@ -593,7 +593,7 @@ struct DeviceShard {
|
||||
std::unique_ptr<RowPartitioner> row_partitioner;
|
||||
DeviceHistogram<GradientSumT> hist;
|
||||
|
||||
/*! \brief row_ptr form HistCutMatrix. */
|
||||
/*! \brief row_ptr form HistogramCuts. */
|
||||
common::Span<uint32_t> feature_segments;
|
||||
/*! \brief minimum value for each feature. */
|
||||
common::Span<bst_float> min_fvalue;
|
||||
@@ -654,10 +654,10 @@ struct DeviceShard {
|
||||
}
|
||||
|
||||
void InitCompressedData(
|
||||
const common::HistCutMatrix& hmat, size_t row_stride, bool is_dense);
|
||||
const common::HistogramCuts& hmat, size_t row_stride, bool is_dense);
|
||||
|
||||
void CreateHistIndices(
|
||||
const SparsePage &row_batch, const common::HistCutMatrix &hmat,
|
||||
const SparsePage &row_batch, const common::HistogramCuts &hmat,
|
||||
const RowStateOnDevice &device_row_state, int rows_per_batch);
|
||||
|
||||
~DeviceShard() {
|
||||
@@ -718,7 +718,7 @@ struct DeviceShard {
|
||||
// Work out cub temporary memory requirement
|
||||
GPUTrainingParam gpu_param(param);
|
||||
DeviceSplitCandidateReduceOp op(gpu_param);
|
||||
size_t temp_storage_bytes;
|
||||
size_t temp_storage_bytes = 0;
|
||||
DeviceSplitCandidate*dummy = nullptr;
|
||||
cub::DeviceReduce::Reduce(
|
||||
nullptr, temp_storage_bytes, dummy,
|
||||
@@ -806,7 +806,7 @@ struct DeviceShard {
|
||||
const int items_per_thread = 8;
|
||||
const int block_threads = 256;
|
||||
const int grid_size = static_cast<int>(
|
||||
dh::DivRoundUp(n_elements, items_per_thread * block_threads));
|
||||
common::DivRoundUp(n_elements, items_per_thread * block_threads));
|
||||
if (grid_size <= 0) {
|
||||
return;
|
||||
}
|
||||
@@ -1106,9 +1106,9 @@ struct DeviceShard {
|
||||
|
||||
template <typename GradientSumT>
|
||||
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
const common::HistCutMatrix &hmat, size_t row_stride, bool is_dense) {
|
||||
n_bins = hmat.row_ptr.back();
|
||||
int null_gidx_value = hmat.row_ptr.back();
|
||||
const common::HistogramCuts &hmat, size_t row_stride, bool is_dense) {
|
||||
n_bins = hmat.Ptrs().back();
|
||||
int null_gidx_value = hmat.Ptrs().back();
|
||||
|
||||
CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
|
||||
<< "Max leaves and max depth cannot both be unconstrained for "
|
||||
@@ -1121,14 +1121,14 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
&gpair, n_rows,
|
||||
&prediction_cache, n_rows,
|
||||
&node_sum_gradients_d, max_nodes,
|
||||
&feature_segments, hmat.row_ptr.size(),
|
||||
&gidx_fvalue_map, hmat.cut.size(),
|
||||
&min_fvalue, hmat.min_val.size(),
|
||||
&feature_segments, hmat.Ptrs().size(),
|
||||
&gidx_fvalue_map, hmat.Values().size(),
|
||||
&min_fvalue, hmat.MinValues().size(),
|
||||
&monotone_constraints, param.monotone_constraints.size());
|
||||
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.cut);
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.min_val);
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.row_ptr);
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
|
||||
dh::CopyVectorToDeviceSpan(monotone_constraints, param.monotone_constraints);
|
||||
|
||||
node_sum_gradients.resize(max_nodes);
|
||||
@@ -1153,26 +1153,26 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
// check if we can use shared memory for building histograms
|
||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
||||
// hiding)
|
||||
auto histogram_size = sizeof(GradientSumT) * hmat.row_ptr.back();
|
||||
auto histogram_size = sizeof(GradientSumT) * hmat.Ptrs().back();
|
||||
auto max_smem = dh::MaxSharedMemory(device_id);
|
||||
if (histogram_size <= max_smem) {
|
||||
use_shared_memory_histograms = true;
|
||||
}
|
||||
|
||||
// Init histogram
|
||||
hist.Init(device_id, hmat.NumBins());
|
||||
hist.Init(device_id, hmat.Ptrs().back());
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
const SparsePage &row_batch,
|
||||
const common::HistCutMatrix &hmat,
|
||||
const common::HistogramCuts &hmat,
|
||||
const RowStateOnDevice &device_row_state,
|
||||
int rows_per_batch) {
|
||||
// Has any been allocated for me in this batch?
|
||||
if (!device_row_state.rows_to_process_from_batch) return;
|
||||
|
||||
unsigned int null_gidx_value = hmat.row_ptr.back();
|
||||
unsigned int null_gidx_value = hmat.Ptrs().back();
|
||||
size_t row_stride = this->ellpack_matrix.row_stride;
|
||||
|
||||
const auto &offset_vec = row_batch.offset.ConstHostVector();
|
||||
@@ -1184,8 +1184,8 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
static_cast<size_t>(device_row_state.rows_to_process_from_batch));
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
size_t gpu_nbatches = dh::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
||||
gpu_batch_nrows);
|
||||
size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
||||
gpu_batch_nrows);
|
||||
|
||||
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
||||
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
|
||||
@@ -1216,8 +1216,8 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
(entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
const dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
|
||||
dh::DivRoundUp(row_stride, block3.y), 1);
|
||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(row_stride, block3.y), 1);
|
||||
CompressBinEllpackKernel<<<grid3, block3>>>
|
||||
(common::CompressedBufferWriter(num_symbols),
|
||||
gidx_buffer.data(),
|
||||
@@ -1361,13 +1361,13 @@ class GPUHistMakerSpecialised {
|
||||
});
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistCutMatrix
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts
|
||||
size_t row_stride = common::DeviceSketch(param_, *learner_param_,
|
||||
hist_maker_param_.gpu_batch_nrows,
|
||||
dmat, &hmat_);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
|
||||
n_bins_ = hmat_.row_ptr.back();
|
||||
n_bins_ = hmat_.Ptrs().back();
|
||||
|
||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||
|
||||
@@ -1475,9 +1475,9 @@ class GPUHistMakerSpecialised {
|
||||
return true;
|
||||
}
|
||||
|
||||
TrainParam param_; // NOLINT
|
||||
common::HistCutMatrix hmat_; // NOLINT
|
||||
MetaInfo* info_; // NOLINT
|
||||
TrainParam param_; // NOLINT
|
||||
common::HistogramCuts hmat_; // NOLINT
|
||||
MetaInfo* info_; // NOLINT
|
||||
|
||||
std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_; // NOLINT
|
||||
|
||||
|
||||
@@ -247,15 +247,15 @@ int32_t QuantileHistMaker::Builder::FindSplitCond(int32_t nid,
|
||||
// Categorize member rows
|
||||
const bst_uint fid = node.SplitIndex();
|
||||
const bst_float split_pt = node.SplitCond();
|
||||
const uint32_t lower_bound = gmat.cut.row_ptr[fid];
|
||||
const uint32_t upper_bound = gmat.cut.row_ptr[fid + 1];
|
||||
const uint32_t lower_bound = gmat.cut.Ptrs()[fid];
|
||||
const uint32_t upper_bound = gmat.cut.Ptrs()[fid + 1];
|
||||
int32_t split_cond = -1;
|
||||
// convert floating-point split_pt into corresponding bin_id
|
||||
// split_cond = -1 indicates that split_pt is less than all known cut points
|
||||
CHECK_LT(upper_bound,
|
||||
static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
|
||||
for (uint32_t i = lower_bound; i < upper_bound; ++i) {
|
||||
if (split_pt == gmat.cut.cut[i]) {
|
||||
if (split_pt == gmat.cut.Values()[i]) {
|
||||
split_cond = static_cast<int32_t>(i);
|
||||
}
|
||||
}
|
||||
@@ -533,7 +533,7 @@ void QuantileHistMaker::Builder::BuildHistsBatch(const std::vector<ExpandEntry>&
|
||||
perf_monitor.TickStart();
|
||||
const size_t block_size_rows = 256;
|
||||
const size_t nthread = static_cast<size_t>(this->nthread_);
|
||||
const size_t nbins = gmat.cut.row_ptr.back();
|
||||
const size_t nbins = gmat.cut.Ptrs().back();
|
||||
const size_t hist_size = 2 * nbins;
|
||||
|
||||
hist_buffers->resize(nodes.size());
|
||||
@@ -856,8 +856,8 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache(
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (int32_t k = 0; k < tasks_elem.size(); ++k) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (omp_ulong k = 0; k < tasks_elem.size(); ++k) {
|
||||
const RowSetCollection::Elem rowset = tasks_elem[k];
|
||||
if (rowset.begin != nullptr && rowset.end != nullptr && rowset.node_id != -1) {
|
||||
const size_t nrows = rowset.Size();
|
||||
@@ -909,7 +909,7 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
|
||||
// clear local prediction cache
|
||||
leaf_value_cache_.clear();
|
||||
// initialize histogram collection
|
||||
uint32_t nbins = gmat.cut.row_ptr.back();
|
||||
uint32_t nbins = gmat.cut.Ptrs().back();
|
||||
hist_.Init(nbins);
|
||||
hist_buff_.Init(nbins);
|
||||
|
||||
@@ -999,7 +999,7 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
|
||||
const size_t ncol = info.num_col_;
|
||||
const size_t nnz = info.num_nonzero_;
|
||||
// number of discrete bins for feature 0
|
||||
const uint32_t nbins_f0 = gmat.cut.row_ptr[1] - gmat.cut.row_ptr[0];
|
||||
const uint32_t nbins_f0 = gmat.cut.Ptrs()[1] - gmat.cut.Ptrs()[0];
|
||||
if (nrow * ncol == nnz) {
|
||||
// dense data with zero-based indexing
|
||||
data_layout_ = kDenseDataZeroBased;
|
||||
@@ -1029,7 +1029,7 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& gmat,
|
||||
choose the column that has a least positive number of discrete bins.
|
||||
For dense data (with no missing value),
|
||||
the sum of gradient histogram is equal to snode[nid] */
|
||||
const std::vector<uint32_t>& row_ptr = gmat.cut.row_ptr;
|
||||
const std::vector<uint32_t>& row_ptr = gmat.cut.Ptrs();
|
||||
const auto nfeature = static_cast<bst_uint>(row_ptr.size() - 1);
|
||||
uint32_t min_nbins_per_feature = 0;
|
||||
for (bst_uint i = 0; i < nfeature; ++i) {
|
||||
@@ -1079,8 +1079,8 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch(
|
||||
// partial results
|
||||
std::vector<std::pair<SplitEntry, SplitEntry>> splits(tasks.size());
|
||||
// parallel enumeration
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (int32_t i = 0; i < tasks.size(); ++i) {
|
||||
#pragma omp parallel for schedule(guided)
|
||||
for (omp_ulong i = 0; i < tasks.size(); ++i) {
|
||||
// node_idx : offset within `nodes` list
|
||||
const int32_t node_idx = tasks[i].first;
|
||||
const size_t fid = tasks[i].second;
|
||||
@@ -1098,7 +1098,7 @@ void QuantileHistMaker::Builder::EvaluateSplitsBatch(
|
||||
|
||||
// reduce needed part of a hist here to have it in cache before enumeration
|
||||
if (!rabit::IsDistributed()) {
|
||||
const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
|
||||
const std::vector<uint32_t>& cut_ptr = gmat.cut.Ptrs();
|
||||
const size_t ibegin = 2 * cut_ptr[fid];
|
||||
const size_t iend = 2 * cut_ptr[fid + 1];
|
||||
ReduceHistograms(hist_data, sibling_hist_data, parent_hist_data, ibegin, iend, node_idx,
|
||||
@@ -1179,8 +1179,8 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step,
|
||||
CHECK(d_step == +1 || d_step == -1);
|
||||
|
||||
// aliases
|
||||
const std::vector<uint32_t>& cut_ptr = gmat.cut.row_ptr;
|
||||
const std::vector<bst_float>& cut_val = gmat.cut.cut;
|
||||
const std::vector<uint32_t>& cut_ptr = gmat.cut.Ptrs();
|
||||
const std::vector<bst_float>& cut_val = gmat.cut.Values();
|
||||
|
||||
// statistics on both sides of split
|
||||
GradStats c;
|
||||
@@ -1239,7 +1239,7 @@ bool QuantileHistMaker::Builder::EnumerateSplit(int d_step,
|
||||
|
||||
if (i == imin) {
|
||||
// for leftmost bin, left bound is the smallest feature value
|
||||
split_pt = gmat.cut.min_val[fid];
|
||||
split_pt = gmat.cut.MinValues()[fid];
|
||||
} else {
|
||||
split_pt = cut_val[i - 1];
|
||||
}
|
||||
|
||||
@@ -33,7 +33,6 @@ namespace common {
|
||||
}
|
||||
namespace tree {
|
||||
|
||||
using xgboost::common::HistCutMatrix;
|
||||
using xgboost::common::GHistIndexMatrix;
|
||||
using xgboost::common::GHistIndexBlockMatrix;
|
||||
using xgboost::common::GHistIndexRow;
|
||||
|
||||
Reference in New Issue
Block a user