Optimizations for quantisation on device (#4572)
* - do not create device vectors for the entire sparse page while computing histograms...
- while creating the compressed histogram indices, the row vector is created for the entire
sparse page batch. this is needless as we only process chunks at a time based on a slice
of the total gpu memory
- this pr will allocate only as much as required to store the ppropriate row indices and the entries
* - do not dereference row_ptrs once the device_vector has been created to elide host copies of those counts
- instead, grab the entry counts directly from the sparsepage
This commit is contained in:
parent
ba1d848767
commit
6757654337
@ -483,8 +483,6 @@ __global__ void CompressBinEllpackKernel(
|
||||
const uint32_t* __restrict__ cut_rows, // HistCutMatrix::row_ptrs
|
||||
size_t base_row, // batch_row_begin
|
||||
size_t n_rows,
|
||||
// row_ptr_begin: row_offset[base_row], the start position of base_row
|
||||
size_t row_ptr_begin,
|
||||
size_t row_stride,
|
||||
unsigned int null_gidx_value) {
|
||||
size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
@ -495,7 +493,7 @@ __global__ void CompressBinEllpackKernel(
|
||||
int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
|
||||
unsigned int bin = null_gidx_value;
|
||||
if (ifeature < row_length) {
|
||||
Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature];
|
||||
Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
|
||||
int feature = entry.index;
|
||||
float fvalue = entry.fvalue;
|
||||
// {feature_cuts, ncuts} forms the array of cuts of `feature'.
|
||||
@ -697,8 +695,6 @@ struct DeviceShard {
|
||||
/*! \brief Sum gradient for each node. */
|
||||
std::vector<GradientPair> node_sum_gradients;
|
||||
common::Span<GradientPair> node_sum_gradients_d;
|
||||
/*! \brief On-device feature set, only actually used on one of the devices */
|
||||
dh::device_vector<int> feature_set_d;
|
||||
dh::device_vector<int64_t>
|
||||
left_counts; // Useful to keep a bunch of zeroed memory for sort position
|
||||
/*! The row offset for this shard. */
|
||||
@ -1311,14 +1307,6 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
size_t row_stride = this->ellpack_matrix.row_stride;
|
||||
|
||||
const auto &offset_vec = row_batch.offset.ConstHostVector();
|
||||
/*! \brief row offset in SparsePage (the input data). */
|
||||
CHECK_LE(device_row_state.rows_to_process_from_batch, offset_vec.size());
|
||||
dh::device_vector<size_t> row_ptrs(device_row_state.rows_to_process_from_batch+1);
|
||||
thrust::copy(
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch,
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch +
|
||||
device_row_state.rows_to_process_from_batch + 1,
|
||||
row_ptrs.begin());
|
||||
|
||||
int num_symbols = n_bins + 1;
|
||||
// bin and compress entries in batches of rows
|
||||
@ -1327,7 +1315,6 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
static_cast<size_t>(device_row_state.rows_to_process_from_batch));
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
dh::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
|
||||
size_t gpu_nbatches = dh::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
||||
gpu_batch_nrows);
|
||||
|
||||
@ -1339,35 +1326,41 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
}
|
||||
size_t batch_nrows = batch_row_end - batch_row_begin;
|
||||
|
||||
const auto ent_cnt_begin =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
|
||||
const auto ent_cnt_end =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
|
||||
|
||||
/*! \brief row offset in SparsePage (the input data). */
|
||||
dh::device_vector<size_t> row_ptrs(batch_nrows+1);
|
||||
thrust::copy(
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
|
||||
row_ptrs.begin());
|
||||
|
||||
// number of entries in this batch.
|
||||
size_t n_entries = row_ptrs[batch_row_end] - row_ptrs[batch_row_begin];
|
||||
size_t n_entries = ent_cnt_end - ent_cnt_begin;
|
||||
dh::device_vector<Entry> entries_d(n_entries);
|
||||
// copy data entries to device.
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy
|
||||
(entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin],
|
||||
(entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
const dim3 grid3(dh::DivRoundUp(device_row_state.rows_to_process_from_batch, block3.x),
|
||||
const dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
|
||||
dh::DivRoundUp(row_stride, block3.y), 1);
|
||||
CompressBinEllpackKernel<<<grid3, block3>>>
|
||||
(common::CompressedBufferWriter(num_symbols),
|
||||
gidx_buffer.data(),
|
||||
row_ptrs.data().get() + batch_row_begin,
|
||||
row_ptrs.data().get(),
|
||||
entries_d.data().get(),
|
||||
gidx_fvalue_map.data(),
|
||||
feature_segments.data(),
|
||||
device_row_state.total_rows_processed + batch_row_begin,
|
||||
batch_nrows,
|
||||
row_ptrs[batch_row_begin],
|
||||
row_stride,
|
||||
null_gidx_value);
|
||||
}
|
||||
|
||||
// free the memory that is no longer needed
|
||||
row_ptrs.resize(0);
|
||||
row_ptrs.shrink_to_fit();
|
||||
entries_d.resize(0);
|
||||
entries_d.shrink_to_fit();
|
||||
}
|
||||
|
||||
// An instance of this type is created which keeps track of total number of rows to process,
|
||||
|
||||
@ -477,7 +477,7 @@ TEST(GpuHist, SortPosition) {
|
||||
TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
|
||||
}
|
||||
|
||||
TEST(GpuHist, TestHistogramIndex) {
|
||||
void TestHistogramIndexImpl(int n_gpus) {
|
||||
// Test if the compressed histogram index matches when using a sparse
|
||||
// dmatrix with and without using external memory
|
||||
|
||||
@ -491,23 +491,26 @@ TEST(GpuHist, TestHistogramIndex) {
|
||||
CreateSparsePageDMatrixWithRC(kNRows, kNCols, 128UL, true));
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> training_params = {
|
||||
{"max_depth", "1"},
|
||||
{"max_depth", "10"},
|
||||
{"max_leaves", "0"}
|
||||
};
|
||||
|
||||
LearnerTrainParam learner_param(CreateEmptyGenericParam(0, 1));
|
||||
LearnerTrainParam learner_param(CreateEmptyGenericParam(0, n_gpus));
|
||||
hist_maker.Init(training_params, &learner_param);
|
||||
hist_maker.InitDataOnce(hist_maker_dmat.get());
|
||||
hist_maker_ext.Init(training_params, &learner_param);
|
||||
hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
|
||||
|
||||
ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size());
|
||||
|
||||
// Extract the device shards from the histogram makers and from that its compressed
|
||||
// histogram index
|
||||
const auto &dev_shard = hist_maker.shards_[0];
|
||||
for (size_t i = 0; i < hist_maker.shards_.size(); ++i) {
|
||||
const auto &dev_shard = hist_maker.shards_[i];
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);
|
||||
|
||||
const auto &dev_shard_ext = hist_maker_ext.shards_[0];
|
||||
const auto &dev_shard_ext = hist_maker_ext.shards_[i];
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);
|
||||
|
||||
@ -516,6 +519,19 @@ TEST(GpuHist, TestHistogramIndex) {
|
||||
|
||||
ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GpuHist, TestHistogramIndex) {
|
||||
TestHistogramIndexImpl(1);
|
||||
}
|
||||
|
||||
#if defined(XGBOOST_USE_NCCL)
|
||||
TEST(GpuHist, MGPU_TestHistogramIndex) {
|
||||
auto devices = GPUSet::AllVisible();
|
||||
CHECK_GT(devices.Size(), 1);
|
||||
TestHistogramIndexImpl(-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user