Move ellpack page construction into DMatrix (#4833)
This commit is contained in:
@@ -21,6 +21,7 @@
|
||||
#include "../common/host_device_vector.h"
|
||||
#include "../common/timer.h"
|
||||
#include "../common/span.h"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "param.h"
|
||||
#include "updater_gpu_common.cuh"
|
||||
#include "constraints.cuh"
|
||||
@@ -108,83 +109,6 @@ inline static bool LossGuide(const ExpandEntry& lhs, const ExpandEntry& rhs) {
|
||||
}
|
||||
}
|
||||
|
||||
// Find a gidx value for a given feature otherwise return -1 if not found
|
||||
__forceinline__ __device__ int BinarySearchRow(
|
||||
bst_uint begin, bst_uint end,
|
||||
common::CompressedIterator<uint32_t> data,
|
||||
int const fidx_begin, int const fidx_end) {
|
||||
bst_uint previous_middle = UINT32_MAX;
|
||||
while (end != begin) {
|
||||
auto middle = begin + (end - begin) / 2;
|
||||
if (middle == previous_middle) {
|
||||
break;
|
||||
}
|
||||
previous_middle = middle;
|
||||
|
||||
auto gidx = data[middle];
|
||||
|
||||
if (gidx >= fidx_begin && gidx < fidx_end) {
|
||||
return gidx;
|
||||
} else if (gidx < fidx_begin) {
|
||||
begin = middle;
|
||||
} else {
|
||||
end = middle;
|
||||
}
|
||||
}
|
||||
// Value is missing
|
||||
return -1;
|
||||
}
|
||||
|
||||
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||
* device. Does not own underlying memory and may be trivially copied into
|
||||
* kernels.*/
|
||||
struct ELLPackMatrix {
|
||||
common::Span<uint32_t> feature_segments;
|
||||
/*! \brief minimum value for each feature. */
|
||||
common::Span<bst_float> min_fvalue;
|
||||
/*! \brief Cut. */
|
||||
common::Span<bst_float> gidx_fvalue_map;
|
||||
/*! \brief row length for ELLPack. */
|
||||
size_t row_stride{0};
|
||||
common::CompressedIterator<uint32_t> gidx_iter;
|
||||
bool is_dense;
|
||||
int null_gidx_value;
|
||||
|
||||
XGBOOST_DEVICE size_t BinCount() const { return gidx_fvalue_map.size(); }
|
||||
|
||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||
// Given a row index and a feature index, returns the corresponding cut value
|
||||
__device__ bst_float GetElement(size_t ridx, size_t fidx) const {
|
||||
auto row_begin = row_stride * ridx;
|
||||
auto row_end = row_begin + row_stride;
|
||||
auto gidx = -1;
|
||||
if (is_dense) {
|
||||
gidx = gidx_iter[row_begin + fidx];
|
||||
} else {
|
||||
gidx =
|
||||
BinarySearchRow(row_begin, row_end, gidx_iter, feature_segments[fidx],
|
||||
feature_segments[fidx + 1]);
|
||||
}
|
||||
if (gidx == -1) {
|
||||
return nan("");
|
||||
}
|
||||
return gidx_fvalue_map[gidx];
|
||||
}
|
||||
void Init(common::Span<uint32_t> feature_segments,
|
||||
common::Span<bst_float> min_fvalue,
|
||||
common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
|
||||
common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
|
||||
int null_gidx_value) {
|
||||
this->feature_segments = feature_segments;
|
||||
this->min_fvalue = min_fvalue;
|
||||
this->gidx_fvalue_map = gidx_fvalue_map;
|
||||
this->row_stride = row_stride;
|
||||
this->gidx_iter = gidx_iter;
|
||||
this->is_dense = is_dense;
|
||||
this->null_gidx_value = null_gidx_value;
|
||||
}
|
||||
};
|
||||
|
||||
// With constraints
|
||||
template <typename GradientPairT>
|
||||
XGBOOST_DEVICE float inline LossChangeMissing(
|
||||
@@ -247,7 +171,7 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
||||
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
||||
__device__ void EvaluateFeature(
|
||||
int fidx, common::Span<const GradientSumT> node_histogram,
|
||||
const ELLPackMatrix& matrix,
|
||||
const xgboost::ELLPackMatrix& matrix,
|
||||
DeviceSplitCandidate* best_split, // shared memory storing best split
|
||||
const DeviceNodeStats& node, const GPUTrainingParam& param,
|
||||
TempStorageT* temp_storage, // temp memory for cub operations
|
||||
@@ -322,7 +246,7 @@ __global__ void EvaluateSplitKernel(
|
||||
common::Span<const GradientSumT> node_histogram, // histogram for gradients
|
||||
common::Span<const int> feature_set, // Selected features
|
||||
DeviceNodeStats node,
|
||||
ELLPackMatrix matrix,
|
||||
xgboost::ELLPackMatrix matrix,
|
||||
GPUTrainingParam gpu_param,
|
||||
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
|
||||
ValueConstraint value_constraint,
|
||||
@@ -473,48 +397,8 @@ struct CalcWeightTrainParam {
|
||||
learning_rate(p.learning_rate) {}
|
||||
};
|
||||
|
||||
// Bin each input data entry, store the bin indices in compressed form.
|
||||
template<typename std::enable_if<true, int>::type = 0>
|
||||
__global__ void CompressBinEllpackKernel(
|
||||
common::CompressedBufferWriter wr,
|
||||
common::CompressedByteT* __restrict__ buffer, // gidx_buffer
|
||||
const size_t* __restrict__ row_ptrs, // row offset of input data
|
||||
const Entry* __restrict__ entries, // One batch of input data
|
||||
const float* __restrict__ cuts, // HistogramCuts::cut
|
||||
const uint32_t* __restrict__ cut_rows, // HistogramCuts::row_ptrs
|
||||
size_t base_row, // batch_row_begin
|
||||
size_t n_rows,
|
||||
size_t row_stride,
|
||||
unsigned int null_gidx_value) {
|
||||
size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
if (irow >= n_rows || ifeature >= row_stride) {
|
||||
return;
|
||||
}
|
||||
int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
|
||||
unsigned int bin = null_gidx_value;
|
||||
if (ifeature < row_length) {
|
||||
Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
|
||||
int feature = entry.index;
|
||||
float fvalue = entry.fvalue;
|
||||
// {feature_cuts, ncuts} forms the array of cuts of `feature'.
|
||||
const float *feature_cuts = &cuts[cut_rows[feature]];
|
||||
int ncuts = cut_rows[feature + 1] - cut_rows[feature];
|
||||
// Assigning the bin in current entry.
|
||||
// S.t.: fvalue < feature_cuts[bin]
|
||||
bin = dh::UpperBound(feature_cuts, ncuts, fvalue);
|
||||
if (bin >= ncuts) {
|
||||
bin = ncuts - 1;
|
||||
}
|
||||
// Add the number of bins in previous features.
|
||||
bin += cut_rows[feature];
|
||||
}
|
||||
// Write to gidx buffer.
|
||||
wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature);
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
__global__ void SharedMemHistKernel(ELLPackMatrix matrix,
|
||||
__global__ void SharedMemHistKernel(xgboost::ELLPackMatrix matrix,
|
||||
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
||||
GradientSumT* d_node_hist,
|
||||
const GradientPair* d_gpair, size_t n_elements,
|
||||
@@ -548,59 +432,17 @@ __global__ void SharedMemHistKernel(ELLPackMatrix matrix,
|
||||
}
|
||||
}
|
||||
|
||||
// Instances of this type are created while creating the histogram bins for the
|
||||
// entire dataset across multiple sparse page batches. This keeps track of the number
|
||||
// of rows to process from a batch and the position from which to process on each device.
|
||||
struct RowStateOnDevice {
|
||||
// Number of rows assigned to this device
|
||||
size_t total_rows_assigned_to_device;
|
||||
// Number of rows processed thus far
|
||||
size_t total_rows_processed;
|
||||
// Number of rows to process from the current sparse page batch
|
||||
size_t rows_to_process_from_batch;
|
||||
// Offset from the current sparse page batch to begin processing
|
||||
size_t row_offset_in_current_batch;
|
||||
|
||||
explicit RowStateOnDevice(size_t total_rows)
|
||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
||||
rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
|
||||
}
|
||||
|
||||
explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
|
||||
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
||||
rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
|
||||
}
|
||||
|
||||
// Advance the row state by the number of rows processed
|
||||
void Advance() {
|
||||
total_rows_processed += rows_to_process_from_batch;
|
||||
CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
|
||||
rows_to_process_from_batch = row_offset_in_current_batch = 0;
|
||||
}
|
||||
};
|
||||
|
||||
// Manage memory for a single GPU
|
||||
template <typename GradientSumT>
|
||||
struct DeviceShard {
|
||||
int n_bins;
|
||||
int device_id;
|
||||
EllpackPageImpl* page;
|
||||
|
||||
dh::BulkAllocator ba;
|
||||
|
||||
ELLPackMatrix ellpack_matrix;
|
||||
|
||||
std::unique_ptr<RowPartitioner> row_partitioner;
|
||||
DeviceHistogram<GradientSumT> hist{};
|
||||
|
||||
/*! \brief row_ptr form HistogramCuts. */
|
||||
common::Span<uint32_t> feature_segments;
|
||||
/*! \brief minimum value for each feature. */
|
||||
common::Span<bst_float> min_fvalue;
|
||||
/*! \brief Cut. */
|
||||
common::Span<bst_float> gidx_fvalue_map;
|
||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||
common::Span<common::CompressedByteT> gidx_buffer;
|
||||
|
||||
/*! \brief Gradient pair for each row. */
|
||||
common::Span<GradientPair> gpair;
|
||||
|
||||
@@ -631,11 +473,15 @@ struct DeviceShard {
|
||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||
std::unique_ptr<ExpandQueue> qexpand;
|
||||
|
||||
DeviceShard(int _device_id, bst_uint _n_rows, TrainParam _param, uint32_t column_sampler_seed,
|
||||
DeviceShard(int _device_id,
|
||||
EllpackPageImpl* _page,
|
||||
bst_uint _n_rows,
|
||||
TrainParam _param,
|
||||
uint32_t column_sampler_seed,
|
||||
uint32_t n_features)
|
||||
: device_id(_device_id),
|
||||
page(_page),
|
||||
n_rows(_n_rows),
|
||||
n_bins(0),
|
||||
param(std::move(_param)),
|
||||
prediction_cache_initialised(false),
|
||||
column_sampler(column_sampler_seed),
|
||||
@@ -643,12 +489,7 @@ struct DeviceShard {
|
||||
monitor.Init(std::string("DeviceShard") + std::to_string(device_id));
|
||||
}
|
||||
|
||||
void InitCompressedData(
|
||||
const common::HistogramCuts& hmat, size_t row_stride, bool is_dense);
|
||||
|
||||
void CreateHistIndices(
|
||||
const SparsePage &row_batch, const common::HistogramCuts &hmat,
|
||||
const RowStateOnDevice &device_row_state, int rows_per_batch);
|
||||
void InitHistogram();
|
||||
|
||||
~DeviceShard() { // NOLINT
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
@@ -762,7 +603,7 @@ struct DeviceShard {
|
||||
int constexpr kBlockThreads = 256;
|
||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
||||
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
||||
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
|
||||
hist.GetNodeHistogram(nidx), d_feature_set, node, page->ellpack_matrix,
|
||||
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
||||
monotone_constraints);
|
||||
|
||||
@@ -788,11 +629,11 @@ struct DeviceShard {
|
||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||
auto d_gpair = gpair.data();
|
||||
|
||||
auto n_elements = d_ridx.size() * ellpack_matrix.row_stride;
|
||||
auto n_elements = d_ridx.size() * page->ellpack_matrix.row_stride;
|
||||
|
||||
const size_t smem_size =
|
||||
use_shared_memory_histograms
|
||||
? sizeof(GradientSumT) * ellpack_matrix.BinCount()
|
||||
? sizeof(GradientSumT) * page->ellpack_matrix.BinCount()
|
||||
: 0;
|
||||
const int items_per_thread = 8;
|
||||
const int block_threads = 256;
|
||||
@@ -802,7 +643,7 @@ struct DeviceShard {
|
||||
return;
|
||||
}
|
||||
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>(
|
||||
ellpack_matrix, d_ridx, d_node_hist.data(), d_gpair, n_elements,
|
||||
page->ellpack_matrix, d_ridx, d_node_hist.data(), d_gpair, n_elements,
|
||||
use_shared_memory_histograms);
|
||||
}
|
||||
|
||||
@@ -812,7 +653,7 @@ struct DeviceShard {
|
||||
auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
|
||||
auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
|
||||
|
||||
dh::LaunchN(device_id, n_bins, [=] __device__(size_t idx) {
|
||||
dh::LaunchN(device_id, page->n_bins, [=] __device__(size_t idx) {
|
||||
d_node_hist_subtraction[idx] =
|
||||
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
||||
});
|
||||
@@ -827,7 +668,7 @@ struct DeviceShard {
|
||||
}
|
||||
|
||||
void UpdatePosition(int nidx, RegTree::Node split_node) {
|
||||
auto d_matrix = ellpack_matrix;
|
||||
auto d_matrix = page->ellpack_matrix;
|
||||
|
||||
row_partitioner->UpdatePosition(
|
||||
nidx, split_node.LeftChild(), split_node.RightChild(),
|
||||
@@ -859,7 +700,7 @@ struct DeviceShard {
|
||||
dh::safe_cuda(cudaMemcpy(d_nodes.data(), p_tree->GetNodes().data(),
|
||||
d_nodes.size() * sizeof(RegTree::Node),
|
||||
cudaMemcpyHostToDevice));
|
||||
auto d_matrix = ellpack_matrix;
|
||||
auto d_matrix = page->ellpack_matrix;
|
||||
row_partitioner->FinalisePosition(
|
||||
[=] __device__(bst_uint ridx, int position) {
|
||||
auto node = d_nodes[position];
|
||||
@@ -922,7 +763,7 @@ struct DeviceShard {
|
||||
reducer->AllReduceSum(
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
ellpack_matrix.BinCount() *
|
||||
page->ellpack_matrix.BinCount() *
|
||||
(sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||
reducer->Synchronize();
|
||||
|
||||
@@ -1097,11 +938,7 @@ struct DeviceShard {
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
const common::HistogramCuts &hmat, size_t row_stride, bool is_dense) {
|
||||
n_bins = hmat.Ptrs().back();
|
||||
int null_gidx_value = hmat.Ptrs().back();
|
||||
|
||||
inline void DeviceShard<GradientSumT>::InitHistogram() {
|
||||
CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
|
||||
<< "Max leaves and max depth cannot both be unconstrained for "
|
||||
"gpu_hist.";
|
||||
@@ -1113,163 +950,25 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
&gpair, n_rows,
|
||||
&prediction_cache, n_rows,
|
||||
&node_sum_gradients_d, max_nodes,
|
||||
&feature_segments, hmat.Ptrs().size(),
|
||||
&gidx_fvalue_map, hmat.Values().size(),
|
||||
&min_fvalue, hmat.MinValues().size(),
|
||||
&monotone_constraints, param.monotone_constraints.size());
|
||||
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
|
||||
dh::CopyVectorToDeviceSpan(monotone_constraints, param.monotone_constraints);
|
||||
|
||||
node_sum_gradients.resize(max_nodes);
|
||||
|
||||
// allocate compressed bin data
|
||||
int num_symbols = n_bins + 1;
|
||||
// Required buffer size for storing data matrix in ELLPack format.
|
||||
size_t compressed_size_bytes =
|
||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
|
||||
num_symbols);
|
||||
|
||||
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
|
||||
thrust::fill(
|
||||
thrust::device_pointer_cast(gidx_buffer.data()),
|
||||
thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
|
||||
|
||||
ellpack_matrix.Init(
|
||||
feature_segments, min_fvalue,
|
||||
gidx_fvalue_map, row_stride,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols),
|
||||
is_dense, null_gidx_value);
|
||||
// check if we can use shared memory for building histograms
|
||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
||||
// hiding)
|
||||
auto histogram_size = sizeof(GradientSumT) * hmat.Ptrs().back();
|
||||
auto histogram_size = sizeof(GradientSumT) * page->n_bins;
|
||||
auto max_smem = dh::MaxSharedMemory(device_id);
|
||||
if (histogram_size <= max_smem) {
|
||||
use_shared_memory_histograms = true;
|
||||
}
|
||||
|
||||
// Init histogram
|
||||
hist.Init(device_id, hmat.Ptrs().back());
|
||||
hist.Init(device_id, page->n_bins);
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
const SparsePage &row_batch,
|
||||
const common::HistogramCuts &hmat,
|
||||
const RowStateOnDevice &device_row_state,
|
||||
int rows_per_batch) {
|
||||
// Has any been allocated for me in this batch?
|
||||
if (!device_row_state.rows_to_process_from_batch) return;
|
||||
|
||||
unsigned int null_gidx_value = hmat.Ptrs().back();
|
||||
size_t row_stride = this->ellpack_matrix.row_stride;
|
||||
|
||||
const auto &offset_vec = row_batch.offset.ConstHostVector();
|
||||
|
||||
int num_symbols = n_bins + 1;
|
||||
// bin and compress entries in batches of rows
|
||||
size_t gpu_batch_nrows = std::min(
|
||||
dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)),
|
||||
static_cast<size_t>(device_row_state.rows_to_process_from_batch));
|
||||
const std::vector<Entry>& data_vec = row_batch.data.ConstHostVector();
|
||||
|
||||
size_t gpu_nbatches = common::DivRoundUp(device_row_state.rows_to_process_from_batch,
|
||||
gpu_batch_nrows);
|
||||
|
||||
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
|
||||
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
|
||||
size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
|
||||
if (batch_row_end > device_row_state.rows_to_process_from_batch) {
|
||||
batch_row_end = device_row_state.rows_to_process_from_batch;
|
||||
}
|
||||
size_t batch_nrows = batch_row_end - batch_row_begin;
|
||||
|
||||
const auto ent_cnt_begin =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_begin];
|
||||
const auto ent_cnt_end =
|
||||
offset_vec[device_row_state.row_offset_in_current_batch + batch_row_end];
|
||||
|
||||
/*! \brief row offset in SparsePage (the input data). */
|
||||
dh::device_vector<size_t> row_ptrs(batch_nrows+1);
|
||||
thrust::copy(
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_begin,
|
||||
offset_vec.data() + device_row_state.row_offset_in_current_batch + batch_row_end + 1,
|
||||
row_ptrs.begin());
|
||||
|
||||
// number of entries in this batch.
|
||||
size_t n_entries = ent_cnt_end - ent_cnt_begin;
|
||||
dh::device_vector<Entry> entries_d(n_entries);
|
||||
// copy data entries to device.
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy
|
||||
(entries_d.data().get(), data_vec.data() + ent_cnt_begin,
|
||||
n_entries * sizeof(Entry), cudaMemcpyDefault));
|
||||
const dim3 block3(32, 8, 1); // 256 threads
|
||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(row_stride, block3.y), 1);
|
||||
CompressBinEllpackKernel<<<grid3, block3>>>
|
||||
(common::CompressedBufferWriter(num_symbols),
|
||||
gidx_buffer.data(),
|
||||
row_ptrs.data().get(),
|
||||
entries_d.data().get(),
|
||||
gidx_fvalue_map.data(),
|
||||
feature_segments.data(),
|
||||
device_row_state.total_rows_processed + batch_row_begin,
|
||||
batch_nrows,
|
||||
row_stride,
|
||||
null_gidx_value);
|
||||
}
|
||||
}
|
||||
|
||||
// An instance of this type is created which keeps track of total number of rows to process,
|
||||
// rows processed thus far, rows to process and the offset from the current sparse page batch
|
||||
// to begin processing on each device
|
||||
class DeviceHistogramBuilderState {
|
||||
public:
|
||||
template <typename GradientSumT>
|
||||
explicit DeviceHistogramBuilderState(const std::unique_ptr<DeviceShard<GradientSumT>>& shard)
|
||||
: device_row_state_(shard->n_rows) {}
|
||||
|
||||
const RowStateOnDevice& GetRowStateOnDevice() const {
|
||||
return device_row_state_;
|
||||
}
|
||||
|
||||
// This method is invoked at the beginning of each sparse page batch. This distributes
|
||||
// the rows in the sparse page to the device.
|
||||
// TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
|
||||
void BeginBatch(const SparsePage &batch) {
|
||||
size_t rem_rows = batch.Size();
|
||||
size_t row_offset_in_current_batch = 0;
|
||||
|
||||
// Do we have anymore left to process from this batch on this device?
|
||||
if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
|
||||
// There are still some rows that needs to be assigned to this device
|
||||
device_row_state_.rows_to_process_from_batch =
|
||||
std::min(
|
||||
device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
|
||||
rem_rows);
|
||||
} else {
|
||||
// All rows have been assigned to this device
|
||||
device_row_state_.rows_to_process_from_batch = 0;
|
||||
}
|
||||
|
||||
device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
|
||||
row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
|
||||
rem_rows -= device_row_state_.rows_to_process_from_batch;
|
||||
}
|
||||
|
||||
// This method is invoked after completion of each sparse page batch
|
||||
void EndBatch() {
|
||||
device_row_state_.Advance();
|
||||
}
|
||||
|
||||
private:
|
||||
RowStateOnDevice device_row_state_{0};
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
class GPUHistMakerSpecialised {
|
||||
public:
|
||||
@@ -1319,47 +1018,33 @@ class GPUHistMakerSpecialised {
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
|
||||
// TODO(rongou): support multiple Ellpack pages.
|
||||
EllpackPageImpl* page{};
|
||||
for (auto& batch : dmat->GetBatches<EllpackPage>()) {
|
||||
page = batch.Impl();
|
||||
page->Init(device_, param_.max_bin, hist_maker_param_.gpu_batch_nrows);
|
||||
}
|
||||
|
||||
// Create device shard
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
shard_.reset(new DeviceShard<GradientSumT>(device_,
|
||||
page,
|
||||
info_->num_row_,
|
||||
param_,
|
||||
column_sampling_seed,
|
||||
info_->num_col_));
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts
|
||||
size_t row_stride = common::DeviceSketch(param_, *generic_param_,
|
||||
hist_maker_param_.gpu_batch_nrows,
|
||||
dmat, &hmat_);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
|
||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||
|
||||
// Init global data for each shard
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
dh::safe_cuda(cudaSetDevice(shard_->device_id));
|
||||
shard_->InitCompressedData(hmat_, row_stride, is_dense);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
|
||||
monitor_.StartCuda("BinningCompression");
|
||||
DeviceHistogramBuilderState hist_builder_row_state(shard_);
|
||||
for (const auto &batch : dmat->GetBatches<SparsePage>()) {
|
||||
hist_builder_row_state.BeginBatch(batch);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(shard_->device_id));
|
||||
shard_->CreateHistIndices(batch, hmat_, hist_builder_row_state.GetRowStateOnDevice(),
|
||||
hist_maker_param_.gpu_batch_nrows);
|
||||
|
||||
hist_builder_row_state.EndBatch();
|
||||
}
|
||||
monitor_.StopCuda("BinningCompression");
|
||||
monitor_.StartCuda("InitHistogram");
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
shard_->InitHistogram();
|
||||
monitor_.StopCuda("InitHistogram");
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised_ = true;
|
||||
}
|
||||
|
||||
void InitData(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat) {
|
||||
void InitData(DMatrix* dmat) {
|
||||
if (!initialised_) {
|
||||
monitor_.StartCuda("InitDataOnce");
|
||||
this->InitDataOnce(dmat);
|
||||
@@ -1387,7 +1072,7 @@ class GPUHistMakerSpecialised {
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
monitor_.StartCuda("InitData");
|
||||
this->InitData(gpair, p_fmat);
|
||||
this->InitData(p_fmat);
|
||||
monitor_.StopCuda("InitData");
|
||||
|
||||
gpair->SetDevice(device_);
|
||||
@@ -1408,7 +1093,6 @@ class GPUHistMakerSpecialised {
|
||||
}
|
||||
|
||||
TrainParam param_; // NOLINT
|
||||
common::HistogramCuts hmat_; // NOLINT
|
||||
MetaInfo* info_{}; // NOLINT
|
||||
|
||||
std::unique_ptr<DeviceShard<GradientSumT>> shard_; // NOLINT
|
||||
|
||||
Reference in New Issue
Block a user