486 lines
20 KiB
Plaintext
486 lines
20 KiB
Plaintext
/**
|
|
* Copyright 2020-2023 by XGBoost contributors
|
|
*
|
|
* \brief Front end and utilities for GPU based sketching. Works on sliding window
|
|
* instead of stream.
|
|
*/
|
|
#ifndef COMMON_HIST_UTIL_CUH_
|
|
#define COMMON_HIST_UTIL_CUH_
|
|
|
|
#include <thrust/host_vector.h>
|
|
|
|
#include <cstddef> // for size_t
|
|
|
|
#include "../data/adapter.h" // for IsValidFunctor
|
|
#include "device_helpers.cuh"
|
|
#include "hist_util.h"
|
|
#include "quantile.cuh"
|
|
#include "xgboost/span.h" // for IterSpan
|
|
|
|
#if defined(XGBOOST_USE_HIP)
|
|
namespace cub = hipcub;
|
|
#endif
|
|
|
|
namespace xgboost::common {
|
|
namespace cuda {
|
|
/**
|
|
* copy and paste of the host version, we can't make it a __host__ __device__ function as
|
|
* the fn might be a host only or device only callable object, which is not allowed by nvcc.
|
|
*/
|
|
template <typename Fn>
|
|
auto __device__ DispatchBinType(BinTypeSize type, Fn&& fn) {
|
|
switch (type) {
|
|
case kUint8BinsTypeSize: {
|
|
return fn(uint8_t{});
|
|
}
|
|
case kUint16BinsTypeSize: {
|
|
return fn(uint16_t{});
|
|
}
|
|
case kUint32BinsTypeSize: {
|
|
return fn(uint32_t{});
|
|
}
|
|
}
|
|
SPAN_CHECK(false);
|
|
return fn(uint32_t{});
|
|
}
|
|
} // namespace cuda
|
|
|
|
namespace detail {
|
|
struct EntryCompareOp {
|
|
__device__ bool operator()(const Entry& a, const Entry& b) {
|
|
if (a.index == b.index) {
|
|
return a.fvalue < b.fvalue;
|
|
}
|
|
return a.index < b.index;
|
|
}
|
|
};
|
|
|
|
// Get column size from adapter batch and for output cuts.
|
|
template <std::uint32_t kBlockThreads, typename CounterT, typename BatchIt>
|
|
__global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
|
data::IsValidFunctor is_valid,
|
|
Span<std::size_t> out_column_size) {
|
|
extern __shared__ char smem[];
|
|
|
|
auto smem_cs_ptr = reinterpret_cast<CounterT*>(smem);
|
|
|
|
dh::BlockFill(smem_cs_ptr, out_column_size.size(), 0);
|
|
|
|
cub::CTA_SYNC();
|
|
|
|
auto n = batch_iter.size();
|
|
|
|
for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n)) {
|
|
auto e = batch_iter[idx];
|
|
if (is_valid(e)) {
|
|
atomicAdd(&smem_cs_ptr[e.column_idx], static_cast<CounterT>(1));
|
|
}
|
|
}
|
|
|
|
cub::CTA_SYNC();
|
|
|
|
auto out_global_ptr = out_column_size;
|
|
for (auto i : dh::BlockStrideRange(static_cast<std::size_t>(0), out_column_size.size())) {
|
|
atomicAdd(&out_global_ptr[i], static_cast<std::size_t>(smem_cs_ptr[i]));
|
|
}
|
|
}
|
|
|
|
template <std::uint32_t kBlockThreads, typename Kernel>
|
|
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
|
|
int n_mps = 0;
|
|
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
|
|
int n_blocks_per_mp = 0;
|
|
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
|
kBlockThreads, shared_mem));
|
|
std::uint32_t grid_size = n_blocks_per_mp * n_mps;
|
|
return grid_size;
|
|
}
|
|
|
|
/**
|
|
* \brief Get the size of each column. This is a histogram with additional handling of
|
|
* invalid values.
|
|
*
|
|
* \tparam BatchIt Type of input adapter batch.
|
|
* \tparam force_use_global_memory Used for testing. Force global atomic add.
|
|
* \tparam force_use_u64 Used for testing. For u64 as counter in shared memory.
|
|
*
|
|
* \param device CUDA device ordinal.
|
|
* \param batch_iter Iterator for input data from adapter batch.
|
|
* \param is_valid Whehter an element is considered as missing.
|
|
* \param out_column_size Output buffer for the size of each column.
|
|
*/
|
|
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
|
|
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
|
|
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
|
|
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
|
|
|
|
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
|
|
// Not strictly correct as we should use number of samples to determine the type of
|
|
// counter. However, the sample size is not known due to sliding window on number of
|
|
// elements.
|
|
std::size_t n = batch_iter.size();
|
|
|
|
std::size_t required_shared_memory = 0;
|
|
bool use_u32{false};
|
|
if (!force_use_u64 && n < static_cast<std::size_t>(std::numeric_limits<std::uint32_t>::max())) {
|
|
required_shared_memory = out_column_size.size() * sizeof(std::uint32_t);
|
|
use_u32 = true;
|
|
} else {
|
|
required_shared_memory = out_column_size.size() * sizeof(std::size_t);
|
|
use_u32 = false;
|
|
}
|
|
bool use_shared = required_shared_memory <= max_shared_memory && required_shared_memory != 0;
|
|
|
|
if (!force_use_global_memory && use_shared) {
|
|
CHECK_NE(required_shared_memory, 0);
|
|
std::uint32_t constexpr kBlockThreads = 512;
|
|
if (use_u32) {
|
|
CHECK(!force_use_u64);
|
|
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
|
|
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
|
|
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
|
|
kernel, batch_iter, is_valid, out_column_size);
|
|
} else {
|
|
auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
|
|
auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
|
|
dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
|
|
kernel, batch_iter, is_valid, out_column_size);
|
|
}
|
|
} else {
|
|
auto d_out_column_size = out_column_size;
|
|
dh::LaunchN(batch_iter.size(), [=] __device__(size_t idx) {
|
|
auto e = batch_iter[idx];
|
|
if (is_valid(e)) {
|
|
atomicAdd(&d_out_column_size[e.column_idx], static_cast<size_t>(1));
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
template <typename BatchIt>
|
|
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
|
|
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
|
|
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
|
|
dh::caching_device_vector<size_t>* column_sizes_scan) {
|
|
column_sizes_scan->resize(num_columns + 1);
|
|
cuts_ptr->SetDevice(device);
|
|
cuts_ptr->Resize(num_columns + 1, 0);
|
|
|
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
|
auto d_column_sizes_scan = dh::ToSpan(*column_sizes_scan);
|
|
LaunchGetColumnSizeKernel(device, batch_iter, is_valid, d_column_sizes_scan);
|
|
// Calculate cuts CSC pointer
|
|
auto cut_ptr_it = dh::MakeTransformIterator<size_t>(
|
|
column_sizes_scan->begin(), [=] __device__(size_t column_size) {
|
|
return thrust::min(num_cuts_per_feature, column_size);
|
|
});
|
|
|
|
#if defined(XGBOOST_USE_CUDA)
|
|
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
|
|
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
|
|
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
|
|
column_sizes_scan->end(), column_sizes_scan->begin());
|
|
#elif defined(XGBOOST_USE_HIP)
|
|
thrust::exclusive_scan(thrust::hip::par(alloc), cut_ptr_it,
|
|
cut_ptr_it + column_sizes_scan->size(),
|
|
cuts_ptr->DevicePointer());
|
|
thrust::exclusive_scan(thrust::hip::par(alloc), column_sizes_scan->begin(),
|
|
column_sizes_scan->end(), column_sizes_scan->begin());
|
|
#endif
|
|
}
|
|
|
|
inline size_t constexpr BytesPerElement(bool has_weight) {
|
|
// Double the memory usage for sorting. We need to assign weight for each element, so
|
|
// sizeof(float) is added to all elements.
|
|
return (has_weight ? sizeof(Entry) + sizeof(float) : sizeof(Entry)) * 2;
|
|
}
|
|
|
|
/* \brief Calcuate the length of sliding window. Returns `sketch_batch_num_elements`
|
|
* directly if it's not 0.
|
|
*/
|
|
size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
|
|
bst_row_t num_rows, bst_feature_t columns,
|
|
size_t nnz, int device,
|
|
size_t num_cuts, bool has_weight);
|
|
|
|
// Compute number of sample cuts needed on local node to maintain accuracy
|
|
// We take more cuts than needed and then reduce them later
|
|
size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows);
|
|
|
|
/* \brief Estimate required memory for each sliding window.
|
|
*
|
|
* It's not precise as to obtain exact memory usage for sparse dataset we need to walk
|
|
* through the whole dataset first. Also if data is from host DMatrix, we copy the
|
|
* weight, group and offset on first batch, which is not considered in the function.
|
|
*
|
|
* \param num_rows Number of rows in this worker.
|
|
* \param num_columns Number of columns for this dataset.
|
|
* \param nnz Number of non-zero element. Put in something greater than rows *
|
|
* cols if nnz is unknown.
|
|
* \param num_bins Number of histogram bins.
|
|
* \param with_weights Whether weight is used, works the same for ranking and other models.
|
|
*
|
|
* \return The estimated bytes
|
|
*/
|
|
size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
|
|
size_t num_bins, bool with_weights);
|
|
|
|
// Count the valid entries in each column and copy them out.
|
|
template <typename AdapterBatch, typename BatchIter>
|
|
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
|
|
float missing, size_t columns, size_t cuts_per_feature,
|
|
DeviceOrd device,
|
|
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
|
|
dh::caching_device_vector<size_t>* column_sizes_scan,
|
|
dh::device_vector<Entry>* sorted_entries) {
|
|
auto entry_iter = dh::MakeTransformIterator<Entry>(
|
|
thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) {
|
|
return Entry(batch.GetElement(idx).column_idx, batch.GetElement(idx).value);
|
|
});
|
|
auto n = range.end() - range.begin();
|
|
auto span = IterSpan{batch_iter + range.begin(), n};
|
|
data::IsValidFunctor is_valid(missing);
|
|
// Work out how many valid entries we have in each column
|
|
GetColumnSizesScan(device, columns, cuts_per_feature, span, is_valid, cut_sizes_scan,
|
|
column_sizes_scan);
|
|
size_t num_valid = column_sizes_scan->back();
|
|
// Copy current subset of valid elements into temporary storage and sort
|
|
sorted_entries->resize(num_valid);
|
|
dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(), sorted_entries->begin(),
|
|
is_valid);
|
|
}
|
|
|
|
void SortByWeight(dh::device_vector<float>* weights,
|
|
dh::device_vector<Entry>* sorted_entries);
|
|
|
|
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
|
dh::device_vector<Entry>* p_sorted_entries,
|
|
dh::device_vector<float>* p_sorted_weights,
|
|
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
|
} // namespace detail
|
|
|
|
/**
|
|
* @brief Compute sketch on DMatrix with GPU and Hessian as weight.
|
|
*
|
|
* @param ctx Runtime context
|
|
* @param p_fmat Training feature matrix
|
|
* @param max_bin Maximum number of bins for each feature
|
|
* @param hessian Hessian vector.
|
|
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
|
|
*
|
|
* @return Quantile cuts
|
|
*/
|
|
HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
|
|
Span<float const> hessian,
|
|
std::size_t sketch_batch_num_elements = 0);
|
|
|
|
/**
|
|
* @brief Compute sketch on DMatrix with GPU.
|
|
*
|
|
* @param ctx Runtime context
|
|
* @param p_fmat Training feature matrix
|
|
* @param max_bin Maximum number of bins for each feature
|
|
* @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
|
|
*
|
|
* @return Quantile cuts
|
|
*/
|
|
inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
|
|
std::size_t sketch_batch_num_elements = 0) {
|
|
return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
|
|
}
|
|
|
|
template <typename AdapterBatch>
|
|
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
|
DeviceOrd device, size_t columns, size_t begin, size_t end,
|
|
float missing, SketchContainer *sketch_container,
|
|
int num_cuts) {
|
|
// Copy current subset of valid elements into temporary storage and sort
|
|
dh::device_vector<Entry> sorted_entries;
|
|
dh::caching_device_vector<size_t> column_sizes_scan;
|
|
auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
|
|
thrust::make_counting_iterator(0llu),
|
|
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
|
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
|
cuts_ptr.SetDevice(device);
|
|
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
|
columns, num_cuts, device,
|
|
&cuts_ptr,
|
|
&column_sizes_scan,
|
|
&sorted_entries);
|
|
dh::XGBDeviceAllocator<char> alloc;
|
|
|
|
#if defined(XGBOOST_USE_CUDA)
|
|
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
|
sorted_entries.end(), detail::EntryCompareOp());
|
|
#elif defined(XGBOOST_USE_HIP)
|
|
thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
|
|
sorted_entries.end(), detail::EntryCompareOp());
|
|
#endif
|
|
|
|
if (sketch_container->HasCategorical()) {
|
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
|
|
&column_sizes_scan);
|
|
}
|
|
|
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
|
auto const &h_cuts_ptr = cuts_ptr.HostVector();
|
|
// Extract the cuts from all columns concurrently
|
|
sketch_container->Push(dh::ToSpan(sorted_entries),
|
|
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
|
h_cuts_ptr.back());
|
|
sorted_entries.clear();
|
|
sorted_entries.shrink_to_fit();
|
|
}
|
|
|
|
template <typename Batch>
|
|
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
|
int num_cuts_per_feature,
|
|
bool is_ranking, float missing, DeviceOrd device,
|
|
size_t columns, size_t begin, size_t end,
|
|
SketchContainer *sketch_container) {
|
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
|
info.weights_.SetDevice(device);
|
|
auto weights = info.weights_.ConstDeviceSpan();
|
|
|
|
auto batch_iter = dh::MakeTransformIterator<data::COOTuple>(
|
|
thrust::make_counting_iterator(0llu),
|
|
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
|
dh::device_vector<Entry> sorted_entries;
|
|
dh::caching_device_vector<size_t> column_sizes_scan;
|
|
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
|
detail::MakeEntriesFromAdapter(batch, batch_iter,
|
|
{begin, end}, missing,
|
|
columns, num_cuts_per_feature, device,
|
|
&cuts_ptr,
|
|
&column_sizes_scan,
|
|
&sorted_entries);
|
|
data::IsValidFunctor is_valid(missing);
|
|
|
|
dh::device_vector<float> temp_weights(sorted_entries.size());
|
|
auto d_temp_weights = dh::ToSpan(temp_weights);
|
|
|
|
if (is_ranking) {
|
|
if (!weights.empty()) {
|
|
CHECK_EQ(weights.size(), info.group_ptr_.size() - 1);
|
|
}
|
|
dh::caching_device_vector<bst_group_t> group_ptr(info.group_ptr_);
|
|
auto d_group_ptr = dh::ToSpan(group_ptr);
|
|
auto const weight_iter = dh::MakeTransformIterator<float>(
|
|
thrust::make_counting_iterator(0lu), [=] __device__(size_t idx) -> float {
|
|
auto ridx = batch.GetElement(idx).row_idx;
|
|
bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx);
|
|
return weights[group_idx];
|
|
});
|
|
|
|
#if defined(XGBOOST_USE_CUDA)
|
|
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
|
weight_iter + begin, weight_iter + end,
|
|
batch_iter + begin,
|
|
d_temp_weights.data(), // output
|
|
is_valid);
|
|
#elif defined(XGBOOST_USE_HIP)
|
|
auto retit = thrust::copy_if(thrust::hip::par(alloc),
|
|
weight_iter + begin, weight_iter + end,
|
|
batch_iter + begin,
|
|
d_temp_weights.data(), // output
|
|
is_valid);
|
|
#endif
|
|
|
|
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
|
} else {
|
|
CHECK_EQ(batch.NumRows(), weights.size());
|
|
auto const weight_iter = dh::MakeTransformIterator<float>(
|
|
thrust::make_counting_iterator(0lu),
|
|
[=]__device__(size_t idx) -> float {
|
|
return weights[batch.GetElement(idx).row_idx];
|
|
});
|
|
|
|
#if defined(XGBOOST_USE_CUDA)
|
|
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
|
weight_iter + begin, weight_iter + end,
|
|
batch_iter + begin,
|
|
d_temp_weights.data(), // output
|
|
is_valid);
|
|
#elif defined(XGBOOST_USE_HIP)
|
|
auto retit = thrust::copy_if(thrust::hip::par(alloc),
|
|
weight_iter + begin, weight_iter + end,
|
|
batch_iter + begin,
|
|
d_temp_weights.data(), // output
|
|
is_valid);
|
|
#endif
|
|
|
|
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
|
}
|
|
|
|
detail::SortByWeight(&temp_weights, &sorted_entries);
|
|
|
|
if (sketch_container->HasCategorical()) {
|
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
|
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
|
|
&column_sizes_scan);
|
|
}
|
|
|
|
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
|
|
|
// Extract cuts
|
|
sketch_container->Push(dh::ToSpan(sorted_entries),
|
|
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
|
h_cuts_ptr.back(), dh::ToSpan(temp_weights));
|
|
sorted_entries.clear();
|
|
sorted_entries.shrink_to_fit();
|
|
}
|
|
|
|
/*
|
|
* \brief Perform sketching on GPU.
|
|
*
|
|
* \param batch A batch from adapter.
|
|
* \param num_bins Bins per column.
|
|
* \param info Metainfo used for sketching.
|
|
* \param missing Floating point value that represents invalid value.
|
|
* \param sketch_container Container for output sketch.
|
|
* \param sketch_batch_num_elements Number of element per-sliding window, use it only for
|
|
* testing.
|
|
*/
|
|
template <typename Batch>
|
|
void AdapterDeviceSketch(Batch batch, int num_bins,
|
|
MetaInfo const& info,
|
|
float missing, SketchContainer* sketch_container,
|
|
size_t sketch_batch_num_elements = 0) {
|
|
size_t num_rows = batch.NumRows();
|
|
size_t num_cols = batch.NumCols();
|
|
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
|
|
auto device = sketch_container->DeviceIdx();
|
|
bool weighted = !info.weights_.Empty();
|
|
|
|
if (weighted) {
|
|
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
|
sketch_batch_num_elements,
|
|
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
|
device.ordinal, num_cuts_per_feature, true);
|
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
|
size_t end =
|
|
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
|
ProcessWeightedSlidingWindow(batch, info,
|
|
num_cuts_per_feature,
|
|
HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
|
|
sketch_container);
|
|
}
|
|
} else {
|
|
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
|
sketch_batch_num_elements,
|
|
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
|
device.ordinal, num_cuts_per_feature, false);
|
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
|
size_t end =
|
|
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
|
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
|
sketch_container, num_cuts_per_feature);
|
|
}
|
|
}
|
|
}
|
|
} // namespace xgboost::common
|
|
#endif // COMMON_HIST_UTIL_CUH_
|