Deterministic GPU histogram. (#5361)
* Use pre-rounding based method to obtain reproducible floating point summation. * GPU Hist for regression and classification are bit-by-bit reproducible. * Add doc. * Switch to thrust reduce for `node_sum_gradient`.
This commit is contained in:
@@ -16,12 +16,12 @@
|
||||
#include "xgboost/base.h"
|
||||
#include "xgboost/tree_model.h"
|
||||
|
||||
#if defined(XGBOOST_STRICT_R_MODE)
|
||||
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
|
||||
#define OBSERVER_PRINT LOG(INFO)
|
||||
#define OBSERVER_ENDL ""
|
||||
#define OBSERVER_NEWLINE ""
|
||||
#else
|
||||
#define OBSERVER_PRINT std::cout
|
||||
#define OBSERVER_PRINT std::cout << std::setprecision(17)
|
||||
#define OBSERVER_ENDL std::endl
|
||||
#define OBSERVER_NEWLINE "\n"
|
||||
#endif // defined(XGBOOST_STRICT_R_MODE)
|
||||
|
||||
@@ -29,14 +29,14 @@ bool EllpackPageSource::Next() {
|
||||
EllpackPage& EllpackPageSource::Value() {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
EllpackPage* page;
|
||||
EllpackPage* page { nullptr };
|
||||
return *page;
|
||||
}
|
||||
|
||||
const EllpackPage& EllpackPageSource::Value() const {
|
||||
LOG(FATAL) << "Internal Error: "
|
||||
"XGBoost is not compiled with CUDA but EllpackPageSource is required";
|
||||
EllpackPage* page;
|
||||
EllpackPage* page { nullptr };
|
||||
return *page;
|
||||
}
|
||||
|
||||
|
||||
@@ -734,6 +734,7 @@ class LearnerImpl : public Learner {
|
||||
|
||||
monitor_.Start("PredictRaw");
|
||||
this->PredictRaw(train.get(), &predt, true);
|
||||
TrainingObserver::Instance().Observe(predt.predictions, "Predictions");
|
||||
monitor_.Stop("PredictRaw");
|
||||
|
||||
monitor_.Start("GetGradient");
|
||||
|
||||
@@ -25,6 +25,7 @@ class SamplingStrategy {
|
||||
public:
|
||||
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
|
||||
virtual GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) = 0;
|
||||
virtual ~SamplingStrategy() = default;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in in-memory mode. */
|
||||
|
||||
184
src/tree/gpu_hist/histogram.cu
Normal file
184
src/tree/gpu_hist/histogram.cu
Normal file
@@ -0,0 +1,184 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
*/
|
||||
#include <thrust/reduce.h>
|
||||
#include <thrust/iterator/transform_iterator.h>
|
||||
#include <algorithm>
|
||||
#include <ctgmath>
|
||||
#include <limits>
|
||||
|
||||
#include "xgboost/base.h"
|
||||
#include "row_partitioner.cuh"
|
||||
|
||||
#include "histogram.cuh"
|
||||
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
#include "../../common/device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
// Following 2 functions are slightly modifed version of fbcuda.
|
||||
|
||||
/* \brief Constructs a rounding factor used to truncate elements in a sum such that the
|
||||
sum of the truncated elements is the same no matter what the order of the sum is.
|
||||
|
||||
* Algorithm 5: Reproducible Sequential Sum in 'Fast Reproducible Floating-Point
|
||||
* Summation' by Demmel and Nguyen
|
||||
|
||||
* In algorithm 5 the bound is calculated as $max(|v_i|) * n$. Here we use the bound
|
||||
*
|
||||
* \begin{equation}
|
||||
* max( fl(\sum^{V}_{v_i>0}{v_i}), fl(\sum^{V}_{v_i<0}|v_i|) )
|
||||
* \end{equation}
|
||||
*
|
||||
* to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
|
||||
*/
|
||||
template <typename T>
|
||||
DEV_INLINE __host__ T CreateRoundingFactor(T max_abs, int n) {
|
||||
T delta = max_abs / (static_cast<T>(1.0) - 2 * n * std::numeric_limits<T>::epsilon());
|
||||
|
||||
// Calculate ceil(log_2(delta)).
|
||||
// frexpf() calculates exp and returns `x` such that
|
||||
// delta = x * 2^exp, where `x` in (-1.0, -0.5] U [0.5, 1).
|
||||
// Because |x| < 1, exp is exactly ceil(log_2(delta)).
|
||||
int exp;
|
||||
std::frexp(delta, &exp);
|
||||
|
||||
// return M = 2 ^ ceil(log_2(delta))
|
||||
return std::ldexp(static_cast<T>(1.0), exp);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct Pair {
|
||||
GradientPair first;
|
||||
GradientPair second;
|
||||
};
|
||||
DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
|
||||
return {lhs.first + rhs.first, lhs.second + rhs.second};
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
struct Clip : public thrust::unary_function<GradientPair, Pair> {
|
||||
static DEV_INLINE float Pclip(float v) {
|
||||
return v > 0 ? v : 0;
|
||||
}
|
||||
static DEV_INLINE float Nclip(float v) {
|
||||
return v < 0 ? abs(v) : 0;
|
||||
}
|
||||
|
||||
DEV_INLINE Pair operator()(GradientPair x) const {
|
||||
auto pg = Pclip(x.GetGrad());
|
||||
auto ph = Pclip(x.GetHess());
|
||||
|
||||
auto ng = Nclip(x.GetGrad());
|
||||
auto nh = Nclip(x.GetHess());
|
||||
|
||||
return { GradientPair{ pg, ph }, GradientPair{ ng, nh } };
|
||||
}
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair) {
|
||||
using T = typename GradientSumT::ValueT;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
thrust::device_ptr<GradientPair const> gpair_beg {gpair.data()};
|
||||
thrust::device_ptr<GradientPair const> gpair_end {gpair.data() + gpair.size()};
|
||||
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
|
||||
auto end = thrust::make_transform_iterator(gpair_end, Clip());
|
||||
Pair p = thrust::reduce(thrust::cuda::par(alloc), beg, end, Pair{});
|
||||
GradientPair positive_sum {p.first}, negative_sum {p.second};
|
||||
|
||||
auto histogram_rounding = GradientSumT {
|
||||
CreateRoundingFactor<T>(std::max(positive_sum.GetGrad(), negative_sum.GetGrad()),
|
||||
gpair.size()),
|
||||
CreateRoundingFactor<T>(std::max(positive_sum.GetHess(), negative_sum.GetHess()),
|
||||
gpair.size()) };
|
||||
return histogram_rounding;
|
||||
}
|
||||
|
||||
template GradientPairPrecise CreateRoundingFactor(common::Span<GradientPair const> gpair);
|
||||
template GradientPair CreateRoundingFactor(common::Span<GradientPair const> gpair);
|
||||
|
||||
template <typename GradientSumT>
|
||||
__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
||||
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
||||
GradientSumT* __restrict__ d_node_hist,
|
||||
const GradientPair* __restrict__ d_gpair,
|
||||
size_t n_elements,
|
||||
GradientSumT const rounding,
|
||||
bool use_shared_memory_histograms) {
|
||||
using T = typename GradientSumT::ValueT;
|
||||
extern __shared__ char smem[];
|
||||
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
||||
if (use_shared_memory_histograms) {
|
||||
dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT());
|
||||
__syncthreads();
|
||||
}
|
||||
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
||||
int ridx = d_ridx[idx / matrix.info.row_stride];
|
||||
int gidx =
|
||||
matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride];
|
||||
if (gidx != matrix.info.n_bins) {
|
||||
GradientSumT truncated {
|
||||
TruncateWithRoundingFactor<T>(rounding.GetGrad(), d_gpair[ridx].GetGrad()),
|
||||
TruncateWithRoundingFactor<T>(rounding.GetHess(), d_gpair[ridx].GetHess()),
|
||||
};
|
||||
// If we are not using shared memory, accumulate the values directly into
|
||||
// global memory
|
||||
GradientSumT* atomic_add_ptr =
|
||||
use_shared_memory_histograms ? smem_arr : d_node_hist;
|
||||
dh::AtomicAddGpair(atomic_add_ptr + gidx, truncated);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_shared_memory_histograms) {
|
||||
// Write shared memory back to global memory
|
||||
__syncthreads();
|
||||
for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.info.n_bins)) {
|
||||
GradientSumT truncated {
|
||||
TruncateWithRoundingFactor<T>(rounding.GetGrad(), smem_arr[i].GetGrad()),
|
||||
TruncateWithRoundingFactor<T>(rounding.GetHess(), smem_arr[i].GetHess()),
|
||||
};
|
||||
dh::AtomicAddGpair(d_node_hist + i, truncated);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
void BuildGradientHistogram(EllpackMatrix const& matrix,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const uint32_t> d_ridx,
|
||||
common::Span<GradientSumT> histogram,
|
||||
GradientSumT rounding, bool shared) {
|
||||
const size_t smem_size =
|
||||
shared
|
||||
? sizeof(GradientSumT) * matrix.info.n_bins
|
||||
: 0;
|
||||
auto n_elements = d_ridx.size() * matrix.info.row_stride;
|
||||
|
||||
uint32_t items_per_thread = 8;
|
||||
uint32_t block_threads = 256;
|
||||
auto grid_size = static_cast<uint32_t>(
|
||||
common::DivRoundUp(n_elements, items_per_thread * block_threads));
|
||||
dh::LaunchKernel {grid_size, block_threads, smem_size} (
|
||||
SharedMemHistKernel<GradientSumT>,
|
||||
matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
|
||||
rounding, shared);
|
||||
}
|
||||
|
||||
template void BuildGradientHistogram<GradientPair>(
|
||||
EllpackMatrix const& matrix,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const uint32_t> ridx,
|
||||
common::Span<GradientPair> histogram,
|
||||
GradientPair rounding, bool shared);
|
||||
|
||||
template void BuildGradientHistogram<GradientPairPrecise>(
|
||||
EllpackMatrix const& matrix,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const uint32_t> ridx,
|
||||
common::Span<GradientPairPrecise> histogram,
|
||||
GradientPairPrecise rounding, bool shared);
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
29
src/tree/gpu_hist/histogram.cuh
Normal file
29
src/tree/gpu_hist/histogram.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
/*!
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef HISTOGRAM_CUH_
|
||||
#define HISTOGRAM_CUH_
|
||||
#include <thrust/transform.h>
|
||||
#include "../../data/ellpack_page.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
template <typename GradientSumT>
|
||||
GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair);
|
||||
|
||||
template <typename T>
|
||||
DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x) {
|
||||
return (rounding_factor + static_cast<T>(x)) - rounding_factor;
|
||||
}
|
||||
|
||||
template <typename GradientSumT>
|
||||
void BuildGradientHistogram(EllpackMatrix const& matrix,
|
||||
common::Span<GradientPair const> gpair,
|
||||
common::Span<const uint32_t> ridx,
|
||||
common::Span<GradientSumT> histogram,
|
||||
GradientSumT rounding, bool shared);
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // HISTOGRAM_CUH_
|
||||
@@ -91,6 +91,16 @@ struct DeviceSplitCandidate {
|
||||
}
|
||||
}
|
||||
XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
|
||||
os << "loss_chg:" << c.loss_chg << ", "
|
||||
<< "dir: " << c.dir << ", "
|
||||
<< "findex: " << c.findex << ", "
|
||||
<< "fvalue: " << c.fvalue << ", "
|
||||
<< "left sum: " << c.left_sum << ", "
|
||||
<< "right sum: " << c.right_sum << std::endl;
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
struct DeviceSplitCandidateReduceOp {
|
||||
@@ -186,6 +196,5 @@ struct SumCallbackOp {
|
||||
XGBOOST_DEVICE inline int MaxNodesDepth(int depth) {
|
||||
return (1 << (depth + 1)) - 1;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2017-2019 XGBoost contributors
|
||||
* Copyright 2017-2020 XGBoost contributors
|
||||
*/
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/functional.h>
|
||||
@@ -31,10 +31,10 @@
|
||||
#include "constraints.cuh"
|
||||
#include "gpu_hist/gradient_based_sampler.cuh"
|
||||
#include "gpu_hist/row_partitioner.cuh"
|
||||
#include "gpu_hist/histogram.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
#if !defined(GTEST_TEST)
|
||||
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
#endif // !defined(GTEST_TEST)
|
||||
@@ -43,6 +43,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
|
||||
struct GPUHistMakerTrainParam
|
||||
: public XGBoostParameter<GPUHistMakerTrainParam> {
|
||||
bool single_precision_histogram;
|
||||
bool deterministic_histogram;
|
||||
// number of rows in a single GPU batch
|
||||
int gpu_batch_nrows;
|
||||
bool debug_synchronize;
|
||||
@@ -50,6 +51,8 @@ struct GPUHistMakerTrainParam
|
||||
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
|
||||
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
|
||||
"Use single precision to build histograms.");
|
||||
DMLC_DECLARE_FIELD(deterministic_histogram).set_default(true).describe(
|
||||
"Pre-round the gradient for obtaining deterministic gradient histogram.");
|
||||
DMLC_DECLARE_FIELD(gpu_batch_nrows)
|
||||
.set_lower_bound(-1)
|
||||
.set_default(0)
|
||||
@@ -336,6 +339,9 @@ class DeviceHistogram {
|
||||
bool HistogramExists(int nidx) const {
|
||||
return nidx_map_.find(nidx) != nidx_map_.cend();
|
||||
}
|
||||
int Bins() const {
|
||||
return n_bins_;
|
||||
}
|
||||
size_t HistogramSize() const {
|
||||
return n_bins_ * kNumItemsInGradientSum;
|
||||
}
|
||||
@@ -402,40 +408,6 @@ struct CalcWeightTrainParam {
|
||||
learning_rate(p.learning_rate) {}
|
||||
};
|
||||
|
||||
template <typename GradientSumT>
|
||||
__global__ void SharedMemHistKernel(xgboost::EllpackMatrix matrix,
|
||||
common::Span<const RowPartitioner::RowIndexT> d_ridx,
|
||||
GradientSumT* d_node_hist,
|
||||
const GradientPair* d_gpair, size_t n_elements,
|
||||
bool use_shared_memory_histograms) {
|
||||
extern __shared__ char smem[];
|
||||
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
||||
if (use_shared_memory_histograms) {
|
||||
dh::BlockFill(smem_arr, matrix.info.n_bins, GradientSumT());
|
||||
__syncthreads();
|
||||
}
|
||||
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
||||
int ridx = d_ridx[idx / matrix.info.row_stride];
|
||||
int gidx =
|
||||
matrix.gidx_iter[ridx * matrix.info.row_stride + idx % matrix.info.row_stride];
|
||||
if (gidx != matrix.info.n_bins) {
|
||||
// If we are not using shared memory, accumulate the values directly into
|
||||
// global memory
|
||||
GradientSumT* atomic_add_ptr =
|
||||
use_shared_memory_histograms ? smem_arr : d_node_hist;
|
||||
dh::AtomicAddGpair(atomic_add_ptr + gidx, d_gpair[ridx]);
|
||||
}
|
||||
}
|
||||
|
||||
if (use_shared_memory_histograms) {
|
||||
// Write shared memory back to global memory
|
||||
__syncthreads();
|
||||
for (auto i : dh::BlockStrideRange(static_cast<size_t>(0), matrix.info.n_bins)) {
|
||||
dh::AtomicAddGpair(d_node_hist + i, smem_arr[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Manage memory for a single GPU
|
||||
template <typename GradientSumT>
|
||||
struct GPUHistMakerDevice {
|
||||
@@ -460,9 +432,12 @@ struct GPUHistMakerDevice {
|
||||
bst_uint n_rows;
|
||||
|
||||
TrainParam param;
|
||||
bool deterministic_histogram;
|
||||
bool prediction_cache_initialised;
|
||||
bool use_shared_memory_histograms {false};
|
||||
|
||||
GradientSumT histogram_rounding;
|
||||
|
||||
dh::CubMemory temp_memory;
|
||||
dh::PinnedMemory pinned_memory;
|
||||
|
||||
@@ -486,6 +461,7 @@ struct GPUHistMakerDevice {
|
||||
TrainParam _param,
|
||||
uint32_t column_sampler_seed,
|
||||
uint32_t n_features,
|
||||
bool deterministic_histogram,
|
||||
BatchParam _batch_param)
|
||||
: device_id(_device_id),
|
||||
page(_page),
|
||||
@@ -494,6 +470,7 @@ struct GPUHistMakerDevice {
|
||||
prediction_cache_initialised(false),
|
||||
column_sampler(column_sampler_seed),
|
||||
interaction_constraints(param, n_features),
|
||||
deterministic_histogram{deterministic_histogram},
|
||||
batch_param(_batch_param) {
|
||||
sampler.reset(new GradientBasedSampler(page,
|
||||
n_rows,
|
||||
@@ -551,6 +528,12 @@ struct GPUHistMakerDevice {
|
||||
page = sample.page;
|
||||
gpair = sample.gpair;
|
||||
|
||||
if (deterministic_histogram) {
|
||||
histogram_rounding = CreateRoundingFactor<GradientSumT>(this->gpair);
|
||||
} else {
|
||||
histogram_rounding = GradientSumT{0.0, 0.0};
|
||||
}
|
||||
|
||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||
row_partitioner.reset(new RowPartitioner(device_id, n_rows));
|
||||
hist.Reset();
|
||||
@@ -644,20 +627,8 @@ struct GPUHistMakerDevice {
|
||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||
auto d_gpair = gpair.data();
|
||||
|
||||
auto n_elements = d_ridx.size() * page->matrix.info.row_stride;
|
||||
|
||||
const size_t smem_size =
|
||||
use_shared_memory_histograms
|
||||
? sizeof(GradientSumT) * page->matrix.info.n_bins
|
||||
: 0;
|
||||
uint32_t items_per_thread = 8;
|
||||
uint32_t block_threads = 256;
|
||||
auto grid_size = static_cast<uint32_t>(
|
||||
common::DivRoundUp(n_elements, items_per_thread * block_threads));
|
||||
dh::LaunchKernel {grid_size, block_threads, smem_size} (
|
||||
SharedMemHistKernel<GradientSumT>,
|
||||
page->matrix, d_ridx, d_node_hist.data(), d_gpair, n_elements,
|
||||
use_shared_memory_histograms);
|
||||
BuildGradientHistogram(page->matrix, gpair, d_ridx, d_node_hist,
|
||||
histogram_rounding, use_shared_memory_histograms);
|
||||
}
|
||||
|
||||
void SubtractionTrick(int nidx_parent, int nidx_histogram,
|
||||
@@ -707,7 +678,7 @@ struct GPUHistMakerDevice {
|
||||
// After tree update is finished, update the position of all training
|
||||
// instances to their final leaf. This information is used later to update the
|
||||
// prediction cache
|
||||
void FinalisePosition(RegTree* p_tree, DMatrix* p_fmat) {
|
||||
void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat) {
|
||||
const auto d_nodes =
|
||||
temp_memory.GetSpan<RegTree::Node>(p_tree->GetNodes().size());
|
||||
dh::safe_cuda(cudaMemcpy(d_nodes.data(), p_tree->GetNodes().data(),
|
||||
@@ -870,16 +841,21 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
void InitRoot(RegTree* p_tree, dh::AllReducer* reducer, int64_t num_columns) {
|
||||
constexpr int kRootNIdx = 0;
|
||||
|
||||
dh::SumReduction(temp_memory, gpair, node_sum_gradients_d, gpair.size());
|
||||
constexpr bst_node_t kRootNIdx = 0;
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
GradientPair root_sum = thrust::reduce(
|
||||
thrust::cuda::par(alloc),
|
||||
thrust::device_ptr<GradientPair const>(gpair.data()),
|
||||
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()));
|
||||
dh::safe_cuda(cudaMemcpyAsync(node_sum_gradients_d.data(), &root_sum, sizeof(root_sum),
|
||||
cudaMemcpyHostToDevice));
|
||||
reducer->AllReduceSum(
|
||||
reinterpret_cast<float*>(node_sum_gradients_d.data()),
|
||||
reinterpret_cast<float*>(node_sum_gradients_d.data()), 2);
|
||||
reducer->Synchronize();
|
||||
dh::safe_cuda(cudaMemcpy(node_sum_gradients.data(),
|
||||
node_sum_gradients_d.data(), sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToHost));
|
||||
dh::safe_cuda(cudaMemcpyAsync(node_sum_gradients.data(),
|
||||
node_sum_gradients_d.data(), sizeof(GradientPair),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
this->BuildHist(kRootNIdx);
|
||||
this->AllReduceHist(kRootNIdx, reducer);
|
||||
@@ -1055,6 +1031,7 @@ class GPUHistMakerSpecialised {
|
||||
param_,
|
||||
column_sampling_seed,
|
||||
info_->num_col_,
|
||||
hist_maker_param_.deterministic_histogram,
|
||||
batch_param));
|
||||
|
||||
monitor_.StartCuda("InitHistogram");
|
||||
|
||||
Reference in New Issue
Block a user