Combine thread launches into single launch per tree for gpu_hist (#4343)
* Combine thread launches into single launch per tree for gpu_hist algorithm. * Address deprecation warning * Add manual column sampler constructor * Turn off omp dynamic to get a guaranteed number of threads * Enable openmp in cuda code
This commit is contained in:
parent
146e83f3b3
commit
5e582b0fa7
@ -62,6 +62,13 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
|
|||||||
DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
|
DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0)
|
||||||
.describe("Size of leaf vector, reserved for vector tree");
|
.describe("Size of leaf vector, reserved for vector tree");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool operator==(const TreeParam& b) const {
|
||||||
|
return num_roots == b.num_roots && num_nodes == b.num_nodes &&
|
||||||
|
num_deleted == b.num_deleted && max_depth == b.max_depth &&
|
||||||
|
num_feature == b.num_feature &&
|
||||||
|
size_leaf_vector == b.size_leaf_vector;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*! \brief node statistics used in regression tree */
|
/*! \brief node statistics used in regression tree */
|
||||||
@ -74,6 +81,10 @@ struct RTreeNodeStat {
|
|||||||
bst_float base_weight;
|
bst_float base_weight;
|
||||||
/*! \brief number of child that is leaf node known up to now */
|
/*! \brief number of child that is leaf node known up to now */
|
||||||
int leaf_child_cnt;
|
int leaf_child_cnt;
|
||||||
|
bool operator==(const RTreeNodeStat& b) const {
|
||||||
|
return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
|
||||||
|
base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
@ -188,6 +199,11 @@ class RegTree {
|
|||||||
if (is_left_child) pidx |= (1U << 31);
|
if (is_left_child) pidx |= (1U << 31);
|
||||||
this->parent_ = pidx;
|
this->parent_ = pidx;
|
||||||
}
|
}
|
||||||
|
bool operator==(const Node& b) const {
|
||||||
|
return parent_ == b.parent_ && cleft_ == b.cleft_ &&
|
||||||
|
cright_ == b.cright_ && sindex_ == b.sindex_ &&
|
||||||
|
info_.leaf_value == b.info_.leaf_value;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*!
|
/*!
|
||||||
@ -304,6 +320,11 @@ class RegTree {
|
|||||||
fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
|
fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool operator==(const RegTree& b) const {
|
||||||
|
return nodes_ == b.nodes_ && stats_ == b.stats_ &&
|
||||||
|
deleted_nodes_ == b.deleted_nodes_ && param == b.param;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Expands a leaf node into two additional leaf nodes.
|
* \brief Expands a leaf node into two additional leaf nodes.
|
||||||
*
|
*
|
||||||
|
|||||||
@ -57,6 +57,12 @@ if (USE_CUDA)
|
|||||||
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
|
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||||
endif (USE_NVTX)
|
endif (USE_NVTX)
|
||||||
|
|
||||||
|
# OpenMP is mandatory for cuda version
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
target_compile_options(objxgboost PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
|
||||||
|
)
|
||||||
|
|
||||||
set_target_properties(objxgboost PROPERTIES
|
set_target_properties(objxgboost PROPERTIES
|
||||||
CUDA_SEPARABLE_COMPILATION OFF)
|
CUDA_SEPARABLE_COMPILATION OFF)
|
||||||
else (USE_CUDA)
|
else (USE_CUDA)
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
#include "span.h"
|
#include "span.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <omp.h>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
@ -752,6 +753,29 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class SaveCudaContext {
|
||||||
|
private:
|
||||||
|
int saved_device_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
template <typename Functor>
|
||||||
|
explicit SaveCudaContext (Functor func) : saved_device_{-1} {
|
||||||
|
// When compiled with CUDA but running on CPU only device,
|
||||||
|
// cudaGetDevice will fail.
|
||||||
|
try {
|
||||||
|
safe_cuda(cudaGetDevice(&saved_device_));
|
||||||
|
} catch (const dmlc::Error &except) {
|
||||||
|
saved_device_ = -1;
|
||||||
|
}
|
||||||
|
func();
|
||||||
|
}
|
||||||
|
~SaveCudaContext() {
|
||||||
|
if (saved_device_ != -1) {
|
||||||
|
safe_cuda(cudaSetDevice(saved_device_));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \class AllReducer
|
* \class AllReducer
|
||||||
*
|
*
|
||||||
@ -777,8 +801,18 @@ class AllReducer {
|
|||||||
allreduce_calls_(0) {}
|
allreduce_calls_(0) {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \fn void Init(const std::vector<int> &device_ordinals)
|
* \brief If we are using a single GPU only
|
||||||
*
|
*/
|
||||||
|
bool IsSingleGPU() {
|
||||||
|
#ifdef XGBOOST_USE_NCCL
|
||||||
|
CHECK(device_counts.size() > 0) << "AllReducer not initialised.";
|
||||||
|
return device_counts.size() <= 1 && device_counts.at(0) == 1;
|
||||||
|
#else
|
||||||
|
return true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
* \brief Initialise with the desired device ordinals for this communication
|
* \brief Initialise with the desired device ordinals for this communication
|
||||||
* group.
|
* group.
|
||||||
*
|
*
|
||||||
@ -956,6 +990,22 @@ class AllReducer {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Synchronizes the device
|
||||||
|
*
|
||||||
|
* \param device_id Identifier for the device.
|
||||||
|
*/
|
||||||
|
void Synchronize(int device_id) {
|
||||||
|
#ifdef XGBOOST_USE_NCCL
|
||||||
|
SaveCudaContext([&]() {
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
|
int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin();
|
||||||
|
CHECK(idx < device_ordinals.size());
|
||||||
|
dh::safe_cuda(cudaStreamSynchronize(streams[idx]));
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef XGBOOST_USE_NCCL
|
#ifdef XGBOOST_USE_NCCL
|
||||||
/**
|
/**
|
||||||
* \fn ncclUniqueId GetUniqueId()
|
* \fn ncclUniqueId GetUniqueId()
|
||||||
@ -980,29 +1030,6 @@ class AllReducer {
|
|||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
class SaveCudaContext {
|
|
||||||
private:
|
|
||||||
int saved_device_;
|
|
||||||
|
|
||||||
public:
|
|
||||||
template <typename Functor>
|
|
||||||
explicit SaveCudaContext (Functor func) : saved_device_{-1} {
|
|
||||||
// When compiled with CUDA but running on CPU only device,
|
|
||||||
// cudaGetDevice will fail.
|
|
||||||
try {
|
|
||||||
safe_cuda(cudaGetDevice(&saved_device_));
|
|
||||||
} catch (const dmlc::Error &except) {
|
|
||||||
saved_device_ = -1;
|
|
||||||
}
|
|
||||||
func();
|
|
||||||
}
|
|
||||||
~SaveCudaContext() {
|
|
||||||
if (saved_device_ != -1) {
|
|
||||||
safe_cuda(cudaSetDevice(saved_device_));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Executes some operation on each element of the input vector, using a
|
* \brief Executes some operation on each element of the input vector, using a
|
||||||
* single controlling thread for each element. In addition, passes the shard index
|
* single controlling thread for each element. In addition, passes the shard index
|
||||||
@ -1017,11 +1044,15 @@ class SaveCudaContext {
|
|||||||
template <typename T, typename FunctionT>
|
template <typename T, typename FunctionT>
|
||||||
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
void ExecuteIndexShards(std::vector<T> *shards, FunctionT f) {
|
||||||
SaveCudaContext{[&]() {
|
SaveCudaContext{[&]() {
|
||||||
|
// Temporarily turn off dynamic so we have a guaranteed number of threads
|
||||||
|
bool dynamic = omp_get_dynamic();
|
||||||
|
omp_set_dynamic(false);
|
||||||
const long shards_size = static_cast<long>(shards->size());
|
const long shards_size = static_cast<long>(shards->size());
|
||||||
#pragma omp parallel for schedule(static, 1) if (shards_size > 1)
|
#pragma omp parallel for schedule(static, 1) if (shards_size > 1)
|
||||||
for (long shard = 0; shard < shards_size; ++shard) {
|
for (long shard = 0; shard < shards_size; ++shard) {
|
||||||
f(shard, shards->at(shard));
|
f(shard, shards->at(shard));
|
||||||
}
|
}
|
||||||
|
omp_set_dynamic(dynamic);
|
||||||
}};
|
}};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -113,6 +113,14 @@ class ColumnSampler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
/**
|
||||||
|
* \brief Column sampler constructor.
|
||||||
|
* \note This constructor manually sets the rng seed
|
||||||
|
*/
|
||||||
|
explicit ColumnSampler(uint32_t seed) {
|
||||||
|
rng_.seed(seed);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Column sampler constructor.
|
* \brief Column sampler constructor.
|
||||||
* \note This constructor synchronizes the RNG seed across processes.
|
* \note This constructor synchronizes the RNG seed across processes.
|
||||||
|
|||||||
@ -342,7 +342,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GPUPredictor() : cpu_predictor_(Predictor::Create("cpu_predictor")) {}
|
GPUPredictor() // NOLINT
|
||||||
|
: cpu_predictor_(Predictor::Create("cpu_predictor")) {} // NOLINT
|
||||||
|
|
||||||
void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
|
void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
|
||||||
const gbm::GBTreeModel& model, int tree_begin,
|
const gbm::GBTreeModel& model, int tree_begin,
|
||||||
|
|||||||
@ -38,6 +38,7 @@ struct GPUHistMakerTrainParam
|
|||||||
bool single_precision_histogram;
|
bool single_precision_histogram;
|
||||||
// number of rows in a single GPU batch
|
// number of rows in a single GPU batch
|
||||||
int gpu_batch_nrows;
|
int gpu_batch_nrows;
|
||||||
|
bool debug_synchronize;
|
||||||
// declare parameters
|
// declare parameters
|
||||||
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
|
DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
|
||||||
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
|
DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
|
||||||
@ -47,6 +48,8 @@ struct GPUHistMakerTrainParam
|
|||||||
.set_default(0)
|
.set_default(0)
|
||||||
.describe("Number of rows in a GPU batch, used for finding quantiles on GPU; "
|
.describe("Number of rows in a GPU batch, used for finding quantiles on GPU; "
|
||||||
"-1 to use all rows assignted to a GPU, and 0 to auto-deduce");
|
"-1 to use all rows assignted to a GPU, and 0 to auto-deduce");
|
||||||
|
DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
|
||||||
|
"Check if all distributed tree are identical after tree construction.");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#if !defined(GTEST_TEST)
|
#if !defined(GTEST_TEST)
|
||||||
@ -598,12 +601,23 @@ inline void SortPosition(dh::CubMemory* temp_memory, common::Span<int> position,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief Count how many rows are assigned to left node. */
|
/*! \brief Count how many rows are assigned to left node. */
|
||||||
__forceinline__ __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
|
__forceinline__ __device__ void CountLeft(int64_t* d_count, int val,
|
||||||
|
int left_nidx) {
|
||||||
|
#if __CUDACC_VER_MAJOR__ > 8
|
||||||
|
int mask = __activemask();
|
||||||
|
unsigned ballot = __ballot_sync(mask, val == left_nidx);
|
||||||
|
int leader = __ffs(mask) - 1;
|
||||||
|
if (threadIdx.x % 32 == leader) {
|
||||||
|
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
|
||||||
|
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
|
||||||
|
}
|
||||||
|
#else
|
||||||
unsigned ballot = __ballot(val == left_nidx);
|
unsigned ballot = __ballot(val == left_nidx);
|
||||||
if (threadIdx.x % 32 == 0) {
|
if (threadIdx.x % 32 == 0) {
|
||||||
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
|
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
|
||||||
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
|
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
@ -621,6 +635,7 @@ template <typename GradientSumT>
|
|||||||
struct DeviceShard {
|
struct DeviceShard {
|
||||||
int n_bins;
|
int n_bins;
|
||||||
int device_id;
|
int device_id;
|
||||||
|
int shard_idx; // Position in the local array of shards
|
||||||
|
|
||||||
dh::BulkAllocator ba;
|
dh::BulkAllocator ba;
|
||||||
|
|
||||||
@ -670,18 +685,31 @@ struct DeviceShard {
|
|||||||
|
|
||||||
std::vector<cudaStream_t> streams;
|
std::vector<cudaStream_t> streams;
|
||||||
|
|
||||||
|
common::Monitor monitor;
|
||||||
|
std::vector<ValueConstraint> node_value_constraints;
|
||||||
|
common::ColumnSampler column_sampler;
|
||||||
|
|
||||||
std::unique_ptr<GPUHistBuilderBase<GradientSumT>> hist_builder;
|
std::unique_ptr<GPUHistBuilderBase<GradientSumT>> hist_builder;
|
||||||
|
|
||||||
|
using ExpandQueue =
|
||||||
|
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||||
|
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||||
|
std::unique_ptr<ExpandQueue> qexpand;
|
||||||
|
|
||||||
// TODO(canonizer): do add support multi-batch DMatrix here
|
// TODO(canonizer): do add support multi-batch DMatrix here
|
||||||
DeviceShard(int _device_id, bst_uint row_begin, bst_uint row_end,
|
DeviceShard(int _device_id, int shard_idx, bst_uint row_begin,
|
||||||
TrainParam _param)
|
bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed)
|
||||||
: device_id(_device_id),
|
: device_id(_device_id),
|
||||||
|
shard_idx(shard_idx),
|
||||||
row_begin_idx(row_begin),
|
row_begin_idx(row_begin),
|
||||||
row_end_idx(row_end),
|
row_end_idx(row_end),
|
||||||
n_rows(row_end - row_begin),
|
n_rows(row_end - row_begin),
|
||||||
n_bins(0),
|
n_bins(0),
|
||||||
param(std::move(_param)),
|
param(std::move(_param)),
|
||||||
prediction_cache_initialised(false) {}
|
prediction_cache_initialised(false),
|
||||||
|
column_sampler(column_sampler_seed) {
|
||||||
|
monitor.Init(std::string("DeviceShard") + std::to_string(device_id));
|
||||||
|
}
|
||||||
|
|
||||||
/* Init row_ptrs and row_stride */
|
/* Init row_ptrs and row_stride */
|
||||||
size_t InitRowPtrs(const SparsePage& row_batch) {
|
size_t InitRowPtrs(const SparsePage& row_batch) {
|
||||||
@ -736,7 +764,16 @@ struct DeviceShard {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Reset values for each update iteration
|
// Reset values for each update iteration
|
||||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
// Note that the column sampler must be passed by value because it is not
|
||||||
|
// thread safe
|
||||||
|
void Reset(HostDeviceVector<GradientPair>* dh_gpair, int64_t num_columns) {
|
||||||
|
if (param.grow_policy == TrainParam::kLossGuide) {
|
||||||
|
qexpand.reset(new ExpandQueue(LossGuide));
|
||||||
|
} else {
|
||||||
|
qexpand.reset(new ExpandQueue(DepthWise));
|
||||||
|
}
|
||||||
|
this->column_sampler.Init(num_columns, param.colsample_bynode,
|
||||||
|
param.colsample_bylevel, param.colsample_bytree);
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
thrust::fill(
|
thrust::fill(
|
||||||
thrust::device_pointer_cast(position.Current()),
|
thrust::device_pointer_cast(position.Current()),
|
||||||
@ -764,8 +801,6 @@ struct DeviceShard {
|
|||||||
|
|
||||||
std::vector<DeviceSplitCandidate> EvaluateSplits(
|
std::vector<DeviceSplitCandidate> EvaluateSplits(
|
||||||
std::vector<int> nidxs, const RegTree& tree,
|
std::vector<int> nidxs, const RegTree& tree,
|
||||||
common::ColumnSampler* column_sampler,
|
|
||||||
const std::vector<ValueConstraint>& value_constraints,
|
|
||||||
size_t num_columns) {
|
size_t num_columns) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
|
auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
|
||||||
@ -800,7 +835,7 @@ struct DeviceShard {
|
|||||||
auto& streams = this->GetStreams(nidxs.size());
|
auto& streams = this->GetStreams(nidxs.size());
|
||||||
for (auto i = 0ull; i < nidxs.size(); i++) {
|
for (auto i = 0ull; i < nidxs.size(); i++) {
|
||||||
auto nidx = nidxs[i];
|
auto nidx = nidxs[i];
|
||||||
auto p_feature_set = column_sampler->GetFeatureSet(tree.GetDepth(nidx));
|
auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
|
||||||
p_feature_set->Reshard(GPUSet(device_id, 1));
|
p_feature_set->Reshard(GPUSet(device_id, 1));
|
||||||
auto d_feature_set = p_feature_set->DeviceSpan(device_id);
|
auto d_feature_set = p_feature_set->DeviceSpan(device_id);
|
||||||
auto d_split_candidates =
|
auto d_split_candidates =
|
||||||
@ -812,7 +847,7 @@ struct DeviceShard {
|
|||||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
||||||
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
||||||
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
|
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
|
||||||
gpu_param, d_split_candidates, value_constraints[nidx],
|
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
||||||
monotone_constraints);
|
monotone_constraints);
|
||||||
|
|
||||||
// Reduce over features to find best feature
|
// Reduce over features to find best feature
|
||||||
@ -997,6 +1032,179 @@ struct DeviceShard {
|
|||||||
out_preds_d, prediction_cache.data(),
|
out_preds_d, prediction_cache.data(),
|
||||||
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
|
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AllReduceHist(int nidx, dh::AllReducer* reducer) {
|
||||||
|
monitor.StartCuda("AllReduce");
|
||||||
|
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||||
|
reducer->AllReduceSum(
|
||||||
|
shard_idx,
|
||||||
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
|
ellpack_matrix.BinCount() *
|
||||||
|
(sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||||
|
reducer->Synchronize(device_id);
|
||||||
|
|
||||||
|
monitor.StopCuda("AllReduce");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* \brief Build GPU local histograms for the left and right child of some parent node
|
||||||
|
*/
|
||||||
|
void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) {
|
||||||
|
auto build_hist_nidx = nidx_left;
|
||||||
|
auto subtraction_trick_nidx = nidx_right;
|
||||||
|
|
||||||
|
// If we are using a single GPU, build the histogram for the node with the
|
||||||
|
// fewest training instances
|
||||||
|
// If we are distributed, don't bother
|
||||||
|
if (reducer->IsSingleGPU()) {
|
||||||
|
bool fewer_right =
|
||||||
|
ridx_segments[nidx_right].Size() < ridx_segments[nidx_left].Size();
|
||||||
|
if (fewer_right) {
|
||||||
|
std::swap(build_hist_nidx, subtraction_trick_nidx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this->BuildHist(build_hist_nidx);
|
||||||
|
this->AllReduceHist(build_hist_nidx, reducer);
|
||||||
|
|
||||||
|
// Check whether we can use the subtraction trick to calculate the other
|
||||||
|
bool do_subtraction_trick = this->CanDoSubtractionTrick(
|
||||||
|
nidx_parent, build_hist_nidx, subtraction_trick_nidx);
|
||||||
|
|
||||||
|
if (do_subtraction_trick) {
|
||||||
|
// Calculate other histogram using subtraction trick
|
||||||
|
this->SubtractionTrick(nidx_parent, build_hist_nidx,
|
||||||
|
subtraction_trick_nidx);
|
||||||
|
} else {
|
||||||
|
// Calculate other histogram manually
|
||||||
|
this->BuildHist(subtraction_trick_nidx);
|
||||||
|
this->AllReduceHist(subtraction_trick_nidx, reducer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||||
|
RegTree& tree = *p_tree;
|
||||||
|
|
||||||
|
GradStats left_stats;
|
||||||
|
left_stats.Add(candidate.split.left_sum);
|
||||||
|
GradStats right_stats;
|
||||||
|
right_stats.Add(candidate.split.right_sum);
|
||||||
|
GradStats parent_sum;
|
||||||
|
parent_sum.Add(left_stats);
|
||||||
|
parent_sum.Add(right_stats);
|
||||||
|
node_value_constraints.resize(tree.GetNodes().size());
|
||||||
|
auto base_weight = node_value_constraints[candidate.nid].CalcWeight(param, parent_sum);
|
||||||
|
auto left_weight =
|
||||||
|
node_value_constraints[candidate.nid].CalcWeight(param, left_stats)*param.learning_rate;
|
||||||
|
auto right_weight =
|
||||||
|
node_value_constraints[candidate.nid].CalcWeight(param, right_stats)*param.learning_rate;
|
||||||
|
tree.ExpandNode(candidate.nid, candidate.split.findex,
|
||||||
|
candidate.split.fvalue, candidate.split.dir == kLeftDir,
|
||||||
|
base_weight, left_weight, right_weight,
|
||||||
|
candidate.split.loss_chg, parent_sum.sum_hess);
|
||||||
|
// Set up child constraints
|
||||||
|
node_value_constraints.resize(tree.GetNodes().size());
|
||||||
|
node_value_constraints[candidate.nid].SetChild(
|
||||||
|
param, tree[candidate.nid].SplitIndex(), left_stats, right_stats,
|
||||||
|
&node_value_constraints[tree[candidate.nid].LeftChild()],
|
||||||
|
&node_value_constraints[tree[candidate.nid].RightChild()]);
|
||||||
|
node_sum_gradients[tree[candidate.nid].LeftChild()] =
|
||||||
|
candidate.split.left_sum;
|
||||||
|
node_sum_gradients[tree[candidate.nid].RightChild()] =
|
||||||
|
candidate.split.right_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitRoot(RegTree* p_tree, HostDeviceVector<GradientPair>* gpair_all,
|
||||||
|
dh::AllReducer* reducer, int64_t num_columns) {
|
||||||
|
constexpr int kRootNIdx = 0;
|
||||||
|
|
||||||
|
const auto &gpair = gpair_all->DeviceSpan(device_id);
|
||||||
|
|
||||||
|
dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
|
||||||
|
gpair.size());
|
||||||
|
reducer->AllReduceSum(
|
||||||
|
shard_idx, reinterpret_cast<float*>(node_sum_gradients_d.data()),
|
||||||
|
reinterpret_cast<float*>(node_sum_gradients_d.data()), 2);
|
||||||
|
reducer->Synchronize(device_id);
|
||||||
|
dh::safe_cuda(cudaMemcpy(node_sum_gradients.data(),
|
||||||
|
node_sum_gradients_d.data(), sizeof(GradientPair),
|
||||||
|
cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
|
this->BuildHist(kRootNIdx);
|
||||||
|
this->AllReduceHist(kRootNIdx, reducer);
|
||||||
|
|
||||||
|
// Remember root stats
|
||||||
|
p_tree->Stat(kRootNIdx).sum_hess = node_sum_gradients[kRootNIdx].GetHess();
|
||||||
|
auto weight = CalcWeight(param, node_sum_gradients[kRootNIdx]);
|
||||||
|
p_tree->Stat(kRootNIdx).base_weight = weight;
|
||||||
|
(*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight);
|
||||||
|
|
||||||
|
// Initialise root constraint
|
||||||
|
node_value_constraints.resize(p_tree->GetNodes().size());
|
||||||
|
|
||||||
|
// Generate first split
|
||||||
|
auto split = this->EvaluateSplits({kRootNIdx}, *p_tree, num_columns);
|
||||||
|
qexpand->push(
|
||||||
|
ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
|
||||||
|
RegTree* p_tree, dh::AllReducer* reducer) {
|
||||||
|
auto& tree = *p_tree;
|
||||||
|
monitor.StartCuda("Reset");
|
||||||
|
this->Reset(gpair_all, p_fmat->Info().num_col_);
|
||||||
|
monitor.StopCuda("Reset");
|
||||||
|
|
||||||
|
monitor.StartCuda("InitRoot");
|
||||||
|
this->InitRoot(p_tree, gpair_all, reducer, p_fmat->Info().num_col_);
|
||||||
|
monitor.StopCuda("InitRoot");
|
||||||
|
|
||||||
|
auto timestamp = qexpand->size();
|
||||||
|
auto num_leaves = 1;
|
||||||
|
|
||||||
|
while (!qexpand->empty()) {
|
||||||
|
ExpandEntry candidate = qexpand->top();
|
||||||
|
qexpand->pop();
|
||||||
|
if (!candidate.IsValid(param, num_leaves)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
this->ApplySplit(candidate, p_tree);
|
||||||
|
|
||||||
|
num_leaves++;
|
||||||
|
|
||||||
|
int left_child_nidx = tree[candidate.nid].LeftChild();
|
||||||
|
int right_child_nidx = tree[candidate.nid].RightChild();
|
||||||
|
// Only create child entries if needed
|
||||||
|
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
|
||||||
|
num_leaves)) {
|
||||||
|
monitor.StartCuda("UpdatePosition");
|
||||||
|
this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
|
||||||
|
monitor.StopCuda("UpdatePosition");
|
||||||
|
|
||||||
|
monitor.StartCuda("BuildHist");
|
||||||
|
this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer);
|
||||||
|
monitor.StopCuda("BuildHist");
|
||||||
|
|
||||||
|
monitor.StartCuda("EvaluateSplits");
|
||||||
|
auto splits = this->EvaluateSplits({left_child_nidx, right_child_nidx},
|
||||||
|
*p_tree, p_fmat->Info().num_col_);
|
||||||
|
monitor.StopCuda("EvaluateSplits");
|
||||||
|
|
||||||
|
qexpand->push(ExpandEntry(left_child_nidx,
|
||||||
|
tree.GetDepth(left_child_nidx), splits.at(0),
|
||||||
|
timestamp++));
|
||||||
|
qexpand->push(ExpandEntry(right_child_nidx,
|
||||||
|
tree.GetDepth(right_child_nidx),
|
||||||
|
splits.at(1), timestamp++));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
monitor.StartCuda("FinalisePosition");
|
||||||
|
this->FinalisePosition(p_tree);
|
||||||
|
monitor.StopCuda("FinalisePosition");
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
@ -1179,12 +1387,6 @@ class GPUHistMakerSpecialised{
|
|||||||
|
|
||||||
dh::CheckComputeCapability();
|
dh::CheckComputeCapability();
|
||||||
|
|
||||||
if (param_.grow_policy == TrainParam::kLossGuide) {
|
|
||||||
qexpand_.reset(new ExpandQueue(LossGuide));
|
|
||||||
} else {
|
|
||||||
qexpand_.reset(new ExpandQueue(DepthWise));
|
|
||||||
}
|
|
||||||
|
|
||||||
monitor_.Init("updater_gpu_hist");
|
monitor_.Init("updater_gpu_hist");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1223,17 +1425,23 @@ class GPUHistMakerSpecialised{
|
|||||||
|
|
||||||
auto batch_iter = dmat->GetRowBatches().begin();
|
auto batch_iter = dmat->GetRowBatches().begin();
|
||||||
const SparsePage& batch = *batch_iter;
|
const SparsePage& batch = *batch_iter;
|
||||||
|
|
||||||
|
// Synchronise the column sampling seed
|
||||||
|
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||||
|
rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||||
|
|
||||||
// Create device shards
|
// Create device shards
|
||||||
shards_.resize(n_devices);
|
shards_.resize(n_devices);
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i)));
|
dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
|
||||||
size_t start = dist_.ShardStart(info_->num_row_, i);
|
size_t start = dist_.ShardStart(info_->num_row_, idx);
|
||||||
size_t size = dist_.ShardSize(info_->num_row_, i);
|
size_t size = dist_.ShardSize(info_->num_row_, idx);
|
||||||
shard = std::unique_ptr<DeviceShard<GradientSumT>>(
|
shard = std::unique_ptr<DeviceShard<GradientSumT>>(
|
||||||
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start,
|
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
|
||||||
start + size, param_));
|
start, start + size, param_,
|
||||||
|
column_sampling_seed));
|
||||||
});
|
});
|
||||||
|
|
||||||
// Find the cuts.
|
// Find the cuts.
|
||||||
@ -1264,277 +1472,61 @@ class GPUHistMakerSpecialised{
|
|||||||
this->InitDataOnce(dmat);
|
this->InitDataOnce(dmat);
|
||||||
monitor_.StopCuda("InitDataOnce");
|
monitor_.StopCuda("InitDataOnce");
|
||||||
}
|
}
|
||||||
|
|
||||||
column_sampler_.Init(info_->num_col_, param_.colsample_bynode,
|
|
||||||
param_.colsample_bylevel, param_.colsample_bytree);
|
|
||||||
|
|
||||||
// Copy gpair & reset memory
|
|
||||||
monitor_.StartCuda("InitDataReset");
|
|
||||||
|
|
||||||
gpair->Reshard(dist_);
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->Reset(gpair);
|
|
||||||
});
|
|
||||||
monitor_.StopCuda("InitDataReset");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void AllReduceHist(int nidx) {
|
// Only call this method for testing
|
||||||
if (shards_.size() == 1 && !rabit::IsDistributed()) {
|
void CheckTreesSynchronized(const std::vector<RegTree>& local_trees) const {
|
||||||
return;
|
std::string s_model;
|
||||||
|
common::MemoryBufferStream fs(&s_model);
|
||||||
|
int rank = rabit::GetRank();
|
||||||
|
if (rank == 0) {
|
||||||
|
local_trees.front().Save(&fs);
|
||||||
}
|
}
|
||||||
monitor_.StartCuda("AllReduce");
|
fs.Seek(0);
|
||||||
|
rabit::Broadcast(&s_model, 0);
|
||||||
reducer_.GroupStart();
|
RegTree reference_tree;
|
||||||
for (auto& shard : shards_) {
|
reference_tree.Load(&fs);
|
||||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
for (const auto& tree : local_trees) {
|
||||||
reducer_.AllReduceSum(
|
CHECK(tree == reference_tree);
|
||||||
dist_.Devices().Index(shard->device_id),
|
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
|
||||||
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
|
||||||
}
|
|
||||||
reducer_.GroupEnd();
|
|
||||||
reducer_.Synchronize();
|
|
||||||
|
|
||||||
monitor_.StopCuda("AllReduce");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Build GPU local histograms for the left and right child of some parent node
|
|
||||||
*/
|
|
||||||
void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right) {
|
|
||||||
size_t left_node_max_elements = 0;
|
|
||||||
size_t right_node_max_elements = 0;
|
|
||||||
for (auto& shard : shards_) {
|
|
||||||
left_node_max_elements = (std::max)(
|
|
||||||
left_node_max_elements, shard->ridx_segments[nidx_left].Size());
|
|
||||||
right_node_max_elements = (std::max)(
|
|
||||||
right_node_max_elements, shard->ridx_segments[nidx_right].Size());
|
|
||||||
}
|
|
||||||
|
|
||||||
rabit::Allreduce<rabit::op::Max, size_t>(&left_node_max_elements, 1);
|
|
||||||
rabit::Allreduce<rabit::op::Max, size_t>(&right_node_max_elements, 1);
|
|
||||||
|
|
||||||
auto build_hist_nidx = nidx_left;
|
|
||||||
auto subtraction_trick_nidx = nidx_right;
|
|
||||||
|
|
||||||
if (right_node_max_elements < left_node_max_elements) {
|
|
||||||
build_hist_nidx = nidx_right;
|
|
||||||
subtraction_trick_nidx = nidx_left;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build histogram for node with the smallest number of training examples
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->BuildHist(build_hist_nidx);
|
|
||||||
});
|
|
||||||
|
|
||||||
this->AllReduceHist(build_hist_nidx);
|
|
||||||
|
|
||||||
// Check whether we can use the subtraction trick to calculate the other
|
|
||||||
bool do_subtraction_trick = true;
|
|
||||||
for (auto& shard : shards_) {
|
|
||||||
do_subtraction_trick &= shard->CanDoSubtractionTrick(
|
|
||||||
nidx_parent, build_hist_nidx, subtraction_trick_nidx);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (do_subtraction_trick) {
|
|
||||||
// Calculate other histogram using subtraction trick
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->SubtractionTrick(nidx_parent, build_hist_nidx,
|
|
||||||
subtraction_trick_nidx);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
// Calculate other histogram manually
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->BuildHist(subtraction_trick_nidx);
|
|
||||||
});
|
|
||||||
|
|
||||||
this->AllReduceHist(subtraction_trick_nidx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<DeviceSplitCandidate> EvaluateSplits(std::vector<int> nidx,
|
|
||||||
RegTree* p_tree) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shards_.front()->device_id));
|
|
||||||
return shards_.front()->EvaluateSplits(nidx, *p_tree, &column_sampler_,
|
|
||||||
node_value_constraints_,
|
|
||||||
info_->num_col_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void InitRoot(RegTree* p_tree) {
|
|
||||||
constexpr int kRootNIdx = 0;
|
|
||||||
// Sum gradients
|
|
||||||
std::vector<GradientPair> tmp_sums(shards_.size());
|
|
||||||
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
tmp_sums[i] = dh::SumReduction(
|
|
||||||
shard->temp_memory, shard->gpair.data(), shard->gpair.size());
|
|
||||||
});
|
|
||||||
|
|
||||||
GradientPair sum_gradient =
|
|
||||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
|
||||||
|
|
||||||
rabit::Allreduce<rabit::op::Sum>(
|
|
||||||
reinterpret_cast<GradientPair::ValueT*>(&sum_gradient), 2);
|
|
||||||
|
|
||||||
// Generate root histogram
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->BuildHist(kRootNIdx);
|
|
||||||
});
|
|
||||||
|
|
||||||
this->AllReduceHist(kRootNIdx);
|
|
||||||
|
|
||||||
// Remember root stats
|
|
||||||
p_tree->Stat(kRootNIdx).sum_hess = sum_gradient.GetHess();
|
|
||||||
auto weight = CalcWeight(param_, sum_gradient);
|
|
||||||
p_tree->Stat(kRootNIdx).base_weight = weight;
|
|
||||||
(*p_tree)[kRootNIdx].SetLeaf(param_.learning_rate * weight);
|
|
||||||
|
|
||||||
// Store sum gradients
|
|
||||||
for (auto& shard : shards_) {
|
|
||||||
shard->node_sum_gradients[kRootNIdx] = sum_gradient;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialise root constraint
|
|
||||||
node_value_constraints_.resize(p_tree->GetNodes().size());
|
|
||||||
|
|
||||||
// Generate first split
|
|
||||||
auto split = this->EvaluateSplits({ kRootNIdx }, p_tree);
|
|
||||||
qexpand_->push(
|
|
||||||
ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
|
||||||
shard->UpdatePosition(candidate.nid,
|
|
||||||
p_tree->GetNodes()[candidate.nid]);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
void FinalisePosition(RegTree* p_tree) {
|
|
||||||
dh::ExecuteIndexShards(
|
|
||||||
&shards_,
|
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
|
||||||
shard->FinalisePosition(p_tree);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) {
|
|
||||||
RegTree& tree = *p_tree;
|
|
||||||
|
|
||||||
GradStats left_stats;
|
|
||||||
left_stats.Add(candidate.split.left_sum);
|
|
||||||
GradStats right_stats;
|
|
||||||
right_stats.Add(candidate.split.right_sum);
|
|
||||||
GradStats parent_sum;
|
|
||||||
parent_sum.Add(left_stats);
|
|
||||||
parent_sum.Add(right_stats);
|
|
||||||
node_value_constraints_.resize(tree.GetNodes().size());
|
|
||||||
auto base_weight = node_value_constraints_[candidate.nid].CalcWeight(param_, parent_sum);
|
|
||||||
auto left_weight =
|
|
||||||
node_value_constraints_[candidate.nid].CalcWeight(param_, left_stats)*param_.learning_rate;
|
|
||||||
auto right_weight =
|
|
||||||
node_value_constraints_[candidate.nid].CalcWeight(param_, right_stats)*param_.learning_rate;
|
|
||||||
tree.ExpandNode(candidate.nid, candidate.split.findex,
|
|
||||||
candidate.split.fvalue, candidate.split.dir == kLeftDir,
|
|
||||||
base_weight, left_weight, right_weight,
|
|
||||||
candidate.split.loss_chg, parent_sum.sum_hess);
|
|
||||||
// Set up child constraints
|
|
||||||
node_value_constraints_.resize(tree.GetNodes().size());
|
|
||||||
node_value_constraints_[candidate.nid].SetChild(
|
|
||||||
param_, tree[candidate.nid].SplitIndex(), left_stats, right_stats,
|
|
||||||
&node_value_constraints_[tree[candidate.nid].LeftChild()],
|
|
||||||
&node_value_constraints_[tree[candidate.nid].RightChild()]);
|
|
||||||
|
|
||||||
// Store sum gradients
|
|
||||||
for (auto& shard : shards_) {
|
|
||||||
shard->node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum;
|
|
||||||
shard->node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||||
RegTree* p_tree) {
|
RegTree* p_tree) {
|
||||||
auto& tree = *p_tree;
|
|
||||||
|
|
||||||
monitor_.StartCuda("InitData");
|
monitor_.StartCuda("InitData");
|
||||||
this->InitData(gpair, p_fmat);
|
this->InitData(gpair, p_fmat);
|
||||||
monitor_.StopCuda("InitData");
|
monitor_.StopCuda("InitData");
|
||||||
monitor_.StartCuda("InitRoot");
|
|
||||||
this->InitRoot(p_tree);
|
|
||||||
monitor_.StopCuda("InitRoot");
|
|
||||||
|
|
||||||
auto timestamp = qexpand_->size();
|
std::vector<RegTree> trees(shards_.size());
|
||||||
auto num_leaves = 1;
|
for (auto& tree : trees) {
|
||||||
|
tree = *p_tree;
|
||||||
|
}
|
||||||
|
gpair->Reshard(dist_);
|
||||||
|
|
||||||
while (!qexpand_->empty()) {
|
// Launch one thread for each device "shard" containing a subset of rows.
|
||||||
ExpandEntry candidate = qexpand_->top();
|
// Threads will cooperatively build the tree, synchronising over histograms.
|
||||||
qexpand_->pop();
|
// Each thread will redundantly build its own copy of the tree
|
||||||
if (!candidate.IsValid(param_, num_leaves)) {
|
dh::ExecuteIndexShards(
|
||||||
continue;
|
&shards_,
|
||||||
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
|
shard->UpdateTree(gpair, p_fmat, &trees.at(idx), &reducer_);
|
||||||
|
});
|
||||||
|
|
||||||
|
// All trees are expected to be identical
|
||||||
|
if (hist_maker_param_.debug_synchronize) {
|
||||||
|
this->CheckTreesSynchronized(trees);
|
||||||
}
|
}
|
||||||
|
|
||||||
this->ApplySplit(candidate, p_tree);
|
// Write the output tree
|
||||||
num_leaves++;
|
*p_tree = trees.front();
|
||||||
|
|
||||||
int left_child_nidx = tree[candidate.nid].LeftChild();
|
|
||||||
int right_child_nidx = tree[candidate.nid].RightChild();
|
|
||||||
// Only create child entries if needed
|
|
||||||
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
|
|
||||||
num_leaves)) {
|
|
||||||
monitor_.StartCuda("UpdatePosition");
|
|
||||||
this->UpdatePosition(candidate, p_tree);
|
|
||||||
monitor_.StopCuda("UpdatePosition");
|
|
||||||
|
|
||||||
monitor_.StartCuda("BuildHist");
|
|
||||||
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
|
||||||
right_child_nidx);
|
|
||||||
monitor_.StopCuda("BuildHist");
|
|
||||||
|
|
||||||
monitor_.StartCuda("EvaluateSplits");
|
|
||||||
auto splits =
|
|
||||||
this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree);
|
|
||||||
qexpand_->push(ExpandEntry(left_child_nidx,
|
|
||||||
tree.GetDepth(left_child_nidx), splits.at(0),
|
|
||||||
timestamp++));
|
|
||||||
qexpand_->push(ExpandEntry(right_child_nidx,
|
|
||||||
tree.GetDepth(right_child_nidx),
|
|
||||||
splits.at(1), timestamp++));
|
|
||||||
monitor_.StopCuda("EvaluateSplits");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
monitor_.StartCuda("FinalisePosition");
|
|
||||||
this->FinalisePosition(p_tree);
|
|
||||||
monitor_.StopCuda("FinalisePosition");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UpdatePredictionCache(
|
bool UpdatePredictionCache(
|
||||||
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
|
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
|
||||||
monitor_.StartCuda("UpdatePredictionCache");
|
|
||||||
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
monitor_.StartCuda("UpdatePredictionCache");
|
||||||
p_out_preds->Reshard(dist_.Devices());
|
p_out_preds->Reshard(dist_.Devices());
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
@ -1552,9 +1544,6 @@ class GPUHistMakerSpecialised{
|
|||||||
MetaInfo* info_; // NOLINT
|
MetaInfo* info_; // NOLINT
|
||||||
|
|
||||||
std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_; // NOLINT
|
std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_; // NOLINT
|
||||||
common::ColumnSampler column_sampler_; // NOLINT
|
|
||||||
|
|
||||||
std::vector<ValueConstraint> node_value_constraints_; // NOLINT
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool initialised_;
|
bool initialised_;
|
||||||
@ -1565,10 +1554,6 @@ class GPUHistMakerSpecialised{
|
|||||||
GPUHistMakerTrainParam hist_maker_param_;
|
GPUHistMakerTrainParam hist_maker_param_;
|
||||||
common::GHistIndexMatrix gmat_;
|
common::GHistIndexMatrix gmat_;
|
||||||
|
|
||||||
using ExpandQueue =
|
|
||||||
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
|
||||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
|
||||||
std::unique_ptr<ExpandQueue> qexpand_;
|
|
||||||
dh::AllReducer reducer_;
|
dh::AllReducer reducer_;
|
||||||
|
|
||||||
DMatrix* p_last_fmat_;
|
DMatrix* p_last_fmat_;
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
#include <valarray>
|
||||||
#include "../../../src/common/random.h"
|
#include "../../../src/common/random.h"
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "gtest/gtest.h"
|
#include "gtest/gtest.h"
|
||||||
@ -33,7 +34,8 @@ TEST(ColumnSampler, Test) {
|
|||||||
|
|
||||||
// No level or node sampling, should be the same at different depth
|
// No level or node sampling, should be the same at different depth
|
||||||
cs.Init(n, 1.0f, 1.0f, 0.5f);
|
cs.Init(n, 1.0f, 1.0f, 0.5f);
|
||||||
ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector());
|
ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
|
||||||
|
cs.GetFeatureSet(1)->HostVector());
|
||||||
|
|
||||||
cs.Init(n, 1.0f, 1.0f, 1.0f);
|
cs.Init(n, 1.0f, 1.0f, 1.0f);
|
||||||
auto set5 = *cs.GetFeatureSet(0);
|
auto set5 = *cs.GetFeatureSet(0);
|
||||||
@ -45,7 +47,34 @@ TEST(ColumnSampler, Test) {
|
|||||||
// Should always be a minimum of one feature
|
// Should always be a minimum of one feature
|
||||||
cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
|
cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
|
||||||
ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
|
ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test if different threads using the same seed produce the same result
|
||||||
|
TEST(ColumnSampler, ThreadSynchronisation) {
|
||||||
|
const int64_t num_threads = 100;
|
||||||
|
int n = 128;
|
||||||
|
int iterations = 10;
|
||||||
|
int levels = 5;
|
||||||
|
std::vector<int> reference_result;
|
||||||
|
bool success =
|
||||||
|
true; // Cannot use google test asserts in multithreaded region
|
||||||
|
#pragma omp parallel num_threads(num_threads)
|
||||||
|
{
|
||||||
|
for (auto j = 0ull; j < iterations; j++) {
|
||||||
|
ColumnSampler cs(j);
|
||||||
|
cs.Init(n, 0.5f, 0.5f, 0.5f);
|
||||||
|
for (auto level = 0ull; level < levels; level++) {
|
||||||
|
auto result = cs.GetFeatureSet(level)->ConstHostVector();
|
||||||
|
#pragma omp single
|
||||||
|
{ reference_result = result; }
|
||||||
|
if (result != reference_result) {
|
||||||
|
success = false;
|
||||||
|
}
|
||||||
|
#pragma omp barrier
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT_TRUE(success);
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) {
|
|||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
|
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
|
||||||
BuildGidx(&shard, kNRows, kNCols);
|
BuildGidx(&shard, kNRows, kNCols);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
||||||
@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) {
|
|||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
|
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols);
|
||||||
BuildGidx(&shard, kNRows, kNCols, 0.9f);
|
BuildGidx(&shard, kNRows, kNCols, 0.9f);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
||||||
@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
|
|||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
|
DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols);
|
||||||
|
|
||||||
BuildGidx(&shard, kNRows, kNCols);
|
BuildGidx(&shard, kNRows, kNCols);
|
||||||
|
|
||||||
@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
int max_bins = 4;
|
int max_bins = 4;
|
||||||
|
|
||||||
// Initialize DeviceShard
|
// Initialize DeviceShard
|
||||||
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
|
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
|
||||||
new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
|
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols)};
|
||||||
// Initialize DeviceShard::node_sum_gradients
|
// Initialize DeviceShard::node_sum_gradients
|
||||||
shard->node_sum_gradients = {{6.4f, 12.8f}};
|
shard->node_sum_gradients = {{6.4f, 12.8f}};
|
||||||
|
|
||||||
@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
thrust::copy(hist.begin(), hist.end(),
|
thrust::copy(hist.begin(), hist.end(),
|
||||||
shard->hist.Data().begin());
|
shard->hist.Data().begin());
|
||||||
|
|
||||||
// Initialize GPUHistMaker
|
shard->column_sampler.Init(kNCols,
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
|
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise>();
|
|
||||||
hist_maker.param_ = param;
|
|
||||||
hist_maker.shards_.push_back(std::move(shard));
|
|
||||||
hist_maker.column_sampler_.Init(kNCols,
|
|
||||||
param.colsample_bynode,
|
param.colsample_bynode,
|
||||||
param.colsample_bylevel,
|
param.colsample_bylevel,
|
||||||
param.colsample_bytree,
|
param.colsample_bytree,
|
||||||
@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
info.num_row_ = kNRows;
|
info.num_row_ = kNRows;
|
||||||
info.num_col_ = kNCols;
|
info.num_col_ = kNCols;
|
||||||
|
|
||||||
hist_maker.info_ = &info;
|
shard->node_value_constraints.resize(1);
|
||||||
hist_maker.node_value_constraints_.resize(1);
|
shard->node_value_constraints[0].lower_bound = -1.0;
|
||||||
hist_maker.node_value_constraints_[0].lower_bound = -1.0;
|
shard->node_value_constraints[0].upper_bound = 1.0;
|
||||||
hist_maker.node_value_constraints_[0].upper_bound = 1.0;
|
|
||||||
|
|
||||||
std::vector<DeviceSplitCandidate> res =
|
std::vector<DeviceSplitCandidate> res =
|
||||||
hist_maker.EvaluateSplits({ 0,0 }, &tree);
|
shard->EvaluateSplits({ 0,0 }, tree, kNCols);
|
||||||
|
|
||||||
ASSERT_EQ(res[0].findex, 7);
|
ASSERT_EQ(res[0].findex, 7);
|
||||||
ASSERT_EQ(res[1].findex, 7);
|
ASSERT_EQ(res[1].findex, 7);
|
||||||
@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
hist_maker.shards_.resize(1);
|
hist_maker.shards_.resize(1);
|
||||||
hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
|
hist_maker.shards_[0].reset(
|
||||||
|
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols));
|
||||||
|
|
||||||
auto& shard = hist_maker.shards_.at(0);
|
auto& shard = hist_maker.shards_.at(0);
|
||||||
shard->ridx_segments.resize(3); // 3 nodes.
|
shard->ridx_segments.resize(3); // 3 nodes.
|
||||||
@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
shard->gidx_buffer.data(), num_symbols);
|
shard->gidx_buffer.data(), num_symbols);
|
||||||
|
|
||||||
hist_maker.info_ = &info;
|
hist_maker.info_ = &info;
|
||||||
hist_maker.ApplySplit(candidate_entry, &tree);
|
shard->ApplySplit(candidate_entry, &tree);
|
||||||
hist_maker.UpdatePosition(candidate_entry, &tree);
|
shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]);
|
||||||
|
|
||||||
ASSERT_FALSE(tree[kNId].IsLeaf());
|
ASSERT_FALSE(tree[kNId].IsLeaf());
|
||||||
|
|
||||||
|
|||||||
@ -54,7 +54,8 @@ base_params = {
|
|||||||
'max_depth': 2,
|
'max_depth': 2,
|
||||||
'eta': 1,
|
'eta': 1,
|
||||||
'verbosity': 0,
|
'verbosity': 0,
|
||||||
'objective': 'binary:logistic'
|
'objective': 'binary:logistic',
|
||||||
|
'debug_synchronize': True
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -51,7 +51,7 @@ class TestGPU(unittest.TestCase):
|
|||||||
variable_param = {'n_gpus': [-1], 'max_depth': [2, 10],
|
variable_param = {'n_gpus': [-1], 'max_depth': [2, 10],
|
||||||
'max_leaves': [255, 4],
|
'max_leaves': [255, 4],
|
||||||
'max_bin': [2, 256],
|
'max_bin': [2, 256],
|
||||||
'grow_policy': ['lossguide']}
|
'grow_policy': ['lossguide'], 'debug_synchronize': [True]}
|
||||||
for param in parameter_combinations(variable_param):
|
for param in parameter_combinations(variable_param):
|
||||||
param['tree_method'] = 'gpu_hist'
|
param['tree_method'] = 'gpu_hist'
|
||||||
gpu_results = run_suite(param, select_datasets=datasets)
|
gpu_results = run_suite(param, select_datasets=datasets)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user