diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 69515e943..37f61dfae 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -62,6 +62,13 @@ struct TreeParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0) .describe("Size of leaf vector, reserved for vector tree"); } + + bool operator==(const TreeParam& b) const { + return num_roots == b.num_roots && num_nodes == b.num_nodes && + num_deleted == b.num_deleted && max_depth == b.max_depth && + num_feature == b.num_feature && + size_leaf_vector == b.size_leaf_vector; + } }; /*! \brief node statistics used in regression tree */ @@ -74,6 +81,10 @@ struct RTreeNodeStat { bst_float base_weight; /*! \brief number of child that is leaf node known up to now */ int leaf_child_cnt; + bool operator==(const RTreeNodeStat& b) const { + return loss_chg == b.loss_chg && sum_hess == b.sum_hess && + base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt; + } }; /*! @@ -188,6 +199,11 @@ class RegTree { if (is_left_child) pidx |= (1U << 31); this->parent_ = pidx; } + bool operator==(const Node& b) const { + return parent_ == b.parent_ && cleft_ == b.cleft_ && + cright_ == b.cright_ && sindex_ == b.sindex_ && + info_.leaf_value == b.info_.leaf_value; + } private: /*! @@ -304,6 +320,11 @@ class RegTree { fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size()); } + bool operator==(const RegTree& b) const { + return nodes_ == b.nodes_ && stats_ == b.stats_ && + deleted_nodes_ == b.deleted_nodes_ && param == b.param; + } + /** * \brief Expands a leaf node into two additional leaf nodes. * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 89c2d2f57..cacf3582f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -57,6 +57,12 @@ if (USE_CUDA) target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1) endif (USE_NVTX) + # OpenMP is mandatory for cuda version + find_package(OpenMP REQUIRED) + target_compile_options(objxgboost PRIVATE + $<$:-Xcompiler=${OpenMP_CXX_FLAGS}> + ) + set_target_properties(objxgboost PROPERTIES CUDA_SEPARABLE_COMPILATION OFF) else (USE_CUDA) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index fdcc7b41b..33d74b5b1 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -12,6 +12,7 @@ #include "span.h" #include +#include #include #include #include @@ -752,6 +753,29 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) { }); } +class SaveCudaContext { + private: + int saved_device_; + + public: + template + explicit SaveCudaContext (Functor func) : saved_device_{-1} { + // When compiled with CUDA but running on CPU only device, + // cudaGetDevice will fail. + try { + safe_cuda(cudaGetDevice(&saved_device_)); + } catch (const dmlc::Error &except) { + saved_device_ = -1; + } + func(); + } + ~SaveCudaContext() { + if (saved_device_ != -1) { + safe_cuda(cudaSetDevice(saved_device_)); + } + } +}; + /** * \class AllReducer * @@ -777,8 +801,18 @@ class AllReducer { allreduce_calls_(0) {} /** - * \fn void Init(const std::vector &device_ordinals) - * + * \brief If we are using a single GPU only + */ + bool IsSingleGPU() { +#ifdef XGBOOST_USE_NCCL + CHECK(device_counts.size() > 0) << "AllReducer not initialised."; + return device_counts.size() <= 1 && device_counts.at(0) == 1; +#else + return true; +#endif + } + + /** * \brief Initialise with the desired device ordinals for this communication * group. * @@ -956,6 +990,22 @@ class AllReducer { #endif }; + /** + * \brief Synchronizes the device + * + * \param device_id Identifier for the device. + */ + void Synchronize(int device_id) { +#ifdef XGBOOST_USE_NCCL + SaveCudaContext([&]() { + dh::safe_cuda(cudaSetDevice(device_id)); + int idx = std::find(device_ordinals.begin(), device_ordinals.end(), device_id) - device_ordinals.begin(); + CHECK(idx < device_ordinals.size()); + dh::safe_cuda(cudaStreamSynchronize(streams[idx])); + }); +#endif + }; + #ifdef XGBOOST_USE_NCCL /** * \fn ncclUniqueId GetUniqueId() @@ -980,29 +1030,6 @@ class AllReducer { #endif }; -class SaveCudaContext { - private: - int saved_device_; - - public: - template - explicit SaveCudaContext (Functor func) : saved_device_{-1} { - // When compiled with CUDA but running on CPU only device, - // cudaGetDevice will fail. - try { - safe_cuda(cudaGetDevice(&saved_device_)); - } catch (const dmlc::Error &except) { - saved_device_ = -1; - } - func(); - } - ~SaveCudaContext() { - if (saved_device_ != -1) { - safe_cuda(cudaSetDevice(saved_device_)); - } - } -}; - /** * \brief Executes some operation on each element of the input vector, using a * single controlling thread for each element. In addition, passes the shard index @@ -1017,11 +1044,15 @@ class SaveCudaContext { template void ExecuteIndexShards(std::vector *shards, FunctionT f) { SaveCudaContext{[&]() { + // Temporarily turn off dynamic so we have a guaranteed number of threads + bool dynamic = omp_get_dynamic(); + omp_set_dynamic(false); const long shards_size = static_cast(shards->size()); #pragma omp parallel for schedule(static, 1) if (shards_size > 1) for (long shard = 0; shard < shards_size; ++shard) { f(shard, shards->at(shard)); } + omp_set_dynamic(dynamic); }}; } diff --git a/src/common/random.h b/src/common/random.h index 5e28e8878..d81d76a82 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -115,8 +115,16 @@ class ColumnSampler { public: /** * \brief Column sampler constructor. - * \note This constructor synchronizes the RNG seed across processes. + * \note This constructor manually sets the rng seed */ + explicit ColumnSampler(uint32_t seed) { + rng_.seed(seed); + } + + /** + * \brief Column sampler constructor. + * \note This constructor synchronizes the RNG seed across processes. + */ ColumnSampler() { uint32_t seed = common::GlobalRandom()(); rabit::Broadcast(&seed, sizeof(seed), 0); diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 39b5c87cf..579bf5844 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -342,7 +342,8 @@ class GPUPredictor : public xgboost::Predictor { } public: - GPUPredictor() : cpu_predictor_(Predictor::Create("cpu_predictor")) {} + GPUPredictor() // NOLINT + : cpu_predictor_(Predictor::Create("cpu_predictor")) {} // NOLINT void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 4a27058fc..9d74abde2 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -38,6 +38,7 @@ struct GPUHistMakerTrainParam bool single_precision_histogram; // number of rows in a single GPU batch int gpu_batch_nrows; + bool debug_synchronize; // declare parameters DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) { DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe( @@ -47,6 +48,8 @@ struct GPUHistMakerTrainParam .set_default(0) .describe("Number of rows in a GPU batch, used for finding quantiles on GPU; " "-1 to use all rows assignted to a GPU, and 0 to auto-deduce"); + DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe( + "Check if all distributed tree are identical after tree construction."); } }; #if !defined(GTEST_TEST) @@ -598,12 +601,23 @@ inline void SortPosition(dh::CubMemory* temp_memory, common::Span position, } /*! \brief Count how many rows are assigned to left node. */ -__forceinline__ __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) { +__forceinline__ __device__ void CountLeft(int64_t* d_count, int val, + int left_nidx) { +#if __CUDACC_VER_MAJOR__ > 8 + int mask = __activemask(); + unsigned ballot = __ballot_sync(mask, val == left_nidx); + int leader = __ffs(mask) - 1; + if (threadIdx.x % 32 == leader) { + atomicAdd(reinterpret_cast(d_count), // NOLINT + static_cast(__popc(ballot))); // NOLINT + } +#else unsigned ballot = __ballot(val == left_nidx); if (threadIdx.x % 32 == 0) { atomicAdd(reinterpret_cast(d_count), // NOLINT static_cast(__popc(ballot))); // NOLINT } +#endif } template @@ -621,6 +635,7 @@ template struct DeviceShard { int n_bins; int device_id; + int shard_idx; // Position in the local array of shards dh::BulkAllocator ba; @@ -670,18 +685,31 @@ struct DeviceShard { std::vector streams; + common::Monitor monitor; + std::vector node_value_constraints; + common::ColumnSampler column_sampler; + std::unique_ptr> hist_builder; + using ExpandQueue = + std::priority_queue, + std::function>; + std::unique_ptr qexpand; + // TODO(canonizer): do add support multi-batch DMatrix here - DeviceShard(int _device_id, bst_uint row_begin, bst_uint row_end, - TrainParam _param) + DeviceShard(int _device_id, int shard_idx, bst_uint row_begin, + bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed) : device_id(_device_id), + shard_idx(shard_idx), row_begin_idx(row_begin), row_end_idx(row_end), n_rows(row_end - row_begin), n_bins(0), param(std::move(_param)), - prediction_cache_initialised(false) {} + prediction_cache_initialised(false), + column_sampler(column_sampler_seed) { + monitor.Init(std::string("DeviceShard") + std::to_string(device_id)); + } /* Init row_ptrs and row_stride */ size_t InitRowPtrs(const SparsePage& row_batch) { @@ -736,7 +764,16 @@ struct DeviceShard { } // Reset values for each update iteration - void Reset(HostDeviceVector* dh_gpair) { + // Note that the column sampler must be passed by value because it is not + // thread safe + void Reset(HostDeviceVector* dh_gpair, int64_t num_columns) { + if (param.grow_policy == TrainParam::kLossGuide) { + qexpand.reset(new ExpandQueue(LossGuide)); + } else { + qexpand.reset(new ExpandQueue(DepthWise)); + } + this->column_sampler.Init(num_columns, param.colsample_bynode, + param.colsample_bylevel, param.colsample_bytree); dh::safe_cuda(cudaSetDevice(device_id)); thrust::fill( thrust::device_pointer_cast(position.Current()), @@ -764,8 +801,6 @@ struct DeviceShard { std::vector EvaluateSplits( std::vector nidxs, const RegTree& tree, - common::ColumnSampler* column_sampler, - const std::vector& value_constraints, size_t num_columns) { dh::safe_cuda(cudaSetDevice(device_id)); auto result = pinned_memory.GetSpan(nidxs.size()); @@ -800,7 +835,7 @@ struct DeviceShard { auto& streams = this->GetStreams(nidxs.size()); for (auto i = 0ull; i < nidxs.size(); i++) { auto nidx = nidxs[i]; - auto p_feature_set = column_sampler->GetFeatureSet(tree.GetDepth(nidx)); + auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx)); p_feature_set->Reshard(GPUSet(device_id, 1)); auto d_feature_set = p_feature_set->DeviceSpan(device_id); auto d_split_candidates = @@ -812,7 +847,7 @@ struct DeviceShard { EvaluateSplitKernel <<>>( hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix, - gpu_param, d_split_candidates, value_constraints[nidx], + gpu_param, d_split_candidates, node_value_constraints[nidx], monotone_constraints); // Reduce over features to find best feature @@ -997,6 +1032,179 @@ struct DeviceShard { out_preds_d, prediction_cache.data(), prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault)); } + + void AllReduceHist(int nidx, dh::AllReducer* reducer) { + monitor.StartCuda("AllReduce"); + auto d_node_hist = hist.GetNodeHistogram(nidx).data(); + reducer->AllReduceSum( + shard_idx, + reinterpret_cast(d_node_hist), + reinterpret_cast(d_node_hist), + ellpack_matrix.BinCount() * + (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); + reducer->Synchronize(device_id); + + monitor.StopCuda("AllReduce"); + } + + /** + * \brief Build GPU local histograms for the left and right child of some parent node + */ + void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right, dh::AllReducer* reducer) { + auto build_hist_nidx = nidx_left; + auto subtraction_trick_nidx = nidx_right; + + // If we are using a single GPU, build the histogram for the node with the + // fewest training instances + // If we are distributed, don't bother + if (reducer->IsSingleGPU()) { + bool fewer_right = + ridx_segments[nidx_right].Size() < ridx_segments[nidx_left].Size(); + if (fewer_right) { + std::swap(build_hist_nidx, subtraction_trick_nidx); + } + } + + this->BuildHist(build_hist_nidx); + this->AllReduceHist(build_hist_nidx, reducer); + + // Check whether we can use the subtraction trick to calculate the other + bool do_subtraction_trick = this->CanDoSubtractionTrick( + nidx_parent, build_hist_nidx, subtraction_trick_nidx); + + if (do_subtraction_trick) { + // Calculate other histogram using subtraction trick + this->SubtractionTrick(nidx_parent, build_hist_nidx, + subtraction_trick_nidx); + } else { + // Calculate other histogram manually + this->BuildHist(subtraction_trick_nidx); + this->AllReduceHist(subtraction_trick_nidx, reducer); + } + } + + void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) { + RegTree& tree = *p_tree; + + GradStats left_stats; + left_stats.Add(candidate.split.left_sum); + GradStats right_stats; + right_stats.Add(candidate.split.right_sum); + GradStats parent_sum; + parent_sum.Add(left_stats); + parent_sum.Add(right_stats); + node_value_constraints.resize(tree.GetNodes().size()); + auto base_weight = node_value_constraints[candidate.nid].CalcWeight(param, parent_sum); + auto left_weight = + node_value_constraints[candidate.nid].CalcWeight(param, left_stats)*param.learning_rate; + auto right_weight = + node_value_constraints[candidate.nid].CalcWeight(param, right_stats)*param.learning_rate; + tree.ExpandNode(candidate.nid, candidate.split.findex, + candidate.split.fvalue, candidate.split.dir == kLeftDir, + base_weight, left_weight, right_weight, + candidate.split.loss_chg, parent_sum.sum_hess); + // Set up child constraints + node_value_constraints.resize(tree.GetNodes().size()); + node_value_constraints[candidate.nid].SetChild( + param, tree[candidate.nid].SplitIndex(), left_stats, right_stats, + &node_value_constraints[tree[candidate.nid].LeftChild()], + &node_value_constraints[tree[candidate.nid].RightChild()]); + node_sum_gradients[tree[candidate.nid].LeftChild()] = + candidate.split.left_sum; + node_sum_gradients[tree[candidate.nid].RightChild()] = + candidate.split.right_sum; + } + + void InitRoot(RegTree* p_tree, HostDeviceVector* gpair_all, + dh::AllReducer* reducer, int64_t num_columns) { + constexpr int kRootNIdx = 0; + + const auto &gpair = gpair_all->DeviceSpan(device_id); + + dh::SumReduction(temp_memory, gpair, node_sum_gradients_d, + gpair.size()); + reducer->AllReduceSum( + shard_idx, reinterpret_cast(node_sum_gradients_d.data()), + reinterpret_cast(node_sum_gradients_d.data()), 2); + reducer->Synchronize(device_id); + dh::safe_cuda(cudaMemcpy(node_sum_gradients.data(), + node_sum_gradients_d.data(), sizeof(GradientPair), + cudaMemcpyDeviceToHost)); + + this->BuildHist(kRootNIdx); + this->AllReduceHist(kRootNIdx, reducer); + + // Remember root stats + p_tree->Stat(kRootNIdx).sum_hess = node_sum_gradients[kRootNIdx].GetHess(); + auto weight = CalcWeight(param, node_sum_gradients[kRootNIdx]); + p_tree->Stat(kRootNIdx).base_weight = weight; + (*p_tree)[kRootNIdx].SetLeaf(param.learning_rate * weight); + + // Initialise root constraint + node_value_constraints.resize(p_tree->GetNodes().size()); + + // Generate first split + auto split = this->EvaluateSplits({kRootNIdx}, *p_tree, num_columns); + qexpand->push( + ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0)); + } + + void UpdateTree(HostDeviceVector* gpair_all, DMatrix* p_fmat, + RegTree* p_tree, dh::AllReducer* reducer) { + auto& tree = *p_tree; + monitor.StartCuda("Reset"); + this->Reset(gpair_all, p_fmat->Info().num_col_); + monitor.StopCuda("Reset"); + + monitor.StartCuda("InitRoot"); + this->InitRoot(p_tree, gpair_all, reducer, p_fmat->Info().num_col_); + monitor.StopCuda("InitRoot"); + + auto timestamp = qexpand->size(); + auto num_leaves = 1; + + while (!qexpand->empty()) { + ExpandEntry candidate = qexpand->top(); + qexpand->pop(); + if (!candidate.IsValid(param, num_leaves)) { + continue; + } + + this->ApplySplit(candidate, p_tree); + + num_leaves++; + + int left_child_nidx = tree[candidate.nid].LeftChild(); + int right_child_nidx = tree[candidate.nid].RightChild(); + // Only create child entries if needed + if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), + num_leaves)) { + monitor.StartCuda("UpdatePosition"); + this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]); + monitor.StopCuda("UpdatePosition"); + + monitor.StartCuda("BuildHist"); + this->BuildHistLeftRight(candidate.nid, left_child_nidx, right_child_nidx, reducer); + monitor.StopCuda("BuildHist"); + + monitor.StartCuda("EvaluateSplits"); + auto splits = this->EvaluateSplits({left_child_nidx, right_child_nidx}, + *p_tree, p_fmat->Info().num_col_); + monitor.StopCuda("EvaluateSplits"); + + qexpand->push(ExpandEntry(left_child_nidx, + tree.GetDepth(left_child_nidx), splits.at(0), + timestamp++)); + qexpand->push(ExpandEntry(right_child_nidx, + tree.GetDepth(right_child_nidx), + splits.at(1), timestamp++)); + } + } + + monitor.StartCuda("FinalisePosition"); + this->FinalisePosition(p_tree); + monitor.StopCuda("FinalisePosition"); + } }; template @@ -1179,12 +1387,6 @@ class GPUHistMakerSpecialised{ dh::CheckComputeCapability(); - if (param_.grow_policy == TrainParam::kLossGuide) { - qexpand_.reset(new ExpandQueue(LossGuide)); - } else { - qexpand_.reset(new ExpandQueue(DepthWise)); - } - monitor_.Init("updater_gpu_hist"); } @@ -1223,17 +1425,23 @@ class GPUHistMakerSpecialised{ auto batch_iter = dmat->GetRowBatches().begin(); const SparsePage& batch = *batch_iter; + + // Synchronise the column sampling seed + uint32_t column_sampling_seed = common::GlobalRandom()(); + rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); + // Create device shards shards_.resize(n_devices); dh::ExecuteIndexShards( &shards_, - [&](int i, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(i))); - size_t start = dist_.ShardStart(info_->num_row_, i); - size_t size = dist_.ShardSize(info_->num_row_, i); + [&](int idx, std::unique_ptr>& shard) { + dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx))); + size_t start = dist_.ShardStart(info_->num_row_, idx); + size_t size = dist_.ShardSize(info_->num_row_, idx); shard = std::unique_ptr>( - new DeviceShard(dist_.Devices().DeviceId(i), start, - start + size, param_)); + new DeviceShard(dist_.Devices().DeviceId(idx), idx, + start, start + size, param_, + column_sampling_seed)); }); // Find the cuts. @@ -1264,277 +1472,61 @@ class GPUHistMakerSpecialised{ this->InitDataOnce(dmat); monitor_.StopCuda("InitDataOnce"); } - - column_sampler_.Init(info_->num_col_, param_.colsample_bynode, - param_.colsample_bylevel, param_.colsample_bytree); - - // Copy gpair & reset memory - monitor_.StartCuda("InitDataReset"); - - gpair->Reshard(dist_); - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->Reset(gpair); - }); - monitor_.StopCuda("InitDataReset"); } - void AllReduceHist(int nidx) { - if (shards_.size() == 1 && !rabit::IsDistributed()) { - return; + // Only call this method for testing + void CheckTreesSynchronized(const std::vector& local_trees) const { + std::string s_model; + common::MemoryBufferStream fs(&s_model); + int rank = rabit::GetRank(); + if (rank == 0) { + local_trees.front().Save(&fs); } - monitor_.StartCuda("AllReduce"); - - reducer_.GroupStart(); - for (auto& shard : shards_) { - auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data(); - reducer_.AllReduceSum( - dist_.Devices().Index(shard->device_id), - reinterpret_cast(d_node_hist), - reinterpret_cast(d_node_hist), - n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); - } - reducer_.GroupEnd(); - reducer_.Synchronize(); - - monitor_.StopCuda("AllReduce"); - } - - /** - * \brief Build GPU local histograms for the left and right child of some parent node - */ - void BuildHistLeftRight(int nidx_parent, int nidx_left, int nidx_right) { - size_t left_node_max_elements = 0; - size_t right_node_max_elements = 0; - for (auto& shard : shards_) { - left_node_max_elements = (std::max)( - left_node_max_elements, shard->ridx_segments[nidx_left].Size()); - right_node_max_elements = (std::max)( - right_node_max_elements, shard->ridx_segments[nidx_right].Size()); - } - - rabit::Allreduce(&left_node_max_elements, 1); - rabit::Allreduce(&right_node_max_elements, 1); - - auto build_hist_nidx = nidx_left; - auto subtraction_trick_nidx = nidx_right; - - if (right_node_max_elements < left_node_max_elements) { - build_hist_nidx = nidx_right; - subtraction_trick_nidx = nidx_left; - } - - // Build histogram for node with the smallest number of training examples - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->BuildHist(build_hist_nidx); - }); - - this->AllReduceHist(build_hist_nidx); - - // Check whether we can use the subtraction trick to calculate the other - bool do_subtraction_trick = true; - for (auto& shard : shards_) { - do_subtraction_trick &= shard->CanDoSubtractionTrick( - nidx_parent, build_hist_nidx, subtraction_trick_nidx); - } - - if (do_subtraction_trick) { - // Calculate other histogram using subtraction trick - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->SubtractionTrick(nidx_parent, build_hist_nidx, - subtraction_trick_nidx); - }); - } else { - // Calculate other histogram manually - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->BuildHist(subtraction_trick_nidx); - }); - - this->AllReduceHist(subtraction_trick_nidx); - } - } - - std::vector EvaluateSplits(std::vector nidx, - RegTree* p_tree) { - dh::safe_cuda(cudaSetDevice(shards_.front()->device_id)); - return shards_.front()->EvaluateSplits(nidx, *p_tree, &column_sampler_, - node_value_constraints_, - info_->num_col_); - } - - void InitRoot(RegTree* p_tree) { - constexpr int kRootNIdx = 0; - // Sum gradients - std::vector tmp_sums(shards_.size()); - - dh::ExecuteIndexShards( - &shards_, - [&](int i, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - tmp_sums[i] = dh::SumReduction( - shard->temp_memory, shard->gpair.data(), shard->gpair.size()); - }); - - GradientPair sum_gradient = - std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair()); - - rabit::Allreduce( - reinterpret_cast(&sum_gradient), 2); - - // Generate root histogram - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->BuildHist(kRootNIdx); - }); - - this->AllReduceHist(kRootNIdx); - - // Remember root stats - p_tree->Stat(kRootNIdx).sum_hess = sum_gradient.GetHess(); - auto weight = CalcWeight(param_, sum_gradient); - p_tree->Stat(kRootNIdx).base_weight = weight; - (*p_tree)[kRootNIdx].SetLeaf(param_.learning_rate * weight); - - // Store sum gradients - for (auto& shard : shards_) { - shard->node_sum_gradients[kRootNIdx] = sum_gradient; - } - - // Initialise root constraint - node_value_constraints_.resize(p_tree->GetNodes().size()); - - // Generate first split - auto split = this->EvaluateSplits({ kRootNIdx }, p_tree); - qexpand_->push( - ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split.at(0), 0)); - } - - void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) { - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id)); - shard->UpdatePosition(candidate.nid, - p_tree->GetNodes()[candidate.nid]); - }); - } - void FinalisePosition(RegTree* p_tree) { - dh::ExecuteIndexShards( - &shards_, - [&](int idx, std::unique_ptr>& shard) { - shard->FinalisePosition(p_tree); - }); - } - - void ApplySplit(const ExpandEntry& candidate, RegTree* p_tree) { - RegTree& tree = *p_tree; - - GradStats left_stats; - left_stats.Add(candidate.split.left_sum); - GradStats right_stats; - right_stats.Add(candidate.split.right_sum); - GradStats parent_sum; - parent_sum.Add(left_stats); - parent_sum.Add(right_stats); - node_value_constraints_.resize(tree.GetNodes().size()); - auto base_weight = node_value_constraints_[candidate.nid].CalcWeight(param_, parent_sum); - auto left_weight = - node_value_constraints_[candidate.nid].CalcWeight(param_, left_stats)*param_.learning_rate; - auto right_weight = - node_value_constraints_[candidate.nid].CalcWeight(param_, right_stats)*param_.learning_rate; - tree.ExpandNode(candidate.nid, candidate.split.findex, - candidate.split.fvalue, candidate.split.dir == kLeftDir, - base_weight, left_weight, right_weight, - candidate.split.loss_chg, parent_sum.sum_hess); - // Set up child constraints - node_value_constraints_.resize(tree.GetNodes().size()); - node_value_constraints_[candidate.nid].SetChild( - param_, tree[candidate.nid].SplitIndex(), left_stats, right_stats, - &node_value_constraints_[tree[candidate.nid].LeftChild()], - &node_value_constraints_[tree[candidate.nid].RightChild()]); - - // Store sum gradients - for (auto& shard : shards_) { - shard->node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum; - shard->node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum; + fs.Seek(0); + rabit::Broadcast(&s_model, 0); + RegTree reference_tree; + reference_tree.Load(&fs); + for (const auto& tree : local_trees) { + CHECK(tree == reference_tree); } } void UpdateTree(HostDeviceVector* gpair, DMatrix* p_fmat, RegTree* p_tree) { - auto& tree = *p_tree; - monitor_.StartCuda("InitData"); this->InitData(gpair, p_fmat); monitor_.StopCuda("InitData"); - monitor_.StartCuda("InitRoot"); - this->InitRoot(p_tree); - monitor_.StopCuda("InitRoot"); - auto timestamp = qexpand_->size(); - auto num_leaves = 1; + std::vector trees(shards_.size()); + for (auto& tree : trees) { + tree = *p_tree; + } + gpair->Reshard(dist_); - while (!qexpand_->empty()) { - ExpandEntry candidate = qexpand_->top(); - qexpand_->pop(); - if (!candidate.IsValid(param_, num_leaves)) { - continue; - } + // Launch one thread for each device "shard" containing a subset of rows. + // Threads will cooperatively build the tree, synchronising over histograms. + // Each thread will redundantly build its own copy of the tree + dh::ExecuteIndexShards( + &shards_, + [&](int idx, std::unique_ptr>& shard) { + shard->UpdateTree(gpair, p_fmat, &trees.at(idx), &reducer_); + }); - this->ApplySplit(candidate, p_tree); - num_leaves++; - - int left_child_nidx = tree[candidate.nid].LeftChild(); - int right_child_nidx = tree[candidate.nid].RightChild(); - // Only create child entries if needed - if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx), - num_leaves)) { - monitor_.StartCuda("UpdatePosition"); - this->UpdatePosition(candidate, p_tree); - monitor_.StopCuda("UpdatePosition"); - - monitor_.StartCuda("BuildHist"); - this->BuildHistLeftRight(candidate.nid, left_child_nidx, - right_child_nidx); - monitor_.StopCuda("BuildHist"); - - monitor_.StartCuda("EvaluateSplits"); - auto splits = - this->EvaluateSplits({left_child_nidx, right_child_nidx}, p_tree); - qexpand_->push(ExpandEntry(left_child_nidx, - tree.GetDepth(left_child_nidx), splits.at(0), - timestamp++)); - qexpand_->push(ExpandEntry(right_child_nidx, - tree.GetDepth(right_child_nidx), - splits.at(1), timestamp++)); - monitor_.StopCuda("EvaluateSplits"); - } + // All trees are expected to be identical + if (hist_maker_param_.debug_synchronize) { + this->CheckTreesSynchronized(trees); } - monitor_.StartCuda("FinalisePosition"); - this->FinalisePosition(p_tree); - monitor_.StopCuda("FinalisePosition"); + // Write the output tree + *p_tree = trees.front(); } bool UpdatePredictionCache( const DMatrix* data, HostDeviceVector* p_out_preds) { - monitor_.StartCuda("UpdatePredictionCache"); if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } + monitor_.StartCuda("UpdatePredictionCache"); p_out_preds->Reshard(dist_.Devices()); dh::ExecuteIndexShards( &shards_, @@ -1552,9 +1544,6 @@ class GPUHistMakerSpecialised{ MetaInfo* info_; // NOLINT std::vector>> shards_; // NOLINT - common::ColumnSampler column_sampler_; // NOLINT - - std::vector node_value_constraints_; // NOLINT private: bool initialised_; @@ -1565,10 +1554,6 @@ class GPUHistMakerSpecialised{ GPUHistMakerTrainParam hist_maker_param_; common::GHistIndexMatrix gmat_; - using ExpandQueue = - std::priority_queue, - std::function>; - std::unique_ptr qexpand_; dh::AllReducer reducer_; DMatrix* p_last_fmat_; diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc index 702f29907..e408f11bc 100644 --- a/tests/cpp/common/test_random.cc +++ b/tests/cpp/common/test_random.cc @@ -1,3 +1,4 @@ +#include #include "../../../src/common/random.h" #include "../helpers.h" #include "gtest/gtest.h" @@ -33,7 +34,8 @@ TEST(ColumnSampler, Test) { // No level or node sampling, should be the same at different depth cs.Init(n, 1.0f, 1.0f, 0.5f); - ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector()); + ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), + cs.GetFeatureSet(1)->HostVector()); cs.Init(n, 1.0f, 1.0f, 1.0f); auto set5 = *cs.GetFeatureSet(0); @@ -45,7 +47,34 @@ TEST(ColumnSampler, Test) { // Should always be a minimum of one feature cs.Init(n, 1e-16f, 1e-16f, 1e-16f); ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1); +} +// Test if different threads using the same seed produce the same result +TEST(ColumnSampler, ThreadSynchronisation) { + const int64_t num_threads = 100; + int n = 128; + int iterations = 10; + int levels = 5; + std::vector reference_result; + bool success = + true; // Cannot use google test asserts in multithreaded region +#pragma omp parallel num_threads(num_threads) + { + for (auto j = 0ull; j < iterations; j++) { + ColumnSampler cs(j); + cs.Init(n, 0.5f, 0.5f, 0.5f); + for (auto level = 0ull; level < levels; level++) { + auto result = cs.GetFeatureSet(level)->ConstHostVector(); +#pragma omp single + { reference_result = result; } + if (result != reference_result) { + success = false; + } +#pragma omp barrier + } + } + } + ASSERT_TRUE(success); } } // namespace common } // namespace xgboost diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 37cce8b57..897b9d771 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -89,7 +89,7 @@ TEST(GpuHist, BuildGidxDense) { param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, kNRows, param); + DeviceShard shard(0, 0, 0, kNRows, param, kNCols); BuildGidx(&shard, kNRows, kNCols); std::vector h_gidx_buffer(shard.gidx_buffer.size()); @@ -128,7 +128,7 @@ TEST(GpuHist, BuildGidxSparse) { param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, kNRows, param); + DeviceShard shard(0, 0, 0, kNRows, param, kNCols); BuildGidx(&shard, kNRows, kNCols, 0.9f); std::vector h_gidx_buffer(shard.gidx_buffer.size()); @@ -172,7 +172,7 @@ void TestBuildHist(GPUHistBuilderBase& builder) { param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, kNRows, param); + DeviceShard shard(0, 0, 0, kNRows, param, kNCols); BuildGidx(&shard, kNRows, kNCols); @@ -282,8 +282,8 @@ TEST(GpuHist, EvaluateSplits) { int max_bins = 4; // Initialize DeviceShard - std::unique_ptr> shard { - new DeviceShard(0, 0, kNRows, param)}; + std::unique_ptr> shard{ + new DeviceShard(0, 0, 0, kNRows, param, kNCols)}; // Initialize DeviceShard::node_sum_gradients shard->node_sum_gradients = {{6.4f, 12.8f}}; @@ -321,12 +321,7 @@ TEST(GpuHist, EvaluateSplits) { thrust::copy(hist.begin(), hist.end(), shard->hist.Data().begin()); - // Initialize GPUHistMaker - GPUHistMakerSpecialised hist_maker = - GPUHistMakerSpecialised(); - hist_maker.param_ = param; - hist_maker.shards_.push_back(std::move(shard)); - hist_maker.column_sampler_.Init(kNCols, + shard->column_sampler.Init(kNCols, param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree, @@ -337,13 +332,12 @@ TEST(GpuHist, EvaluateSplits) { info.num_row_ = kNRows; info.num_col_ = kNCols; - hist_maker.info_ = &info; - hist_maker.node_value_constraints_.resize(1); - hist_maker.node_value_constraints_[0].lower_bound = -1.0; - hist_maker.node_value_constraints_[0].upper_bound = 1.0; + shard->node_value_constraints.resize(1); + shard->node_value_constraints[0].lower_bound = -1.0; + shard->node_value_constraints[0].upper_bound = 1.0; std::vector res = - hist_maker.EvaluateSplits({ 0,0 }, &tree); + shard->EvaluateSplits({ 0,0 }, tree, kNCols); ASSERT_EQ(res[0].findex, 7); ASSERT_EQ(res[1].findex, 7); @@ -368,7 +362,8 @@ TEST(GpuHist, ApplySplit) { } hist_maker.shards_.resize(1); - hist_maker.shards_[0].reset(new DeviceShard(0, 0, kNRows, param)); + hist_maker.shards_[0].reset( + new DeviceShard(0, 0, 0, kNRows, param, kNCols)); auto& shard = hist_maker.shards_.at(0); shard->ridx_segments.resize(3); // 3 nodes. @@ -435,8 +430,8 @@ TEST(GpuHist, ApplySplit) { shard->gidx_buffer.data(), num_symbols); hist_maker.info_ = &info; - hist_maker.ApplySplit(candidate_entry, &tree); - hist_maker.UpdatePosition(candidate_entry, &tree); + shard->ApplySplit(candidate_entry, &tree); + shard->UpdatePosition(candidate_entry.nid, tree[candidate_entry.nid]); ASSERT_FALSE(tree[kNId].IsLeaf()); diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py index 172b0443d..009905110 100644 --- a/tests/distributed/distributed_gpu.py +++ b/tests/distributed/distributed_gpu.py @@ -54,7 +54,8 @@ base_params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic' + 'objective': 'binary:logistic', + 'debug_synchronize': True } diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index 4cb683a88..5c26ae11c 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -51,7 +51,7 @@ class TestGPU(unittest.TestCase): variable_param = {'n_gpus': [-1], 'max_depth': [2, 10], 'max_leaves': [255, 4], 'max_bin': [2, 256], - 'grow_policy': ['lossguide']} + 'grow_policy': ['lossguide'], 'debug_synchronize': [True]} for param in parameter_combinations(variable_param): param['tree_method'] = 'gpu_hist' gpu_results = run_suite(param, select_datasets=datasets)