diff --git a/.clang-tidy b/.clang-tidy index 1708dcf52..ecc265f8d 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -6,8 +6,8 @@ CheckOptions: - { key: readability-identifier-naming.TypedefCase, value: CamelCase } - { key: readability-identifier-naming.TypeTemplateParameterCase, value: CamelCase } - { key: readability-identifier-naming.MemberCase, value: lower_case } - - { key: readability-identifier-naming.PrivateMemberSuffix, value: '_' } - - { key: readability-identifier-naming.ProtectedMemberSuffix, value: '_' } + - { key: readability-identifier-naming.PrivateMemberSuffix, value: '_' } + - { key: readability-identifier-naming.ProtectedMemberSuffix, value: '_' } - { key: readability-identifier-naming.EnumCase, value: CamelCase } - { key: readability-identifier-naming.EnumConstant, value: CamelCase } - { key: readability-identifier-naming.EnumConstantPrefix, value: k } diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 45749a7db..cb7240448 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -87,7 +87,10 @@ class Booster { initialized_ = true; } - public: + bool IsInitialized() const { return initialized_; } + void Intialize() { initialized_ = true; } + + private: bool configured_; bool initialized_; std::unique_ptr learner_; @@ -1153,7 +1156,7 @@ XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle, auto* bst = static_cast(handle); *version = rabit::LoadCheckPoint(bst->learner()); if (*version != 0) { - bst->initialized_ = true; + bst->Intialize(); } API_END(); } diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 4a1ab918d..18fcdb337 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -42,7 +42,9 @@ class Column { uint32_t GetBaseIdx() const { return index_base_; } ColumnType GetType() const { return type_; } size_t GetRowIdx(size_t idx) const { - return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx]; + // clang-tidy worries that row_ind_ might be a nullptr, which is possible, + // but low level structure is not safe anyway. + return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx]; // NOLINT } bool IsMissing(size_t idx) const { return index_[idx] == std::numeric_limits::max(); @@ -68,7 +70,7 @@ class ColumnMatrix { // construct column matrix from GHistIndexMatrix inline void Init(const GHistIndexMatrix& gmat, - double sparse_threshold) { + double sparse_threshold) { const auto nfeature = static_cast(gmat.cut.row_ptr.size() - 1); const size_t nrow = gmat.row_ptr.size() - 1; diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 923ee03cd..5e2461ef7 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -772,7 +772,7 @@ template typename std::iterator_traits::value_type SumReduction( dh::CubMemory &tmp_mem, T in, int nVals) { using ValueT = typename std::iterator_traits::value_type; - size_t tmpSize; + size_t tmpSize {0}; ValueT *dummy_out = nullptr; dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals)); // Allocate small extra memory for the return value diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 8e1340313..f1c3e412b 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -548,7 +548,7 @@ void GHistBuilder::BuildBlockHist(const std::vector& gpair, const size_t rest = nrows % kUnroll; #if defined(_OPENMP) - const auto nthread = static_cast(this->nthread_); + const auto nthread = static_cast(this->nthread_); // NOLINT #endif // defined(_OPENMP) tree::GradStats* p_hist = hist.data(); @@ -594,7 +594,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa const uint32_t rest = nbins % kUnroll; #if defined(_OPENMP) - const auto nthread = static_cast(this->nthread_); + const auto nthread = static_cast(this->nthread_); // NOLINT #endif // defined(_OPENMP) tree::GradStats* p_self = self.data(); tree::GradStats* p_sibling = sibling.data(); diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 0c41bcb67..b4cc7977e 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -24,13 +24,14 @@ namespace common { using WXQSketch = HistCutMatrix::WXQSketch; -__global__ void find_cuts_k +__global__ void FindCutsK (WXQSketch::Entry* __restrict__ cuts, const bst_float* __restrict__ data, const float* __restrict__ cum_weights, int nsamples, int ncuts) { // ncuts < nsamples int icut = threadIdx.x + blockIdx.x * blockDim.x; - if (icut >= ncuts) + if (icut >= ncuts) { return; + } WXQSketch::Entry v; int isample = 0; if (icut == 0) { @@ -55,7 +56,7 @@ struct IsNotNaN { __device__ bool operator()(float a) const { return !isnan(a); } }; -__global__ void unpack_features_k +__global__ void UnpackFeaturesK (float* __restrict__ fvalues, float* __restrict__ feature_weights, const size_t* __restrict__ row_ptrs, const float* __restrict__ weights, Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr, @@ -75,7 +76,7 @@ __global__ void unpack_features_k // if and only if it is also written to features if (!isnan(entry.fvalue) && (weights == nullptr || !isnan(weights[irow]))) { fvalues[ind] = entry.fvalue; - if (feature_weights != nullptr) { + if (feature_weights != nullptr && weights != nullptr) { feature_weights[ind] = weights[irow]; } } @@ -84,7 +85,7 @@ __global__ void unpack_features_k // finds quantiles on the GPU struct GPUSketcher { // manage memory for a single GPU - struct DeviceShard { + class DeviceShard { int device_; bst_uint row_begin_; // The row offset for this shard bst_uint row_end_; @@ -110,6 +111,7 @@ struct GPUSketcher { thrust::device_vector num_elements_; thrust::device_vector tmp_storage_; + public: DeviceShard(int device, bst_uint row_begin, bst_uint row_end, tree::TrainParam param) : device_(device), row_begin_(row_begin), row_end_(row_end), @@ -268,7 +270,7 @@ struct GPUSketcher { } else if (n_cuts_cur_[icol] > 0) { // if more elements than cuts: use binary search on cumulative weights int block = 256; - find_cuts_k<<>> + FindCutsK<<>> (cuts_d_.data().get() + icol * n_cuts_, fvalues_cur_.data().get(), weights2_.data().get(), n_unique, n_cuts_cur_[icol]); dh::safe_cuda(cudaGetLastError()); // NOLINT @@ -309,7 +311,7 @@ struct GPUSketcher { dim3 block3(64, 4, 1); dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x), dh::DivRoundUp(num_cols_, block3.y), 1); - unpack_features_k<<>> + UnpackFeaturesK<<>> (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr, row_ptrs_.data().get() + batch_row_begin, has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(), @@ -340,6 +342,10 @@ struct GPUSketcher { SketchBatch(row_batch, info, gpu_batch); } } + + void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) { + sketches_[icol].GetSummary(summary); + } }; void Sketch(const SparsePage& batch, const MetaInfo& info, @@ -368,8 +374,8 @@ struct GPUSketcher { WXQSketch::SummaryContainer summary; for (int icol = 0; icol < num_cols; ++icol) { sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin)); - for (int shard = 0; shard < shards_.size(); ++shard) { - shards_[shard]->sketches_[icol].GetSummary(&summary); + for (auto &shard : shards_) { + shard->GetSummary(&summary, icol); sketches[icol].PushSummary(summary); } } @@ -381,6 +387,7 @@ struct GPUSketcher { dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus, n_rows)); } + private: std::vector> shards_; tree::TrainParam param_; GPUDistribution dist_; diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 6c3be21b0..3aa2f232d 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -38,6 +38,7 @@ struct HistCutMatrix { void Init(std::vector* sketchs, uint32_t max_num_bins); HistCutMatrix(); + size_t NumBins() const { return row_ptr.back(); } protected: virtual size_t SearchGroupIndFromBaseRow( diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index 98e760a50..bb8450526 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -18,6 +18,11 @@ struct HostDeviceVectorImpl { explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {} HostDeviceVectorImpl(std::initializer_list init) : data_h_(init), distribution_() {} explicit HostDeviceVectorImpl(std::vector init) : data_h_(std::move(init)), distribution_() {} + + std::vector& Vec() { return data_h_; } + GPUDistribution& Dist() { return distribution_; } + + private: std::vector data_h_; GPUDistribution distribution_; }; @@ -64,14 +69,14 @@ HostDeviceVector& HostDeviceVector::operator=(const HostDeviceVector& o } template -size_t HostDeviceVector::Size() const { return impl_->data_h_.size(); } +size_t HostDeviceVector::Size() const { return impl_->Vec().size(); } template GPUSet HostDeviceVector::Devices() const { return GPUSet::Empty(); } template const GPUDistribution& HostDeviceVector::Distribution() const { - return impl_->distribution_; + return impl_->Dist(); } template @@ -93,16 +98,16 @@ common::Span HostDeviceVector::ConstDeviceSpan(int device) const { } template -std::vector& HostDeviceVector::HostVector() { return impl_->data_h_; } +std::vector& HostDeviceVector::HostVector() { return impl_->Vec(); } template const std::vector& HostDeviceVector::ConstHostVector() const { - return impl_->data_h_; + return impl_->Vec(); } template void HostDeviceVector::Resize(size_t new_size, T v) { - impl_->data_h_.resize(new_size, v); + impl_->Vec().resize(new_size, v); } template diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index df8541f09..ef3c8eee8 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -23,10 +23,10 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) { // wrapper over access with useful methods class Permissions { GPUAccess access_; - explicit Permissions(GPUAccess access) : access_(access) {} + explicit Permissions(GPUAccess access) : access_{access} {} public: - Permissions() : access_(GPUAccess::kNone) {} + Permissions() : access_{GPUAccess::kNone} {} explicit Permissions(bool perm) : access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {} @@ -46,8 +46,8 @@ template struct HostDeviceVectorImpl { struct DeviceShard { DeviceShard() - : proper_size_(0), device_(-1), start_(0), perm_d_(false), - cached_size_(~0), vec_(nullptr) {} + : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false}, + cached_size_{static_cast(~0)}, vec_{nullptr} {} void Init(HostDeviceVectorImpl* vec, int device) { if (vec_ == nullptr) { vec_ = vec; } @@ -154,6 +154,13 @@ struct HostDeviceVectorImpl { } } + T* Raw() { return data_.data().get(); } + size_t Start() const { return start_; } + size_t DataSize() const { return data_.size(); } + Permissions& Perm() { return perm_d_; } + Permissions const& Perm() const { return perm_d_; } + + private: int device_; thrust::device_vector data_; // cached vector size @@ -216,41 +223,42 @@ struct HostDeviceVectorImpl { T* DevicePointer(int device) { CHECK(distribution_.devices_.Contains(device)); LazySyncDevice(device, GPUAccess::kWrite); - return shards_.at(distribution_.devices_.Index(device)).data_.data().get(); + return shards_.at(distribution_.devices_.Index(device)).Raw(); } const T* ConstDevicePointer(int device) { CHECK(distribution_.devices_.Contains(device)); LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).data_.data().get(); + return shards_.at(distribution_.devices_.Index(device)).Raw(); } common::Span DeviceSpan(int device) { GPUSet devices = distribution_.devices_; CHECK(devices.Contains(device)); LazySyncDevice(device, GPUAccess::kWrite); - return {shards_.at(devices.Index(device)).data_.data().get(), - static_cast::index_type>(DeviceSize(device))}; + return {shards_.at(devices.Index(device)).Raw(), + static_cast::index_type>(DeviceSize(device))}; } common::Span ConstDeviceSpan(int device) { GPUSet devices = distribution_.devices_; CHECK(devices.Contains(device)); LazySyncDevice(device, GPUAccess::kRead); - return {shards_.at(devices.Index(device)).data_.data().get(), - static_cast::index_type>(DeviceSize(device))}; + using SpanInd = typename common::Span::index_type; + return {shards_.at(devices.Index(device)).Raw(), + static_cast(DeviceSize(device))}; } size_t DeviceSize(int device) { CHECK(distribution_.devices_.Contains(device)); LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).data_.size(); + return shards_.at(distribution_.devices_.Index(device)).DataSize(); } size_t DeviceStart(int device) { CHECK(distribution_.devices_.Contains(device)); LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).start_; + return shards_.at(distribution_.devices_.Index(device)).Start(); } thrust::device_ptr tbegin(int device) { // NOLINT @@ -293,7 +301,7 @@ struct HostDeviceVectorImpl { } } - void Fill(T v) { + void Fill(T v) { // NOLINT if (perm_h_.CanWrite()) { std::fill(data_h_.begin(), data_h_.end(), v); } else { @@ -389,7 +397,7 @@ struct HostDeviceVectorImpl { if (perm_h_.CanRead()) { // data is present, just need to deny access to the device dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.perm_d_.DenyComplementary(access); + shard.Perm().DenyComplementary(access); }); perm_h_.Grant(access); return; @@ -412,9 +420,10 @@ struct HostDeviceVectorImpl { bool DeviceCanAccess(int device, GPUAccess access) { GPUSet devices = distribution_.Devices(); if (!devices.Contains(device)) { return false; } - return shards_.at(devices.Index(device)).perm_d_.CanAccess(access); + return shards_.at(devices.Index(device)).Perm().CanAccess(access); } + private: std::vector data_h_; Permissions perm_h_; // the total size of the data stored on the devices diff --git a/src/common/math.h b/src/common/math.h index 20879aaba..4e27a57f1 100644 --- a/src/common/math.h +++ b/src/common/math.h @@ -7,6 +7,8 @@ #ifndef XGBOOST_COMMON_MATH_H_ #define XGBOOST_COMMON_MATH_H_ +#include + #include #include #include diff --git a/src/common/span.h b/src/common/span.h index bcaf1b178..cd28ca27e 100644 --- a/src/common/span.h +++ b/src/common/span.h @@ -622,8 +622,8 @@ XGBOOST_DEVICE auto as_writable_bytes(Span s) __span_noexcept -> // NOLIN return {reinterpret_cast(s.data()), s.size_bytes()}; } -} // namespace common NOLINT -} // namespace xgboost NOLINT +} // namespace common +} // namespace xgboost #if defined(_MSC_VER) &&_MSC_VER < 1910 #undef constexpr diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc index 8eea3fce6..33b3e9181 100644 --- a/src/linear/updater_coordinate.cc +++ b/src/linear/updater_coordinate.cc @@ -30,8 +30,8 @@ class CoordinateUpdater : public LinearUpdater { tparam_.InitAllowUnknown(args) }; cparam_.InitAllowUnknown(rest); - selector.reset(FeatureSelector::Create(tparam_.feature_selector)); - monitor.Init("CoordinateUpdater"); + selector_.reset(FeatureSelector::Create(tparam_.feature_selector)); + monitor_.Init("CoordinateUpdater"); } void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model, double sum_instance_weight) override { @@ -48,20 +48,20 @@ class CoordinateUpdater : public LinearUpdater { dbias, &in_gpair->HostVector(), p_fmat); } // prepare for updating the weights - selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, + selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, cparam_.top_k); // update weights for (int group_idx = 0; group_idx < ngroup; ++group_idx) { for (unsigned i = 0U; i < model->param.num_feature; i++) { - int fidx = selector->NextFeature + int fidx = selector_->NextFeature (i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm); if (fidx < 0) break; this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model); } } - monitor.Stop("UpdateFeature"); + monitor_.Stop("UpdateFeature"); } inline void UpdateFeature(int fidx, int group_idx, std::vector *in_gpair, @@ -78,11 +78,12 @@ class CoordinateUpdater : public LinearUpdater { UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat); } + private: CoordinateParam cparam_; // training parameter LinearTrainParam tparam_; - std::unique_ptr selector; - common::Monitor monitor; + std::unique_ptr selector_; + common::Monitor monitor_; }; XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent") diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index 0d4cbe824..663276675 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -62,7 +62,7 @@ class DeviceShard { auto column_end = std::lower_bound(col.cbegin(), col.cend(), xgboost::Entry(row_end, 0.0f), cmp); - column_segments.push_back( + column_segments.emplace_back( std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin())); row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin)); } @@ -154,13 +154,13 @@ class GPUCoordinateUpdater : public LinearUpdater { void Init( const std::vector> &args) override { tparam_.InitAllowUnknown(args); - selector.reset(FeatureSelector::Create(tparam_.feature_selector)); - monitor.Init("GPUCoordinateUpdater"); + selector_.reset(FeatureSelector::Create(tparam_.feature_selector)); + monitor_.Init("GPUCoordinateUpdater"); } void LazyInitShards(DMatrix *p_fmat, const gbm::GBLinearModelParam &model_param) { - if (!shards.empty()) return; + if (!shards_.empty()) return; dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus, p_fmat->Info().num_row_)); @@ -183,9 +183,9 @@ class GPUCoordinateUpdater : public LinearUpdater { CHECK(p_fmat->SingleColBlock()); SparsePage const& batch = *(p_fmat->GetColumnBatches().begin()); - shards.resize(n_devices); + shards_.resize(n_devices); // Create device shards - dh::ExecuteIndexShards(&shards, + dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr& shard) { shard = std::unique_ptr( new DeviceShard(devices.DeviceId(i), batch, row_segments[i], @@ -196,38 +196,38 @@ class GPUCoordinateUpdater : public LinearUpdater { void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model, double sum_instance_weight) override { tparam_.DenormalizePenalties(sum_instance_weight); - monitor.Start("LazyInitShards"); + monitor_.Start("LazyInitShards"); this->LazyInitShards(p_fmat, model->param); - monitor.Stop("LazyInitShards"); + monitor_.Stop("LazyInitShards"); - monitor.Start("UpdateGpair"); + monitor_.Start("UpdateGpair"); // Update gpair - dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr& shard) { + dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr& shard) { if (!shard->IsEmpty()) { shard->UpdateGpair(in_gpair->ConstHostVector(), model->param); } }); - monitor.Stop("UpdateGpair"); + monitor_.Stop("UpdateGpair"); - monitor.Start("UpdateBias"); + monitor_.Start("UpdateBias"); this->UpdateBias(p_fmat, model); - monitor.Stop("UpdateBias"); + monitor_.Stop("UpdateBias"); // prepare for updating the weights - selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat, - tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, - coord_param_.top_k); - monitor.Start("UpdateFeature"); + selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat, + tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, + coord_param_.top_k); + monitor_.Start("UpdateFeature"); for (auto group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) { for (auto i = 0U; i < model->param.num_feature; i++) { - auto fidx = selector->NextFeature( + auto fidx = selector_->NextFeature( i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm); if (fidx < 0) break; this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model); } } - monitor.Stop("UpdateFeature"); + monitor_.Stop("UpdateFeature"); } void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) { @@ -235,7 +235,7 @@ class GPUCoordinateUpdater : public LinearUpdater { ++group_idx) { // Get gradient auto grad = dh::ReduceShards( - &shards, [&](std::unique_ptr &shard) { + &shards_, [&](std::unique_ptr &shard) { if (!shard->IsEmpty()) { GradientPair result = shard->GetBiasGradient(group_idx, @@ -251,7 +251,7 @@ class GPUCoordinateUpdater : public LinearUpdater { model->bias()[group_idx] += dbias; // Update residual - dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr& shard) { + dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr& shard) { if (!shard->IsEmpty()) { shard->UpdateBiasResidual(dbias, group_idx, model->param.num_output_group); @@ -266,7 +266,7 @@ class GPUCoordinateUpdater : public LinearUpdater { bst_float &w = (*model)[fidx][group_idx]; // Get gradient auto grad = dh::ReduceShards( - &shards, [&](std::unique_ptr &shard) { + &shards_, [&](std::unique_ptr &shard) { if (!shard->IsEmpty()) { return shard->GetGradient(group_idx, model->param.num_output_group, fidx); @@ -280,7 +280,7 @@ class GPUCoordinateUpdater : public LinearUpdater { tparam_.reg_lambda_denorm)); w += dw; - dh::ExecuteIndexShards(&shards, [&](int idx, + dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr &shard) { if (!shard->IsEmpty()) { shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx); @@ -288,14 +288,15 @@ class GPUCoordinateUpdater : public LinearUpdater { }); } + private: // training parameter LinearTrainParam tparam_; CoordinateParam coord_param_; GPUDistribution dist_; - std::unique_ptr selector; - common::Monitor monitor; + std::unique_ptr selector_; + common::Monitor monitor_; - std::vector> shards; + std::vector> shards_; }; XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent") diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index cb7f9153d..b694feafd 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -27,23 +27,28 @@ namespace metric { // tag the this file, used by force static link later. DMLC_REGISTRY_FILE_TAG(elementwise_metric); -struct PackedReduceResult { - double residue_sum_; - double weights_sum_; - - XGBOOST_DEVICE PackedReduceResult() : residue_sum_{0}, weights_sum_{0} {} - XGBOOST_DEVICE PackedReduceResult(double residue, double weight) : - residue_sum_{residue}, weights_sum_{weight} {} - - XGBOOST_DEVICE - PackedReduceResult operator+(PackedReduceResult const& other) const { - return PackedReduceResult { residue_sum_ + other.residue_sum_, - weights_sum_ + other.weights_sum_ }; - } -}; - template class MetricsReduction { + public: + class PackedReduceResult { + double residue_sum_; + double weights_sum_; + friend MetricsReduction; + + public: + XGBOOST_DEVICE PackedReduceResult() : residue_sum_{0}, weights_sum_{0} {} + XGBOOST_DEVICE PackedReduceResult(double residue, double weight) : + residue_sum_{residue}, weights_sum_{weight} {} + + XGBOOST_DEVICE + PackedReduceResult operator+(PackedReduceResult const& other) const { + return PackedReduceResult { residue_sum_ + other.residue_sum_, + weights_sum_ + other.weights_sum_ }; + } + double Residue() const { return residue_sum_; } + double Weights() const { return weights_sum_; } + }; + public: explicit MetricsReduction(EvalRow policy) : policy_(std::move(policy)) {} @@ -346,10 +351,10 @@ struct EvalEWiseBase : public Metric { // Dealing with ndata < n_gpus. GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata); - PackedReduceResult result = + auto result = reducer_.Reduce(devices, info.weights_, info.labels_, preds); - double dat[2] { result.residue_sum_, result.weights_sum_ }; + double dat[2] { result.Residue(), result.Weights() }; if (distributed) { rabit::Allreduce(dat, 2); } diff --git a/src/metric/multiclass_metric.cc b/src/metric/multiclass_metric.cc index 31a402755..a1db79901 100644 --- a/src/metric/multiclass_metric.cc +++ b/src/metric/multiclass_metric.cc @@ -79,6 +79,8 @@ struct EvalMClassBase : public Metric { inline static bst_float GetFinal(bst_float esum, bst_float wsum) { return esum / wsum; } + + private: // used to store error message const char *error_msg_; }; diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 3d6a1df64..3b235c12c 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -48,7 +48,7 @@ void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) { */ struct DevicePredictionNode { XGBOOST_DEVICE DevicePredictionNode() - : fidx(-1), left_child_idx(-1), right_child_idx(-1) {} + : fidx{-1}, left_child_idx{-1}, right_child_idx{-1} {} union NodeValue { float leaf_weight; @@ -238,10 +238,10 @@ class GPUPredictor : public xgboost::Predictor { } struct DeviceShard { - DeviceShard() : device_(-1) {} + DeviceShard() : device_{-1} {} void Init(int device) { this->device_ = device; - max_shared_memory_bytes = dh::MaxSharedMemory(this->device_); + max_shared_memory_bytes_ = dh::MaxSharedMemory(this->device_); } void PredictInternal (const SparsePage& batch, const MetaInfo& info, @@ -251,20 +251,20 @@ class GPUPredictor : public xgboost::Predictor { const thrust::host_vector& h_nodes, size_t tree_begin, size_t tree_end) { dh::safe_cuda(cudaSetDevice(device_)); - nodes.resize(h_nodes.size()); - dh::safe_cuda(cudaMemcpyAsync(dh::Raw(nodes), h_nodes.data(), - sizeof(DevicePredictionNode) * h_nodes.size(), - cudaMemcpyHostToDevice)); - tree_segments.resize(h_tree_segments.size()); + nodes_.resize(h_nodes.size()); + dh::safe_cuda(cudaMemcpyAsync(dh::Raw(nodes_), h_nodes.data(), + sizeof(DevicePredictionNode) * h_nodes.size(), + cudaMemcpyHostToDevice)); + tree_segments_.resize(h_tree_segments.size()); - dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_segments), h_tree_segments.data(), - sizeof(size_t) * h_tree_segments.size(), - cudaMemcpyHostToDevice)); - tree_group.resize(model.tree_info.size()); + dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_segments_), h_tree_segments.data(), + sizeof(size_t) * h_tree_segments.size(), + cudaMemcpyHostToDevice)); + tree_group_.resize(model.tree_info.size()); - dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_group), model.tree_info.data(), - sizeof(int) * model.tree_info.size(), - cudaMemcpyHostToDevice)); + dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_group_), model.tree_info.data(), + sizeof(int) * model.tree_info.size(), + cudaMemcpyHostToDevice)); const int BLOCK_THREADS = 128; size_t num_rows = batch.offset.DeviceSize(device_) - 1; @@ -275,7 +275,7 @@ class GPUPredictor : public xgboost::Predictor { int shared_memory_bytes = static_cast (sizeof(float) * info.num_col_ * BLOCK_THREADS); bool use_shared = true; - if (shared_memory_bytes > max_shared_memory_bytes) { + if (shared_memory_bytes > max_shared_memory_bytes_) { shared_memory_bytes = 0; use_shared = false; } @@ -284,17 +284,18 @@ class GPUPredictor : public xgboost::Predictor { data_distr.Devices().Index(device_)); PredictKernel<<>> - (dh::ToSpan(nodes), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments), - dh::ToSpan(tree_group), batch.offset.DeviceSpan(device_), + (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_), + dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_), batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_, num_rows, entry_start, use_shared, model.param.num_output_group); } + private: int device_; - thrust::device_vector nodes; - thrust::device_vector tree_segments; - thrust::device_vector tree_group; - size_t max_shared_memory_bytes; + thrust::device_vector nodes_; + thrust::device_vector tree_segments_; + thrust::device_vector tree_group_; + size_t max_shared_memory_bytes_; }; void DevicePredictInternal(DMatrix* dmat, @@ -325,13 +326,12 @@ class GPUPredictor : public xgboost::Predictor { for (const auto &batch : dmat->GetRowBatches()) { CHECK_EQ(i_batch, 0) << "External memory not supported"; - size_t n_rows = batch.offset.Size() - 1; // out_preds have been resharded and resized in InitOutPredictions() batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1)); std::vector device_offsets; DeviceOffsets(batch.offset, &device_offsets); batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets)); - dh::ExecuteIndexShards(&shards, [&](int idx, DeviceShard& shard) { + dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.PredictInternal(batch, dmat->Info(), out_preds, model, h_tree_segments, h_nodes, tree_begin, tree_end); }); @@ -340,13 +340,13 @@ class GPUPredictor : public xgboost::Predictor { } public: - GPUPredictor() : cpu_predictor(Predictor::Create("cpu_predictor")) {} + GPUPredictor() : cpu_predictor_(Predictor::Create("cpu_predictor")) {} void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { GPUSet devices = GPUSet::All( - param.gpu_id, param.n_gpus, dmat->Info().num_row_); + param_.gpu_id, param_.n_gpus, dmat->Info().num_row_); ConfigureShards(devices); if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { @@ -427,12 +427,12 @@ class GPUPredictor : public xgboost::Predictor { std::vector* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit, unsigned root_index) override { - cpu_predictor->PredictInstance(inst, out_preds, model, root_index); + cpu_predictor_->PredictInstance(inst, out_preds, model, root_index); } void PredictLeaf(DMatrix* p_fmat, std::vector* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit) override { - cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit); + cpu_predictor_->PredictLeaf(p_fmat, out_preds, model, ntree_limit); } void PredictContribution(DMatrix* p_fmat, @@ -440,7 +440,7 @@ class GPUPredictor : public xgboost::Predictor { const gbm::GBTreeModel& model, unsigned ntree_limit, bool approximate, int condition, unsigned condition_feature) override { - cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, + cpu_predictor_->PredictContribution(p_fmat, out_contribs, model, ntree_limit, approximate, condition, condition_feature); } @@ -450,17 +450,17 @@ class GPUPredictor : public xgboost::Predictor { const gbm::GBTreeModel& model, unsigned ntree_limit, bool approximate) override { - cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, + cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, approximate); } void Init(const std::vector>& cfg, const std::vector>& cache) override { Predictor::Init(cfg, cache); - cpu_predictor->Init(cfg, cache); - param.InitAllowUnknown(cfg); + cpu_predictor_->Init(cfg, cache); + param_.InitAllowUnknown(cfg); - GPUSet devices = GPUSet::All(param.gpu_id, param.n_gpus); + GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus); ConfigureShards(devices); } @@ -470,16 +470,16 @@ class GPUPredictor : public xgboost::Predictor { if (devices_ == devices) return; devices_ = devices; - shards.clear(); - shards.resize(devices_.Size()); - dh::ExecuteIndexShards(&shards, [=](size_t i, DeviceShard& shard){ + shards_.clear(); + shards_.resize(devices_.Size()); + dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){ shard.Init(devices_.DeviceId(i)); }); } - GPUPredictionParam param; - std::unique_ptr cpu_predictor; - std::vector shards; + GPUPredictionParam param_; + std::unique_ptr cpu_predictor_; + std::vector shards_; GPUSet devices_; }; diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 97fdfa045..24b47ba65 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -77,7 +77,7 @@ class ColMaker: public TreeUpdater { /*! \brief current best solution */ SplitEntry best; // constructor - NodeEntry() : root_gain(0.0f), weight(0.0f) {} + NodeEntry() : root_gain{0.0f}, weight{0.0f} {} }; // actual builder that runs the algorithm class Builder { @@ -595,9 +595,10 @@ class ColMaker: public TreeUpdater { const MetaInfo& info = p_fmat->Info(); // start enumeration const auto num_features = static_cast(feat_set.size()); - #if defined(_OPENMP) - const int batch_size = std::max(static_cast(num_features / this->nthread_ / 32), 1); - #endif // defined(_OPENMP) +#if defined(_OPENMP) + const int batch_size = // NOLINT + std::max(static_cast(num_features / this->nthread_ / 32), 1); +#endif // defined(_OPENMP) int poption = param_.parallel_option; if (poption == 2) { poption = static_cast(num_features) * 2 < this->nthread_ ? 1 : 0; diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu index 4464f63fa..e9745b78e 100644 --- a/src/tree/updater_gpu.cu +++ b/src/tree/updater_gpu.cu @@ -18,11 +18,11 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu); template XGBOOST_DEVICE float inline LossChangeMissing(const GradientPairT& scan, - const GradientPairT& missing, - const GradientPairT& parent_sum, - const float& parent_gain, - const GPUTrainingParam& param, - bool& missing_left_out) { // NOLINT + const GradientPairT& missing, + const GradientPairT& parent_sum, + const float& parent_gain, + const GPUTrainingParam& param, + bool& missing_left_out) { // NOLINT // Put gradients of missing values to left float missing_left_loss = DeviceCalcLossChange(param, scan + missing, parent_sum, parent_gain); @@ -102,7 +102,7 @@ struct AddByKey { * @param instIds instance index buffer * @return the expected gradient value */ -HOST_DEV_INLINE GradientPair get(int id, +HOST_DEV_INLINE GradientPair Get(int id, common::Span vals, common::Span instIds) { id = instIds[id]; @@ -123,13 +123,13 @@ __global__ void CubScanByKeyL1( Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)}; int myKey; GradientPair myValue; - typedef cub::BlockScan BlockScan; + using BlockScan = cub::BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; Pair threadData; int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x; if (tid < size) { myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys); - myValue = get(tid, vals, instIds); + myValue = Get(tid, vals, instIds); } else { myKey = kNoneKey; myValue = {}; @@ -164,7 +164,7 @@ __global__ void CubScanByKeyL1( template __global__ void CubScanByKeyL2(common::Span mScans, common::Span mKeys, int mLength) { - typedef cub::BlockScan BlockScan; + using BlockScan = cub::BlockScan; Pair threadData; __shared__ typename BlockScan::TempStorage temp_storage; for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) { @@ -205,19 +205,19 @@ __global__ void CubScanByKeyL3(common::Span sums, int previousKey = tid == 0 ? kNoneKey : Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys); - GradientPair myValue = scans[tid]; + GradientPair my_value = scans[tid]; __syncthreads(); if (blockIdx.x > 0 && s_mKeys == previousKey) { - myValue += s_mScans[0]; + my_value += s_mScans[0]; } if (tid == size - 1) { - sums[previousKey] = myValue + get(tid, vals, instIds); + sums[previousKey] = my_value + Get(tid, vals, instIds); } if ((previousKey != myKey) && (previousKey >= 0)) { - sums[previousKey] = myValue; - myValue = GradientPair(0.0f, 0.0f); + sums[previousKey] = my_value; + my_value = GradientPair(0.0f, 0.0f); } - scans[tid] = myValue; + scans[tid] = my_value; } /** @@ -271,14 +271,14 @@ struct ExactSplitCandidate { /** index where to split in the DMatrix */ int index; - HOST_DEV_INLINE ExactSplitCandidate() : score(-FLT_MAX), index(INT_MAX) {} + HOST_DEV_INLINE ExactSplitCandidate() : score{-FLT_MAX}, index{INT_MAX} {} /** * @brief Whether the split info is valid to be used to create a new child * @param minSplitLoss minimum score above which decision to split is made * @return true if splittable, else false */ - HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const { + HOST_DEV_INLINE bool IsSplittable(float minSplitLoss) const { return ((score >= minSplitLoss) && (index != INT_MAX)); } }; @@ -297,7 +297,7 @@ enum ArgMaxByKeyAlgo { /** max depth until which to use shared mem based atomics for argmax */ static const int kMaxAbkLevels = 3; -HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a, +HOST_DEV_INLINE ExactSplitCandidate MaxSplit(ExactSplitCandidate a, ExactSplitCandidate b) { ExactSplitCandidate out; if (a.score < b.score) { @@ -315,13 +315,13 @@ HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a, DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address, ExactSplitCandidate val) { - unsigned long long* intAddress = (unsigned long long*)address; // NOLINT + unsigned long long* intAddress = reinterpret_cast(address); // NOLINT unsigned long long old = *intAddress; // NOLINT - unsigned long long assumed; // NOLINT + unsigned long long assumed = old; // NOLINT do { assumed = old; ExactSplitCandidate res = - maxSplit(val, *reinterpret_cast(&assumed)); + MaxSplit(val, *reinterpret_cast(&assumed)); old = atomicCAS(intAddress, assumed, *reinterpret_cast(&res)); } while (assumed != old); } @@ -399,7 +399,7 @@ __global__ void AtomicArgMaxByKeySmem( nUniqKeys * sizeof(ExactSplitCandidate))); int tid = threadIdx.x; ExactSplitCandidate defVal; -#pragma unroll 1 + for (int i = tid; i < nUniqKeys; i += blockDim.x) { sNodeSplits[i] = defVal; } @@ -465,7 +465,7 @@ void ArgMaxByKey(common::Span nodeSplits, } } -__global__ void assignColIds(int* colIds, const int* colOffsets) { +__global__ void AssignColIds(int* colIds, const int* colOffsets) { int myId = blockIdx.x; int start = colOffsets[myId]; int end = colOffsets[myId + 1]; @@ -474,10 +474,10 @@ __global__ void assignColIds(int* colIds, const int* colOffsets) { } } -__global__ void fillDefaultNodeIds(NodeIdT* nodeIdsPerInst, - const DeviceNodeStats* nodes, int nRows) { +__global__ void FillDefaultNodeIds(NodeIdT* nodeIdsPerInst, + const DeviceNodeStats* nodes, int n_rows) { int id = threadIdx.x + (blockIdx.x * blockDim.x); - if (id >= nRows) { + if (id >= n_rows) { return; } // if this element belongs to none of the currently active node-id's @@ -497,7 +497,7 @@ __global__ void fillDefaultNodeIds(NodeIdT* nodeIdsPerInst, nodeIdsPerInst[id] = result; } -__global__ void assignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations, +__global__ void AssignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations, const NodeIdT* nodeIds, const int* instId, const DeviceNodeStats* nodes, const int* colOffsets, const float* vals, @@ -526,7 +526,7 @@ __global__ void assignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations, } } -__global__ void markLeavesKernel(DeviceNodeStats* nodes, int len) { +__global__ void MarkLeavesKernel(DeviceNodeStats* nodes, int len) { int id = (blockIdx.x * blockDim.x) + threadIdx.x; if ((id < len) && !nodes[id].IsUnused()) { int lid = (id << 1) + 1; @@ -541,118 +541,117 @@ __global__ void markLeavesKernel(DeviceNodeStats* nodes, int len) { class GPUMaker : public TreeUpdater { protected: - TrainParam param; + TrainParam param_; /** whether we have initialized memory already (so as not to repeat!) */ - bool allocated; + bool allocated_; /** feature values stored in column-major compressed format */ - dh::DVec2 vals; - dh::DVec vals_cached; + dh::DVec2 vals_; + dh::DVec vals_cached_; /** corresponding instance id's of these featutre values */ - dh::DVec2 instIds; - dh::DVec instIds_cached; + dh::DVec2 instIds_; + dh::DVec inst_ids_cached_; /** column offsets for these feature values */ - dh::DVec colOffsets; - dh::DVec gradsInst; - dh::DVec2 nodeAssigns; - dh::DVec2 nodeLocations; - dh::DVec nodes; - dh::DVec nodeAssignsPerInst; - dh::DVec gradSums; - dh::DVec gradScans; - dh::DVec nodeSplits; - int nVals; - int nRows; - int nCols; - int maxNodes; - int maxLeaves; + dh::DVec colOffsets_; + dh::DVec gradsInst_; + dh::DVec2 nodeAssigns_; + dh::DVec2 nodeLocations_; + dh::DVec nodes_; + dh::DVec node_assigns_per_inst_; + dh::DVec gradsums_; + dh::DVec gradscans_; + dh::DVec nodeSplits_; + int n_vals_; + int n_rows_; + int n_cols_; + int maxNodes_; + int maxLeaves_; // devices are only used for resharding the HostDeviceVector passed as a parameter; // the algorithm works with a single GPU only GPUSet devices_; - dh::CubMemory tmp_mem; - dh::DVec tmpScanGradBuff; - dh::DVec tmpScanKeyBuff; - dh::DVec colIds; - dh::BulkAllocator ba; + dh::CubMemory tmp_mem_; + dh::DVec tmpScanGradBuff_; + dh::DVec tmp_scan_key_buff_; + dh::DVec colIds_; + dh::BulkAllocator ba_; public: - GPUMaker() : allocated(false) {} - ~GPUMaker() {} + GPUMaker() : allocated_{false} {} + ~GPUMaker() override = default; - void Init( - const std::vector>& args) { - param.InitAllowUnknown(args); - maxNodes = (1 << (param.max_depth + 1)) - 1; - maxLeaves = 1 << param.max_depth; + void Init(const std::vector> &args) override { + param_.InitAllowUnknown(args); + maxNodes_ = (1 << (param_.max_depth + 1)) - 1; + maxLeaves_ = 1 << param_.max_depth; - devices_ = GPUSet::All(param.gpu_id, param.n_gpus); + devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus); } void Update(HostDeviceVector* gpair, DMatrix* dmat, - const std::vector& trees) { + const std::vector& trees) override { // rescale learning rate according to size of trees - float lr = param.learning_rate; - param.learning_rate = lr / trees.size(); + float lr = param_.learning_rate; + param_.learning_rate = lr / trees.size(); gpair->Reshard(devices_); try { // build tree - for (size_t i = 0; i < trees.size(); ++i) { - UpdateTree(gpair, dmat, trees[i]); + for (auto tree : trees) { + UpdateTree(gpair, dmat, tree); } } catch (const std::exception& e) { LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl; } - param.learning_rate = lr; + param_.learning_rate = lr; } /// @note: Update should be only after Init!! void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, RegTree* hTree) { - if (!allocated) { + if (!allocated_) { SetupOneTimeData(dmat); } - for (int i = 0; i < param.max_depth; ++i) { + for (int i = 0; i < param_.max_depth; ++i) { if (i == 0) { // make sure to start on a fresh tree with sorted values! - vals.CurrentDVec() = vals_cached; - instIds.CurrentDVec() = instIds_cached; - transferGrads(gpair); + vals_.CurrentDVec() = vals_cached_; + instIds_.CurrentDVec() = inst_ids_cached_; + TransferGrads(gpair); } int nNodes = 1 << i; NodeIdT nodeStart = nNodes - 1; - initNodeData(i, nodeStart, nNodes); - findSplit(i, nodeStart, nNodes); + InitNodeData(i, nodeStart, nNodes); + FindSplit(i, nodeStart, nNodes); } // mark all the used nodes with unused children as leaf nodes - markLeaves(); - Dense2SparseTree(hTree, nodes, param); + MarkLeaves(); + Dense2SparseTree(hTree, nodes_, param_); } - void split2node(int nNodes, NodeIdT nodeStart) { - auto d_nodes = nodes.GetSpan(); - auto d_gradScans = gradScans.GetSpan(); - auto d_gradSums = gradSums.GetSpan(); - auto d_nodeAssigns = nodeAssigns.CurrentSpan(); - auto d_colIds = colIds.GetSpan(); - auto d_vals = vals.Current(); - auto d_nodeSplits = nodeSplits.Data(); + void Split2Node(int nNodes, NodeIdT nodeStart) { + auto d_nodes = nodes_.GetSpan(); + auto d_gradScans = gradscans_.GetSpan(); + auto d_gradsums = gradsums_.GetSpan(); + auto d_nodeAssigns = nodeAssigns_.CurrentSpan(); + auto d_colIds = colIds_.GetSpan(); + auto d_vals = vals_.Current(); + auto d_nodeSplits = nodeSplits_.Data(); int nUniqKeys = nNodes; - float min_split_loss = param.min_split_loss; - auto gpu_param = GPUTrainingParam(param); + float min_split_loss = param_.min_split_loss; + auto gpu_param = GPUTrainingParam(param_); - dh::LaunchN(param.gpu_id, nNodes, [=] __device__(int uid) { + dh::LaunchN(param_.gpu_id, nNodes, [=] __device__(int uid) { int absNodeId = uid + nodeStart; ExactSplitCandidate s = d_nodeSplits[uid]; - if (s.isSplittable(min_split_loss)) { + if (s.IsSplittable(min_split_loss)) { int idx = s.index; int nodeInstId = Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys); bool missingLeft = true; const DeviceNodeStats& n = d_nodes[absNodeId]; GradientPair gradScan = d_gradScans[idx]; - GradientPair gradSum = d_gradSums[nodeInstId]; + GradientPair gradSum = d_gradsums[nodeInstId]; float thresh = d_vals[idx]; int colId = d_colIds[idx]; // get the default direction for the current node @@ -679,54 +678,53 @@ class GPUMaker : public TreeUpdater { }); } - void findSplit(int level, NodeIdT nodeStart, int nNodes) { - ReduceScanByKey(gradSums.GetSpan(), gradScans.GetSpan(), gradsInst.GetSpan(), - instIds.CurrentSpan(), nodeAssigns.CurrentSpan(), nVals, nNodes, - nCols, tmpScanGradBuff.GetSpan(), tmpScanKeyBuff.GetSpan(), - colIds.GetSpan(), nodeStart); - ArgMaxByKey(nodeSplits.GetSpan(), gradScans.GetSpan(), gradSums.GetSpan(), - vals.CurrentSpan(), colIds.GetSpan(), nodeAssigns.CurrentSpan(), - nodes.GetSpan(), nNodes, nodeStart, nVals, param, + void FindSplit(int level, NodeIdT nodeStart, int nNodes) { + ReduceScanByKey(gradsums_.GetSpan(), gradscans_.GetSpan(), gradsInst_.GetSpan(), + instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes, + n_cols_, tmpScanGradBuff_.GetSpan(), tmp_scan_key_buff_.GetSpan(), + colIds_.GetSpan(), nodeStart); + ArgMaxByKey(nodeSplits_.GetSpan(), gradscans_.GetSpan(), gradsums_.GetSpan(), + vals_.CurrentSpan(), colIds_.GetSpan(), nodeAssigns_.CurrentSpan(), + nodes_.GetSpan(), nNodes, nodeStart, n_vals_, param_, level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem); - split2node(nNodes, nodeStart); + Split2Node(nNodes, nodeStart); } - void allocateAllData(int offsetSize) { - int tmpBuffSize = ScanTempBufferSize(nVals); - ba.Allocate(param.gpu_id, &vals, nVals, - &vals_cached, nVals, &instIds, nVals, &instIds_cached, nVals, - &colOffsets, offsetSize, &gradsInst, nRows, &nodeAssigns, nVals, - &nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst, - nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals, - &nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize, - &tmpScanKeyBuff, tmpBuffSize, &colIds, nVals); + void AllocateAllData(int offsetSize) { + int tmpBuffSize = ScanTempBufferSize(n_vals_); + ba_.Allocate(param_.gpu_id, &vals_, n_vals_, + &vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_, + &colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_, + &nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_, + n_rows_, &gradsums_, maxLeaves_ * n_cols_, &gradscans_, n_vals_, + &nodeSplits_, maxLeaves_, &tmpScanGradBuff_, tmpBuffSize, + &tmp_scan_key_buff_, tmpBuffSize, &colIds_, n_vals_); } void SetupOneTimeData(DMatrix* dmat) { - size_t free_memory = dh::AvailableMemory(param.gpu_id); if (!dmat->SingleColBlock()) { LOG(FATAL) << "exact::GPUBuilder - must have 1 column block"; } std::vector fval; std::vector fId; std::vector offset; - convertToCsc(dmat, &fval, &fId, &offset); - allocateAllData(static_cast(offset.size())); - transferAndSortData(fval, fId, offset); - allocated = true; + ConvertToCsc(dmat, &fval, &fId, &offset); + AllocateAllData(static_cast(offset.size())); + TransferAndSortData(fval, fId, offset); + allocated_ = true; } - void convertToCsc(DMatrix* dmat, std::vector* fval, + void ConvertToCsc(DMatrix* dmat, std::vector* fval, std::vector* fId, std::vector* offset) { const MetaInfo& info = dmat->Info(); CHECK(info.num_col_ < std::numeric_limits::max()); CHECK(info.num_row_ < std::numeric_limits::max()); - nRows = static_cast(info.num_row_); - nCols = static_cast(info.num_col_); - offset->reserve(nCols + 1); + n_rows_ = static_cast(info.num_row_); + n_cols_ = static_cast(info.num_col_); + offset->reserve(n_cols_ + 1); offset->push_back(0); - fval->reserve(nCols * nRows); - fId->reserve(nCols * nRows); + fval->reserve(n_cols_ * n_rows_); + fId->reserve(n_cols_ * n_rows_); // in case you end up with a DMatrix having no column access // then make sure to enable that before copying the data! for (const auto& batch : dmat->GetSortedColumnBatches()) { @@ -741,59 +739,59 @@ class GPUMaker : public TreeUpdater { } } CHECK(fval->size() < std::numeric_limits::max()); - nVals = static_cast(fval->size()); + n_vals_ = static_cast(fval->size()); } - void transferAndSortData(const std::vector& fval, + void TransferAndSortData(const std::vector& fval, const std::vector& fId, const std::vector& offset) { - vals.CurrentDVec() = fval; - instIds.CurrentDVec() = fId; - colOffsets = offset; - dh::SegmentedSort(&tmp_mem, &vals, &instIds, nVals, nCols, - colOffsets); - vals_cached = vals.CurrentDVec(); - instIds_cached = instIds.CurrentDVec(); - assignColIds<<>>(colIds.Data(), colOffsets.Data()); + vals_.CurrentDVec() = fval; + instIds_.CurrentDVec() = fId; + colOffsets_ = offset; + dh::SegmentedSort(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_, + colOffsets_); + vals_cached_ = vals_.CurrentDVec(); + inst_ids_cached_ = instIds_.CurrentDVec(); + AssignColIds<<>>(colIds_.Data(), colOffsets_.Data()); } - void transferGrads(HostDeviceVector* gpair) { - gpair->GatherTo(gradsInst.tbegin(), gradsInst.tend()); + void TransferGrads(HostDeviceVector* gpair) { + gpair->GatherTo(gradsInst_.tbegin(), gradsInst_.tend()); // evaluate the full-grad reduction for the root node - dh::SumReduction(tmp_mem, gradsInst, gradSums, nRows); + dh::SumReduction(tmp_mem_, gradsInst_, gradsums_, n_rows_); } - void initNodeData(int level, NodeIdT nodeStart, int nNodes) { + void InitNodeData(int level, NodeIdT nodeStart, int nNodes) { // all instances belong to root node at the beginning! if (level == 0) { - nodes.Fill(DeviceNodeStats()); - nodeAssigns.CurrentDVec().Fill(0); - nodeAssignsPerInst.Fill(0); + nodes_.Fill(DeviceNodeStats()); + nodeAssigns_.CurrentDVec().Fill(0); + node_assigns_per_inst_.Fill(0); // for root node, just update the gradient/score/weight/id info // before splitting it! Currently all data is on GPU, hence this // stupid little kernel - auto d_nodes = nodes.Data(); - auto d_sums = gradSums.Data(); - auto gpu_params = GPUTrainingParam(param); - dh::LaunchN(param.gpu_id, 1, [=] __device__(int idx) { + auto d_nodes = nodes_.Data(); + auto d_sums = gradsums_.Data(); + auto gpu_params = GPUTrainingParam(param_); + dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) { d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params); }); } else { const int BlkDim = 256; const int ItemsPerThread = 4; // assign default node ids first - int nBlks = dh::DivRoundUp(nRows, BlkDim); - fillDefaultNodeIds<<>>(nodeAssignsPerInst.Data(), - nodes.Data(), nRows); + int nBlks = dh::DivRoundUp(n_rows_, BlkDim); + FillDefaultNodeIds<<>>(node_assigns_per_inst_.Data(), + nodes_.Data(), n_rows_); // evaluate the correct child indices of non-missing values next - nBlks = dh::DivRoundUp(nVals, BlkDim * ItemsPerThread); - assignNodeIds<<>>( - nodeAssignsPerInst.Data(), nodeLocations.Current(), - nodeAssigns.Current(), instIds.Current(), nodes.Data(), - colOffsets.Data(), vals.Current(), nVals, nCols); + nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread); + AssignNodeIds<<>>( + node_assigns_per_inst_.Data(), nodeLocations_.Current(), + nodeAssigns_.Current(), instIds_.Current(), nodes_.Data(), + colOffsets_.Data(), vals_.Current(), n_vals_, n_cols_); // gather the node assignments across all other columns too - dh::Gather(param.gpu_id, nodeAssigns.Current(), - nodeAssignsPerInst.Data(), instIds.Current(), nVals); + dh::Gather(param_.gpu_id, nodeAssigns_.Current(), + node_assigns_per_inst_.Data(), instIds_.Current(), n_vals_); SortKeys(level); } } @@ -801,19 +799,19 @@ class GPUMaker : public TreeUpdater { void SortKeys(int level) { // segmented-sort the arrays based on node-id's // but we don't need more than level+1 bits for sorting! - SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols, - colOffsets, 0, level + 1); - dh::Gather(param.gpu_id, vals.other(), - vals.Current(), instIds.other(), instIds.Current(), - nodeLocations.Current(), nVals); - vals.buff().selector ^= 1; - instIds.buff().selector ^= 1; + SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_, + colOffsets_, 0, level + 1); + dh::Gather(param_.gpu_id, vals_.other(), + vals_.Current(), instIds_.other(), instIds_.Current(), + nodeLocations_.Current(), n_vals_); + vals_.buff().selector ^= 1; + instIds_.buff().selector ^= 1; } - void markLeaves() { + void MarkLeaves() { const int BlkDim = 128; - int nBlks = dh::DivRoundUp(maxNodes, BlkDim); - markLeavesKernel<<>>(nodes.Data(), maxNodes); + int nBlks = dh::DivRoundUp(maxNodes_, BlkDim); + MarkLeavesKernel<<>>(nodes_.Data(), maxNodes_); } }; diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 11b9b01d4..bdf309d47 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -14,7 +14,7 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 -#else +#else // In device code and CUDA < 600 XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; // NOLINT @@ -39,7 +39,7 @@ namespace tree { // Atomic add function for gradients template DEV_INLINE void AtomicAddGpair(OutputGradientT* dest, - const InputGradientT& gpair) { + const InputGradientT& gpair) { auto dst_ptr = reinterpret_cast(dest); atomicAdd(dst_ptr, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index c2511d378..11ba665ff 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -108,7 +108,7 @@ __device__ GradientSumT ReduceFeature(common::Span feature_h } /*! \brief Find the thread with best gain. */ -template __device__ void EvaluateFeature( int fidx, @@ -142,7 +142,7 @@ __device__ void EvaluateFeature( // Gradient value for current bin. GradientSumT bin = thread_active ? node_histogram[scan_begin + threadIdx.x] : GradientSumT(); - scan_t(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); + ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); // Whether the gradient of missing values is put to the left side. bool missing_left = true; @@ -198,12 +198,12 @@ __global__ void EvaluateSplitKernel( ValueConstraint value_constraint, common::Span d_monotonic_constraints) { // KeyValuePair here used as threadIdx.x -> gain_value - typedef cub::KeyValuePair ArgMaxT; - typedef cub::BlockScan< - GradientSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS> BlockScanT; - typedef cub::BlockReduce MaxReduceT; + using ArgMaxT = cub::KeyValuePair; + using BlockScanT = + cub::BlockScan; + using MaxReduceT = cub::BlockReduce; - typedef cub::BlockReduce SumReduceT; + using SumReduceT = cub::BlockReduce; union TempStorage { typename BlockScanT::TempStorage scan; @@ -274,51 +274,56 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data, * \date 28/07/2018 */ template -struct DeviceHistogram { +class DeviceHistogram { + private: /*! \brief Map nidx to starting index of its histogram. */ - std::map nidx_map; - thrust::device_vector data; - const size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size - int n_bins; + std::map nidx_map_; + thrust::device_vector data_; + static constexpr size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size + int n_bins_; int device_id_; + public: void Init(int device_id, int n_bins) { - this->n_bins = n_bins; + this->n_bins_ = n_bins; this->device_id_ = device_id; } void Reset() { dh::safe_cuda(cudaSetDevice(device_id_)); dh::safe_cuda(cudaMemsetAsync( - data.data().get(), 0, - data.size() * sizeof(typename decltype(data)::value_type))); - nidx_map.clear(); + data_.data().get(), 0, + data_.size() * sizeof(typename decltype(data_)::value_type))); + nidx_map_.clear(); + } + bool HistogramExists(int nidx) { + return nidx_map_.find(nidx) != nidx_map_.end(); } - bool HistogramExists(int nidx) { - return nidx_map.find(nidx) != nidx_map.end(); + thrust::device_vector &Data() { + return data_; } void AllocateHistogram(int nidx) { if (HistogramExists(nidx)) return; size_t current_size = - nidx_map.size() * n_bins * 2; // Number of items currently used in data + nidx_map_.size() * n_bins_ * 2; // Number of items currently used in data dh::safe_cuda(cudaSetDevice(device_id_)); - if (data.size() >= kStopGrowingSize) { + if (data_.size() >= kStopGrowingSize) { // Recycle histogram memory - std::pair old_entry = *nidx_map.begin(); - nidx_map.erase(old_entry.first); - dh::safe_cuda(cudaMemsetAsync(data.data().get() + old_entry.second, 0, - n_bins * sizeof(GradientSumT))); - nidx_map[nidx] = old_entry.second; + std::pair old_entry = *nidx_map_.begin(); + nidx_map_.erase(old_entry.first); + dh::safe_cuda(cudaMemsetAsync(data_.data().get() + old_entry.second, 0, + n_bins_ * sizeof(GradientSumT))); + nidx_map_[nidx] = old_entry.second; } else { // Append new node histogram - nidx_map[nidx] = current_size; - if (data.size() < current_size + n_bins * 2) { + nidx_map_[nidx] = current_size; + if (data_.size() < current_size + n_bins_ * 2) { size_t new_size = current_size * 2; // Double in size - new_size = std::max(static_cast(n_bins * 2), + new_size = std::max(static_cast(n_bins_ * 2), new_size); // Have at least one histogram - data.resize(new_size); + data_.resize(new_size); } } } @@ -330,9 +335,9 @@ struct DeviceHistogram { */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); - auto ptr = data.data().get() + nidx_map[nidx]; + auto ptr = data_.data().get() + nidx_map_[nidx]; return common::Span( - reinterpret_cast(ptr), n_bins); + reinterpret_cast(ptr), n_bins_); } }; @@ -351,7 +356,7 @@ struct CalcWeightTrainParam { }; // Bin each input data entry, store the bin indices in compressed form. -__global__ void compress_bin_ellpack_k( +__global__ void CompressBinEllpackKernel( common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, // gidx_buffer const size_t* __restrict__ row_ptrs, // row offset of input data @@ -366,8 +371,9 @@ __global__ void compress_bin_ellpack_k( unsigned int null_gidx_value) { size_t irow = threadIdx.x + blockIdx.x * blockDim.x; int ifeature = threadIdx.y + blockIdx.y * blockDim.y; - if (irow >= n_rows || ifeature >= row_stride) + if (irow >= n_rows || ifeature >= row_stride) { return; + } int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); unsigned int bin = null_gidx_value; if (ifeature < row_length) { @@ -380,8 +386,9 @@ __global__ void compress_bin_ellpack_k( // Assigning the bin in current entry. // S.t.: fvalue < feature_cuts[bin] bin = dh::UpperBound(feature_cuts, ncuts, fvalue); - if (bin >= ncuts) + if (bin >= ncuts) { bin = ncuts - 1; + } // Add the number of bins in previous features. bin += cut_rows[feature]; } @@ -419,7 +426,7 @@ struct Segment { size_t begin; size_t end; - Segment() : begin(0), end(0) {} + Segment() : begin{0}, end{0} {} Segment(size_t begin, size_t end) : begin(begin), end(end) { CHECK_GE(end, begin); @@ -487,7 +494,9 @@ struct GPUHistBuilderBase { // Manage memory for a single GPU template struct DeviceShard { - int device_id_; + int n_bins; + int device_id; + dh::BulkAllocator ba; /*! \brief HistCutMatrix stored in device. */ @@ -498,14 +507,12 @@ struct DeviceShard { dh::DVec min_fvalue; /*! \brief Cut. */ dh::DVec gidx_fvalue_map; - } cut_; + } d_cut; /*! \brief Range of rows for each node. */ std::vector ridx_segments; DeviceHistogram hist; - /*! \brief global index of histogram, which is stored in ELLPack format. */ - dh::DVec gidx_buffer; /*! \brief row length for ELLPack. */ size_t row_stride; common::CompressedIterator gidx; @@ -526,6 +533,8 @@ struct DeviceShard { /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; dh::DVec node_sum_gradients_d; + /*! \brief global index of histogram, which is stored in ELLPack format. */ + dh::DVec gidx_buffer; /*! \brief row offset in SparsePage (the input data). */ thrust::device_vector row_ptrs; /*! \brief On-device feature set, only actually used on one of the devices */ @@ -534,7 +543,6 @@ struct DeviceShard { bst_uint row_begin_idx; bst_uint row_end_idx; bst_uint n_rows; - int n_bins; TrainParam param; bool prediction_cache_initialised; @@ -544,21 +552,21 @@ struct DeviceShard { std::unique_ptr> hist_builder; // TODO(canonizer): do add support multi-batch DMatrix here - DeviceShard(int device_id, bst_uint row_begin, bst_uint row_end, + DeviceShard(int _device_id, bst_uint row_begin, bst_uint row_end, TrainParam _param) - : device_id_(device_id), + : device_id(_device_id), row_begin_idx(row_begin), row_end_idx(row_end), row_stride(0), n_rows(row_end - row_begin), - n_bins(0), + n_bins{0}, null_gidx_value(0), - param(_param), + param(std::move(_param)), prediction_cache_initialised(false) {} /* Init row_ptrs and row_stride */ void InitRowPtrs(const SparsePage& row_batch) { - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); const auto& offset_vec = row_batch.offset.HostVector(); row_ptrs.resize(n_rows + 1); thrust::copy(offset_vec.data() + row_begin_idx, @@ -589,12 +597,11 @@ struct DeviceShard { void CreateHistIndices(const SparsePage& row_batch); - ~DeviceShard() { - } + ~DeviceShard() = default; // Reset values for each update iteration void Reset(HostDeviceVector* dh_gpair) { - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); position.CurrentDVec().Fill(0); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPair()); @@ -603,8 +610,8 @@ struct DeviceShard { std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0)); ridx_segments.front() = Segment(0, ridx.Size()); - this->gpair.copy(dh_gpair->tcbegin(device_id_), - dh_gpair->tcend(device_id_)); + this->gpair.copy(dh_gpair->tcbegin(device_id), + dh_gpair->tcend(device_id)); SubsampleGradientPair(&gpair, param.subsample, row_begin_idx); hist.Reset(); } @@ -612,7 +619,7 @@ struct DeviceShard { DeviceSplitCandidate EvaluateSplit(int nidx, const std::vector& feature_set, ValueConstraint value_constraint) { - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); auto d_split_candidates = temp_memory.GetSpan(feature_set.size()); feature_set_d.resize(feature_set.size()); auto d_features = common::Span(feature_set_d.data().get(), @@ -622,14 +629,13 @@ struct DeviceShard { DeviceNodeStats node(node_sum_gradients[nidx], nidx, param); // One block for each feature - int constexpr BLOCK_THREADS = 256; - EvaluateSplitKernel - <<>>( - hist.GetNodeHistogram(nidx), d_features, node, - cut_.feature_segments.GetSpan(), cut_.min_fvalue.GetSpan(), - cut_.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param), - d_split_candidates, value_constraint, - monotone_constraints.GetSpan()); + int constexpr kBlockThreads = 256; + EvaluateSplitKernel + <<>> + (hist.GetNodeHistogram(nidx), d_features, node, + d_cut.feature_segments.GetSpan(), d_cut.min_fvalue.GetSpan(), + d_cut.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param), + d_split_candidates, value_constraint, monotone_constraints.GetSpan()); std::vector split_candidates(feature_set.size()); dh::safe_cuda(cudaMemcpy(split_candidates.data(), d_split_candidates.data(), @@ -655,7 +661,7 @@ struct DeviceShard { auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); - dh::LaunchN(device_id_, hist.n_bins, [=] __device__(size_t idx) { + dh::LaunchN(device_id, n_bins, [=] __device__(size_t idx) { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); @@ -673,7 +679,7 @@ struct DeviceShard { int64_t split_gidx, bool default_dir_left, bool is_dense, int fidx_begin, // cut.row_ptr[fidx] int fidx_end) { // cut.row_ptr[fidx + 1] - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); Segment segment = ridx_segments[nidx]; bst_uint* d_ridx = ridx.Current(); int* d_position = position.Current(); @@ -681,7 +687,7 @@ struct DeviceShard { size_t row_stride = this->row_stride; // Launch 1 thread for each row dh::LaunchN<1, 128>( - device_id_, segment.Size(), [=] __device__(bst_uint idx) { + device_id, segment.Size(), [=] __device__(bst_uint idx) { idx += segment.begin; bst_uint ridx = d_ridx[idx]; auto row_begin = row_stride * ridx; @@ -724,7 +730,7 @@ struct DeviceShard { /*! \brief Sort row indices according to position. */ void SortPositionAndCopy(const Segment& segment, int left_nidx, int right_nidx, - size_t left_count) { + size_t left_count) { SortPosition( &temp_memory, common::Span(position.Current() + segment.begin, segment.Size()), @@ -737,14 +743,14 @@ struct DeviceShard { const auto d_position_other = position.other() + segment.begin; const auto d_ridx_current = ridx.Current() + segment.begin; const auto d_ridx_other = ridx.other() + segment.begin; - dh::LaunchN(device_id_, segment.Size(), [=] __device__(size_t idx) { + dh::LaunchN(device_id, segment.Size(), [=] __device__(size_t idx) { d_position_current[idx] = d_position_other[idx]; d_ridx_current[idx] = d_ridx_other[idx]; }); } void UpdatePredictionCache(bst_float* out_preds_d) { - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); if (!prediction_cache_initialised) { dh::safe_cuda(cudaMemcpyAsync( prediction_cache.Data(), out_preds_d, @@ -764,7 +770,7 @@ struct DeviceShard { auto d_prediction_cache = prediction_cache.Data(); dh::LaunchN( - device_id_, prediction_cache.Size(), [=] __device__(int local_idx) { + device_id, prediction_cache.Size(), [=] __device__(int local_idx) { int pos = d_position[local_idx]; bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]); d_prediction_cache[d_ridx[local_idx]] += @@ -799,7 +805,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase { if (grid_size <= 0) { return; } - dh::safe_cuda(cudaSetDevice(shard->device_id_)); + dh::safe_cuda(cudaSetDevice(shard->device_id)); SharedMemHistKernel<<>> (shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair, segment_begin, n_elements); @@ -819,7 +825,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase { size_t const row_stride = shard->row_stride; int const null_gidx_value = shard->null_gidx_value; - dh::LaunchN(shard->device_id_, n_elements, [=] __device__(size_t idx) { + dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) { int ridx = d_ridx[(idx / row_stride) + segment.begin]; // lookup the index (bin) of histogram. int gidx = d_gidx[ridx * row_stride + idx % row_stride]; @@ -834,31 +840,31 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase { template inline void DeviceShard::InitCompressedData( const common::HistCutMatrix& hmat, const SparsePage& row_batch) { - n_bins = hmat.row_ptr.back(); - null_gidx_value = hmat.row_ptr.back(); + n_bins = hmat.NumBins(); + null_gidx_value = hmat.NumBins(); int max_nodes = param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth); - ba.Allocate(device_id_, + ba.Allocate(device_id, &gpair, n_rows, &ridx, n_rows, &position, n_rows, &prediction_cache, n_rows, &node_sum_gradients_d, max_nodes, - &cut_.feature_segments, hmat.row_ptr.size(), - &cut_.gidx_fvalue_map, hmat.cut.size(), - &cut_.min_fvalue, hmat.min_val.size(), + &d_cut.feature_segments, hmat.row_ptr.size(), + &d_cut.gidx_fvalue_map, hmat.cut.size(), + &d_cut.min_fvalue, hmat.min_val.size(), &monotone_constraints, param.monotone_constraints.size()); - cut_.gidx_fvalue_map = hmat.cut; - cut_.min_fvalue = hmat.min_val; - cut_.feature_segments = hmat.row_ptr; + d_cut.gidx_fvalue_map = hmat.cut; + d_cut.min_fvalue = hmat.min_val; + d_cut.feature_segments = hmat.row_ptr; monotone_constraints = param.monotone_constraints; node_sum_gradients.resize(max_nodes); ridx_segments.resize(max_nodes); - dh::safe_cuda(cudaSetDevice(device_id_)); + dh::safe_cuda(cudaSetDevice(device_id)); // allocate compressed bin data int num_symbols = n_bins + 1; @@ -870,7 +876,7 @@ inline void DeviceShard::InitCompressedData( CHECK(!(param.max_leaves == 0 && param.max_depth == 0)) << "Max leaves and max depth cannot both be unconstrained for " "gpu_hist."; - ba.Allocate(device_id_, &gidx_buffer, compressed_size_bytes); + ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes); gidx_buffer.Fill(0); int nbits = common::detail::SymbolBits(num_symbols); @@ -882,7 +888,7 @@ inline void DeviceShard::InitCompressedData( // check if we can use shared memory for building histograms // (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding) auto histogram_size = sizeof(GradientSumT) * null_gidx_value; - auto max_smem = dh::MaxSharedMemory(device_id_); + auto max_smem = dh::MaxSharedMemory(device_id); if (histogram_size <= max_smem) { hist_builder.reset(new SharedMemHistBuilder); } else { @@ -890,7 +896,7 @@ inline void DeviceShard::InitCompressedData( } // Init histogram - hist.Init(device_id_, hmat.row_ptr.back()); + hist.Init(device_id, hmat.NumBins()); } @@ -900,7 +906,7 @@ inline void DeviceShard::CreateHistIndices(const SparsePage& row_b // bin and compress entries in batches of rows size_t gpu_batch_nrows = std::min - (dh::TotalMemory(device_id_) / (16 * row_stride * sizeof(Entry)), + (dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)), static_cast(n_rows)); const std::vector& data_vec = row_batch.data.HostVector(); @@ -924,12 +930,12 @@ inline void DeviceShard::CreateHistIndices(const SparsePage& row_b const dim3 block3(32, 8, 1); // 256 threads const dim3 grid3(dh::DivRoundUp(n_rows, block3.x), dh::DivRoundUp(row_stride, block3.y), 1); - compress_bin_ellpack_k<<>> + CompressBinEllpackKernel<<>> (common::CompressedBufferWriter(num_symbols), gidx_buffer.Data(), row_ptrs.data().get() + batch_row_begin, entries_d.data().get(), - cut_.gidx_fvalue_map.Data(), cut_.feature_segments.Data(), + d_cut.gidx_fvalue_map.Data(), d_cut.feature_segments.Data(), batch_row_begin, batch_nrows, row_ptrs[batch_row_begin], row_stride, null_gidx_value); @@ -948,7 +954,7 @@ class GPUHistMakerSpecialised{ public: struct ExpandEntry; - GPUHistMakerSpecialised() : initialised_(false), p_last_fmat_(nullptr) {} + GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {} void Init( const std::vector>& args) { param_.InitAllowUnknown(args); @@ -977,8 +983,8 @@ class GPUHistMakerSpecialised{ ValueConstraint::Init(¶m_, dmat->Info().num_col_); // build tree try { - for (size_t i = 0; i < trees.size(); ++i) { - this->UpdateTree(gpair, dmat, trees[i]); + for (xgboost::RegTree* tree : trees) { + this->UpdateTree(gpair, dmat, tree); } dh::safe_cuda(cudaGetLastError()); } catch (const std::exception& e) { @@ -1056,14 +1062,16 @@ class GPUHistMakerSpecialised{ } void AllReduceHist(int nidx) { - if (shards_.size() == 1 && !rabit::IsDistributed()) return; + if (shards_.size() == 1 && !rabit::IsDistributed()) { + return; + } monitor_.StartCuda("AllReduce"); reducer_.GroupStart(); for (auto& shard : shards_) { auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data(); reducer_.AllReduceSum( - dist_.Devices().Index(shard->device_id_), + dist_.Devices().Index(shard->device_id), reinterpret_cast(d_node_hist), reinterpret_cast(d_node_hist), n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); @@ -1141,14 +1149,14 @@ class GPUHistMakerSpecialised{ } void InitRoot(RegTree* p_tree) { - constexpr int root_nidx = 0; + constexpr int kRootNIdx = 0; // Sum gradients std::vector tmp_sums(shards_.size()); dh::ExecuteIndexShards( &shards_, [&](int i, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(shard->device_id_)); + dh::safe_cuda(cudaSetDevice(shard->device_id)); tmp_sums[i] = dh::SumReduction( shard->temp_memory, shard->gpair.Data(), shard->gpair.Size()); }); @@ -1156,35 +1164,36 @@ class GPUHistMakerSpecialised{ GradientPair sum_gradient = std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair()); - rabit::Allreduce((GradientPair::ValueT*)&sum_gradient, 2); + rabit::Allreduce( + reinterpret_cast(&sum_gradient), 2); // Generate root histogram dh::ExecuteIndexShards( &shards_, [&](int idx, std::unique_ptr>& shard) { - shard->BuildHist(root_nidx); + shard->BuildHist(kRootNIdx); }); - this->AllReduceHist(root_nidx); + this->AllReduceHist(kRootNIdx); // Remember root stats - p_tree->Stat(root_nidx).sum_hess = sum_gradient.GetHess(); + p_tree->Stat(kRootNIdx).sum_hess = sum_gradient.GetHess(); auto weight = CalcWeight(param_, sum_gradient); - p_tree->Stat(root_nidx).base_weight = weight; - (*p_tree)[root_nidx].SetLeaf(param_.learning_rate * weight); + p_tree->Stat(kRootNIdx).base_weight = weight; + (*p_tree)[kRootNIdx].SetLeaf(param_.learning_rate * weight); // Store sum gradients for (auto& shard : shards_) { - shard->node_sum_gradients[root_nidx] = sum_gradient; + shard->node_sum_gradients[kRootNIdx] = sum_gradient; } // Initialise root constraint node_value_constraints_.resize(p_tree->GetNodes().size()); // Generate first split - auto split = this->EvaluateSplit(root_nidx, p_tree); + auto split = this->EvaluateSplit(kRootNIdx, p_tree); qexpand_->push( - ExpandEntry(root_nidx, p_tree->GetDepth(root_nidx), split, 0)); + ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split, 0)); } void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) { @@ -1302,15 +1311,16 @@ class GPUHistMakerSpecialised{ bool UpdatePredictionCache( const DMatrix* data, HostDeviceVector* p_out_preds) { - if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) - return false; monitor_.StartCuda("UpdatePredictionCache"); + if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) { + return false; + } p_out_preds->Reshard(dist_.Devices()); dh::ExecuteIndexShards( &shards_, [&](int idx, std::unique_ptr>& shard) { shard->UpdatePredictionCache( - p_out_preds->DevicePointer(shard->device_id_)); + p_out_preds->DevicePointer(shard->device_id)); }); monitor_.StopCuda("UpdatePredictionCache"); return true; @@ -1321,15 +1331,23 @@ class GPUHistMakerSpecialised{ int depth; DeviceSplitCandidate split; uint64_t timestamp; - ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split, - uint64_t timestamp) - : nid(nid), depth(depth), split(split), timestamp(timestamp) {} + ExpandEntry(int _nid, int _depth, const DeviceSplitCandidate _split, + uint64_t _timestamp) : + nid{_nid}, depth{_depth}, split(std::move(_split)), + timestamp{_timestamp} {} bool IsValid(const TrainParam& param, int num_leaves) const { - if (split.loss_chg <= kRtEps) return false; - if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) + if (split.loss_chg <= kRtEps) { return false; - if (param.max_depth > 0 && depth == param.max_depth) return false; - if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false; + } + if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) { + return false; + } + if (param.max_depth > 0 && depth == param.max_depth) { + return false; + } + if (param.max_leaves > 0 && num_leaves == param.max_leaves) { + return false; + } return true; } @@ -1365,28 +1383,36 @@ class GPUHistMakerSpecialised{ return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg } } - TrainParam param_; - GPUHistMakerTrainParam hist_maker_param_; - common::HistCutMatrix hmat_; - common::GHistIndexMatrix gmat_; - MetaInfo* info_; + + TrainParam param_; // NOLINT + common::HistCutMatrix hmat_; // NOLINT + MetaInfo* info_; // NOLINT + + std::vector>> shards_; // NOLINT + common::ColumnSampler column_sampler_; // NOLINT + + std::vector node_value_constraints_; // NOLINT + + private: bool initialised_; + int n_devices_; int n_bins_; - std::vector>> shards_; - common::ColumnSampler column_sampler_; + GPUHistMakerTrainParam hist_maker_param_; + common::GHistIndexMatrix gmat_; + using ExpandQueue = std::priority_queue, std::function>; std::unique_ptr qexpand_; - common::Monitor monitor_; dh::AllReducer reducer_; - std::vector node_value_constraints_; - /*! List storing device id. */ - std::vector device_list_; DMatrix* p_last_fmat_; GPUDistribution dist_; + + common::Monitor monitor_; + /*! List storing device id. */ + std::vector device_list_; }; class GPUHistMaker : public TreeUpdater { diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index f863e8552..807185aff 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -69,9 +69,9 @@ class HistMaker: public BaseMaker { std::vector data; /*! \brief */ inline HistUnit operator[](size_t fid) { - return HistUnit(cut + rptr[fid], - &data[0] + rptr[fid], - rptr[fid+1] - rptr[fid]); + return {cut + rptr[fid], + &data[0] + rptr[fid], + rptr[fid+1] - rptr[fid]}; } }; // thread workspace diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index b9395b517..49a28fb15 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -95,11 +95,10 @@ void QuantileHistMaker::Builder::SyncHistograms( perf_monitor.TickStart(); this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count); // use Subtraction Trick - for (auto local_it = nodes_for_subtraction_trick_.begin(); - local_it != nodes_for_subtraction_trick_.end(); local_it++) { - hist_.AddHistRow(local_it->first); - SubtractionTrick(hist_[local_it->first], hist_[local_it->second], - hist_[(*p_tree)[local_it->first].Parent()]); + for (auto const& node_pair : nodes_for_subtraction_trick_) { + hist_.AddHistRow(node_pair.first); + SubtractionTrick(hist_[node_pair.first], hist_[node_pair.second], + hist_[(*p_tree)[node_pair.first].Parent()]); } perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST); } @@ -112,8 +111,8 @@ void QuantileHistMaker::Builder::BuildLocalHistograms( RegTree *p_tree, const std::vector &gpair_h) { perf_monitor.TickStart(); - for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) { - int nid = qexpand_depth_wise_[k].nid; + for (auto const& entry : qexpand_depth_wise_) { + int nid = entry.nid; RegTree::Node &node = (*p_tree)[nid]; if (rabit::IsDistributed()) { if (node.IsRoot() || node.IsLeftChild()) { @@ -160,8 +159,8 @@ void QuantileHistMaker::Builder::BuildNodeStats( RegTree *p_tree, const std::vector &gpair_h) { perf_monitor.TickStart(); - for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) { - int nid = qexpand_depth_wise_[k].nid; + for (auto const& entry : qexpand_depth_wise_) { + int nid = entry.nid; this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree); // add constraints if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) { @@ -185,8 +184,8 @@ void QuantileHistMaker::Builder::EvaluateSplits( int depth, unsigned *timestamp, std::vector *temp_qexpand_depth) { - for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) { - int nid = qexpand_depth_wise_[k].nid; + for (auto const& entry : qexpand_depth_wise_) { + int nid = entry.nid; perf_monitor.TickStart(); this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree); perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT); @@ -221,7 +220,7 @@ void QuantileHistMaker::Builder::ExpandWithDepthWidth( int num_leaves = 0; // in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway - qexpand_depth_wise_.push_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++)); + qexpand_depth_wise_.emplace_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++)); ++num_leaves; for (int depth = 0; depth < param_.max_depth + 1; depth++) { int starting_index = std::numeric_limits::max(); diff --git a/tests/ci_build/clang_tidy.sh b/tests/ci_build/clang_tidy.sh index 392d910e4..5c96a78ba 100755 --- a/tests/ci_build/clang_tidy.sh +++ b/tests/ci_build/clang_tidy.sh @@ -1,12 +1,18 @@ #!/bin/bash -rm -rf gtest googletest-release-1.7.0 -wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip -unzip -n release-1.7.0.zip -mv googletest-release-1.7.0 gtest && cd gtest -cmake . && make -mkdir lib && mv libgtest.a lib -cd .. -rm -rf release-1.7.0.zip* +export GTEST_PKG_NAME=release-1.8.1 +export GTEST_DIR_NAME=googletest-${GTEST_PKG_NAME} # uncompressed directory +export GTEST_ZIP_FILE=${GTEST_PKG_NAME}.zip # downloaded zip ball name -python3 tests/ci_build/tidy.py --gtest-path=${PWD}/gtest +rm -rf gtest googletest-release* + +wget -nc https://github.com/google/googletest/archive/${GTEST_ZIP_FILE} +unzip -n ${GTEST_ZIP_FILE} +mv ${GTEST_DIR_NAME} gtest && cd gtest +cmake . -DCMAKE_INSTALL_PREFIX=./ins && make +make install + +cd .. +rm ${GTEST_ZIP_FILE} + +python3 tests/ci_build/tidy.py --gtest-path=${PWD}/gtest/ins diff --git a/tests/ci_build/test_tidy.cc b/tests/ci_build/test_tidy.cc new file mode 100644 index 000000000..7f59b4f3a --- /dev/null +++ b/tests/ci_build/test_tidy.cc @@ -0,0 +1,11 @@ +#include +#include + +struct Foo { + int bar_; +}; + +int main() { + std::vector values; + values.push_back(Foo()); +} diff --git a/tests/ci_build/tidy.py b/tests/ci_build/tidy.py index 76a044228..a9b175da0 100755 --- a/tests/ci_build/tidy.py +++ b/tests/ci_build/tidy.py @@ -5,23 +5,24 @@ import json from multiprocessing import Pool, cpu_count import shutil import os +import sys import re import argparse def call(args): '''Subprocess run wrapper.''' - completed = subprocess.run(args, stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) + completed = subprocess.run(args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) error_msg = completed.stdout.decode('utf-8') - matched = re.match('.*xgboost.*warning.*', error_msg, - re.MULTILINE | re.DOTALL) + matched = re.search('(src|tests)/.*warning:', error_msg, + re.MULTILINE) if matched is None: return_code = 0 else: - print(error_msg, '\n') return_code = 1 - return completed.returncode | return_code + return (completed.returncode, return_code, error_msg) class ClangTidy(object): @@ -69,8 +70,8 @@ class ClangTidy(object): def _configure_flags(self, path, command): common_args = ['clang-tidy', - # "-header-filter='(xgboost\\/src|xgboost\\/include)'", - '-config='+str(self.clang_tidy)] + "-header-filter='(xgboost\\/src|xgboost\\/include)'", + '-config='+self.clang_tidy] common_args.append(path) common_args.append('--') @@ -112,7 +113,10 @@ class ClangTidy(object): def should_lint(path): if not self.cpp_lint and path.endswith('.cc'): return False - return True + isxgb = path.find('rabit') == -1 + isxgb = isxgb and path.find('dmlc-core') == -1 + if isxgb: + return True cdb_file = os.path.join(self.cdb_path, 'compile_commands.json') with open(cdb_file, 'r') as fd: @@ -120,6 +124,7 @@ class ClangTidy(object): tidy_file = os.path.join(self.root_path, '.clang-tidy') with open(tidy_file) as fd: self.clang_tidy = yaml.load(fd) + self.clang_tidy = str(self.clang_tidy) all_files = [] for entry in self.compile_commands: path = entry['file'] @@ -132,15 +137,59 @@ class ClangTidy(object): def run(self): '''Run clang-tidy.''' all_files = self._configure() + passed = True + BAR = '-'*32 with Pool(cpu_count()) as pool: results = pool.map(call, all_files) - passed = True - if 1 in results: + for (process_status, tidy_status, msg) in results: + # Don't enforce clang-tidy to pass for now due to namespace + # for cub in thrust is not correct. + if tidy_status == 1: + passed = False + print(BAR, '\n' + 'Process return code:', process_status, ', ', + 'Tidy result code:', tidy_status, ', ', + 'Message:\n', msg, + BAR, '\n') + if not passed: print('Please correct clang-tidy warnings.') - passed = False return passed +def test_tidy(): + '''See if clang-tidy and our regex is working correctly. There are +many subtleties we need to be careful. For instances: + + * Is the string re-directed to pipe encoded as UTF-8? or is it +bytes? + + * On Jenkins there's no 'xgboost' directory, are we catching the +right keywords? + + * Should we use re.DOTALL? + + * Should we use re.MULTILINE? + + Tests here are not thorough, at least we want to guarantee tidy is + not missing anything on Jenkins. + + ''' + root_path = os.path.abspath(os.path.curdir) + tidy_file = os.path.join(root_path, '.clang-tidy') + test_file_path = os.path.join(root_path, + 'tests', 'ci_build', 'test_tidy.cc') + + with open(tidy_file) as fd: + tidy_config = fd.read() + tidy_config = str(tidy_config) + tidy_config = '-config='+tidy_config + args = ['clang-tidy', tidy_config, test_file_path] + (proc_code, tidy_status, error_msg) = call(args) + assert proc_code == 0 + assert tidy_status == 1 + print('clang-tidy is working.') + + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Run clang-tidy.') parser.add_argument('--cpp', type=int, default=1) @@ -148,8 +197,10 @@ if __name__ == '__main__': parser.add_argument('--gtest-path', required=True, help='Full path of Google Test library directory') args = parser.parse_args() + + test_tidy() + with ClangTidy(args.gtest_path, args.cpp, args.cuda) as linter: passed = linter.run() - # Uncomment it once the code base is clang-tidy conformant. - # if not passed: - # sys.exit(1) + if not passed: + sys.exit(1) diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc index abf4e25c1..f29ce2af6 100644 --- a/tests/cpp/common/test_span.cc +++ b/tests/cpp/common/test_span.cc @@ -172,7 +172,7 @@ struct BaseClass { virtual void operator()() {} }; struct DerivedClass : public BaseClass { - virtual void operator()() {} + void operator()() override {} }; TEST(Span, FromOther) { diff --git a/tests/cpp/common/test_span.cu b/tests/cpp/common/test_span.cu index 9e58c3b43..04237497c 100644 --- a/tests/cpp/common/test_span.cu +++ b/tests/cpp/common/test_span.cu @@ -15,6 +15,7 @@ namespace xgboost { namespace common { struct TestStatus { + private: int *status_; public: @@ -28,32 +29,34 @@ struct TestStatus { dh::safe_cuda(cudaFree(status_)); } - int get() { + int Get() { int h_status; dh::safe_cuda(cudaMemcpy(&h_status, status_, sizeof(int), cudaMemcpyDeviceToHost)); return h_status; } - int* data() { + int* Data() { return status_; } }; -__global__ void test_from_other_kernel(Span span) { +__global__ void TestFromOtherKernel(Span span) { // don't get optimized out size_t idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= span.size()) + if (idx >= span.size()) { return; + } } // Test converting different T - __global__ void test_from_other_kernel_const(Span span) { + __global__ void TestFromOtherKernelConst(Span span) { // don't get optimized out size_t idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= span.size()) + if (idx >= span.size()) { return; + } } /*! @@ -68,42 +71,44 @@ TEST(GPUSpan, FromOther) { // dynamic extent { Span span (d_vec.data().get(), d_vec.size()); - test_from_other_kernel<<<1, 16>>>(span); + TestFromOtherKernel<<<1, 16>>>(span); } { Span span (d_vec.data().get(), d_vec.size()); - test_from_other_kernel_const<<<1, 16>>>(span); + TestFromOtherKernelConst<<<1, 16>>>(span); } // static extent { Span span(d_vec.data().get(), d_vec.data().get() + 16); - test_from_other_kernel<<<1, 16>>>(span); + TestFromOtherKernel<<<1, 16>>>(span); } { Span span(d_vec.data().get(), d_vec.data().get() + 16); - test_from_other_kernel_const<<<1, 16>>>(span); + TestFromOtherKernelConst<<<1, 16>>>(span); } } TEST(GPUSpan, Assignment) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestAssignment{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestAssignment{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpan, TestStatus) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestTestStatus{status.data()}); - ASSERT_EQ(status.get(), -1); + dh::LaunchN(0, 16, TestTestStatus{status.Data()}); + ASSERT_EQ(status.Get(), -1); } template struct TestEqual { + private: T *lhs_, *rhs_; int *status_; + public: TestEqual(T* _lhs, T* _rhs, int * _status) : lhs_(_lhs), rhs_(_rhs), status_(_status) {} @@ -140,10 +145,10 @@ TEST(GPUSpan, WithTrust) { dh::LaunchN(0, 16, TestEqual{ thrust::raw_pointer_cast(d_vec1.data()), - s.data(), status.data()}); - ASSERT_EQ(status.get(), 1); + s.data(), status.Data()}); + ASSERT_EQ(status.Get(), 1); - // FIXME: memory error! + // FIXME(trivialfis): memory error! // bool res = thrust::equal(thrust::device, // d_vec.begin(), d_vec.end(), // s.begin()); @@ -153,23 +158,23 @@ TEST(GPUSpan, WithTrust) { TEST(GPUSpan, BeginEnd) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestBeginEnd{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestBeginEnd{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpan, RBeginREnd) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestRBeginREnd{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestRBeginREnd{status.Data()}); + ASSERT_EQ(status.Get(), 1); } -__global__ void test_modify_kernel(Span span) { +__global__ void TestModifyKernel(Span span) { size_t idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= span.size()) + if (idx >= span.size()) { return; - + } span[idx] = span.size() - idx; } @@ -182,7 +187,7 @@ TEST(GPUSpan, Modify) { Span span (d_vec.data().get(), d_vec.size()); - test_modify_kernel<<<1, 16>>>(span); + TestModifyKernel<<<1, 16>>>(span); for (size_t i = 0; i < d_vec.size(); ++i) { ASSERT_EQ(d_vec[i], d_vec.size() - i); @@ -192,21 +197,23 @@ TEST(GPUSpan, Modify) { TEST(GPUSpan, Observers) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestObservers{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestObservers{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpan, Compare) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestIterCompare{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestIterCompare{status.Data()}); + ASSERT_EQ(status.Get(), 1); } struct TestElementAccess { + private: Span span_; - XGBOOST_DEVICE TestElementAccess (Span _span) : span_(_span) {} + public: + XGBOOST_DEVICE explicit TestElementAccess (Span _span) : span_(_span) {} XGBOOST_DEVICE float operator()(size_t _idx) { float tmp = span_[_idx]; @@ -232,16 +239,16 @@ TEST(GPUSpan, ElementAccess) { std::string output = testing::internal::GetCapturedStdout(); } -__global__ void test_first_dynamic_kernel(Span _span) { +__global__ void TestFirstDynamicKernel(Span _span) { _span.first<-1>(); } -__global__ void test_first_static_kernel(Span _span) { +__global__ void TestFirstStaticKernel(Span _span) { _span.first(-1); } -__global__ void test_last_dynamic_kernel(Span _span) { +__global__ void TestLastDynamicKernel(Span _span) { _span.last<-1>(); } -__global__ void test_last_static_kernel(Span _span) { +__global__ void TestLastStaticKernel(Span _span) { _span.last(-1); } @@ -256,7 +263,7 @@ TEST(GPUSpan, FirstLast) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_first_dynamic_kernel<<<1, 1>>>(span); + TestFirstDynamicKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_first_dy(), ""); @@ -270,7 +277,7 @@ TEST(GPUSpan, FirstLast) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_first_static_kernel<<<1, 1>>>(span); + TestFirstStaticKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_first_static(), ""); @@ -284,7 +291,7 @@ TEST(GPUSpan, FirstLast) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_last_dynamic_kernel<<<1, 1>>>(span); + TestLastDynamicKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_last_dy(), ""); @@ -298,7 +305,7 @@ TEST(GPUSpan, FirstLast) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_last_static_kernel<<<1, 1>>>(span); + TestLastStaticKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_last_static(), ""); @@ -306,10 +313,10 @@ TEST(GPUSpan, FirstLast) { } -__global__ void test_subspan_dynamic_kernel(Span _span) { +__global__ void TestSubspanDynamicKernel(Span _span) { _span.subspan(16, 0); } -__global__ void test_subspan_static_kernel(Span _span) { +__global__ void TestSubspanStaticKernel(Span _span) { _span.subspan<16>(); } TEST(GPUSpan, Subspan) { @@ -321,7 +328,7 @@ TEST(GPUSpan, Subspan) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_subspan_dynamic_kernel<<<1, 1>>>(span); + TestSubspanDynamicKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_subspan_dynamic(), ""); @@ -335,7 +342,7 @@ TEST(GPUSpan, Subspan) { thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin()); Span span (d_vec.data().get(), d_vec.size()); - test_subspan_static_kernel<<<1, 1>>>(span); + TestSubspanStaticKernel<<<1, 1>>>(span); }; testing::internal::CaptureStdout(); EXPECT_DEATH(lambda_subspan_static(), ""); @@ -345,43 +352,43 @@ TEST(GPUSpan, Subspan) { TEST(GPUSpanIter, Construct) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestIterConstruct{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestIterConstruct{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpanIter, Ref) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestIterRef{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestIterRef{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpanIter, Calculate) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestIterCalculate{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestIterCalculate{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpanIter, Compare) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestIterCompare{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestIterCompare{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpan, AsBytes) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestAsBytes{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestAsBytes{status.Data()}); + ASSERT_EQ(status.Get(), 1); } TEST(GPUSpan, AsWritableBytes) { dh::safe_cuda(cudaSetDevice(0)); TestStatus status; - dh::LaunchN(0, 16, TestAsWritableBytes{status.data()}); - ASSERT_EQ(status.get(), 1); + dh::LaunchN(0, 16, TestAsWritableBytes{status.Data()}); + ASSERT_EQ(status.Get(), 1); } } // namespace common diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc index 7baea3a39..e2cdebc0c 100644 --- a/tests/cpp/data/test_data.cc +++ b/tests/cpp/data/test_data.cc @@ -13,7 +13,7 @@ TEST(SparsePage, PushCSC) { offset = {0, 1, 4}; for (size_t i = 0; i < offset.back(); ++i) { - data.push_back(Entry(i, 0.1f)); + data.emplace_back(Entry(i, 0.1f)); } SparsePage other; @@ -52,4 +52,4 @@ TEST(SparsePage, PushCSC) { ASSERT_EQ(inst[i].index, indices_sol[i % 3]); } } -} +} // namespace xgboost diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index 50b865258..d9c648fa9 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -27,7 +27,7 @@ TEST(SimpleDMatrix, RowAccess) { xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false); // Loop over the batches and count the records - long row_count = 0; + int64_t row_count = 0; for (auto &batch : dmat->GetRowBatches()) { row_count += batch.Size(); } @@ -54,7 +54,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) { ASSERT_TRUE(dmat->SingleColBlock()); // Loop over the batches and assert the data is as expected - long num_col_batch = 0; + int64_t num_col_batch = 0; for (const auto &batch : dmat->GetSortedColumnBatches()) { num_col_batch += 1; EXPECT_EQ(batch.Size(), dmat->Info().num_col_) diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index c4ca6aa49..8d478e487 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -1,6 +1,8 @@ // Copyright by Contributors #include #include +#include + #include "../../../src/data/sparse_page_dmatrix.h" #include "../helpers.h" @@ -33,7 +35,7 @@ TEST(SparsePageDMatrix, RowAccess) { EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); // Loop over the batches and count the records - long row_count = 0; + int64_t row_count = 0; for (auto &batch : dmat->GetRowBatches()) { row_count += batch.Size(); } diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 2eb53ad26..57460e6e9 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -4,13 +4,14 @@ #include "./helpers.h" #include "xgboost/c_api.h" #include +#include bool FileExists(const std::string& filename) { struct stat st; return stat(filename.c_str(), &st) == 0; } -long GetFileSize(const std::string& filename) { +int64_t GetFileSize(const std::string& filename) { struct stat st; stat(filename.c_str(), &st); return st.st_size; @@ -30,13 +31,13 @@ void CreateBigTestData(const std::string& filename, size_t n_entries) { } } -void _CheckObjFunction(xgboost::ObjFunction * obj, - std::vector preds, - std::vector labels, - std::vector weights, - xgboost::MetaInfo info, - std::vector out_grad, - std::vector out_hess) { +void CheckObjFunctionImpl(xgboost::ObjFunction * obj, + std::vector preds, + std::vector labels, + std::vector weights, + xgboost::MetaInfo info, + std::vector out_grad, + std::vector out_hess) { xgboost::HostDeviceVector in_preds(preds); xgboost::HostDeviceVector out_gpair; obj->GetGradient(in_preds, info, 1, &out_gpair); @@ -64,7 +65,7 @@ void CheckObjFunction(xgboost::ObjFunction * obj, info.labels_.HostVector() = labels; info.weights_.HostVector() = weights; - _CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess); + CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); } void CheckRankingObjFunction(xgboost::ObjFunction * obj, @@ -80,7 +81,7 @@ void CheckRankingObjFunction(xgboost::ObjFunction * obj, info.weights_.HostVector() = weights; info.group_ptr_ = groups; - _CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess); + CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); } diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc index e073a76f1..8311663b0 100644 --- a/tests/cpp/metric/test_metric.cc +++ b/tests/cpp/metric/test_metric.cc @@ -4,11 +4,16 @@ #include "../helpers.h" TEST(Metric, UnknownMetric) { - xgboost::Metric * metric; + xgboost::Metric * metric = nullptr; EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name")); EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse")); - delete metric; + if (metric) { + delete metric; + } + metric = nullptr; EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name@1")); EXPECT_NO_THROW(metric = xgboost::Metric::Create("error@0.5f")); - delete metric; + if (metric) { + delete metric; + } } diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc index 213f7d47e..963b94b20 100644 --- a/tests/cpp/objective/test_objective.cc +++ b/tests/cpp/objective/test_objective.cc @@ -4,8 +4,10 @@ #include "../helpers.h" TEST(Objective, UnknownFunction) { - xgboost::ObjFunction* obj; + xgboost::ObjFunction* obj = nullptr; EXPECT_ANY_THROW(obj = xgboost::ObjFunction::Create("unknown_name")); EXPECT_NO_THROW(obj = xgboost::ObjFunction::Create("reg:linear")); - delete obj; + if (obj) { + delete obj; + } } diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc index 7843cc12f..534c2e0b0 100644 --- a/tests/cpp/objective/test_regression_obj.cc +++ b/tests/cpp/objective/test_regression_obj.cc @@ -85,7 +85,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) { TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) { xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson"); std::vector > args; - args.push_back(std::make_pair("max_delta_step", "0.1f")); + args.emplace_back(std::make_pair("max_delta_step", "0.1f")); obj->Configure(args); CheckObjFunction(obj, { 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, @@ -176,7 +176,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) { TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) { xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie"); std::vector > args; - args.push_back(std::make_pair("tweedie_variance_power", "1.1f")); + args.emplace_back(std::make_pair("tweedie_variance_power", "1.1f")); obj->Configure(args); CheckObjFunction(obj, { 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1}, diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 752fdee92..6fc844113 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -41,21 +41,20 @@ TEST(cpu_predictor, Test) { // Test predict leaf std::vector leaf_out_predictions; cpu_predictor->PredictLeaf((*dmat).get(), &leaf_out_predictions, model); - for (int i = 0; i < leaf_out_predictions.size(); i++) { - ASSERT_EQ(leaf_out_predictions[i], 0); + for (auto v : leaf_out_predictions) { + ASSERT_EQ(v, 0); } // Test predict contribution std::vector out_contribution; cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model); - for (int i = 0; i < out_contribution.size(); i++) { - ASSERT_EQ(out_contribution[i], 1.5); + for (auto const& contri : out_contribution) { + ASSERT_EQ(contri, 1.5); } - // Test predict contribution (approximate method) cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model, true); - for (int i = 0; i < out_contribution.size(); i++) { - ASSERT_EQ(out_contribution[i], 1.5); + for (auto const& contri : out_contribution) { + ASSERT_EQ(contri, 1.5); } delete dmat; diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index b0181bd30..7d2eef49b 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -8,7 +8,7 @@ namespace xgboost { TEST(Learner, Basic) { - typedef std::pair Arg; + using Arg = std::pair; auto args = {Arg("tree_method", "exact")}; auto mat_ptr = CreateDMatrix(10, 10, 0); std::vector> mat = {*mat_ptr}; diff --git a/tests/cpp/tree/test_gpu_exact.cu b/tests/cpp/tree/test_gpu_exact.cu index 90d96f66b..00f162815 100644 --- a/tests/cpp/tree/test_gpu_exact.cu +++ b/tests/cpp/tree/test_gpu_exact.cu @@ -20,13 +20,13 @@ TEST(GPUExact, Update) { auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu"); p_gpuexact_maker->Init(args); - size_t constexpr n_rows = 4; - size_t constexpr n_cols = 8; - bst_float constexpr sparsity = 0.0f; + size_t constexpr kNRows = 4; + size_t constexpr kNCols = 8; + bst_float constexpr kSparsity = 0.0f; - auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3); - std::vector h_gpair(n_rows); - for (size_t i = 0; i < n_rows; ++i) { + auto dmat = CreateDMatrix(kNRows, kNCols, kSparsity, 3); + std::vector h_gpair(kNRows); + for (size_t i = 0; i < kNRows; ++i) { h_gpair[i] = GradientPair(i % 2, 1); } HostDeviceVector gpair (h_gpair); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index c40ab7bb3..21ad0efe8 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -46,20 +46,20 @@ void BuildGidx(DeviceShard* shard, int n_rows, int n_cols, } TEST(GpuHist, BuildGidxDense) { - int const n_rows = 16, n_cols = 8; + int constexpr kNRows = 16, kNCols = 8; TrainParam param; param.max_depth = 1; param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, n_rows, param); - BuildGidx(&shard, n_rows, n_cols); + DeviceShard shard(0, 0, kNRows, param); + BuildGidx(&shard, kNRows, kNCols); std::vector h_gidx_buffer; h_gidx_buffer = shard.gidx_buffer.AsVector(); common::CompressedIterator gidx(h_gidx_buffer.data(), 25); - ASSERT_EQ(shard.row_stride, n_cols); + ASSERT_EQ(shard.row_stride, kNCols); std::vector solution = { 0, 3, 8, 9, 14, 17, 20, 21, @@ -79,20 +79,20 @@ TEST(GpuHist, BuildGidxDense) { 2, 4, 8, 10, 14, 15, 19, 22, 1, 4, 7, 10, 14, 16, 19, 21, }; - for (size_t i = 0; i < n_rows * n_cols; ++i) { + for (size_t i = 0; i < kNRows * kNCols; ++i) { ASSERT_EQ(solution[i], gidx[i]); } } TEST(GpuHist, BuildGidxSparse) { - int const n_rows = 16, n_cols = 8; + int constexpr kNRows = 16, kNCols = 8; TrainParam param; param.max_depth = 1; param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, n_rows, param); - BuildGidx(&shard, n_rows, n_cols, 0.9f); + DeviceShard shard(0, 0, kNRows, param); + BuildGidx(&shard, kNRows, kNCols, 0.9f); std::vector h_gidx_buffer; h_gidx_buffer = shard.gidx_buffer.AsVector(); @@ -106,7 +106,7 @@ TEST(GpuHist, BuildGidxSparse) { 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; - for (size_t i = 0; i < n_rows * shard.row_stride; ++i) { + for (size_t i = 0; i < kNRows * shard.row_stride; ++i) { ASSERT_EQ(solution[i], gidx[i]); } } @@ -128,27 +128,27 @@ std::vector GetHostHistGpair() { template void TestBuildHist(GPUHistBuilderBase& builder) { - int const n_rows = 16, n_cols = 8; + int const kNRows = 16, kNCols = 8; TrainParam param; param.max_depth = 6; param.n_gpus = 1; param.max_leaves = 0; - DeviceShard shard(0, 0, n_rows, param); + DeviceShard shard(0, 0, kNRows, param); - BuildGidx(&shard, n_rows, n_cols); + BuildGidx(&shard, kNRows, kNCols); xgboost::SimpleLCG gen; xgboost::SimpleRealUniformDistribution dist(0.0f, 1.0f); - std::vector h_gpair(n_rows); - for (size_t i = 0; i < h_gpair.size(); ++i) { + std::vector h_gpair(kNRows); + for (auto &gpair : h_gpair) { bst_float grad = dist(&gen); bst_float hess = dist(&gen); - h_gpair[i] = GradientPair(grad, hess); + gpair = GradientPair(grad, hess); } - thrust::device_vector gpair (n_rows); + thrust::device_vector gpair (kNRows); gpair = h_gpair; int num_symbols = shard.n_bins + 1; @@ -164,7 +164,7 @@ void TestBuildHist(GPUHistBuilderBase& builder) { num_symbols); shard.ridx_segments.resize(1); - shard.ridx_segments[0] = Segment(0, n_rows); + shard.ridx_segments[0] = Segment(0, kNRows); shard.hist.AllocateHistogram(0); shard.gpair.copy(gpair.begin(), gpair.end()); thrust::sequence(shard.ridx.CurrentDVec().tbegin(), @@ -175,11 +175,11 @@ void TestBuildHist(GPUHistBuilderBase& builder) { auto node_histogram = d_hist.GetNodeHistogram(0); // d_hist.data stored in float, not gradient pair - thrust::host_vector h_result (d_hist.data.size()/2); + thrust::host_vector h_result (d_hist.Data().size() / 2); size_t data_size = sizeof(GradientSumT) / (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)); - data_size *= d_hist.data.size(); + data_size *= d_hist.Data().size(); dh::safe_cuda(cudaMemcpy(h_result.data(), node_histogram.data(), data_size, cudaMemcpyDeviceToHost)); @@ -224,8 +224,8 @@ common::HistCutMatrix GetHostCutMatrix () { // TODO(trivialfis): This test is over simplified. TEST(GpuHist, EvaluateSplits) { - constexpr int n_rows = 16; - constexpr int n_cols = 8; + constexpr int kNRows = 16; + constexpr int kNCols = 8; TrainParam param; param.max_depth = 1; @@ -240,14 +240,15 @@ TEST(GpuHist, EvaluateSplits) { param.reg_lambda = 0; param.max_delta_step = 0.0; - for (size_t i = 0; i < n_cols; ++i) { + for (size_t i = 0; i < kNCols; ++i) { param.monotone_constraints.emplace_back(0); } int max_bins = 4; // Initialize DeviceShard - std::unique_ptr> shard {new DeviceShard(0, 0, n_rows, param)}; + std::unique_ptr> shard { + new DeviceShard(0, 0, kNRows, param)}; // Initialize DeviceShard::node_sum_gradients shard->node_sum_gradients = {{6.4f, 12.8f}}; @@ -257,17 +258,17 @@ TEST(GpuHist, EvaluateSplits) { // Copy cut matrix to device. DeviceShard::DeviceHistCutMatrix cut; shard->ba.Allocate(0, - &(shard->cut_.feature_segments), cmat.row_ptr.size(), - &(shard->cut_.min_fvalue), cmat.min_val.size(), - &(shard->cut_.gidx_fvalue_map), 24, - &(shard->monotone_constraints), n_cols); - shard->cut_.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); - shard->cut_.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); + &(shard->d_cut.feature_segments), cmat.row_ptr.size(), + &(shard->d_cut.min_fvalue), cmat.min_val.size(), + &(shard->d_cut.gidx_fvalue_map), 24, + &(shard->monotone_constraints), kNCols); + shard->d_cut.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); + shard->d_cut.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); shard->monotone_constraints.copy(param.monotone_constraints.begin(), param.monotone_constraints.end()); // Initialize DeviceShard::hist - shard->hist.Init(0, (max_bins - 1) * n_cols); + shard->hist.Init(0, (max_bins - 1) * kNCols); shard->hist.AllocateHistogram(0); // Each row of hist_gpair represents gpairs for one feature. // Each entry represents a bin. @@ -278,16 +279,16 @@ TEST(GpuHist, EvaluateSplits) { hist.push_back(pair.GetHess()); } - ASSERT_EQ(shard->hist.data.size(), hist.size()); + ASSERT_EQ(shard->hist.Data().size(), hist.size()); thrust::copy(hist.begin(), hist.end(), - shard->hist.data.begin()); + shard->hist.Data().begin()); // Initialize GPUHistMaker GPUHistMakerSpecialised hist_maker = GPUHistMakerSpecialised(); hist_maker.param_ = param; hist_maker.shards_.push_back(std::move(shard)); - hist_maker.column_sampler_.Init(n_cols, + hist_maker.column_sampler_.Init(kNCols, param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree, @@ -295,8 +296,8 @@ TEST(GpuHist, EvaluateSplits) { RegTree tree; MetaInfo info; - info.num_row_ = n_rows; - info.num_col_ = n_cols; + info.num_row_ = kNRows; + info.num_col_ = kNCols; hist_maker.info_ = &info; hist_maker.node_value_constraints_.resize(1); @@ -313,30 +314,30 @@ TEST(GpuHist, EvaluateSplits) { TEST(GpuHist, ApplySplit) { GPUHistMakerSpecialised hist_maker = GPUHistMakerSpecialised(); - int constexpr nid = 0; - int constexpr n_rows = 16; - int constexpr n_cols = 8; + int constexpr kNId = 0; + int constexpr kNRows = 16; + int constexpr kNCols = 8; TrainParam param; std::vector> args = {}; param.InitAllowUnknown(args); // Initialize shard - for (size_t i = 0; i < n_cols; ++i) { + for (size_t i = 0; i < kNCols; ++i) { param.monotone_constraints.emplace_back(0); } hist_maker.shards_.resize(1); - hist_maker.shards_[0].reset(new DeviceShard(0, 0, n_rows, param)); + hist_maker.shards_[0].reset(new DeviceShard(0, 0, kNRows, param)); auto& shard = hist_maker.shards_.at(0); shard->ridx_segments.resize(3); // 3 nodes. shard->node_sum_gradients.resize(3); - shard->ridx_segments[0] = Segment(0, n_rows); - shard->ba.Allocate(0, &(shard->ridx), n_rows, - &(shard->position), n_rows); - shard->row_stride = n_cols; + shard->ridx_segments[0] = Segment(0, kNRows); + shard->ba.Allocate(0, &(shard->ridx), kNRows, + &(shard->position), kNRows); + shard->row_stride = kNCols; thrust::sequence(shard->ridx.CurrentDVec().tbegin(), shard->ridx.CurrentDVec().tend()); // Initialize GPUHistMaker @@ -349,31 +350,30 @@ TEST(GpuHist, ApplySplit) { GradientPair(8.2, 2.8), GradientPair(6.3, 3.6), GPUTrainingParam(param)); GPUHistMakerSpecialised::ExpandEntry candidate_entry {0, 0, candidate, 0}; - candidate_entry.nid = nid; + candidate_entry.nid = kNId; auto const& nodes = tree.GetNodes(); - size_t n_nodes = nodes.size(); // Used to get bin_id in update position. common::HistCutMatrix cmat = GetHostCutMatrix(); hist_maker.hmat_ = cmat; MetaInfo info; - info.num_row_ = n_rows; - info.num_col_ = n_cols; - info.num_nonzero_ = n_rows * n_cols; // Dense + info.num_row_ = kNRows; + info.num_col_ = kNCols; + info.num_nonzero_ = kNRows * kNCols; // Dense // Initialize gidx int n_bins = 24; - int row_stride = n_cols; + int row_stride = kNCols; int num_symbols = n_bins + 1; size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize( - row_stride * n_rows, num_symbols); + row_stride * kNRows, num_symbols); shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes); common::CompressedBufferWriter wr(num_symbols); - std::vector h_gidx (n_rows * row_stride); + std::vector h_gidx (kNRows * row_stride); std::iota(h_gidx.begin(), h_gidx.end(), 0); std::vector h_gidx_compressed (compressed_size_bytes); @@ -387,10 +387,10 @@ TEST(GpuHist, ApplySplit) { hist_maker.ApplySplit(candidate_entry, &tree); hist_maker.UpdatePosition(candidate_entry, &tree); - ASSERT_FALSE(tree[nid].IsLeaf()); + ASSERT_FALSE(tree[kNId].IsLeaf()); - int left_nidx = tree[nid].LeftChild(); - int right_nidx = tree[nid].RightChild(); + int left_nidx = tree[kNId].LeftChild(); + int right_nidx = tree[kNId].RightChild(); ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0); ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6); diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc index 8206a39be..cc7981ef4 100644 --- a/tests/cpp/tree/test_prune.cc +++ b/tests/cpp/tree/test_prune.cc @@ -13,14 +13,14 @@ namespace xgboost { namespace tree { TEST(Updater, Prune) { - int constexpr n_rows = 32, n_cols = 16; + int constexpr kNRows = 32, kNCols = 16; std::vector> cfg; - cfg.push_back(std::pair( - "num_feature", std::to_string(n_cols))); - cfg.push_back(std::pair( + cfg.emplace_back(std::pair( + "num_feature", std::to_string(kNCols))); + cfg.emplace_back(std::pair( "min_split_loss", "10")); - cfg.push_back(std::pair( + cfg.emplace_back(std::pair( "silent", "1")); // These data are just place holders. diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 819860e8a..f1f567198 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -133,12 +133,12 @@ class QuantileHistMock : public QuantileHistMaker { std::vector row_gpairs = { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f}, {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} }; - size_t constexpr max_bins = 4; - auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3); + size_t constexpr kMaxBins = 4; + auto dmat = CreateDMatrix(kNRows, kNCols, 0, 3); // dense, no missing values common::GHistIndexMatrix gmat; - gmat.Init((*dmat).get(), max_bins); + gmat.Init((*dmat).get(), kMaxBins); RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree); hist_.AddHistRow(0); @@ -167,7 +167,8 @@ class QuantileHistMock : public QuantileHistMaker { // 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha, // and max_delta_step to 0. bst_float best_split_gain = 0.0f; - size_t best_split_threshold, best_split_feature; + size_t best_split_threshold = std::numeric_limits::max(); + size_t best_split_feature = std::numeric_limits::max(); // Enumerate all features for (size_t fid = 0; fid < num_feature; ++fid) { const size_t bin_id_min = gmat.cut.row_ptr[fid]; @@ -213,56 +214,56 @@ class QuantileHistMock : public QuantileHistMaker { } }; - int static constexpr n_rows = 8, n_cols = 16; - std::shared_ptr *dmat; - const std::vector > cfg; + int static constexpr kNRows = 8, kNCols = 16; + std::shared_ptr *dmat_; + const std::vector > cfg_; std::shared_ptr builder_; public: explicit QuantileHistMock( const std::vector >& args) : - cfg{args} { + cfg_{args} { QuantileHistMaker::Init(args); builder_.reset( new BuilderMock( param_, std::move(pruner_), std::unique_ptr(spliteval_->GetHostClone()))); - dmat = CreateDMatrix(n_rows, n_cols, 0.8, 3); + dmat_ = CreateDMatrix(kNRows, kNCols, 0.8, 3); } - ~QuantileHistMock() { delete dmat; } + ~QuantileHistMock() override { delete dmat_; } - static size_t GetNumColumns() { return n_cols; } + static size_t GetNumColumns() { return kNCols; } void TestInitData() { - size_t constexpr max_bins = 4; + size_t constexpr kMaxBins = 4; common::GHistIndexMatrix gmat; - gmat.Init((*dmat).get(), max_bins); + gmat.Init((*dmat_).get(), kMaxBins); RegTree tree = RegTree(); - tree.param.InitAllowUnknown(cfg); + tree.param.InitAllowUnknown(cfg_); std::vector gpair = { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }; - builder_->TestInitData(gmat, gpair, dmat->get(), tree); + builder_->TestInitData(gmat, gpair, dmat_->get(), tree); } void TestBuildHist() { RegTree tree = RegTree(); - tree.param.InitAllowUnknown(cfg); + tree.param.InitAllowUnknown(cfg_); - size_t constexpr max_bins = 4; + size_t constexpr kMaxBins = 4; common::GHistIndexMatrix gmat; - gmat.Init((*dmat).get(), max_bins); + gmat.Init((*dmat_).get(), kMaxBins); - builder_->TestBuildHist(0, gmat, *(*dmat).get(), tree); + builder_->TestBuildHist(0, gmat, *(*dmat_).get(), tree); } void TestEvaluateSplit() { RegTree tree = RegTree(); - tree.param.InitAllowUnknown(cfg); + tree.param.InitAllowUnknown(cfg_); builder_->TestEvaluateSplit(gmatb_, tree); } diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc index cbd06d609..029e4479a 100644 --- a/tests/cpp/tree/test_refresh.cc +++ b/tests/cpp/tree/test_refresh.cc @@ -13,15 +13,15 @@ namespace xgboost { namespace tree { TEST(Updater, Refresh) { - int constexpr n_rows = 8, n_cols = 16; + int constexpr kNRows = 8, kNCols = 16; HostDeviceVector gpair = { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }; - auto dmat = CreateDMatrix(n_rows, n_cols, 0.4, 3); + auto dmat = CreateDMatrix(kNRows, kNCols, 0.4, 3); std::vector> cfg { {"reg_alpha", "0.0"}, - {"num_feature", std::to_string(n_cols)}, + {"num_feature", std::to_string(kNCols)}, {"reg_lambda", "1"}}; RegTree tree = RegTree();