sync up May15 2023

2023-05-15 18:59:18 +02:00
parent b066accad6 7375bd058b
commit 8cad8c693c
37 changed files with 628 additions and 398 deletions
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -4,9 +4,6 @@
 * \brief The command line interface program of xgboost.
 *  This file is not included in dynamic library.
 */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
 #endif  // !defined(NOMINMAX)
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@@ -222,15 +222,15 @@ void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string*

  std::unique_lock<std::mutex> lock(mutex_);

-  LOG(INFO) << functor.name << " rank " << rank << ": waiting for current sequence number";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": waiting for current sequence number";
  cv_.wait(lock, [this, sequence_number] { return sequence_number_ == sequence_number; });

-  LOG(INFO) << functor.name << " rank " << rank << ": handling request";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": handling request";
  functor(input, bytes, &buffer_);
  received_++;

  if (received_ == world_size_) {
-    LOG(INFO) << functor.name << " rank " << rank << ": all requests received";
+    LOG(DEBUG) << functor.name << " rank " << rank << ": all requests received";
    output->assign(buffer_);
    sent_++;
    lock.unlock();
@@ -238,15 +238,15 @@ void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string*
    return;
  }

-  LOG(INFO) << functor.name << " rank " << rank << ": waiting for all clients";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": waiting for all clients";
  cv_.wait(lock, [this] { return received_ == world_size_; });

-  LOG(INFO) << functor.name << " rank " << rank << ": sending reply";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": sending reply";
  output->assign(buffer_);
  sent_++;

  if (sent_ == world_size_) {
-    LOG(INFO) << functor.name << " rank " << rank << ": all replies sent";
+    LOG(DEBUG) << functor.name << " rank " << rank << ": all replies sent";
    sent_ = 0;
    received_ = 0;
    buffer_.clear();
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1355,14 +1355,12 @@ class CUDAStream {
  cudaStream_t stream_;

 public:
-  CUDAStream() {
-    dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
-  }
-  ~CUDAStream() {
-    dh::safe_cuda(cudaStreamDestroy(stream_));
-  }
+  CUDAStream() { dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); }
+  ~CUDAStream() { dh::safe_cuda(cudaStreamDestroy(stream_)); }
+
+  [[nodiscard]] CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+  [[nodiscard]] cudaStream_t Handle() const { return stream_; }

-  CUDAStreamView View() const { return CUDAStreamView{stream_}; }
  void Sync() { this->View().Sync(); }
 };

--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@@ -1273,14 +1273,12 @@ class CUDAStream {
  hipStream_t stream_;

 public:
-  CUDAStream() {
-    dh::safe_cuda(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking));
-  }
-  ~CUDAStream() {
-    dh::safe_cuda(hipStreamDestroy(stream_));
-  }
+  CUDAStream() { dh::safe_cuda(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking)); }
+  ~CUDAStream() { dh::safe_cuda(hipStreamDestroy(stream_)); }
+
+  [[nodiscard]] CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+  [[nodiscard]] hipStream_t Handle() const { return stream_; }

-  CUDAStreamView View() const { return CUDAStreamView{stream_}; }
  void Sync() { this->View().Sync(); }
 };

--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -183,14 +183,28 @@ class PartitionBuilder {
    SetNRightElems(node_in_set, range.begin(), n_right);
  }

+  template <bool any_missing, typename ColumnType, typename Predicate>
+  void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
+                  BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
+    auto& column = *p_column;
+    for (auto const row_id : row_indices) {
+      auto const bin_id = column[row_id - base_rowid];
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        missing_bits->Set(row_id - base_rowid);
+      } else if (pred(row_id, bin_id)) {
+        decision_bits->Set(row_id - base_rowid);
+      }
+    }
+  }
+
  /**
   * @brief When data is split by column, we don't have all the features locally on the current
   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
   * to go right, or if the feature value used for the split is missing.
   */
-  template <typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
-                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
                BitVector* decision_bits, BitVector* missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
@@ -204,7 +218,7 @@ class PartitionBuilder {
      for (auto row_id : rid_span) {
        auto gidx = gmat.GetGindex(row_id, fid);
        if (gidx > -1) {
-          bool go_left = false;
+          bool go_left;
          if (is_cat) {
            go_left = Decision(node_cats, cut_values[gidx]);
          } else {
@@ -218,7 +232,27 @@ class PartitionBuilder {
        }
      }
    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+      auto pred_hist = [&](auto ridx, auto bin_id) {
+        if (any_cat && is_cat) {
+          auto gidx = gmat.GetGindex(ridx, fid);
+          CHECK_GT(gidx, -1);
+          return Decision(node_cats, cut_values[gidx]);
+        } else {
+          return bin_id <= split_cond;
+        }
+      };
+
+      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
+        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
+        MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
+                                pred_hist);
+      } else {
+        CHECK_EQ(any_missing, true);
+        auto column =
+            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
+        MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
+                                pred_hist);
+      }
    }
  }

@@ -238,7 +272,7 @@ class PartitionBuilder {
    std::size_t nid = nodes[node_in_set].nid;
    bool default_left = tree[nid].DefaultLeft();

-    auto pred_approx = [&](auto ridx) {
+    auto pred = [&](auto ridx) {
      bool go_left = default_left;
      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
      if (!is_missing) {
@@ -248,11 +282,7 @@ class PartitionBuilder {
    };

    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
+    child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred);

    const size_t n_left  = child_nodes_sizes.first;
    const size_t n_right = child_nodes_sizes.second;
--- a/src/common/transform_iterator.h
+++ b/src/common/transform_iterator.h
@@ -26,9 +26,9 @@ class IndexTransformIter {

 public:
  using iterator_category = std::random_access_iterator_tag;  // NOLINT
-  using value_type = std::result_of_t<Fn(std::size_t)>;       // NOLINT
+  using reference = std::result_of_t<Fn(std::size_t)>;        // NOLINT
+  using value_type = std::remove_cv_t<std::remove_reference_t<reference>>; // NOLINT
  using difference_type = detail::ptrdiff_t;                  // NOLINT
-  using reference = std::add_lvalue_reference_t<value_type>;  // NOLINT
  using pointer = std::add_pointer_t<value_type>;             // NOLINT

 public:
@@ -43,8 +43,8 @@ class IndexTransformIter {
    return *this;
  }

-  value_type operator*() const { return fn_(iter_); }
-  value_type operator[](std::size_t i) const {
+  reference operator*() const { return fn_(iter_); }
+  reference operator[](std::size_t i) const {
    auto iter = *this + i;
    return *iter;
  }
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -1,11 +1,15 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
+#include <cstdint>  // for int64_t
+
 #include "../common/common.h"
+#include "../common/device_helpers.cuh"  // for DefaultStream, CUDAEvent
 #include "array_interface.h"
+#include "xgboost/logging.h"

 namespace xgboost {
-void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
+void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
  switch (stream) {
    case 0:
      /**
@@ -22,12 +26,15 @@ void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
      break;
    case 2:
      // default per-thread stream
-    default:
+    default: {
+      dh::CUDAEvent e;
 #if defined(XGBOOST_USE_CUDA)
-      dh::safe_cuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
 #elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipStreamSynchronize(reinterpret_cast<hipStream_t>(stream)));
+      e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
 #endif
+      dh::DefaultStream().Wait(e);
+    }
  }
 }

--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -166,7 +166,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
  }
  if (!gradient_index_ || detail::RegenGHist(batch_param_, param)) {
    // GIDX page doesn't exist, generate it
-    LOG(INFO) << "Generating new Gradient Index.";
+    LOG(DEBUG) << "Generating new Gradient Index.";
    // These places can ask for a CSR gidx:
    // - CPU Hist: the ctx must be on CPU.
    // - IterativeDMatrix::InitFromCPU: The ctx must be on CPU.
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -38,19 +38,21 @@ class ColumnSplitHelper {
    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
  }

-  template <typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
  void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
                 GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
-                 std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                 std::vector<ExpandEntry> const& nodes,
+                 std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
    // When data is split by column, we don't have all the feature values in the local worker, so
    // we first collect all the decisions and whether the feature is missing into bit vectors.
    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
      const int32_t nid = nodes[node_in_set].nid;
-      partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
-                                   (*row_set_collection_)[nid].begin, &decision_bits_,
-                                   &missing_bits_);
+      bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
+      partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
+          node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+          (*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
    });

    // Then aggregate the bit vectors across all the workers.
@@ -217,7 +219,8 @@ class CommonRowPartitioner {
    // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
    // Store results in intermediate buffers from partition_builder_
    if (is_col_split_) {
-      column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
+      column_split_helper_.Partition<BinIdxType, any_missing, any_cat>(
+          space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree);
    } else {
      common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
        size_t begin = r.begin();
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -412,6 +412,7 @@ class HistEvaluator {
    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
                             tree[candidate.nid].SplitIndex(), left_weight,
                             right_weight);
+    evaluator = tree_evaluator_.GetEvaluator();

    snode_.resize(tree.GetNodes().size());
    snode_.at(left_child).stats = candidate.split.left_sum;
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -49,6 +49,8 @@ class TreeEvaluator {
      monotone_.HostVector().resize(n_features, 0);
      has_constraint_ = false;
    } else {
+      CHECK_LE(p.monotone_constraints.size(), n_features)
+          << "The size of monotone constraint should be less or equal to the number of features.";
      monotone_.HostVector() = p.monotone_constraints;
      monotone_.HostVector().resize(n_features, 0);
      // Initialised to some small size, can grow if needed