Merge branch 'master' into dev-hui

2023-03-08 00:39:33 +01:00
parent f286ae5bfa f236640427
commit ed45aa2816
221 changed files with 3122 additions and 1486 deletions
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -1,10 +1,32 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
 */
 #ifndef XGBOOST_COMMON_ALGORITHM_H_
 #define XGBOOST_COMMON_ALGORITHM_H_
-#include <algorithm>  // std::upper_bound
-#include <cinttypes>  // std::size_t
+#include <algorithm>          // upper_bound, stable_sort, sort, max
+#include <cinttypes>          // size_t
+#include <functional>         // less
+#include <iterator>           // iterator_traits, distance
+#include <vector>             // vector
+
+#include "numeric.h"          // Iota
+#include "xgboost/context.h"  // Context
+
+// clang with libstdc++ works as well
+#if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__sun) && !defined(sun) && \
+    !defined(__APPLE__) && __has_include(<omp.h>)
+#define GCC_HAS_PARALLEL 1
+#endif  // GLIC_VERSION
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#define MSVC_HAS_PARALLEL 1
+#endif  // MSC
+
+#if defined(GCC_HAS_PARALLEL)
+#include <parallel/algorithm>
+#elif defined(MSVC_HAS_PARALLEL)
+#include <ppl.h>
+#endif  // GLIBC VERSION

 namespace xgboost {
 namespace common {
@@ -13,6 +35,63 @@ auto SegmentId(It first, It last, Idx idx) {
  std::size_t segment_id = std::upper_bound(first, last, idx) - 1 - first;
  return segment_id;
 }
+
+template <typename Iter, typename Comp>
+void StableSort(Context const *ctx, Iter begin, Iter end, Comp &&comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::stable_sort(begin, end, comp,
+                                __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#else
+    // the only stable sort is radix sort for msvc ppl.
+    std::stable_sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::stable_sort(begin, end, comp);
+  }
+}
+
+template <typename Iter, typename Comp>
+void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
+  if (ctx->Threads() > 1) {
+#if defined(GCC_HAS_PARALLEL)
+    __gnu_parallel::sort(begin, end, comp, __gnu_parallel::default_parallel_tag(ctx->Threads()));
+#elif defined(MSVC_HAS_PARALLEL)
+    auto n = std::distance(begin, end);
+    // use chunk size as hint to number of threads. No local policy/scheduler input with the
+    // concurrency module.
+    std::size_t chunk_size = n / ctx->Threads();
+    // 2048 is the default of msvc ppl as of v2022.
+    chunk_size = std::max(chunk_size, static_cast<std::size_t>(2048));
+    concurrency::parallel_sort(begin, end, comp, chunk_size);
+#else
+    std::sort(begin, end, comp);
+#endif  // GLIBC VERSION
+  } else {
+    std::sort(begin, end, comp);
+  }
+}
+
+template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
+          typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
+  CHECK(ctx->IsCPU());
+  auto n = std::distance(begin, end);
+  std::vector<Idx> result(n);
+  Iota(ctx, result.begin(), result.end(), 0);
+  auto op = [&](Idx const &l, Idx const &r) { return comp(begin[l], begin[r]); };
+  StableSort(ctx, result.begin(), result.end(), op);
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
+
+#if defined(GCC_HAS_PARALLEL)
+#undef GCC_HAS_PARALLEL
+#endif  // defined(GCC_HAS_PARALLEL)
+
+#if defined(MSVC_HAS_PARALLEL)
+#undef MSVC_HAS_PARALLEL
+#endif  // defined(MSVC_HAS_PARALLEL)
+
 #endif  // XGBOOST_COMMON_ALGORITHM_H_
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -42,9 +42,9 @@ constexpr inline bst_cat_t OutOfRangeCat() {

 inline XGBOOST_DEVICE bool InvalidCat(float cat) {
  constexpr auto kMaxCat = OutOfRangeCat();
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat, "");
-  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1, "");
-  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat, "");
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat)) == kMaxCat);
+  static_assert(static_cast<bst_cat_t>(static_cast<float>(kMaxCat + 1)) != kMaxCat + 1);
+  static_assert(static_cast<float>(kMaxCat + 1) == kMaxCat);
  return cat < 0 || cat >= kMaxCat;
 }

--- a/src/common/charconv.cc
+++ b/src/common/charconv.cc
@@ -270,7 +270,9 @@ struct RyuPowLogUtils {
   */
  static uint32_t MulPow5InvDivPow2(const uint32_t m, const uint32_t q,
                                    const int32_t j) noexcept(true) {
-    return MulShift(m, kFloatPow5InvSplit[q], j);
+    static_assert(sizeof(kFloatPow5InvSplit) == 55 * sizeof(std::uint64_t));
+    assert(q < 55);
+    return MulShift(m, kFloatPow5InvSplit[q], j);  // NOLINT
  }

  /*
@@ -495,12 +497,10 @@ class PowerBaseComputer {
                             static_cast<int32_t>(IEEE754::kFloatBias) -
                             static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
                             static_cast<int32_t>(2);
-      static_assert(static_cast<int32_t>(1) -
-                            static_cast<int32_t>(IEEE754::kFloatBias) -
-                            static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
-                            static_cast<int32_t>(2) ==
-                        -151,
-                    "");
+      static_assert(static_cast<int32_t>(1) - static_cast<int32_t>(IEEE754::kFloatBias) -
+                        static_cast<int32_t>(IEEE754::kFloatMantissaBits) -
+                        static_cast<int32_t>(2) ==
+                    -151);
      mantissa_base2 = f.mantissa;
    } else {
      base2_range.exponent = static_cast<int32_t>(f.exponent) - IEEE754::kFloatBias -
@@ -544,7 +544,7 @@ class RyuPrinter {
    // Function precondition: v is not a 10-digit number.
    // (f2s: 9 digits are sufficient for round-tripping.)
    // (d2fixed: We print 9-digit blocks.)
-    static_assert(100000000 == Tens(8), "");
+    static_assert(100000000 == Tens(8));
    assert(v < Tens(9));
    if (v >= Tens(8)) {
      return 9;
@@ -911,7 +911,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  // the bias and also special-case the value 0.
  int32_t shift = (f_e2 == 0 ? 1 : f_e2) - exp_b2 - IEEE754::kFloatBias -
                  IEEE754::kFloatMantissaBits;
-  assert(shift >= 0);
+  assert(shift >= 1);

  // We need to round up if the exact value is more than 0.5 above the value we
  // computed. That's equivalent to checking if the last removed bit was 1 and
@@ -920,7 +920,7 @@ from_chars_result FromCharFloatImpl(const char *buffer, const int len,
  //
  // We need to update trailingZeros given that we have the exact output
  // exponent ieee_e2 now.
-  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;
+  trailing_zeros &= (mantissa_b2 & ((1u << (shift - 1)) - 1)) == 0;  // NOLINT
  uint32_t lastRemovedBit = (mantissa_b2 >> (shift - 1)) & 1;
  bool roundup = (lastRemovedBit != 0) &&
                 (!trailing_zeros || (((mantissa_b2 >> shift) & 1) != 0));
--- a/src/common/charconv.h
+++ b/src/common/charconv.h
@@ -87,7 +87,7 @@ inline to_chars_result to_chars(char *first, char *last, int64_t value) { // NOL
  if (value < 0) {
    *first = '-';
    std::advance(first, 1);
-    unsigned_value = uint64_t(~value) + uint64_t(1);
+    unsigned_value = static_cast<uint64_t>(~value) + static_cast<uint64_t>(1);
  }
  return detail::ToCharsUnsignedImpl(first, last, unsigned_value);
 }
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
    feature_offsets_[fid] = accum_index;
  }

-  SetTypeSize(gmat.max_num_bins);
+  SetTypeSize(gmat.MaxNumBinPerFeat());
  auto storage_size =
      feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
  index_.resize(storage_size, 0);
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -188,17 +188,6 @@ inline void SetDevice(std::int32_t device) {
 }
 #endif

-template <typename Idx, typename Container,
-          typename V = typename Container::value_type,
-          typename Comp = std::less<V>>
-std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
-  std::vector<Idx> result(array.size());
-  std::iota(result.begin(), result.end(), 0);
-  auto op = [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); };
-  XGBOOST_PARALLEL_STABLE_SORT(result.begin(), result.end(), op);
-  return result;
-}
-
 /**
 * Last index of a group in a CSR style of index pointer.
 */
@@ -206,31 +195,6 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
  return indptr[group + 1] - 1;
 }
-
-/**
- * \brief A CRTP (curiously recurring template pattern) helper function.
- *
- * https://www.fluentcpp.com/2017/05/19/crtp-helper/
- *
- * Does two things:
- * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class.
- * 2. Avoids having to `static_cast` in a lot of places.
- *
- * \tparam T The derived class in a CRTP hierarchy.
- */
-template <typename T>
-struct Crtp {
-  T &Underlying() { return static_cast<T &>(*this); }
-  T const &Underlying() const { return static_cast<T const &>(*this); }
-};
-
-/**
- * \brief C++17 std::as_const
- */
-template <typename T>
-typename std::add_const<T>::type &AsConst(T &v) noexcept {  // NOLINT(runtime/references)
-  return v;
-}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -1,12 +1,13 @@
-/*!
- * Copyright 2017 by Contributors
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
 * \file compressed_iterator.h
 */
 #pragma once
 #include <xgboost/base.h>
-#include <cmath>
-#include <cstddef>
+
 #include <algorithm>
+#include <cmath>
+#include <cstddef>  // for size_t

 #include "common.h"

@@ -36,7 +37,7 @@ static const int kPadding = 4;  // Assign padding so we can read slightly off
 // The number of bits required to represent a given unsigned range
 inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) {
  auto bits = std::ceil(log2(static_cast<double>(num_symbols)));
-  return common::Max(static_cast<size_t>(bits), size_t(1));
+  return common::Max(static_cast<size_t>(bits), static_cast<std::size_t>(1));
 }
 }  // namespace detail

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -20,6 +20,7 @@

 #include <algorithm>
 #include <chrono>
+#include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
 #include <numeric>
@@ -178,7 +179,7 @@ inline size_t MaxSharedMemory(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 /**
@@ -195,7 +196,7 @@ inline size_t MaxSharedMemoryOptin(int device_idx) {
  dh::safe_cuda(cudaDeviceGetAttribute
                (&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlockOptin,
                 device_idx));
-  return size_t(max_shared_memory);
+  return static_cast<std::size_t>(max_shared_memory);
 }

 inline void CheckComputeCapability() {
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -46,7 +46,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  if (!use_sorted) {
    HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                  HostSketchContainer::UseGroup(info),
-                                  m->Info().data_split_mode == DataSplitMode::kCol, n_threads);
+                                  m->IsColumnSplit(), n_threads);
    for (auto const& page : m->GetBatches<SparsePage>()) {
      container.PushRowPage(page, info, hessian);
    }
@@ -54,7 +54,7 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
  } else {
    SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
                                    HostSketchContainer::UseGroup(info),
-                                    m->Info().data_split_mode == DataSplitMode::kCol, n_threads};
+                                    m->IsColumnSplit(), n_threads};
    for (auto const& page : m->GetBatches<SortedCSCPage>()) {
      container.PushColPage(page, info, hessian);
    }
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2018~2020 XGBoost contributors
+/**
+ * Copyright 2018~2023 by XGBoost contributors
 */
-
-#include <xgboost/logging.h>
-
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
+#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/sort.h>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
+#include <xgboost/logging.h>

+#include <cstddef>  // for size_t
 #include <memory>
 #include <mutex>
 #include <utility>
 #include <vector>

+#include "categorical.h"
 #include "device_helpers.cuh"
-#include "hist_util.h"
 #include "hist_util.cuh"
+#include "hist_util.h"
 #include "math.h"  // NOLINT
 #include "quantile.h"
-#include "categorical.h"
 #include "xgboost/host_device_vector.h"

-
 namespace xgboost {
 namespace common {

@@ -318,7 +316,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
    size_t batch_nnz = batch.data.Size();
    auto const& info = dmat->Info();
    for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, size_t(begin + sketch_batch_num_elements));
+      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
      if (has_weights) {
        bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
        dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 XGBoost contributors
+/**
+ * Copyright 2020-2023 by XGBoost contributors
 *
 * \brief Front end and utilities for GPU based sketching.  Works on sliding window
 *        instead of stream.
@@ -9,11 +9,13 @@

 #include <thrust/host_vector.h>

+#include <cstddef>  // for size_t
+
+#include "../data/device_adapter.cuh"
+#include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "device_helpers.cuh"
 #include "timer.h"
-#include "../data/device_adapter.cuh"

 namespace xgboost {
 namespace common {
@@ -304,7 +306,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, true);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessWeightedSlidingWindow(batch, info,
                                   num_cuts_per_feature,
                                   HostSketchContainer::UseGroup(info), missing, device, num_cols, begin, end,
@@ -316,7 +319,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
        num_rows, num_cols, std::numeric_limits<size_t>::max(),
        device, num_cuts_per_feature, false);
    for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
+      size_t end =
+          std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
      ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
                           sketch_container, num_cuts_per_feature);
    }
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -50,7 +50,7 @@ size_t PeekableInStream::PeekRead(void* dptr, size_t size) {
  }
 }

-FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream), pointer_{0} {
+FixedSizeStream::FixedSizeStream(PeekableInStream* stream) : PeekableInStream(stream) {
  size_t constexpr kInitialSize = 4096;
  size_t size{kInitialSize}, total{0};
  buffer_.clear();
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -27,8 +27,7 @@ using MemoryBufferStream = rabit::utils::MemoryBufferStream;
 */
 class PeekableInStream : public dmlc::Stream {
 public:
-  explicit PeekableInStream(dmlc::Stream* strm)
-      : strm_(strm), buffer_ptr_(0) {}
+  explicit PeekableInStream(dmlc::Stream* strm) : strm_(strm) {}

  size_t Read(void* dptr, size_t size) override;
  virtual size_t PeekRead(void* dptr, size_t size);
@@ -41,7 +40,7 @@ class PeekableInStream : public dmlc::Stream {
  /*! \brief input stream */
  dmlc::Stream *strm_;
  /*! \brief current buffer pointer */
-  size_t buffer_ptr_;
+  size_t buffer_ptr_{0};
  /*! \brief internal buffer */
  std::string buffer_;
 };
@@ -72,7 +71,7 @@ class FixedSizeStream : public PeekableInStream {
  void Take(std::string* out);

 private:
-  size_t pointer_;
+  size_t pointer_{0};
  std::string buffer_;
 };

--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -710,10 +710,10 @@ void Json::Dump(Json json, JsonWriter* writer) {
  writer->Save(json);
 }

-static_assert(std::is_nothrow_move_constructible<Json>::value, "");
-static_assert(std::is_nothrow_move_constructible<Object>::value, "");
-static_assert(std::is_nothrow_move_constructible<Array>::value, "");
-static_assert(std::is_nothrow_move_constructible<String>::value, "");
+static_assert(std::is_nothrow_move_constructible<Json>::value);
+static_assert(std::is_nothrow_move_constructible<Object>::value);
+static_assert(std::is_nothrow_move_constructible<Array>::value);
+static_assert(std::is_nothrow_move_constructible<String>::value);

 Json UBJReader::ParseArray() {
  auto marker = PeekNextChar();
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -14,7 +14,7 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
  if (ctx->IsCPU()) {
    auto const& h_values = values.ConstHostVector();
    auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
-    static_assert(std::is_same<decltype(result), double>::value, "");
+    static_assert(std::is_same<decltype(result), double>::value);
    return result;
  }
  return cuda_impl::Reduce(ctx, values);
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -42,8 +42,8 @@ void RunLengthEncode(Iter begin, Iter end, std::vector<Idx>* p_out) {
 */
 template <typename InIt, typename OutIt, typename T>
 void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
-  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
-  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
+  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value);
+  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value);
  // The number of threads is pegged to the batch size. If the OMP block is parallelized
  // on anything other than the batch/block size, it should be reassigned
  auto n = static_cast<size_t>(std::distance(begin, end));
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -31,6 +31,8 @@ namespace common {
 // BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature
 template<size_t BlockSize>
 class PartitionBuilder {
+  using BitVector = RBitField8;
+
 public:
  template<typename Func>
  void Init(const size_t n_tasks, size_t n_nodes, Func funcNTask) {
@@ -121,27 +123,11 @@ class PartitionBuilder {
    bool default_left = tree[nid].DefaultLeft();
    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
    auto node_cats = tree.NodeCats(nid);
-
-    auto const& index = gmat.index;
    auto const& cut_values = gmat.cut.Values();
-    auto const& cut_ptrs = gmat.cut.Ptrs();
-
-    auto gidx_calc = [&](auto ridx) {
-      auto begin = gmat.RowIdx(ridx);
-      if (gmat.IsDense()) {
-        return static_cast<bst_bin_t>(index[begin + fid]);
-      }
-      auto end = gmat.RowIdx(ridx + 1);
-      auto f_begin = cut_ptrs[fid];
-      auto f_end = cut_ptrs[fid + 1];
-      // bypassing the column matrix as we need the cut value instead of bin idx for categorical
-      // features.
-      return BinarySearchBin(begin, end, index, f_begin, f_end);
-    };

    auto pred_hist = [&](auto ridx, auto bin_id) {
      if (any_cat && is_cat) {
-        auto gidx = gidx_calc(ridx);
+        auto gidx = gmat.GetGindex(ridx, fid);
        bool go_left = default_left;
        if (gidx > -1) {
          go_left = Decision(node_cats, cut_values[gidx]);
@@ -153,7 +139,7 @@ class PartitionBuilder {
    };

    auto pred_approx = [&](auto ridx) {
-      auto gidx = gidx_calc(ridx);
+      auto gidx = gmat.GetGindex(ridx, fid);
      bool go_left = default_left;
      if (gidx > -1) {
        if (is_cat) {
@@ -199,6 +185,84 @@ class PartitionBuilder {
    SetNRightElems(node_in_set, range.begin(), n_right);
  }

+  /**
+   * @brief When data is split by column, we don't have all the features locally on the current
+   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
+   * to go right, or if the feature value used for the split is missing.
+   */
+  void MaskRows(const size_t node_in_set, std::vector<xgboost::tree::CPUExpandEntry> const &nodes,
+                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::ColumnMatrix& column_matrix,
+                const RegTree& tree, const size_t* rid,
+                BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bst_feature_t fid = tree[nid].SplitIndex();
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+    auto const& cut_values = gmat.cut.Values();
+
+    if (!column_matrix.IsInitialized()) {
+      for (auto row_id : rid_span) {
+        auto gidx = gmat.GetGindex(row_id, fid);
+        if (gidx > -1) {
+          bool go_left = false;
+          if (is_cat) {
+            go_left = Decision(node_cats, cut_values[gidx]);
+          } else {
+            go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
+          }
+          if (go_left) {
+            decision_bits->Set(row_id - gmat.base_rowid);
+          }
+        } else {
+          missing_bits->Set(row_id - gmat.base_rowid);
+        }
+      }
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+  }
+
+  /**
+   * @brief Once we've aggregated the decision and missing bits from all the workers, we can then
+   * use them to partition the rows accordingly.
+   */
+  void PartitionByMask(const size_t node_in_set,
+                       std::vector<xgboost::tree::CPUExpandEntry> const& nodes,
+                       const common::Range1d range, GHistIndexMatrix const& gmat,
+                       const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                       const size_t* rid, BitVector const& decision_bits,
+                       BitVector const& missing_bits) {
+    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    std::size_t nid = nodes[node_in_set].nid;
+    bool default_left = tree[nid].DefaultLeft();
+
+    auto pred_approx = [&](auto ridx) {
+      bool go_left = default_left;
+      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
+      if (!is_missing) {
+        go_left = decision_bits.Check(ridx - gmat.base_rowid);
+      }
+      return go_left;
+    };
+
+    std::pair<size_t, size_t> child_nodes_sizes;
+    if (!column_matrix.IsInitialized()) {
+      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
+    } else {
+      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+    }
+
+    const size_t n_left  = child_nodes_sizes.first;
+    const size_t n_right = child_nodes_sizes.second;
+
+    SetNLeftElems(node_in_set, range.begin(), n_left);
+    SetNRightElems(node_in_set, range.begin(), n_right);
+  }
+
  // allocate thread local memory, should be called for each specific task
  void AllocateForTask(size_t id) {
    if (mem_blocks_[id].get() == nullptr) {
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
 */
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -109,7 +109,7 @@ void PruneImpl(common::Span<SketchContainer::OffsetT const> cuts_ptr,
 template <typename T, typename U>
 void CopyTo(Span<T> out, Span<U> src) {
  CHECK_EQ(out.size(), src.size());
-  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value, "");
+  static_assert(std::is_same<std::remove_cv_t<T>, std::remove_cv_t<T>>::value);
  dh::safe_cuda(cudaMemcpyAsync(out.data(), src.data(),
                                out.size_bytes(),
                                cudaMemcpyDefault));
@@ -143,7 +143,7 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
      thrust::make_zip_iterator(thrust::make_tuple(b_ind_iter, place_holder));

  dh::XGBCachingDeviceAllocator<Tuple> alloc;
-  static_assert(sizeof(Tuple) == sizeof(SketchEntry), "");
+  static_assert(sizeof(Tuple) == sizeof(SketchEntry));
  // We reuse the memory for storing merge path.
  common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
  // Determine the merge path, 0 if element is from x, 1 if it's from y.
--- a/src/common/random.cc
+++ b/src/common/random.cc
@@ -24,8 +24,9 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
    for (size_t i = 0; i < h_features.size(); ++i) {
      weights[i] = feature_weights_[h_features[i]];
    }
+    CHECK(ctx_);
    new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
  } else {
    new_features.Resize(features.size());
    std::copy(features.begin(), features.end(), new_features.HostVector().begin());
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -20,7 +20,9 @@
 #include <vector>

 #include "../collective/communicator-inl.h"
+#include "algorithm.h"  // ArgSort
 #include "common.h"
+#include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"

 namespace xgboost {
@@ -87,8 +89,8 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
 */
 template <typename T>
-std::vector<T> WeightedSamplingWithoutReplacement(
-    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vector<T> const& array,
+                                                  std::vector<float> const& weights, size_t n) {
  // ES sampling.
  CHECK_EQ(array.size(), weights.size());
  std::vector<float> keys(weights.size());
@@ -100,7 +102,7 @@ std::vector<T> WeightedSamplingWithoutReplacement(
    auto k = std::log(u) / w;
    keys[i] = k;
  }
-  auto ind = ArgSort<size_t>(Span<float>{keys}, std::greater<>{});
+  auto ind = ArgSort<std::size_t>(ctx, keys.data(), keys.data() + keys.size(), std::greater<>{});
  ind.resize(n);

  std::vector<T> results(ind.size());
@@ -126,6 +128,7 @@ class ColumnSampler {
  float colsample_bytree_{1.0f};
  float colsample_bynode_{1.0f};
  GlobalRandomEngine rng_;
+  Context const* ctx_;

 public:
  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
@@ -157,12 +160,13 @@ class ColumnSampler {
   * \param colsample_bytree
   * \param skip_index_0      (Optional) True to skip index 0.
   */
-  void Init(int64_t num_col, std::vector<float> feature_weights, float colsample_bynode,
-            float colsample_bylevel, float colsample_bytree) {
+  void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
    feature_weights_ = std::move(feature_weights);
    colsample_bylevel_ = colsample_bylevel;
    colsample_bytree_ = colsample_bytree;
    colsample_bynode_ = colsample_bynode;
+    ctx_ = ctx;

    if (feature_set_tree_ == nullptr) {
      feature_set_tree_ = std::make_shared<HostDeviceVector<bst_feature_t>>();
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -77,14 +77,14 @@ class RowSetCollection {
    if (row_indices_.empty()) {  // edge case: empty instance set
      constexpr size_t* kBegin = nullptr;
      constexpr size_t* kEnd = nullptr;
-      static_assert(kEnd - kBegin == 0, "");
-      elem_of_each_node_.emplace_back(Elem(kBegin, kEnd, 0));
+      static_assert(kEnd - kBegin == 0);
+      elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
      return;
    }

    const size_t* begin = dmlc::BeginPtr(row_indices_);
    const size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
-    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+    elem_of_each_node_.emplace_back(begin, end, 0);
  }

  std::vector<size_t>* Data() { return &row_indices_; }
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -35,11 +35,11 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
    auto iter = linalg::cbegin(ti_v);
    float q{0};
    if (opt_weights.Empty()) {
-      q = common::Quantile(0.5, iter, iter + ti_v.Size());
+      q = common::Quantile(ctx, 0.5, iter, iter + ti_v.Size());
    } else {
      CHECK_NE(t_v.Shape(1), 0);
      auto w_it = common::MakeIndexTransformIter([&](std::size_t i) { return opt_weights[i]; });
-      q = common::WeightedQuantile(0.5, iter, iter + ti_v.Size(), w_it);
+      q = common::WeightedQuantile(ctx, 0.5, iter, iter + ti_v.Size(), w_it);
    }
    h_out(i) = q;
  }
--- a/src/common/stats.h
+++ b/src/common/stats.h
@@ -4,46 +4,52 @@
 #ifndef XGBOOST_COMMON_STATS_H_
 #define XGBOOST_COMMON_STATS_H_
 #include <algorithm>
-#include <iterator>
+#include <iterator>  // for distance
 #include <limits>
 #include <vector>

+#include "algorithm.h"           // for StableSort
 #include "common.h"              // AssertGPUSupport, OptionalWeights
 #include "optional_weight.h"     // OptionalWeights
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/context.h"     // Context
-#include "xgboost/linalg.h"
-#include "xgboost/logging.h"  // CHECK_GE
+#include "xgboost/linalg.h"      // TensorView,VectorView
+#include "xgboost/logging.h"     // CHECK_GE

 namespace xgboost {
 namespace common {

 /**
- * \brief Percentile with masked array using linear interpolation.
+ * @brief Quantile using linear interpolation.
 *
 *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
 *
- * \param alpha Percentile, must be in range [0, 1].
+ * \param alpha Quantile, must be in range [0, 1].
 * \param begin Iterator begin for input array.
 * \param end   Iterator end for input array.
 *
 * \return The result of interpolation.
 */
 template <typename Iter>
-float Quantile(double alpha, Iter const& begin, Iter const& end) {
+float Quantile(Context const* ctx, double alpha, Iter const& begin, Iter const& end) {
  CHECK(alpha >= 0 && alpha <= 1);
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }

-  std::vector<size_t> sorted_idx(n);
+  std::vector<std::size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
-  static_assert(std::is_same<decltype(val(0)), float>::value, "");
+  static_assert(std::is_same<decltype(val(0)), float>::value);

  if (alpha <= (1 / (n + 1))) {
    return val(0);
@@ -51,7 +57,7 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
  if (alpha >= (n / (n + 1))) {
    return val(sorted_idx.size() - 1);
  }
-  assert(n != 0 && "The number of rows in a leaf can not be zero.");
+
  double x = alpha * static_cast<double>((n + 1));
  double k = std::floor(x) - 1;
  CHECK_GE(k, 0);
@@ -66,30 +72,35 @@ float Quantile(double alpha, Iter const& begin, Iter const& end) {
 * \brief Calculate the weighted quantile with step function. Unlike the unweighted
 *        version, no interpolation is used.
 *
- *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
+ *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussions on computing
 *   weighted quantile with interpolation.
 */
 template <typename Iter, typename WeightIter>
-float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
+float WeightedQuantile(Context const* ctx, double alpha, Iter begin, Iter end, WeightIter w_begin) {
  auto n = static_cast<double>(std::distance(begin, end));
  if (n == 0) {
    return std::numeric_limits<float>::quiet_NaN();
  }
  std::vector<size_t> sorted_idx(n);
  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
-  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
-                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+  if (omp_in_parallel()) {
+    std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                     [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  } else {
+    StableSort(ctx, sorted_idx.begin(), sorted_idx.end(),
+               [&](std::size_t l, std::size_t r) { return *(begin + l) < *(begin + r); });
+  }

  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };

  std::vector<float> weight_cdf(n);  // S_n
  // weighted cdf is sorted during construction
-  weight_cdf[0] = *(weights + sorted_idx[0]);
+  weight_cdf[0] = *(w_begin + sorted_idx[0]);
  for (size_t i = 1; i < n; ++i) {
-    weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
+    weight_cdf[i] = weight_cdf[i - 1] + w_begin[sorted_idx[i]];
  }
  float thresh = weight_cdf.back() * alpha;
-  size_t idx =
+  std::size_t idx =
      std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
  idx = std::min(idx, static_cast<size_t>(n - 1));
  return val(idx);