rocm enable for v2.0.1

2023-10-27 18:50:28 -07:00
parent 2e7e9d3b2d a408254c2f
commit 782b73f2bb
447 changed files with 13518 additions and 8719 deletions
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -10,6 +10,7 @@
 #include <dmlc/omp.h>

 #include <cmath>
+#include <cstdint>
 #include <iostream>
 #include <string>
 #include <utility>
@@ -90,8 +91,6 @@ namespace xgboost {

 /*! \brief unsigned integer type used for feature index. */
 using bst_uint = uint32_t;  // NOLINT
-/*! \brief integer type. */
-using bst_int = int32_t;    // NOLINT
 /*! \brief unsigned long integers */
 using bst_ulong = uint64_t;  // NOLINT
 /*! \brief float type, used for storing statistics */
@@ -112,19 +111,23 @@ using bst_row_t = std::size_t;   // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
-using bst_group_t = std::uint32_t;      // NOLINT
+using bst_group_t = std::uint32_t;  // NOLINT
 /**
 * \brief Type for indexing into output targets.
 */
 using bst_target_t = std::uint32_t;  // NOLINT
 /**
- * brief Type for indexing boosted layers.
+ * @brief Type for indexing boosted layers.
 */
 using bst_layer_t = std::int32_t;  // NOLINT
 /**
 * \brief Type for indexing trees.
 */
 using bst_tree_t = std::int32_t;  // NOLINT
+/**
+ * @brief Ordinal of a CUDA device.
+ */
+using bst_d_ordinal_t = std::int16_t;  // NOLINT

 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation
@@ -133,9 +136,9 @@ namespace detail {
 template <typename T>
 class GradientPairInternal {
  /*! \brief gradient statistics */
-  T grad_;
+  T grad_{0};
  /*! \brief second order gradient statistics */
-  T hess_;
+  T hess_{0};

  XGBOOST_DEVICE void SetGrad(T g) { grad_ = g; }
  XGBOOST_DEVICE void SetHess(T h) { hess_ = h; }
@@ -152,7 +155,7 @@ class GradientPairInternal {
    a += b;
  }

-  XGBOOST_DEVICE GradientPairInternal() : grad_(0), hess_(0) {}
+  GradientPairInternal() = default;

  XGBOOST_DEVICE GradientPairInternal(T grad, T hess) {
    SetGrad(grad);
@@ -268,10 +271,11 @@ class GradientPairInt64 {
  GradientPairInt64() = default;

  // Copy constructor if of same value type, marked as default to be trivially_copyable
-  GradientPairInt64(const GradientPairInt64 &g) = default;
+  GradientPairInt64(GradientPairInt64 const &g) = default;
+  GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;

-  XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
-  XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }

  XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
    grad_ += rhs.grad_;
@@ -320,17 +324,6 @@ using omp_ulong = dmlc::omp_ulong;  // NOLINT
 using bst_omp_uint = dmlc::omp_uint;  // NOLINT
 /*! \brief Type used for representing version number in binary form.*/
 using XGBoostVersionT = int32_t;
-
-/*!
- * \brief define compatible keywords in g++
- *  Used to support g++-4.6 and g++4.7
- */
-#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
-#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
-#define override
-#define final
-#endif  // __GNUC__ == 4 && __GNUC_MINOR__ < 8
-#endif  // DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
 }  // namespace xgboost

 #endif  // XGBOOST_BASE_H_
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -478,7 +478,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
 * \param config   JSON encoded parameters for DMatrix construction.  Accepted fields are:
 *   - missing:      Which value to represent missing value
 *   - nthread (optional): Number of threads used for initializing DMatrix.
- *   - max_bin (optional):  Maximum number of bins for building histogram.
+ *   - max_bin (optional): Maximum number of bins for building histogram.
 * \param out      The created Device Quantile DMatrix
 *
 * \return 0 when success, -1 when failure happens
@@ -810,7 +810,7 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
 */
 XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);

-/*!
+/**
 * \brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
 *        quantized DMatrix, quantized values are returned instead.
 *
@@ -819,8 +819,10 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 * XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until
 * exiting the thread.
 *
+ * @since 1.7.0
+ *
 * \param handle the handle to the DMatrix
- * \param config Json configuration string. At the moment it should be an empty document,
+ * \param config JSON configuration string. At the moment it should be an empty document,
 *               preserved for future use.
 * \param out_indptr  indptr of output CSR matrix.
 * \param out_indices Column index of output CSR matrix.
@@ -831,6 +833,24 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                  bst_ulong *out_indptr, unsigned *out_indices, float *out_data);

+/**
+ * @brief Export the quantile cuts used for training histogram-based models like `hist` and
+ *        `approx`. Useful for model compression.
+ *
+ * @since 2.0.0
+ *
+ * @param handle the handle to the DMatrix
+ * @param config JSON configuration string. At the moment it should be an empty document,
+ *               preserved for future use.
+ *
+ * @param out_indptr indptr of output CSC matrix represented by a JSON encoded
+ *                   __(cuda_)array_interface__.
+ * @param out_data   Data value of CSC matrix represented by a JSON encoded
+ *                   __(cuda_)array_interface__.
+ */
+XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
+                                     char const **out_indptr, char const **out_data);
+
 /** @} */  // End of DMatrix

 /**
@@ -1067,6 +1087,9 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
 /**
 * \brief Inplace prediction from CPU dense matrix.
 *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle.
 * \param values        JSON encoded __array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1091,6 +1114,9 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
 /**
 * \brief Inplace prediction from CPU CSR matrix.
 *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle.
 * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
 * \param indices       JSON encoded __array_interface__ to column indices in CSR.
@@ -1116,6 +1142,9 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 /**
 * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
 *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle
 * \param values        JSON encoded __cuda_array_interface__ to values.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1137,6 +1166,9 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
 /**
 * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
 *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
 * \param handle        Booster handle
 * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
 * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1189,7 +1221,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
 * \brief Load model from existing file
 *
 * \param handle handle
- * \param fname File URI or file name.
+ * \param fname File URI or file name. The string must be UTF-8 encoded.
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
@@ -1198,7 +1230,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
 * \brief Save model into existing file
 *
 * \param handle handle
- * \param fname File URI or file name.
+ * \param fname File URI or file name. The string must be UTF-8 encoded.
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -1,67 +1,192 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
 * \file context.h
 */
 #ifndef XGBOOST_CONTEXT_H_
 #define XGBOOST_CONTEXT_H_

-#include <xgboost/logging.h>
-#include <xgboost/parameter.h>
+#include <xgboost/base.h>       // for bst_d_ordinal_t
+#include <xgboost/logging.h>    // for CHECK_GE
+#include <xgboost/parameter.h>  // for XGBoostParameter

-#include <memory>  // std::shared_ptr
-#include <string>
+#include <cstdint>      // for int16_t, int32_t, int64_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string, to_string
+#include <type_traits>  // for invoke_result_t, is_same_v, underlying_type_t

 namespace xgboost {

 struct CUDAContext;

+// symbolic names
+struct DeviceSym {
+  static auto constexpr CPU() { return "cpu"; }
+  static auto constexpr CUDA() { return "cuda"; }
+};
+
+/**
+ * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
+ *        viewing types like `linalg::TensorView`.
+ */
+struct DeviceOrd {
+  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
+  // CUDA device ordinal.
+  bst_d_ordinal_t ordinal{-1};
+
+  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
+  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
+
+  DeviceOrd() = default;
+  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
+
+  DeviceOrd(DeviceOrd const& that) = default;
+  DeviceOrd& operator=(DeviceOrd const& that) = default;
+  DeviceOrd(DeviceOrd&& that) = default;
+  DeviceOrd& operator=(DeviceOrd&& that) = default;
+
+  /**
+   * @brief Constructor for CPU.
+   */
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  /**
+   * @brief Constructor for CUDA device.
+   *
+   * @param ordinal CUDA device ordinal.
+   */
+  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+
+  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
+    return device == that.device && ordinal == that.ordinal;
+  }
+  [[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
+  /**
+   * @brief Get a string representation of the device and the ordinal.
+   */
+  [[nodiscard]] std::string Name() const {
+    switch (device) {
+      case DeviceOrd::kCPU:
+        return DeviceSym::CPU();
+      case DeviceOrd::kCUDA:
+        return DeviceSym::CUDA() + (':' + std::to_string(ordinal));
+      default: {
+        LOG(FATAL) << "Unknown device.";
+        return "";
+      }
+    }
+  }
+};
+
+static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
+
+/**
+ * @brief Runtime context for XGBoost. Contains information like threads and device.
+ */
 struct Context : public XGBoostParameter<Context> {
+ private:
+  std::string device{DeviceSym::CPU()};  // NOLINT
+  // The device object for the current context. We are in the middle of replacing the
+  // `gpu_id` with this device field.
+  DeviceOrd device_{DeviceOrd::CPU()};
+
 public:
  // Constant representing the device ID of CPU.
-  static std::int32_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
  static std::int64_t constexpr kDefaultSeed = 0;

 public:
  Context();

+  template <typename Container>
+  Args UpdateAllowUnknown(Container const& kwargs) {
+    auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
+    this->SetDeviceOrdinal(kwargs);
+    return args;
+  }
+
+  std::int32_t gpu_id{kCpuId};
+  // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
+  std::int32_t nthread{0};  // NOLINT
  // stored random seed
  std::int64_t seed{kDefaultSeed};
  // whether seed the PRNG each iteration
  bool seed_per_iteration{false};
-  // number of threads to use if OpenMP is enabled
-  // if equals 0, use system default
-  std::int32_t nthread{0};
-  // primary device, -1 means no gpu.
-  std::int32_t gpu_id{kCpuId};
  // fail when gpu_id is invalid
  bool fail_on_invalid_gpu_id{false};
  bool validate_parameters{false};

-  /*!
-   * \brief Configure the parameter `gpu_id'.
+  /**
+   * @brief Configure the parameter `gpu_id'.
   *
-   * \param require_gpu  Whether GPU is explicitly required from user.
+   * @param require_gpu Whether GPU is explicitly required by the user through other
+   *                    configurations.
   */
  void ConfigureGpuId(bool require_gpu);
-  /*!
-   * Return automatically chosen threads.
+  /**
+   * @brief Returns the automatically chosen number of threads based on the `nthread`
+   *        parameter and the system settting.
   */
-  std::int32_t Threads() const;
+  [[nodiscard]] std::int32_t Threads() const;
+  /**
+   * @brief Is XGBoost running on CPU?
+   */
+  [[nodiscard]] bool IsCPU() const { return Device().IsCPU(); }
+  /**
+   * @brief Is XGBoost running on a CUDA device?
+   */
+  [[nodiscard]] bool IsCUDA() const { return Device().IsCUDA(); }
+  /**
+   * @brief Get the current device and ordinal.
+   */
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
+  /**
+   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
+   */
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return Device().ordinal; }
+  /**
+   * @brief Name of the current device.
+   */
+  [[nodiscard]] std::string DeviceName() const { return Device().Name(); }
+  /**
+   * @brief Get a CUDA device context for allocator and stream.
+   */
+  [[nodiscard]] CUDAContext const* CUDACtx() const;

-  bool IsCPU() const { return gpu_id == kCpuId; }
-  bool IsCUDA() const { return !IsCPU(); }
-
-  CUDAContext const* CUDACtx() const;
-  // Make a CUDA context based on the current context.
-  Context MakeCUDA(std::int32_t device = 0) const {
+  /**
+   * @brief Make a CUDA context based on the current context.
+   *
+   * @param ordinal The CUDA device ordinal.
+   */
+  [[nodiscard]] Context MakeCUDA(bst_d_ordinal_t ordinal = 0) const {
    Context ctx = *this;
-    ctx.gpu_id = device;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CUDA(ordinal));
  }
-  Context MakeCPU() const {
+  /**
+   * @brief Make a CPU context based on the current context.
+   */
+  [[nodiscard]] Context MakeCPU() const {
    Context ctx = *this;
-    ctx.gpu_id = kCpuId;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CPU());
+  }
+  /**
+   * @brief Call function based on the current device.
+   */
+  template <typename CPUFn, typename CUDAFn>
+  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
+    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
+    switch (this->Device().device) {
+      case DeviceOrd::kCPU:
+        return cpu_fn();
+      case DeviceOrd::kCUDA:
+        return cuda_fn();
+      default:
+        // Do not use the device name as this is likely an internal error, the name
+        // wouldn't be valid.
+        LOG(FATAL) << "Unknown device type:"
+                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
+        break;
+    }
+    return std::invoke_result_t<CPUFn>();
  }

  // declare parameters
@@ -73,11 +198,9 @@ struct Context : public XGBoostParameter<Context> {
    DMLC_DECLARE_FIELD(seed_per_iteration)
        .set_default(false)
        .describe("Seed PRNG determnisticly via iterator number.");
+    DMLC_DECLARE_FIELD(device).set_default(DeviceSym::CPU()).describe("Device ordinal.");
    DMLC_DECLARE_FIELD(nthread).set_default(0).describe("Number of threads to use.");
    DMLC_DECLARE_ALIAS(nthread, n_jobs);
-
-    DMLC_DECLARE_FIELD(gpu_id).set_default(-1).set_lower_bound(-1).describe(
-        "The primary GPU device ordinal.");
    DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
        .set_default(false)
        .describe("Fail with error when gpu_id is invalid.");
@@ -87,9 +210,17 @@ struct Context : public XGBoostParameter<Context> {
  }

 private:
-  // mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
-  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
-  // while trying to hide CUDA code from host compiler.
+  void SetDeviceOrdinal(Args const& kwargs);
+  Context& SetDevice(DeviceOrd d) {
+    this->device_ = d;
+    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
+    this->device = d.Name();
+    return *this;
+  }
+
+  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
+  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
+  // p_impl while trying to hide CUDA code from the host compiler.
  mutable std::shared_ptr<CUDAContext> cuctx_;
  // cached value for CFS CPU limit. (used in containerized env)
  std::int32_t cfs_cpu_count_;  // NOLINT
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -185,10 +185,10 @@ class MetaInfo {
    return data_split_mode == DataSplitMode::kRow;
  }

-  /*! \brief Whether the data is split column-wise. */
-  bool IsColumnSplit() const {
-    return data_split_mode == DataSplitMode::kCol;
-  }
+  /** @brief Whether the data is split column-wise. */
+  bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; }
+  /** @brief Whether this is a learning to rank data. */
+  bool IsRanking() const { return !group_ptr_.empty(); }

  /*!
   * \brief A convenient method to check if we are doing vertical federated learning, which requires
@@ -249,7 +249,7 @@ struct BatchParam {
  /**
   * \brief Hessian, used for sketching with future approx implementation.
   */
-  common::Span<float> hess;
+  common::Span<float const> hess;
  /**
   * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
   *        GHistIndex.
@@ -279,10 +279,10 @@ struct BatchParam {
   *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
   *   span is changed, so caller should keep the span for each iteration.
   */
-  BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
+  BatchParam(bst_bin_t max_bin, common::Span<float const> hessian, bool regenerate)
      : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}

-  bool ParamNotEqual(BatchParam const& other) const {
+  [[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
    // Check non-floating parameters.
    bool cond = max_bin != other.max_bin;
    // Check sparse thresh.
@@ -293,11 +293,11 @@ struct BatchParam {

    return cond;
  }
-  bool Initialized() const { return max_bin != 0; }
+  [[nodiscard]] bool Initialized() const { return max_bin != 0; }
  /**
   * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
   */
-  BatchParam MakeCache() const {
+  [[nodiscard]] BatchParam MakeCache() const {
    auto p = *this;
    // These parameters have nothing to do with how the gradient index was generated in the
    // first place.
@@ -319,7 +319,7 @@ struct HostSparsePageView {
            static_cast<Inst::index_type>(size)};
  }

-  size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
+  [[nodiscard]] size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
 };

 /*!
@@ -337,7 +337,7 @@ class SparsePage {
  /*! \brief an instance of sparse vector in the batch */
  using Inst = common::Span<Entry const>;

-  HostSparsePageView GetView() const {
+  [[nodiscard]] HostSparsePageView GetView() const {
    return {offset.ConstHostSpan(), data.ConstHostSpan()};
  }

@@ -353,12 +353,12 @@ class SparsePage {
  virtual ~SparsePage() = default;

  /*! \return Number of instances in the page. */
-  inline size_t Size() const {
+  [[nodiscard]] size_t Size() const {
    return offset.Size() == 0 ? 0 : offset.Size() - 1;
  }

  /*! \return estimation of memory cost of this page */
-  inline size_t MemCostBytes() const {
+  [[nodiscard]] size_t MemCostBytes() const {
    return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
  }

@@ -376,7 +376,7 @@ class SparsePage {
    base_rowid = row_id;
  }

-  SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
+  [[nodiscard]] SparsePage GetTranspose(int num_columns, int32_t n_threads) const;

  /**
   * \brief Sort the column index.
@@ -385,7 +385,7 @@ class SparsePage {
  /**
   * \brief Check wether the column index is sorted.
   */
-  bool IsIndicesSorted(int32_t n_threads) const;
+  [[nodiscard]] bool IsIndicesSorted(int32_t n_threads) const;
  /**
   * \brief Reindex the column index with an offset.
   */
@@ -440,49 +440,7 @@ class SortedCSCPage : public SparsePage {
  explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
 };

-class EllpackPageImpl;
-/*!
- * \brief A page stored in ELLPACK format.
- *
- * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
- * including CUDA-specific implementation details in the header.
- */
-class EllpackPage {
- public:
-  /*!
-   * \brief Default constructor.
-   *
-   * This is used in the external memory case. An empty ELLPACK page is constructed with its content
-   * set later by the reader.
-   */
-  EllpackPage();
-
-  /*!
-   * \brief Constructor from an existing DMatrix.
-   *
-   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
-   * in CSR format.
-   */
-  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
-
-  /*! \brief Destructor. */
-  ~EllpackPage();
-
-  EllpackPage(EllpackPage&& that);
-
-  /*! \return Number of instances in the page. */
-  size_t Size() const;
-
-  /*! \brief Set the base row id for this page. */
-  void SetBaseRowId(std::size_t row_id);
-
-  const EllpackPageImpl* Impl() const { return impl_.get(); }
-  EllpackPageImpl* Impl() { return impl_.get(); }
-
- private:
-  std::unique_ptr<EllpackPageImpl> impl_;
-};
-
+class EllpackPage;
 class GHistIndexMatrix;

 template<typename T>
@@ -492,7 +450,7 @@ class BatchIteratorImpl {
  virtual ~BatchIteratorImpl() = default;
  virtual const T& operator*() const = 0;
  virtual BatchIteratorImpl& operator++() = 0;
-  virtual bool AtEnd() const = 0;
+  [[nodiscard]] virtual bool AtEnd() const = 0;
  virtual std::shared_ptr<T const> Page() const = 0;
 };

@@ -519,12 +477,12 @@ class BatchIterator {
    return !impl_->AtEnd();
  }

-  bool AtEnd() const {
+  [[nodiscard]] bool AtEnd() const {
    CHECK(impl_ != nullptr);
    return impl_->AtEnd();
  }

-  std::shared_ptr<T const> Page() const {
+  [[nodiscard]] std::shared_ptr<T const> Page() const {
    return impl_->Page();
  }

@@ -563,15 +521,15 @@ class DMatrix {
    this->Info().SetInfo(ctx, key, StringView{interface_str});
  }
  /*! \brief meta information of the dataset */
-  virtual const MetaInfo& Info() const = 0;
+  [[nodiscard]] virtual const MetaInfo& Info() const = 0;

  /*! \brief Get thread local memory for returning data from DMatrix. */
-  XGBAPIThreadLocalEntry& GetThreadLocal() const;
+  [[nodiscard]] XGBAPIThreadLocalEntry& GetThreadLocal() const;
  /**
   * \brief Get the context object of this DMatrix.  The context is created during construction of
   *        DMatrix with user specified `nthread` parameter.
   */
-  virtual Context const* Ctx() const = 0;
+  [[nodiscard]] virtual Context const* Ctx() const = 0;

  /**
   * \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
@@ -583,16 +541,16 @@ class DMatrix {
  template <typename T>
  BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
  template <typename T>
-  bool PageExists() const;
+  [[nodiscard]] bool PageExists() const;

  // the following are column meta data, should be able to answer them fast.
  /*! \return Whether the data columns single column block. */
-  virtual bool SingleColBlock() const = 0;
+  [[nodiscard]] virtual bool SingleColBlock() const = 0;
  /*! \brief virtual destructor */
  virtual ~DMatrix();

  /*! \brief Whether the matrix is dense. */
-  bool IsDense() const {
+  [[nodiscard]] bool IsDense() const {
    return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
  }

@@ -695,9 +653,9 @@ class DMatrix {
                                                      BatchParam const& param) = 0;
  virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;

-  virtual bool EllpackExists() const = 0;
-  virtual bool GHistIndexExists() const = 0;
-  virtual bool SparsePageExists() const = 0;
+  [[nodiscard]] virtual bool EllpackExists() const = 0;
+  [[nodiscard]] virtual bool GHistIndexExists() const = 0;
+  [[nodiscard]] virtual bool SparsePageExists() const = 0;
 };

 template <>
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -149,18 +149,14 @@ class GradientBooster : public Model, public Configurable {
   * \param layer_begin Beginning of boosted tree layer used for prediction.
   * \param layer_end   End of booster layer. 0 means do not limit trees.
   * \param approximate use a faster (inconsistent) approximation of SHAP values
-   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
-   * \param condition_feature feature to condition on (i.e. fix) during calculations
   */
-  virtual void PredictContribution(DMatrix* dmat,
-                                   HostDeviceVector<bst_float>* out_contribs,
-                                   unsigned layer_begin, unsigned layer_end,
-                                   bool approximate = false, int condition = 0,
-                                   unsigned condition_feature = 0) = 0;
+  virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                   bst_layer_t layer_begin, bst_layer_t layer_end,
+                                   bool approximate = false) = 0;

-  virtual void PredictInteractionContributions(
-      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;
+  virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                               bst_layer_t layer_begin, bst_layer_t layer_end,
+                                               bool approximate) = 0;

  /*!
   * \brief dump the model in the requested format
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
 * \file global_config.h
 * \brief Global configuration for XGBoost
 * \author Hyunsu Cho
@@ -7,24 +7,22 @@
 #ifndef XGBOOST_GLOBAL_CONFIG_H_
 #define XGBOOST_GLOBAL_CONFIG_H_

-#include <xgboost/parameter.h>
-#include <vector>
-#include <string>
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
+#include <xgboost/parameter.h>  // for XGBoostParameter
+
+#include <cstdint>  // for int32_t

 namespace xgboost {
-class Json;
-
 struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
-  int verbosity { 1 };
-  bool use_rmm { false };
+  std::int32_t verbosity{1};
+  bool use_rmm{false};
  DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
    DMLC_DECLARE_FIELD(verbosity)
        .set_range(0, 3)
        .set_default(1)  // shows only warning
        .describe("Flag to print out detailed breakdown of runtime.");
-    DMLC_DECLARE_FIELD(use_rmm)
-        .set_default(false)
-        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
+    DMLC_DECLARE_FIELD(use_rmm).set_default(false).describe(
+        "Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
  }
 };

--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -49,11 +49,12 @@
 #ifndef XGBOOST_HOST_DEVICE_VECTOR_H_
 #define XGBOOST_HOST_DEVICE_VECTOR_H_

-#include <initializer_list>
-#include <vector>
-#include <type_traits>
+#include <xgboost/context.h>  // for DeviceOrd
+#include <xgboost/span.h>     // for Span

-#include "span.h"
+#include <initializer_list>
+#include <type_traits>
+#include <vector>

 namespace xgboost {

@@ -133,6 +134,7 @@ class HostDeviceVector {
  GPUAccess DeviceAccess() const;

  void SetDevice(int device) const;
+  void SetDevice(DeviceOrd device) const;

  void Resize(size_t new_size, T v = T());

--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -664,11 +664,11 @@ Object ToJson(Parameter const& param) {
 template <typename Parameter>
 Args FromJson(Json const& obj, Parameter* param) {
  auto const& j_param = get<Object const>(obj);
-  std::map<std::string, std::string> m;
+  Args args;
  for (auto const& kv : j_param) {
-    m[kv.first] = get<String const>(kv.second);
+    args.emplace_back(kv.first, get<String const>(kv.second));
  }
-  return param->UpdateAllowUnknown(m);
+  return param->UpdateAllowUnknown(args);
 }
 }  // namespace xgboost
 #endif  // XGBOOST_JSON_H_
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -110,15 +110,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \param approx_contribs whether to approximate the feature contributions for speed
   * \param pred_interactions whether to compute the feature pair contributions
   */
-  virtual void Predict(std::shared_ptr<DMatrix> data,
-                       bool output_margin,
-                       HostDeviceVector<bst_float> *out_preds,
-                       unsigned layer_begin,
-                       unsigned layer_end,
-                       bool training = false,
-                       bool pred_leaf = false,
-                       bool pred_contribs = false,
-                       bool approx_contribs = false,
+  virtual void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
+                       HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+                       bst_layer_t layer_end, bool training = false, bool pred_leaf = false,
+                       bool pred_contribs = false, bool approx_contribs = false,
                       bool pred_interactions = false) = 0;

  /*!
@@ -132,8 +127,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   * \param          layer_end   End of booster layer. 0 means do not limit trees.
   */
  virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
-                              uint32_t layer_end) = 0;
+                              HostDeviceVector<float>** out_preds, bst_layer_t layer_begin,
+                              bst_layer_t layer_end) = 0;

  /*!
   * \brief Calculate feature score.  See doc in C API for outputs.
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -574,7 +574,9 @@ template <typename Container, typename... S,
          std::enable_if_t<!common::detail::IsSpan<Container>::value &&
                           !std::is_pointer_v<Container>> * = nullptr>
 auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOLINT
-  using T = typename Container::value_type;
+  using T = std::conditional_t<std::is_const_v<Container>,
+                               std::add_const_t<typename Container::value_type>,
+                               typename Container::value_type>;
  std::size_t in_shape[sizeof...(S)];
  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->gpu_id};
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -6,24 +6,22 @@
 */
 #pragma once
 #include <xgboost/base.h>
-#include <xgboost/cache.h>  // DMatrixCache
+#include <xgboost/cache.h>    // for DMatrixCache
+#include <xgboost/context.h>  // for Context
 #include <xgboost/context.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>

-#include <functional>  // std::function
-#include <memory>
+#include <functional>  // for function
+#include <memory>      // for shared_ptr
 #include <string>
-#include <thread>   // for get_id
 #include <utility>  // for make_pair
 #include <vector>

 // Forward declarations
-namespace xgboost {
-namespace gbm {
+namespace xgboost::gbm {
 struct GBTreeModel;
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm

 namespace xgboost {
 /**
@@ -41,9 +39,8 @@ struct PredictionCacheEntry {
   *
   * \param v Added versions.
   */
-  void Update(std::uint32_t v) {
-    version += v;
-  }
+  void Update(std::uint32_t v) { version += v; }
+  void Reset() { version = 0; }
 };

 /**
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -6,6 +6,6 @@

 #define XGBOOST_VER_MAJOR 2  /* NOLINT */
 #define XGBOOST_VER_MINOR 0  /* NOLINT */
-#define XGBOOST_VER_PATCH 0  /* NOLINT */
+#define XGBOOST_VER_PATCH 1  /* NOLINT */

 #endif  // XGBOOST_VERSION_CONFIG_H_