merge 23Mar01

2023-05-02 00:05:58 +02:00
parent 313a74b582 08ce495b5d
commit 5446c501af
258 changed files with 7471 additions and 5379 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -38,7 +38,7 @@ typedef uint64_t bst_ulong;  // NOLINT(*)
 */

 /**
- * @defgroup Library
+ * @defgroup Library Library
 *
 * These functions are used to obtain general information about XGBoost including version,
 * build info and current global configuration.
@@ -112,7 +112,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
 /**@}*/

 /**
- * @defgroup DMatrix
+ * @defgroup DMatrix DMatrix
 *
 * @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
 *        including both training, prediction and explanation. There are a few variants of
@@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
 /*!
 * \brief load a data matrix
 * \param config JSON encoded parameters for DMatrix construction.  Accepted fields are:
- *   - uri: The URI of the input file.
+
+ *   - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
+ *          \verbatim embed:rst:leading-asterisk
+ *            See :doc:`/tutorials/input_format` for more info.
+ *          \endverbatim
 *   - silent (optional): Whether to print message during loading. Default to true.
 *   - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
 *     file is split accordingly; otherwise this is only an indicator on how the file was split
@@ -200,7 +204,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
-                                   bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
+                                   bst_ulong nrow, char const *config, DMatrixHandle *out);

 /*!
 * \brief create a matrix content from CSC format
@@ -281,7 +285,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
                                                  DMatrixHandle *out);

 /**
- * @defgroup Streaming
+ * @defgroup Streaming Streaming
 * @ingroup DMatrix
 *
 * @brief Quantile DMatrix and external memory DMatrix can be created from batches of
@@ -431,7 +435,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
 * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
 * - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
 * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
- *           `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
+ *           \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
 * - Step 3: Call appropriate data setters in `next` functions.
 *
 * \param iter    A handle to external data iterator.
@@ -830,7 +834,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
 /** @} */  // End of DMatrix

 /**
- * @defgroup Booster
+ * @defgroup Booster Booster
 *
 * @brief The `Booster` class is the gradient-boosted model for XGBoost.
 * @{
@@ -953,7 +957,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
 */

 /**
- * @defgroup Prediction
+ * @defgroup Prediction Prediction
 * @ingroup Booster
 *
 * @brief These functions are used for running prediction and explanation algorithms.
@@ -1155,7 +1159,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v


 /**
- * @defgroup Serialization
+ * @defgroup Serialization Serialization
 * @ingroup Booster
 *
 * @brief There are multiple ways to serialize a Booster object depending on the use case.
@@ -1490,7 +1494,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
 /**@}*/  // End of Booster

 /**
- * @defgroup Collective
+ * @defgroup Collective Collective
 *
 * @brief Experimental support for exposing internal communicator in XGBoost.
 *
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -50,7 +50,19 @@ struct Context : public XGBoostParameter<Context> {

  bool IsCPU() const { return gpu_id == kCpuId; }
  bool IsCUDA() const { return !IsCPU(); }
+
  CUDAContext const* CUDACtx() const;
+  // Make a CUDA context based on the current context.
+  Context MakeCUDA(std::int32_t device = 0) const {
+    Context ctx = *this;
+    ctx.gpu_id = device;
+    return ctx;
+  }
+  Context MakeCPU() const {
+    Context ctx = *this;
+    ctx.gpu_id = kCpuId;
+    return ctx;
+  }

  // declare parameters
  DMLC_DECLARE_PARAMETER(Context) {
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023 by XGBoost Contributors
 * \file data.h
 * \brief The input data structure of xgboost.
 * \author Tianqi Chen
@@ -196,6 +196,14 @@ class MetaInfo {
   */
  bool IsVerticalFederated() const;

+  /*!
+   * \brief A convenient method to check if the MetaInfo should contain labels.
+   *
+   * Normally we assume labels are available everywhere. The only exception is in vertical federated
+   * learning where labels are only available on worker 0.
+   */
+  bool ShouldHaveLabels() const;
+
 private:
  void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
  void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@@ -230,44 +238,72 @@ struct Entry {
  }
 };

-/*!
- * \brief Parameters for constructing batches.
+/**
+ * \brief Parameters for constructing histogram index batches.
 */
 struct BatchParam {
-  /*! \brief The GPU device to use. */
-  int gpu_id {-1};
-  /*! \brief Maximum number of bins per feature for histograms. */
+  /**
+   * \brief Maximum number of bins per feature for histograms.
+   */
  bst_bin_t max_bin{0};
-  /*! \brief Hessian, used for sketching with future approx implementation. */
+  /**
+   * \brief Hessian, used for sketching with future approx implementation.
+   */
  common::Span<float> hess;
-  /*! \brief Whether should DMatrix regenerate the batch.  Only used for GHistIndex. */
-  bool regen {false};
-  /*! \brief Parameter used to generate column matrix for hist. */
+  /**
+   * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
+   *        GHistIndex.
+   */
+  bool regen{false};
+  /**
+   * \brief Forbid regenerating the gradient index. Used for internal validation.
+   */
+  bool forbid_regen{false};
+  /**
+   * \brief Parameter used to generate column matrix for hist.
+   */
  double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};

+  /**
+   * \brief Exact or others that don't need histogram.
+   */
  BatchParam() = default;
-  // GPU Hist
-  BatchParam(int32_t device, bst_bin_t max_bin)
-      : gpu_id{device}, max_bin{max_bin} {}
-  // Hist
+  /**
+   * \brief Used by the hist tree method.
+   */
  BatchParam(bst_bin_t max_bin, double sparse_thresh)
      : max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
-  // Approx
  /**
-   * \brief Get batch with sketch weighted by hessian.  The batch will be regenerated if
-   *        the span is changed, so caller should keep the span for each iteration.
+   * \brief Used by the approx tree method.
+   *
+   *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
+   *   span is changed, so caller should keep the span for each iteration.
   */
  BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
      : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}

-  bool operator!=(BatchParam const& other) const {
-    if (hess.empty() && other.hess.empty()) {
-      return gpu_id != other.gpu_id || max_bin != other.max_bin;
-    }
-    return gpu_id != other.gpu_id || max_bin != other.max_bin || hess.data() != other.hess.data();
+  bool ParamNotEqual(BatchParam const& other) const {
+    // Check non-floating parameters.
+    bool cond = max_bin != other.max_bin;
+    // Check sparse thresh.
+    bool l_nan = std::isnan(sparse_thresh);
+    bool r_nan = std::isnan(other.sparse_thresh);
+    bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
+    cond |= st_chg;
+
+    return cond;
  }
-  bool operator==(BatchParam const& other) const {
-    return !(*this != other);
+  bool Initialized() const { return max_bin != 0; }
+  /**
+   * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
+   */
+  BatchParam MakeCache() const {
+    auto p = *this;
+    // These parameters have nothing to do with how the gradient index was generated in the
+    // first place.
+    p.regen = false;
+    p.forbid_regen = false;
+    return p;
  }
 };

@@ -427,7 +463,7 @@ class EllpackPage {
   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
   * in CSR format.
   */
-  explicit EllpackPage(DMatrix* dmat, const BatchParam& param);
+  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);

  /*! \brief Destructor. */
  ~EllpackPage();
@@ -543,7 +579,9 @@ class DMatrix {
  template <typename T>
  BatchSet<T> GetBatches();
  template <typename T>
-  BatchSet<T> GetBatches(const BatchParam& param);
+  BatchSet<T> GetBatches(Context const* ctx);
+  template <typename T>
+  BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
  template <typename T>
  bool PageExists() const;

@@ -558,21 +596,17 @@ class DMatrix {
    return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
  }

-  /*!
+  /**
   * \brief Load DMatrix from URI.
+   *
   * \param uri The URI of input.
   * \param silent Whether print information during loading.
   * \param data_split_mode In distributed mode, split the input according this mode; otherwise,
   *                        it's just an indicator on how the input was split beforehand.
-   * \param file_format The format type of the file, used for dmlc::Parser::Create.
-   *   By default "auto" will be able to load in both local binary file.
-   * \param page_size Page size for external memory.
   * \return The created DMatrix.
   */
-  static DMatrix* Load(const std::string& uri,
-                       bool silent = true,
-                       DataSplitMode data_split_mode = DataSplitMode::kRow,
-                       const std::string& file_format = "auto");
+  static DMatrix* Load(const std::string& uri, bool silent = true,
+                       DataSplitMode data_split_mode = DataSplitMode::kRow);

  /**
   * \brief Creates a new DMatrix from an external data adapter.
@@ -654,18 +688,19 @@ class DMatrix {

 protected:
  virtual BatchSet<SparsePage> GetRowBatches() = 0;
-  virtual BatchSet<CSCPage> GetColumnBatches() = 0;
-  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
-  virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
-  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) = 0;
-  virtual BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) = 0;
+  virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) = 0;
+  virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
+  virtual BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
+                                                      BatchParam const& param) = 0;
+  virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;

  virtual bool EllpackExists() const = 0;
  virtual bool GHistIndexExists() const = 0;
  virtual bool SparsePageExists() const = 0;
 };

-template<>
+template <>
 inline BatchSet<SparsePage> DMatrix::GetBatches() {
  return GetRowBatches();
 }
@@ -680,34 +715,39 @@ inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
  return this->GHistIndexExists();
 }

-template<>
+template <>
 inline bool DMatrix::PageExists<SparsePage>() const {
  return this->SparsePageExists();
 }

-template<>
-inline BatchSet<CSCPage> DMatrix::GetBatches() {
-  return GetColumnBatches();
-}
-
-template<>
-inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
-  return GetSortedColumnBatches();
-}
-
-template<>
-inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
-  return GetEllpackBatches(param);
+template <>
+inline BatchSet<SparsePage> DMatrix::GetBatches(Context const*) {
+  return GetRowBatches();
 }

 template <>
-inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(const BatchParam& param) {
-  return GetGradientIndex(param);
+inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetColumnBatches(ctx);
 }

 template <>
-inline BatchSet<ExtSparsePage> DMatrix::GetBatches() {
-  return GetExtBatches(BatchParam{});
+inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
+  return GetSortedColumnBatches(ctx);
+}
+
+template <>
+inline BatchSet<EllpackPage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetEllpackBatches(ctx, param);
+}
+
+template <>
+inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetGradientIndex(ctx, param);
+}
+
+template <>
+inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
+  return GetExtBatches(ctx, param);
 }
 }  // namespace xgboost

--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -567,7 +567,7 @@ class RegTree : public Model {
     * \brief drop the trace after fill, must be called after fill.
     * \param inst The sparse instance to drop.
     */
-    void Drop(const SparsePage::Inst& inst);
+    void Drop();
    /*!
     * \brief returns the size of the feature vector
     * \return the size of the feature vector
@@ -807,13 +807,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
  has_missing_ = data_.size() != feature_count;
 }

-inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
-  for (auto const& entry : inst) {
-    if (entry.index >= data_.size()) {
-      continue;
-    }
-    data_[entry.index].flag = -1;
-  }
+inline void RegTree::FVec::Drop() {
+  Entry e{};
+  e.flag = -1;
+  std::fill_n(data_.data(), data_.size(), e);
  has_missing_ = true;
 }