merge 23Mar01

This commit is contained in:
amdsc21
2023-05-02 00:05:58 +02:00
258 changed files with 7471 additions and 5379 deletions

View File

@@ -38,7 +38,7 @@ typedef uint64_t bst_ulong; // NOLINT(*)
*/
/**
* @defgroup Library
* @defgroup Library Library
*
* These functions are used to obtain general information about XGBoost including version,
* build info and current global configuration.
@@ -112,7 +112,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
/**@}*/
/**
* @defgroup DMatrix
* @defgroup DMatrix DMatrix
*
* @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
* including both training, prediction and explanation. There are a few variants of
@@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
/*!
* \brief load a data matrix
* \param config JSON encoded parameters for DMatrix construction. Accepted fields are:
* - uri: The URI of the input file.
* - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
* \verbatim embed:rst:leading-asterisk
* See :doc:`/tutorials/input_format` for more info.
* \endverbatim
* - silent (optional): Whether to print message during loading. Default to true.
* - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
* file is split accordingly; otherwise this is only an indicator on how the file was split
@@ -200,7 +204,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
bst_ulong nrow, char const *config, DMatrixHandle *out);
/*!
* \brief create a matrix content from CSC format
@@ -281,7 +285,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
DMatrixHandle *out);
/**
* @defgroup Streaming
* @defgroup Streaming Streaming
* @ingroup DMatrix
*
* @brief Quantile DMatrix and external memory DMatrix can be created from batches of
@@ -431,7 +435,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
* - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
* `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
* \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
* - Step 3: Call appropriate data setters in `next` functions.
*
* \param iter A handle to external data iterator.
@@ -830,7 +834,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
/** @} */ // End of DMatrix
/**
* @defgroup Booster
* @defgroup Booster Booster
*
* @brief The `Booster` class is the gradient-boosted model for XGBoost.
* @{
@@ -953,7 +957,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
*/
/**
* @defgroup Prediction
* @defgroup Prediction Prediction
* @ingroup Booster
*
* @brief These functions are used for running prediction and explanation algorithms.
@@ -1155,7 +1159,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
/**
* @defgroup Serialization
* @defgroup Serialization Serialization
* @ingroup Booster
*
* @brief There are multiple ways to serialize a Booster object depending on the use case.
@@ -1490,7 +1494,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
/**@}*/ // End of Booster
/**
* @defgroup Collective
* @defgroup Collective Collective
*
* @brief Experimental support for exposing internal communicator in XGBoost.
*

View File

@@ -50,7 +50,19 @@ struct Context : public XGBoostParameter<Context> {
bool IsCPU() const { return gpu_id == kCpuId; }
bool IsCUDA() const { return !IsCPU(); }
CUDAContext const* CUDACtx() const;
// Make a CUDA context based on the current context.
Context MakeCUDA(std::int32_t device = 0) const {
Context ctx = *this;
ctx.gpu_id = device;
return ctx;
}
Context MakeCPU() const {
Context ctx = *this;
ctx.gpu_id = kCpuId;
return ctx;
}
// declare parameters
DMLC_DECLARE_PARAMETER(Context) {

View File

@@ -1,5 +1,5 @@
/*!
* Copyright (c) 2015-2022 by XGBoost Contributors
/**
* Copyright 2015-2023 by XGBoost Contributors
* \file data.h
* \brief The input data structure of xgboost.
* \author Tianqi Chen
@@ -196,6 +196,14 @@ class MetaInfo {
*/
bool IsVerticalFederated() const;
/*!
* \brief A convenient method to check if the MetaInfo should contain labels.
*
* Normally we assume labels are available everywhere. The only exception is in vertical federated
* learning where labels are only available on worker 0.
*/
bool ShouldHaveLabels() const;
private:
void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
@@ -230,44 +238,72 @@ struct Entry {
}
};
/*!
* \brief Parameters for constructing batches.
/**
* \brief Parameters for constructing histogram index batches.
*/
struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id {-1};
/*! \brief Maximum number of bins per feature for histograms. */
/**
* \brief Maximum number of bins per feature for histograms.
*/
bst_bin_t max_bin{0};
/*! \brief Hessian, used for sketching with future approx implementation. */
/**
* \brief Hessian, used for sketching with future approx implementation.
*/
common::Span<float> hess;
/*! \brief Whether should DMatrix regenerate the batch. Only used for GHistIndex. */
bool regen {false};
/*! \brief Parameter used to generate column matrix for hist. */
/**
* \brief Whether should we force DMatrix to regenerate the batch. Only used for
* GHistIndex.
*/
bool regen{false};
/**
* \brief Forbid regenerating the gradient index. Used for internal validation.
*/
bool forbid_regen{false};
/**
* \brief Parameter used to generate column matrix for hist.
*/
double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
/**
* \brief Exact or others that don't need histogram.
*/
BatchParam() = default;
// GPU Hist
BatchParam(int32_t device, bst_bin_t max_bin)
: gpu_id{device}, max_bin{max_bin} {}
// Hist
/**
* \brief Used by the hist tree method.
*/
BatchParam(bst_bin_t max_bin, double sparse_thresh)
: max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
// Approx
/**
* \brief Get batch with sketch weighted by hessian. The batch will be regenerated if
* the span is changed, so caller should keep the span for each iteration.
* \brief Used by the approx tree method.
*
* Get batch with sketch weighted by hessian. The batch will be regenerated if the
* span is changed, so caller should keep the span for each iteration.
*/
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
bool operator!=(BatchParam const& other) const {
if (hess.empty() && other.hess.empty()) {
return gpu_id != other.gpu_id || max_bin != other.max_bin;
}
return gpu_id != other.gpu_id || max_bin != other.max_bin || hess.data() != other.hess.data();
bool ParamNotEqual(BatchParam const& other) const {
// Check non-floating parameters.
bool cond = max_bin != other.max_bin;
// Check sparse thresh.
bool l_nan = std::isnan(sparse_thresh);
bool r_nan = std::isnan(other.sparse_thresh);
bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
cond |= st_chg;
return cond;
}
bool operator==(BatchParam const& other) const {
return !(*this != other);
bool Initialized() const { return max_bin != 0; }
/**
* \brief Make a copy of self for DMatrix to describe how its existing index was generated.
*/
BatchParam MakeCache() const {
auto p = *this;
// These parameters have nothing to do with how the gradient index was generated in the
// first place.
p.regen = false;
p.forbid_regen = false;
return p;
}
};
@@ -427,7 +463,7 @@ class EllpackPage {
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(DMatrix* dmat, const BatchParam& param);
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
/*! \brief Destructor. */
~EllpackPage();
@@ -543,7 +579,9 @@ class DMatrix {
template <typename T>
BatchSet<T> GetBatches();
template <typename T>
BatchSet<T> GetBatches(const BatchParam& param);
BatchSet<T> GetBatches(Context const* ctx);
template <typename T>
BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
template <typename T>
bool PageExists() const;
@@ -558,21 +596,17 @@ class DMatrix {
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
}
/*!
/**
* \brief Load DMatrix from URI.
*
* \param uri The URI of input.
* \param silent Whether print information during loading.
* \param data_split_mode In distributed mode, split the input according this mode; otherwise,
* it's just an indicator on how the input was split beforehand.
* \param file_format The format type of the file, used for dmlc::Parser::Create.
* By default "auto" will be able to load in both local binary file.
* \param page_size Page size for external memory.
* \return The created DMatrix.
*/
static DMatrix* Load(const std::string& uri,
bool silent = true,
DataSplitMode data_split_mode = DataSplitMode::kRow,
const std::string& file_format = "auto");
static DMatrix* Load(const std::string& uri, bool silent = true,
DataSplitMode data_split_mode = DataSplitMode::kRow);
/**
* \brief Creates a new DMatrix from an external data adapter.
@@ -654,18 +688,19 @@ class DMatrix {
protected:
virtual BatchSet<SparsePage> GetRowBatches() = 0;
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
virtual BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) = 0;
virtual BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) = 0;
virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) = 0;
virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
virtual BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
BatchParam const& param) = 0;
virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
virtual bool EllpackExists() const = 0;
virtual bool GHistIndexExists() const = 0;
virtual bool SparsePageExists() const = 0;
};
template<>
template <>
inline BatchSet<SparsePage> DMatrix::GetBatches() {
return GetRowBatches();
}
@@ -680,34 +715,39 @@ inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
return this->GHistIndexExists();
}
template<>
template <>
inline bool DMatrix::PageExists<SparsePage>() const {
return this->SparsePageExists();
}
template<>
inline BatchSet<CSCPage> DMatrix::GetBatches() {
return GetColumnBatches();
}
template<>
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
return GetSortedColumnBatches();
}
template<>
inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
return GetEllpackBatches(param);
template <>
inline BatchSet<SparsePage> DMatrix::GetBatches(Context const*) {
return GetRowBatches();
}
template <>
inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(const BatchParam& param) {
return GetGradientIndex(param);
inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
return GetColumnBatches(ctx);
}
template <>
inline BatchSet<ExtSparsePage> DMatrix::GetBatches() {
return GetExtBatches(BatchParam{});
inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
return GetSortedColumnBatches(ctx);
}
template <>
inline BatchSet<EllpackPage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
return GetEllpackBatches(ctx, param);
}
template <>
inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
return GetGradientIndex(ctx, param);
}
template <>
inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
return GetExtBatches(ctx, param);
}
} // namespace xgboost

View File

@@ -567,7 +567,7 @@ class RegTree : public Model {
* \brief drop the trace after fill, must be called after fill.
* \param inst The sparse instance to drop.
*/
void Drop(const SparsePage::Inst& inst);
void Drop();
/*!
* \brief returns the size of the feature vector
* \return the size of the feature vector
@@ -807,13 +807,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
has_missing_ = data_.size() != feature_count;
}
inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
for (auto const& entry : inst) {
if (entry.index >= data_.size()) {
continue;
}
data_[entry.index].flag = -1;
}
inline void RegTree::FVec::Drop() {
Entry e{};
e.flag = -1;
std::fill_n(data_.data(), data_.size(), e);
has_missing_ = true;
}