merge 23Mar01
This commit is contained in:
@@ -38,7 +38,7 @@ typedef uint64_t bst_ulong; // NOLINT(*)
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup Library
|
||||
* @defgroup Library Library
|
||||
*
|
||||
* These functions are used to obtain general information about XGBoost including version,
|
||||
* build info and current global configuration.
|
||||
@@ -112,7 +112,7 @@ XGB_DLL int XGBGetGlobalConfig(char const **out_config);
|
||||
/**@}*/
|
||||
|
||||
/**
|
||||
* @defgroup DMatrix
|
||||
* @defgroup DMatrix DMatrix
|
||||
*
|
||||
* @brief DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms
|
||||
* including both training, prediction and explanation. There are a few variants of
|
||||
@@ -138,7 +138,11 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param config JSON encoded parameters for DMatrix construction. Accepted fields are:
|
||||
* - uri: The URI of the input file.
|
||||
|
||||
* - uri: The URI of the input file. The URI parameter `format` is required when loading text data.
|
||||
* \verbatim embed:rst:leading-asterisk
|
||||
* See :doc:`/tutorials/input_format` for more info.
|
||||
* \endverbatim
|
||||
* - silent (optional): Whether to print message during loading. Default to true.
|
||||
* - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
|
||||
* file is split accordingly; otherwise this is only an indicator on how the file was split
|
||||
@@ -200,7 +204,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
|
||||
bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
|
||||
bst_ulong nrow, char const *config, DMatrixHandle *out);
|
||||
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
@@ -281,7 +285,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data, char const *
|
||||
DMatrixHandle *out);
|
||||
|
||||
/**
|
||||
* @defgroup Streaming
|
||||
* @defgroup Streaming Streaming
|
||||
* @ingroup DMatrix
|
||||
*
|
||||
* @brief Quantile DMatrix and external memory DMatrix can be created from batches of
|
||||
@@ -431,7 +435,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
|
||||
* - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
|
||||
* - Step 1: Create a DMatrix proxy by \ref XGProxyDMatrixCreate and hold the handle.
|
||||
* - Step 2: Pass the iterator handle, proxy handle and 2 methods into
|
||||
* `XGDMatrixCreateFromCallback`, along with other parameters encoded as a JSON object.
|
||||
* \ref XGDMatrixCreateFromCallback, along with other parameters encoded as a JSON object.
|
||||
* - Step 3: Call appropriate data setters in `next` functions.
|
||||
*
|
||||
* \param iter A handle to external data iterator.
|
||||
@@ -830,7 +834,7 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
|
||||
/** @} */ // End of DMatrix
|
||||
|
||||
/**
|
||||
* @defgroup Booster
|
||||
* @defgroup Booster Booster
|
||||
*
|
||||
* @brief The `Booster` class is the gradient-boosted model for XGBoost.
|
||||
* @{
|
||||
@@ -953,7 +957,7 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle, int iter, DMatrixHandle d
|
||||
*/
|
||||
|
||||
/**
|
||||
* @defgroup Prediction
|
||||
* @defgroup Prediction Prediction
|
||||
* @ingroup Booster
|
||||
*
|
||||
* @brief These functions are used for running prediction and explanation algorithms.
|
||||
@@ -1155,7 +1159,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
|
||||
|
||||
|
||||
/**
|
||||
* @defgroup Serialization
|
||||
* @defgroup Serialization Serialization
|
||||
* @ingroup Booster
|
||||
*
|
||||
* @brief There are multiple ways to serialize a Booster object depending on the use case.
|
||||
@@ -1490,7 +1494,7 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
|
||||
/**@}*/ // End of Booster
|
||||
|
||||
/**
|
||||
* @defgroup Collective
|
||||
* @defgroup Collective Collective
|
||||
*
|
||||
* @brief Experimental support for exposing internal communicator in XGBoost.
|
||||
*
|
||||
|
||||
@@ -50,7 +50,19 @@ struct Context : public XGBoostParameter<Context> {
|
||||
|
||||
bool IsCPU() const { return gpu_id == kCpuId; }
|
||||
bool IsCUDA() const { return !IsCPU(); }
|
||||
|
||||
CUDAContext const* CUDACtx() const;
|
||||
// Make a CUDA context based on the current context.
|
||||
Context MakeCUDA(std::int32_t device = 0) const {
|
||||
Context ctx = *this;
|
||||
ctx.gpu_id = device;
|
||||
return ctx;
|
||||
}
|
||||
Context MakeCPU() const {
|
||||
Context ctx = *this;
|
||||
ctx.gpu_id = kCpuId;
|
||||
return ctx;
|
||||
}
|
||||
|
||||
// declare parameters
|
||||
DMLC_DECLARE_PARAMETER(Context) {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright (c) 2015-2022 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2015-2023 by XGBoost Contributors
|
||||
* \file data.h
|
||||
* \brief The input data structure of xgboost.
|
||||
* \author Tianqi Chen
|
||||
@@ -196,6 +196,14 @@ class MetaInfo {
|
||||
*/
|
||||
bool IsVerticalFederated() const;
|
||||
|
||||
/*!
|
||||
* \brief A convenient method to check if the MetaInfo should contain labels.
|
||||
*
|
||||
* Normally we assume labels are available everywhere. The only exception is in vertical federated
|
||||
* learning where labels are only available on worker 0.
|
||||
*/
|
||||
bool ShouldHaveLabels() const;
|
||||
|
||||
private:
|
||||
void SetInfoFromHost(Context const& ctx, StringView key, Json arr);
|
||||
void SetInfoFromCUDA(Context const& ctx, StringView key, Json arr);
|
||||
@@ -230,44 +238,72 @@ struct Entry {
|
||||
}
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief Parameters for constructing batches.
|
||||
/**
|
||||
* \brief Parameters for constructing histogram index batches.
|
||||
*/
|
||||
struct BatchParam {
|
||||
/*! \brief The GPU device to use. */
|
||||
int gpu_id {-1};
|
||||
/*! \brief Maximum number of bins per feature for histograms. */
|
||||
/**
|
||||
* \brief Maximum number of bins per feature for histograms.
|
||||
*/
|
||||
bst_bin_t max_bin{0};
|
||||
/*! \brief Hessian, used for sketching with future approx implementation. */
|
||||
/**
|
||||
* \brief Hessian, used for sketching with future approx implementation.
|
||||
*/
|
||||
common::Span<float> hess;
|
||||
/*! \brief Whether should DMatrix regenerate the batch. Only used for GHistIndex. */
|
||||
bool regen {false};
|
||||
/*! \brief Parameter used to generate column matrix for hist. */
|
||||
/**
|
||||
* \brief Whether should we force DMatrix to regenerate the batch. Only used for
|
||||
* GHistIndex.
|
||||
*/
|
||||
bool regen{false};
|
||||
/**
|
||||
* \brief Forbid regenerating the gradient index. Used for internal validation.
|
||||
*/
|
||||
bool forbid_regen{false};
|
||||
/**
|
||||
* \brief Parameter used to generate column matrix for hist.
|
||||
*/
|
||||
double sparse_thresh{std::numeric_limits<double>::quiet_NaN()};
|
||||
|
||||
/**
|
||||
* \brief Exact or others that don't need histogram.
|
||||
*/
|
||||
BatchParam() = default;
|
||||
// GPU Hist
|
||||
BatchParam(int32_t device, bst_bin_t max_bin)
|
||||
: gpu_id{device}, max_bin{max_bin} {}
|
||||
// Hist
|
||||
/**
|
||||
* \brief Used by the hist tree method.
|
||||
*/
|
||||
BatchParam(bst_bin_t max_bin, double sparse_thresh)
|
||||
: max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
|
||||
// Approx
|
||||
/**
|
||||
* \brief Get batch with sketch weighted by hessian. The batch will be regenerated if
|
||||
* the span is changed, so caller should keep the span for each iteration.
|
||||
* \brief Used by the approx tree method.
|
||||
*
|
||||
* Get batch with sketch weighted by hessian. The batch will be regenerated if the
|
||||
* span is changed, so caller should keep the span for each iteration.
|
||||
*/
|
||||
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
|
||||
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
|
||||
|
||||
bool operator!=(BatchParam const& other) const {
|
||||
if (hess.empty() && other.hess.empty()) {
|
||||
return gpu_id != other.gpu_id || max_bin != other.max_bin;
|
||||
}
|
||||
return gpu_id != other.gpu_id || max_bin != other.max_bin || hess.data() != other.hess.data();
|
||||
bool ParamNotEqual(BatchParam const& other) const {
|
||||
// Check non-floating parameters.
|
||||
bool cond = max_bin != other.max_bin;
|
||||
// Check sparse thresh.
|
||||
bool l_nan = std::isnan(sparse_thresh);
|
||||
bool r_nan = std::isnan(other.sparse_thresh);
|
||||
bool st_chg = (l_nan != r_nan) || (!l_nan && !r_nan && (sparse_thresh != other.sparse_thresh));
|
||||
cond |= st_chg;
|
||||
|
||||
return cond;
|
||||
}
|
||||
bool operator==(BatchParam const& other) const {
|
||||
return !(*this != other);
|
||||
bool Initialized() const { return max_bin != 0; }
|
||||
/**
|
||||
* \brief Make a copy of self for DMatrix to describe how its existing index was generated.
|
||||
*/
|
||||
BatchParam MakeCache() const {
|
||||
auto p = *this;
|
||||
// These parameters have nothing to do with how the gradient index was generated in the
|
||||
// first place.
|
||||
p.regen = false;
|
||||
p.forbid_regen = false;
|
||||
return p;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -427,7 +463,7 @@ class EllpackPage {
|
||||
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
|
||||
* in CSR format.
|
||||
*/
|
||||
explicit EllpackPage(DMatrix* dmat, const BatchParam& param);
|
||||
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
|
||||
|
||||
/*! \brief Destructor. */
|
||||
~EllpackPage();
|
||||
@@ -543,7 +579,9 @@ class DMatrix {
|
||||
template <typename T>
|
||||
BatchSet<T> GetBatches();
|
||||
template <typename T>
|
||||
BatchSet<T> GetBatches(const BatchParam& param);
|
||||
BatchSet<T> GetBatches(Context const* ctx);
|
||||
template <typename T>
|
||||
BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
|
||||
template <typename T>
|
||||
bool PageExists() const;
|
||||
|
||||
@@ -558,21 +596,17 @@ class DMatrix {
|
||||
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
|
||||
}
|
||||
|
||||
/*!
|
||||
/**
|
||||
* \brief Load DMatrix from URI.
|
||||
*
|
||||
* \param uri The URI of input.
|
||||
* \param silent Whether print information during loading.
|
||||
* \param data_split_mode In distributed mode, split the input according this mode; otherwise,
|
||||
* it's just an indicator on how the input was split beforehand.
|
||||
* \param file_format The format type of the file, used for dmlc::Parser::Create.
|
||||
* By default "auto" will be able to load in both local binary file.
|
||||
* \param page_size Page size for external memory.
|
||||
* \return The created DMatrix.
|
||||
*/
|
||||
static DMatrix* Load(const std::string& uri,
|
||||
bool silent = true,
|
||||
DataSplitMode data_split_mode = DataSplitMode::kRow,
|
||||
const std::string& file_format = "auto");
|
||||
static DMatrix* Load(const std::string& uri, bool silent = true,
|
||||
DataSplitMode data_split_mode = DataSplitMode::kRow);
|
||||
|
||||
/**
|
||||
* \brief Creates a new DMatrix from an external data adapter.
|
||||
@@ -654,18 +688,19 @@ class DMatrix {
|
||||
|
||||
protected:
|
||||
virtual BatchSet<SparsePage> GetRowBatches() = 0;
|
||||
virtual BatchSet<CSCPage> GetColumnBatches() = 0;
|
||||
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches() = 0;
|
||||
virtual BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) = 0;
|
||||
virtual BatchSet<GHistIndexMatrix> GetGradientIndex(const BatchParam& param) = 0;
|
||||
virtual BatchSet<ExtSparsePage> GetExtBatches(BatchParam const& param) = 0;
|
||||
virtual BatchSet<CSCPage> GetColumnBatches(Context const* ctx) = 0;
|
||||
virtual BatchSet<SortedCSCPage> GetSortedColumnBatches(Context const* ctx) = 0;
|
||||
virtual BatchSet<EllpackPage> GetEllpackBatches(Context const* ctx, BatchParam const& param) = 0;
|
||||
virtual BatchSet<GHistIndexMatrix> GetGradientIndex(Context const* ctx,
|
||||
BatchParam const& param) = 0;
|
||||
virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
|
||||
|
||||
virtual bool EllpackExists() const = 0;
|
||||
virtual bool GHistIndexExists() const = 0;
|
||||
virtual bool SparsePageExists() const = 0;
|
||||
};
|
||||
|
||||
template<>
|
||||
template <>
|
||||
inline BatchSet<SparsePage> DMatrix::GetBatches() {
|
||||
return GetRowBatches();
|
||||
}
|
||||
@@ -680,34 +715,39 @@ inline bool DMatrix::PageExists<GHistIndexMatrix>() const {
|
||||
return this->GHistIndexExists();
|
||||
}
|
||||
|
||||
template<>
|
||||
template <>
|
||||
inline bool DMatrix::PageExists<SparsePage>() const {
|
||||
return this->SparsePageExists();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline BatchSet<CSCPage> DMatrix::GetBatches() {
|
||||
return GetColumnBatches();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline BatchSet<SortedCSCPage> DMatrix::GetBatches() {
|
||||
return GetSortedColumnBatches();
|
||||
}
|
||||
|
||||
template<>
|
||||
inline BatchSet<EllpackPage> DMatrix::GetBatches(const BatchParam& param) {
|
||||
return GetEllpackBatches(param);
|
||||
template <>
|
||||
inline BatchSet<SparsePage> DMatrix::GetBatches(Context const*) {
|
||||
return GetRowBatches();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(const BatchParam& param) {
|
||||
return GetGradientIndex(param);
|
||||
inline BatchSet<CSCPage> DMatrix::GetBatches(Context const* ctx) {
|
||||
return GetColumnBatches(ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline BatchSet<ExtSparsePage> DMatrix::GetBatches() {
|
||||
return GetExtBatches(BatchParam{});
|
||||
inline BatchSet<SortedCSCPage> DMatrix::GetBatches(Context const* ctx) {
|
||||
return GetSortedColumnBatches(ctx);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline BatchSet<EllpackPage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
|
||||
return GetEllpackBatches(ctx, param);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline BatchSet<GHistIndexMatrix> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
|
||||
return GetGradientIndex(ctx, param);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline BatchSet<ExtSparsePage> DMatrix::GetBatches(Context const* ctx, BatchParam const& param) {
|
||||
return GetExtBatches(ctx, param);
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -567,7 +567,7 @@ class RegTree : public Model {
|
||||
* \brief drop the trace after fill, must be called after fill.
|
||||
* \param inst The sparse instance to drop.
|
||||
*/
|
||||
void Drop(const SparsePage::Inst& inst);
|
||||
void Drop();
|
||||
/*!
|
||||
* \brief returns the size of the feature vector
|
||||
* \return the size of the feature vector
|
||||
@@ -807,13 +807,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
|
||||
has_missing_ = data_.size() != feature_count;
|
||||
}
|
||||
|
||||
inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
|
||||
for (auto const& entry : inst) {
|
||||
if (entry.index >= data_.size()) {
|
||||
continue;
|
||||
}
|
||||
data_[entry.index].flag = -1;
|
||||
}
|
||||
inline void RegTree::FVec::Drop() {
|
||||
Entry e{};
|
||||
e.flag = -1;
|
||||
std::fill_n(data_.data(), data_.size(), e);
|
||||
has_missing_ = true;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user