Support exporting cut values (#9356)

This commit is contained in:
Jiaming Yuan
2023-07-08 15:32:41 +08:00
committed by GitHub
parent c3124813e8
commit 20c52f07d2
28 changed files with 722 additions and 101 deletions

View File

@@ -810,7 +810,7 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
*/
XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
/*!
/**
* \brief Get the predictors from DMatrix as CSR matrix for testing. If this is a
* quantized DMatrix, quantized values are returned instead.
*
@@ -819,8 +819,10 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
* XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until
* exiting the thread.
*
* @since 1.7.0
*
* \param handle the handle to the DMatrix
* \param config Json configuration string. At the moment it should be an empty document,
* \param config JSON configuration string. At the moment it should be an empty document,
* preserved for future use.
* \param out_indptr indptr of output CSR matrix.
* \param out_indices Column index of output CSR matrix.
@@ -831,6 +833,24 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
bst_ulong *out_indptr, unsigned *out_indices, float *out_data);
/**
* @brief Export the quantile cuts used for training histogram-based models like `hist` and
* `approx`. Useful for model compression.
*
* @since 2.0.0
*
* @param handle the handle to the DMatrix
* @param config JSON configuration string. At the moment it should be an empty document,
* preserved for future use.
*
* @param out_indptr indptr of output CSC matrix represented by a JSON encoded
* __(cuda_)array_interface__.
* @param out_data Data value of CSC matrix represented by a JSON encoded
* __(cuda_)array_interface__.
*/
XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
char const **out_indptr, char const **out_data);
/** @} */ // End of DMatrix
/**

View File

@@ -282,7 +282,7 @@ struct BatchParam {
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
bool ParamNotEqual(BatchParam const& other) const {
[[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
// Check non-floating parameters.
bool cond = max_bin != other.max_bin;
// Check sparse thresh.
@@ -293,11 +293,11 @@ struct BatchParam {
return cond;
}
bool Initialized() const { return max_bin != 0; }
[[nodiscard]] bool Initialized() const { return max_bin != 0; }
/**
* \brief Make a copy of self for DMatrix to describe how its existing index was generated.
*/
BatchParam MakeCache() const {
[[nodiscard]] BatchParam MakeCache() const {
auto p = *this;
// These parameters have nothing to do with how the gradient index was generated in the
// first place.
@@ -319,7 +319,7 @@ struct HostSparsePageView {
static_cast<Inst::index_type>(size)};
}
size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
[[nodiscard]] size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
};
/*!
@@ -337,7 +337,7 @@ class SparsePage {
/*! \brief an instance of sparse vector in the batch */
using Inst = common::Span<Entry const>;
HostSparsePageView GetView() const {
[[nodiscard]] HostSparsePageView GetView() const {
return {offset.ConstHostSpan(), data.ConstHostSpan()};
}
@@ -353,12 +353,12 @@ class SparsePage {
virtual ~SparsePage() = default;
/*! \return Number of instances in the page. */
inline size_t Size() const {
[[nodiscard]] size_t Size() const {
return offset.Size() == 0 ? 0 : offset.Size() - 1;
}
/*! \return estimation of memory cost of this page */
inline size_t MemCostBytes() const {
[[nodiscard]] size_t MemCostBytes() const {
return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
}
@@ -376,7 +376,7 @@ class SparsePage {
base_rowid = row_id;
}
SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
[[nodiscard]] SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
/**
* \brief Sort the column index.
@@ -385,7 +385,7 @@ class SparsePage {
/**
* \brief Check wether the column index is sorted.
*/
bool IsIndicesSorted(int32_t n_threads) const;
[[nodiscard]] bool IsIndicesSorted(int32_t n_threads) const;
/**
* \brief Reindex the column index with an offset.
*/
@@ -440,49 +440,7 @@ class SortedCSCPage : public SparsePage {
explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
};
class EllpackPageImpl;
/*!
* \brief A page stored in ELLPACK format.
*
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
* including CUDA-specific implementation details in the header.
*/
class EllpackPage {
public:
/*!
* \brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();
/*!
* \brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
/*! \brief Destructor. */
~EllpackPage();
EllpackPage(EllpackPage&& that);
/*! \return Number of instances in the page. */
size_t Size() const;
/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id);
const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }
private:
std::unique_ptr<EllpackPageImpl> impl_;
};
class EllpackPage;
class GHistIndexMatrix;
template<typename T>
@@ -492,7 +450,7 @@ class BatchIteratorImpl {
virtual ~BatchIteratorImpl() = default;
virtual const T& operator*() const = 0;
virtual BatchIteratorImpl& operator++() = 0;
virtual bool AtEnd() const = 0;
[[nodiscard]] virtual bool AtEnd() const = 0;
virtual std::shared_ptr<T const> Page() const = 0;
};
@@ -519,12 +477,12 @@ class BatchIterator {
return !impl_->AtEnd();
}
bool AtEnd() const {
[[nodiscard]] bool AtEnd() const {
CHECK(impl_ != nullptr);
return impl_->AtEnd();
}
std::shared_ptr<T const> Page() const {
[[nodiscard]] std::shared_ptr<T const> Page() const {
return impl_->Page();
}
@@ -563,15 +521,15 @@ class DMatrix {
this->Info().SetInfo(ctx, key, StringView{interface_str});
}
/*! \brief meta information of the dataset */
virtual const MetaInfo& Info() const = 0;
[[nodiscard]] virtual const MetaInfo& Info() const = 0;
/*! \brief Get thread local memory for returning data from DMatrix. */
XGBAPIThreadLocalEntry& GetThreadLocal() const;
[[nodiscard]] XGBAPIThreadLocalEntry& GetThreadLocal() const;
/**
* \brief Get the context object of this DMatrix. The context is created during construction of
* DMatrix with user specified `nthread` parameter.
*/
virtual Context const* Ctx() const = 0;
[[nodiscard]] virtual Context const* Ctx() const = 0;
/**
* \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
@@ -583,16 +541,16 @@ class DMatrix {
template <typename T>
BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
template <typename T>
bool PageExists() const;
[[nodiscard]] bool PageExists() const;
// the following are column meta data, should be able to answer them fast.
/*! \return Whether the data columns single column block. */
virtual bool SingleColBlock() const = 0;
[[nodiscard]] virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */
virtual ~DMatrix();
/*! \brief Whether the matrix is dense. */
bool IsDense() const {
[[nodiscard]] bool IsDense() const {
return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
}
@@ -695,9 +653,9 @@ class DMatrix {
BatchParam const& param) = 0;
virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
virtual bool EllpackExists() const = 0;
virtual bool GHistIndexExists() const = 0;
virtual bool SparsePageExists() const = 0;
[[nodiscard]] virtual bool EllpackExists() const = 0;
[[nodiscard]] virtual bool GHistIndexExists() const = 0;
[[nodiscard]] virtual bool SparsePageExists() const = 0;
};
template <>