Support exporting cut values (#9356)
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
*/
|
||||
#include "xgboost/c_api.h"
|
||||
|
||||
#include <algorithm> // for copy
|
||||
#include <algorithm> // for copy, transform
|
||||
#include <cinttypes> // for strtoimax
|
||||
#include <cmath> // for nan
|
||||
#include <cstring> // for strcmp
|
||||
@@ -20,9 +20,11 @@
|
||||
#include "../collective/communicator-inl.h" // for Allreduce, Broadcast, Finalize, GetProcessor...
|
||||
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
|
||||
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
|
||||
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
|
||||
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
|
||||
#include "../data/ellpack_page.h" // for EllpackPage
|
||||
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
|
||||
#include "../data/simple_dmatrix.h" // for SimpleDMatrix
|
||||
#include "c_api_error.h" // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
|
||||
@@ -785,6 +787,104 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
|
||||
API_END();
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename Page>
|
||||
void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
|
||||
std::vector<std::uint64_t> *p_indptr, std::vector<float> *p_data) {
|
||||
auto &indptr = *p_indptr;
|
||||
auto &data = *p_data;
|
||||
for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
|
||||
auto const &cut = page.Cuts();
|
||||
|
||||
auto const &ptrs = cut.Ptrs();
|
||||
indptr.resize(ptrs.size());
|
||||
|
||||
auto const &vals = cut.Values();
|
||||
auto const &mins = cut.MinValues();
|
||||
|
||||
bst_feature_t n_features = p_m->Info().num_col_;
|
||||
auto ft = p_m->Info().feature_types.ConstHostSpan();
|
||||
std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
|
||||
[](auto t) { return t == FeatureType::kCategorical; });
|
||||
data.resize(vals.size() + n_features - n_categories); // |vals| + |mins|
|
||||
std::size_t i{0}, n_numeric{0};
|
||||
for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
|
||||
CHECK_LT(i, data.size());
|
||||
bool is_numeric = !common::IsCat(ft, fidx);
|
||||
if (is_numeric) {
|
||||
data[i] = mins[fidx];
|
||||
i++;
|
||||
}
|
||||
auto beg = ptrs[fidx];
|
||||
auto end = ptrs[fidx + 1];
|
||||
CHECK_LE(end, data.size());
|
||||
std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
|
||||
i += (end - beg);
|
||||
// shift by min values.
|
||||
indptr[fidx] = ptrs[fidx] + n_numeric;
|
||||
if (is_numeric) {
|
||||
n_numeric++;
|
||||
}
|
||||
}
|
||||
CHECK_EQ(n_numeric, n_features - n_categories);
|
||||
|
||||
indptr.back() = data.size();
|
||||
CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
|
||||
char const **out_indptr, char const **out_data) {
|
||||
API_BEGIN();
|
||||
CHECK_HANDLE();
|
||||
|
||||
auto p_m = CastDMatrixHandle(handle);
|
||||
|
||||
xgboost_CHECK_C_ARG_PTR(config);
|
||||
xgboost_CHECK_C_ARG_PTR(out_indptr);
|
||||
xgboost_CHECK_C_ARG_PTR(out_data);
|
||||
|
||||
auto jconfig = Json::Load(StringView{config});
|
||||
|
||||
if (!p_m->PageExists<GHistIndexMatrix>() && !p_m->PageExists<EllpackPage>()) {
|
||||
LOG(FATAL) << "The quantile cut hasn't been generated yet. Unless this is a `QuantileDMatrix`, "
|
||||
"quantile cut is generated during training.";
|
||||
}
|
||||
// Get return buffer
|
||||
auto &data = p_m->GetThreadLocal().ret_vec_float;
|
||||
auto &indptr = p_m->GetThreadLocal().ret_vec_u64;
|
||||
|
||||
if (p_m->PageExists<GHistIndexMatrix>()) {
|
||||
auto ctx = p_m->Ctx()->IsCPU() ? *p_m->Ctx() : p_m->Ctx()->MakeCPU();
|
||||
GetCutImpl<GHistIndexMatrix>(&ctx, p_m, &indptr, &data);
|
||||
} else {
|
||||
auto ctx = p_m->Ctx()->IsCUDA() ? *p_m->Ctx() : p_m->Ctx()->MakeCUDA(0);
|
||||
GetCutImpl<EllpackPage>(&ctx, p_m, &indptr, &data);
|
||||
}
|
||||
|
||||
// Create a CPU context
|
||||
Context ctx;
|
||||
// Get return buffer
|
||||
auto &ret_vec_str = p_m->GetThreadLocal().ret_vec_str;
|
||||
ret_vec_str.clear();
|
||||
|
||||
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
|
||||
linalg::MakeTensorView(&ctx, common::Span{indptr.data(), indptr.size()}, indptr.size())));
|
||||
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
|
||||
linalg::MakeTensorView(&ctx, common::Span{data.data(), data.size()}, data.size())));
|
||||
|
||||
auto &charp_vecs = p_m->GetThreadLocal().ret_vec_charp;
|
||||
charp_vecs.resize(ret_vec_str.size());
|
||||
std::transform(ret_vec_str.cbegin(), ret_vec_str.cend(), charp_vecs.begin(),
|
||||
[](auto const &str) { return str.c_str(); });
|
||||
|
||||
*out_indptr = charp_vecs[0];
|
||||
*out_data = charp_vecs[1];
|
||||
API_END();
|
||||
}
|
||||
|
||||
// xgboost implementation
|
||||
XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[],
|
||||
xgboost::bst_ulong len,
|
||||
|
||||
@@ -24,6 +24,8 @@ struct XGBAPIThreadLocalEntry {
|
||||
std::vector<const char *> ret_vec_charp;
|
||||
/*! \brief returning float vector. */
|
||||
std::vector<float> ret_vec_float;
|
||||
/*! \brief returning uint vector. */
|
||||
std::vector<std::uint64_t> ret_vec_u64;
|
||||
/*! \brief temp variable of gradient pairs. */
|
||||
std::vector<GradientPair> tmp_gpair;
|
||||
/*! \brief Temp variable for returning prediction result. */
|
||||
|
||||
@@ -455,7 +455,7 @@ class ArrayInterface {
|
||||
|
||||
explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
|
||||
|
||||
explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
|
||||
explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}
|
||||
|
||||
void AssignType(StringView typestr) {
|
||||
using T = ArrayInterfaceHandler::Type;
|
||||
|
||||
@@ -3,12 +3,20 @@
|
||||
*/
|
||||
#ifndef XGBOOST_USE_CUDA
|
||||
|
||||
#include "ellpack_page.h"
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
// dummy implementation of EllpackPage in case CUDA is not used
|
||||
namespace xgboost {
|
||||
|
||||
class EllpackPageImpl {};
|
||||
class EllpackPageImpl {
|
||||
common::HistogramCuts cuts_;
|
||||
|
||||
public:
|
||||
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
|
||||
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
|
||||
};
|
||||
|
||||
EllpackPage::EllpackPage() = default;
|
||||
|
||||
@@ -32,6 +40,17 @@ size_t EllpackPage::Size() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
return impl_->Cuts();
|
||||
}
|
||||
|
||||
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
|
||||
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
|
||||
"EllpackPage is required";
|
||||
return impl_->Cuts();
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
#include <thrust/iterator/transform_output_iterator.h>
|
||||
|
||||
#include <algorithm> // for copy
|
||||
#include <utility> // for move
|
||||
#include <vector> // for vector
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/cuda_context.cuh"
|
||||
#include "../common/hist_util.cuh"
|
||||
@@ -11,6 +15,7 @@
|
||||
#include "../common/transform_iterator.h" // MakeIndexTransformIter
|
||||
#include "./ellpack_page.cuh"
|
||||
#include "device_adapter.cuh" // for HasInfInData
|
||||
#include "ellpack_page.h"
|
||||
#include "gradient_index.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
@@ -29,6 +34,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }
|
||||
|
||||
void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }
|
||||
|
||||
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
|
||||
CHECK(impl_);
|
||||
return impl_->Cuts();
|
||||
}
|
||||
|
||||
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
|
||||
CHECK(impl_);
|
||||
return impl_->Cuts();
|
||||
}
|
||||
|
||||
// Bin each input data entry, store the bin indices in compressed form.
|
||||
__global__ void CompressBinEllpackKernel(
|
||||
common::CompressedBufferWriter wr,
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
/*!
|
||||
* Copyright 2019 by XGBoost Contributors
|
||||
/**
|
||||
* Copyright 2019-2023, XGBoost Contributors
|
||||
*/
|
||||
|
||||
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
#define XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
|
||||
#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
|
||||
|
||||
#include <thrust/binary_search.h>
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include "../common/categorical.h"
|
||||
#include "../common/compressed_iterator.h"
|
||||
#include "../common/device_helpers.cuh"
|
||||
#include "../common/hist_util.h"
|
||||
#include "../common/categorical.h"
|
||||
#include <thrust/binary_search.h>
|
||||
#include "ellpack_page.h"
|
||||
|
||||
namespace xgboost {
|
||||
/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
|
||||
@@ -194,8 +195,8 @@ class EllpackPageImpl {
|
||||
base_rowid = row_id;
|
||||
}
|
||||
|
||||
common::HistogramCuts& Cuts() { return cuts_; }
|
||||
common::HistogramCuts const& Cuts() const { return cuts_; }
|
||||
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
|
||||
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
|
||||
|
||||
/*! \return Estimation of memory cost of this page. */
|
||||
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
|
||||
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
#endif // XGBOOST_DATA_ELLPACK_PAGE_CUH_
|
||||
|
||||
59
src/data/ellpack_page.h
Normal file
59
src/data/ellpack_page.h
Normal file
@@ -0,0 +1,59 @@
|
||||
/**
|
||||
* Copyright 2017-2023 by XGBoost Contributors
|
||||
*/
|
||||
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
#define XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
|
||||
#include <memory> // for unique_ptr
|
||||
|
||||
#include "../common/hist_util.h" // for HistogramCuts
|
||||
#include "xgboost/context.h" // for Context
|
||||
#include "xgboost/data.h" // for DMatrix, BatchParam
|
||||
|
||||
namespace xgboost {
|
||||
class EllpackPageImpl;
|
||||
/**
|
||||
* @brief A page stored in ELLPACK format.
|
||||
*
|
||||
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
|
||||
* including CUDA-specific implementation details in the header.
|
||||
*/
|
||||
class EllpackPage {
|
||||
public:
|
||||
/**
|
||||
* @brief Default constructor.
|
||||
*
|
||||
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
|
||||
* set later by the reader.
|
||||
*/
|
||||
EllpackPage();
|
||||
/**
|
||||
* @brief Constructor from an existing DMatrix.
|
||||
*
|
||||
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
|
||||
* in CSR format.
|
||||
*/
|
||||
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
|
||||
|
||||
/*! \brief Destructor. */
|
||||
~EllpackPage();
|
||||
|
||||
EllpackPage(EllpackPage&& that);
|
||||
|
||||
/*! \return Number of instances in the page. */
|
||||
[[nodiscard]] size_t Size() const;
|
||||
|
||||
/*! \brief Set the base row id for this page. */
|
||||
void SetBaseRowId(std::size_t row_id);
|
||||
|
||||
[[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
|
||||
EllpackPageImpl* Impl() { return impl_.get(); }
|
||||
|
||||
[[nodiscard]] common::HistogramCuts& Cuts();
|
||||
[[nodiscard]] common::HistogramCuts const& Cuts() const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<EllpackPageImpl> impl_;
|
||||
};
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
|
||||
@@ -5,10 +5,10 @@
|
||||
#include <utility>
|
||||
|
||||
#include "ellpack_page.cuh"
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "ellpack_page_source.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
namespace xgboost::data {
|
||||
void EllpackPageSource::Fetch() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
if (!this->ReadCache()) {
|
||||
@@ -27,5 +27,4 @@ void EllpackPageSource::Fetch() {
|
||||
this->WriteCache();
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
|
||||
@@ -6,17 +6,17 @@
|
||||
#define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
|
||||
|
||||
#include <xgboost/data.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "../common/hist_util.h"
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "sparse_page_source.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace xgboost::data {
|
||||
class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
|
||||
bool is_dense_;
|
||||
size_t row_stride_;
|
||||
@@ -53,7 +53,6 @@ inline void EllpackPageSource::Fetch() {
|
||||
common::AssertGPUSupport();
|
||||
}
|
||||
#endif // !defined(XGBOOST_USE_CUDA)
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
|
||||
#endif // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
|
||||
|
||||
@@ -245,6 +245,9 @@ class GHistIndexMatrix {
|
||||
std::vector<float> const& values, std::vector<float> const& mins,
|
||||
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
|
||||
|
||||
[[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
|
||||
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<common::ColumnMatrix> columns_;
|
||||
std::vector<size_t> hit_count_tloc_;
|
||||
|
||||
@@ -16,7 +16,8 @@
|
||||
#include "../common/threading_utils.h"
|
||||
#include "./simple_batch_iterator.h"
|
||||
#include "adapter.h"
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
#include "ellpack_page.h" // for EllpackPage
|
||||
#include "gradient_index.h"
|
||||
#include "xgboost/c_api.h"
|
||||
#include "xgboost/data.h"
|
||||
|
||||
@@ -165,7 +165,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
|
||||
|
||||
BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
|
||||
const BatchParam ¶m) {
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
if (param.Initialized()) {
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
}
|
||||
detail::CheckEmpty(batch_param_, param);
|
||||
auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
|
||||
this->InitializeSparsePage(ctx);
|
||||
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
/**
|
||||
* Copyright 2021-2023 by XGBoost contributors
|
||||
*/
|
||||
#include <memory>
|
||||
|
||||
#include "../common/hist_util.cuh"
|
||||
#include "batch_utils.h" // for CheckEmpty, RegenGHist
|
||||
#include "ellpack_page.cuh"
|
||||
@@ -11,7 +13,9 @@ namespace xgboost::data {
|
||||
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
const BatchParam& param) {
|
||||
CHECK(ctx->IsCUDA());
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
if (param.Initialized()) {
|
||||
CHECK_GE(param.max_bin, 2);
|
||||
}
|
||||
detail::CheckEmpty(batch_param_, param);
|
||||
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
|
||||
size_t row_stride = 0;
|
||||
@@ -21,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
||||
cache_info_.erase(id);
|
||||
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
|
||||
std::unique_ptr<common::HistogramCuts> cuts;
|
||||
cuts.reset(
|
||||
new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
|
||||
cuts = std::make_unique<common::HistogramCuts>(
|
||||
common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
|
||||
this->InitializeSparsePage(ctx); // reset after use.
|
||||
|
||||
row_stride = GetRowStride(this);
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "../common/io.h"
|
||||
#include "../common/timer.h"
|
||||
#include "../data/ellpack_page.cuh"
|
||||
#include "../data/ellpack_page.h"
|
||||
#include "constraints.cuh"
|
||||
#include "driver.h"
|
||||
#include "gpu_hist/evaluate_splits.cuh"
|
||||
|
||||
Reference in New Issue
Block a user