Support exporting cut values (#9356)

This commit is contained in:
Jiaming Yuan
2023-07-08 15:32:41 +08:00
committed by GitHub
parent c3124813e8
commit 20c52f07d2
28 changed files with 722 additions and 101 deletions

View File

@@ -3,7 +3,7 @@
*/
#include "xgboost/c_api.h"
#include <algorithm> // for copy
#include <algorithm> // for copy, transform
#include <cinttypes> // for strtoimax
#include <cmath> // for nan
#include <cstring> // for strcmp
@@ -20,9 +20,11 @@
#include "../collective/communicator-inl.h" // for Allreduce, Broadcast, Finalize, GetProcessor...
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
#include "../common/hist_util.h" // for HistogramCuts
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
#include "../data/ellpack_page.h" // for EllpackPage
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
#include "../data/simple_dmatrix.h" // for SimpleDMatrix
#include "c_api_error.h" // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
@@ -785,6 +787,104 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
API_END();
}
namespace {
template <typename Page>
void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
std::vector<std::uint64_t> *p_indptr, std::vector<float> *p_data) {
auto &indptr = *p_indptr;
auto &data = *p_data;
for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
auto const &cut = page.Cuts();
auto const &ptrs = cut.Ptrs();
indptr.resize(ptrs.size());
auto const &vals = cut.Values();
auto const &mins = cut.MinValues();
bst_feature_t n_features = p_m->Info().num_col_;
auto ft = p_m->Info().feature_types.ConstHostSpan();
std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
[](auto t) { return t == FeatureType::kCategorical; });
data.resize(vals.size() + n_features - n_categories); // |vals| + |mins|
std::size_t i{0}, n_numeric{0};
for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
CHECK_LT(i, data.size());
bool is_numeric = !common::IsCat(ft, fidx);
if (is_numeric) {
data[i] = mins[fidx];
i++;
}
auto beg = ptrs[fidx];
auto end = ptrs[fidx + 1];
CHECK_LE(end, data.size());
std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
i += (end - beg);
// shift by min values.
indptr[fidx] = ptrs[fidx] + n_numeric;
if (is_numeric) {
n_numeric++;
}
}
CHECK_EQ(n_numeric, n_features - n_categories);
indptr.back() = data.size();
CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
break;
}
}
} // namespace
XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
char const **out_indptr, char const **out_data) {
API_BEGIN();
CHECK_HANDLE();
auto p_m = CastDMatrixHandle(handle);
xgboost_CHECK_C_ARG_PTR(config);
xgboost_CHECK_C_ARG_PTR(out_indptr);
xgboost_CHECK_C_ARG_PTR(out_data);
auto jconfig = Json::Load(StringView{config});
if (!p_m->PageExists<GHistIndexMatrix>() && !p_m->PageExists<EllpackPage>()) {
LOG(FATAL) << "The quantile cut hasn't been generated yet. Unless this is a `QuantileDMatrix`, "
"quantile cut is generated during training.";
}
// Get return buffer
auto &data = p_m->GetThreadLocal().ret_vec_float;
auto &indptr = p_m->GetThreadLocal().ret_vec_u64;
if (p_m->PageExists<GHistIndexMatrix>()) {
auto ctx = p_m->Ctx()->IsCPU() ? *p_m->Ctx() : p_m->Ctx()->MakeCPU();
GetCutImpl<GHistIndexMatrix>(&ctx, p_m, &indptr, &data);
} else {
auto ctx = p_m->Ctx()->IsCUDA() ? *p_m->Ctx() : p_m->Ctx()->MakeCUDA(0);
GetCutImpl<EllpackPage>(&ctx, p_m, &indptr, &data);
}
// Create a CPU context
Context ctx;
// Get return buffer
auto &ret_vec_str = p_m->GetThreadLocal().ret_vec_str;
ret_vec_str.clear();
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
linalg::MakeTensorView(&ctx, common::Span{indptr.data(), indptr.size()}, indptr.size())));
ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
linalg::MakeTensorView(&ctx, common::Span{data.data(), data.size()}, data.size())));
auto &charp_vecs = p_m->GetThreadLocal().ret_vec_charp;
charp_vecs.resize(ret_vec_str.size());
std::transform(ret_vec_str.cbegin(), ret_vec_str.cend(), charp_vecs.begin(),
[](auto const &str) { return str.c_str(); });
*out_indptr = charp_vecs[0];
*out_data = charp_vecs[1];
API_END();
}
// xgboost implementation
XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[],
xgboost::bst_ulong len,

View File

@@ -24,6 +24,8 @@ struct XGBAPIThreadLocalEntry {
std::vector<const char *> ret_vec_charp;
/*! \brief returning float vector. */
std::vector<float> ret_vec_float;
/*! \brief returning uint vector. */
std::vector<std::uint64_t> ret_vec_u64;
/*! \brief temp variable of gradient pairs. */
std::vector<GradientPair> tmp_gpair;
/*! \brief Temp variable for returning prediction result. */

View File

@@ -455,7 +455,7 @@ class ArrayInterface {
explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}
void AssignType(StringView typestr) {
using T = ArrayInterfaceHandler::Type;

View File

@@ -3,12 +3,20 @@
*/
#ifndef XGBOOST_USE_CUDA
#include "ellpack_page.h"
#include <xgboost/data.h>
// dummy implementation of EllpackPage in case CUDA is not used
namespace xgboost {
class EllpackPageImpl {};
class EllpackPageImpl {
common::HistogramCuts cuts_;
public:
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
};
EllpackPage::EllpackPage() = default;
@@ -32,6 +40,17 @@ size_t EllpackPage::Size() const {
return 0;
}
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
} // namespace xgboost
#endif // XGBOOST_USE_CUDA

View File

@@ -4,6 +4,10 @@
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <algorithm> // for copy
#include <utility> // for move
#include <vector> // for vector
#include "../common/categorical.h"
#include "../common/cuda_context.cuh"
#include "../common/hist_util.cuh"
@@ -11,6 +15,7 @@
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "./ellpack_page.cuh"
#include "device_adapter.cuh" // for HasInfInData
#include "ellpack_page.h"
#include "gradient_index.h"
#include "xgboost/data.h"
@@ -29,6 +34,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }
void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
CHECK(impl_);
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
CHECK(impl_);
return impl_->Cuts();
}
// Bin each input data entry, store the bin indices in compressed form.
__global__ void CompressBinEllpackKernel(
common::CompressedBufferWriter wr,

View File

@@ -1,17 +1,18 @@
/*!
* Copyright 2019 by XGBoost Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
#include <thrust/binary_search.h>
#include <xgboost/data.h>
#include "../common/categorical.h"
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "../common/categorical.h"
#include <thrust/binary_search.h>
#include "ellpack_page.h"
namespace xgboost {
/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
@@ -194,8 +195,8 @@ class EllpackPageImpl {
base_rowid = row_id;
}
common::HistogramCuts& Cuts() { return cuts_; }
common::HistogramCuts const& Cuts() const { return cuts_; }
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
/*! \return Estimation of memory cost of this page. */
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
}
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
#endif // XGBOOST_DATA_ELLPACK_PAGE_CUH_

59
src/data/ellpack_page.h Normal file
View File

@@ -0,0 +1,59 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#include <memory> // for unique_ptr
#include "../common/hist_util.h" // for HistogramCuts
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, BatchParam
namespace xgboost {
class EllpackPageImpl;
/**
* @brief A page stored in ELLPACK format.
*
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
* including CUDA-specific implementation details in the header.
*/
class EllpackPage {
public:
/**
* @brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();
/**
* @brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
/*! \brief Destructor. */
~EllpackPage();
EllpackPage(EllpackPage&& that);
/*! \return Number of instances in the page. */
[[nodiscard]] size_t Size() const;
/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id);
[[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }
[[nodiscard]] common::HistogramCuts& Cuts();
[[nodiscard]] common::HistogramCuts const& Cuts() const;
private:
std::unique_ptr<EllpackPageImpl> impl_;
};
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_

View File

@@ -5,10 +5,10 @@
#include <utility>
#include "ellpack_page.cuh"
#include "ellpack_page.h" // for EllpackPage
#include "ellpack_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void EllpackPageSource::Fetch() {
dh::safe_cuda(cudaSetDevice(device_));
if (!this->ReadCache()) {
@@ -27,5 +27,4 @@ void EllpackPageSource::Fetch() {
this->WriteCache();
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -6,17 +6,17 @@
#define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
#include <xgboost/data.h>
#include <memory>
#include <string>
#include <utility>
#include "../common/common.h"
#include "../common/hist_util.h"
#include "ellpack_page.h" // for EllpackPage
#include "sparse_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
bool is_dense_;
size_t row_stride_;
@@ -53,7 +53,6 @@ inline void EllpackPageSource::Fetch() {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_

View File

@@ -245,6 +245,9 @@ class GHistIndexMatrix {
std::vector<float> const& values, std::vector<float> const& mins,
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
[[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
private:
std::unique_ptr<common::ColumnMatrix> columns_;
std::vector<size_t> hit_count_tloc_;

View File

@@ -16,7 +16,8 @@
#include "../common/threading_utils.h"
#include "./simple_batch_iterator.h"
#include "adapter.h"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.h" // for EllpackPage
#include "gradient_index.h"
#include "xgboost/c_api.h"
#include "xgboost/data.h"

View File

@@ -165,7 +165,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
const BatchParam &param) {
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
this->InitializeSparsePage(ctx);
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {

View File

@@ -1,6 +1,8 @@
/**
* Copyright 2021-2023 by XGBoost contributors
*/
#include <memory>
#include "../common/hist_util.cuh"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.cuh"
@@ -11,7 +13,9 @@ namespace xgboost::data {
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
const BatchParam& param) {
CHECK(ctx->IsCUDA());
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
size_t row_stride = 0;
@@ -21,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
cache_info_.erase(id);
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
std::unique_ptr<common::HistogramCuts> cuts;
cuts.reset(
new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
cuts = std::make_unique<common::HistogramCuts>(
common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
this->InitializeSparsePage(ctx); // reset after use.
row_stride = GetRowStride(this);

View File

@@ -21,6 +21,7 @@
#include "../common/io.h"
#include "../common/timer.h"
#include "../data/ellpack_page.cuh"
#include "../data/ellpack_page.h"
#include "constraints.cuh"
#include "driver.h"
#include "gpu_hist/evaluate_splits.cuh"