Support exporting cut values (#9356)

This commit is contained in:
Jiaming Yuan
2023-07-08 15:32:41 +08:00
committed by GitHub
parent c3124813e8
commit 20c52f07d2
28 changed files with 722 additions and 101 deletions

View File

@@ -455,7 +455,7 @@ class ArrayInterface {
explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}
void AssignType(StringView typestr) {
using T = ArrayInterfaceHandler::Type;

View File

@@ -3,12 +3,20 @@
*/
#ifndef XGBOOST_USE_CUDA
#include "ellpack_page.h"
#include <xgboost/data.h>
// dummy implementation of EllpackPage in case CUDA is not used
namespace xgboost {
class EllpackPageImpl {};
class EllpackPageImpl {
common::HistogramCuts cuts_;
public:
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
};
EllpackPage::EllpackPage() = default;
@@ -32,6 +40,17 @@ size_t EllpackPage::Size() const {
return 0;
}
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return impl_->Cuts();
}
} // namespace xgboost
#endif // XGBOOST_USE_CUDA

View File

@@ -4,6 +4,10 @@
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <algorithm> // for copy
#include <utility> // for move
#include <vector> // for vector
#include "../common/categorical.h"
#include "../common/cuda_context.cuh"
#include "../common/hist_util.cuh"
@@ -11,6 +15,7 @@
#include "../common/transform_iterator.h" // MakeIndexTransformIter
#include "./ellpack_page.cuh"
#include "device_adapter.cuh" // for HasInfInData
#include "ellpack_page.h"
#include "gradient_index.h"
#include "xgboost/data.h"
@@ -29,6 +34,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }
void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }
[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
CHECK(impl_);
return impl_->Cuts();
}
[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
CHECK(impl_);
return impl_->Cuts();
}
// Bin each input data entry, store the bin indices in compressed form.
__global__ void CompressBinEllpackKernel(
common::CompressedBufferWriter wr,

View File

@@ -1,17 +1,18 @@
/*!
* Copyright 2019 by XGBoost Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
#include <thrust/binary_search.h>
#include <xgboost/data.h>
#include "../common/categorical.h"
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "../common/categorical.h"
#include <thrust/binary_search.h>
#include "ellpack_page.h"
namespace xgboost {
/** \brief Struct for accessing and manipulating an ELLPACK matrix on the
@@ -194,8 +195,8 @@ class EllpackPageImpl {
base_rowid = row_id;
}
common::HistogramCuts& Cuts() { return cuts_; }
common::HistogramCuts const& Cuts() const { return cuts_; }
[[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
/*! \return Estimation of memory cost of this page. */
static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
}
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
#endif // XGBOOST_DATA_ELLPACK_PAGE_CUH_

59
src/data/ellpack_page.h Normal file
View File

@@ -0,0 +1,59 @@
/**
* Copyright 2017-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
#define XGBOOST_DATA_ELLPACK_PAGE_H_
#include <memory> // for unique_ptr
#include "../common/hist_util.h" // for HistogramCuts
#include "xgboost/context.h" // for Context
#include "xgboost/data.h" // for DMatrix, BatchParam
namespace xgboost {
class EllpackPageImpl;
/**
* @brief A page stored in ELLPACK format.
*
* This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
* including CUDA-specific implementation details in the header.
*/
class EllpackPage {
public:
/**
* @brief Default constructor.
*
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
* set later by the reader.
*/
EllpackPage();
/**
* @brief Constructor from an existing DMatrix.
*
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
* in CSR format.
*/
explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
/*! \brief Destructor. */
~EllpackPage();
EllpackPage(EllpackPage&& that);
/*! \return Number of instances in the page. */
[[nodiscard]] size_t Size() const;
/*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id);
[[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
EllpackPageImpl* Impl() { return impl_.get(); }
[[nodiscard]] common::HistogramCuts& Cuts();
[[nodiscard]] common::HistogramCuts const& Cuts() const;
private:
std::unique_ptr<EllpackPageImpl> impl_;
};
} // namespace xgboost
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_

View File

@@ -5,10 +5,10 @@
#include <utility>
#include "ellpack_page.cuh"
#include "ellpack_page.h" // for EllpackPage
#include "ellpack_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void EllpackPageSource::Fetch() {
dh::safe_cuda(cudaSetDevice(device_));
if (!this->ReadCache()) {
@@ -27,5 +27,4 @@ void EllpackPageSource::Fetch() {
this->WriteCache();
}
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -6,17 +6,17 @@
#define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
#include <xgboost/data.h>
#include <memory>
#include <string>
#include <utility>
#include "../common/common.h"
#include "../common/hist_util.h"
#include "ellpack_page.h" // for EllpackPage
#include "sparse_page_source.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
bool is_dense_;
size_t row_stride_;
@@ -53,7 +53,6 @@ inline void EllpackPageSource::Fetch() {
common::AssertGPUSupport();
}
#endif // !defined(XGBOOST_USE_CUDA)
} // namespace data
} // namespace xgboost
} // namespace xgboost::data
#endif // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_

View File

@@ -245,6 +245,9 @@ class GHistIndexMatrix {
std::vector<float> const& values, std::vector<float> const& mins,
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
[[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
[[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
private:
std::unique_ptr<common::ColumnMatrix> columns_;
std::vector<size_t> hit_count_tloc_;

View File

@@ -16,7 +16,8 @@
#include "../common/threading_utils.h"
#include "./simple_batch_iterator.h"
#include "adapter.h"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.h" // for EllpackPage
#include "gradient_index.h"
#include "xgboost/c_api.h"
#include "xgboost/data.h"

View File

@@ -165,7 +165,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
const BatchParam &param) {
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
this->InitializeSparsePage(ctx);
if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {

View File

@@ -1,6 +1,8 @@
/**
* Copyright 2021-2023 by XGBoost contributors
*/
#include <memory>
#include "../common/hist_util.cuh"
#include "batch_utils.h" // for CheckEmpty, RegenGHist
#include "ellpack_page.cuh"
@@ -11,7 +13,9 @@ namespace xgboost::data {
BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
const BatchParam& param) {
CHECK(ctx->IsCUDA());
CHECK_GE(param.max_bin, 2);
if (param.Initialized()) {
CHECK_GE(param.max_bin, 2);
}
detail::CheckEmpty(batch_param_, param);
auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
size_t row_stride = 0;
@@ -21,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
cache_info_.erase(id);
MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
std::unique_ptr<common::HistogramCuts> cuts;
cuts.reset(
new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
cuts = std::make_unique<common::HistogramCuts>(
common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
this->InitializeSparsePage(ctx); // reset after use.
row_stride = GetRowStride(this);