Use mmap for external memory. (#9282)

- Have basic infrastructure for mmap.
- Release file write handle.
This commit is contained in:
Jiaming Yuan
2023-06-19 18:52:55 +08:00
committed by GitHub
parent d8beb517ed
commit ee6809e642
16 changed files with 599 additions and 275 deletions

View File

@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_;
};
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) {
return {dmat->Info().num_row_, page_, gpair};
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
return {dmat->Info().num_row_, page, gpair};
}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
size_t n_rows, BatchParam batch_param)
: batch_param_{std::move(batch_param)},
page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
n_rows)) {}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
: batch_param_{std::move(batch_param)} {}
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair,
DMatrix* dmat) {
if (!page_concatenated_) {
// Concatenate all the external memory ELLPACK pages into a single in-memory page.
page_.reset(nullptr);
size_t offset = 0;
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
auto page = batch.Impl();
if (!page_) {
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
page->row_stride, dmat->Info().num_row_);
}
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
offset += num_elements;
}
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
return {dmat->Info().num_row_, page_.get(), gpair};
}
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
: page_(page), subsample_(subsample) {}
UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
: batch_param_{std::move(batch_param)}, subsample_(subsample) {}
GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) {
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<std::size_t>(0),
BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
return {dmat->Info().num_row_, page_, gpair};
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
return {dmat->Info().num_row_, page, gpair};
}
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam&,
GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
float subsample)
: page_(page),
subsample_(subsample),
: subsample_(subsample),
batch_param_{std::move(batch_param)},
threshold_(n_rows + 1, 0.0f),
grad_sum_(n_rows, 0.0f) {}
@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
// Perform Poisson sampling in place.
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
RandomWeight(common::GlobalRandom()())));
return {n_rows, page_, gpair};
return {n_rows, page, gpair};
}
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
size_t n_rows,
BatchParam batch_param,
float subsample)
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
BatchParam batch_param,
float subsample)
: batch_param_(std::move(batch_param)),
subsample_(subsample),
threshold_(n_rows + 1, 0.0f),
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
common::Span<GradientPair> gpair,
DMatrix* dmat) {
size_t n_rows = dmat->Info().num_row_;
auto cuctx = ctx->CUDACtx();
bst_row_t n_rows = dmat->Info().num_row_;
size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
// Perform Poisson sampling in place.
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0),
dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_),
threshold_index,
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
PoissonSampling(dh::ToSpan(threshold_), threshold_index,
RandomWeight(common::GlobalRandom()())));
// Count the sampled rows.
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
// Compact gradient pairs.
gpair_.resize(sample_rows);
thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
// Index the sample rows.
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
sample_row_index_.begin());
thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
sample_row_index_.begin(),
sample_row_index_.begin(),
ClearEmptyRows());
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
IsNonZero());
thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
sample_row_index_.begin());
thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
sample_row_index_.begin(), ClearEmptyRows());
auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
auto first_page = (*batch_iterator.begin()).Impl();
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
size_t n_rows, const BatchParam& batch_param,
float subsample, int sampling_method) {
GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
const BatchParam& batch_param, float subsample,
int sampling_method, bool is_external_memory) {
// The ctx is kept here for future development of stream-based operations.
monitor_.Init("gradient_based_sampler");
bool is_sampling = subsample < 1.0;
bool is_external_memory = page->n_rows != n_rows;
if (is_sampling) {
switch (sampling_method) {
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
if (is_external_memory) {
strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new UniformSampling(page, subsample));
strategy_.reset(new UniformSampling(batch_param, subsample));
}
break;
case TrainParam::kGradientBased:
if (is_external_memory) {
strategy_.reset(
new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
} else {
strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
}
break;
default:LOG(FATAL) << "unknown sampling method";
default:
LOG(FATAL) << "unknown sampling method";
}
} else {
if (is_external_memory) {
strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
} else {
strategy_.reset(new NoSampling(page));
strategy_.reset(new NoSampling(batch_param));
}
}
}
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
return sample;
}
size_t GradientBasedSampler::CalculateThresholdIndex(
common::Span<GradientPair> gpair, common::Span<float> threshold,
common::Span<float> grad_sum, size_t sample_rows) {
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
std::numeric_limits<float>::max());
size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
common::Span<float> threshold,
common::Span<float> grad_sum,
size_t sample_rows) {
thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
CombineGradientPair());
thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
return thrust::distance(dh::tbegin(grad_sum), min) + 1;
}
}; // namespace tree
}; // namespace xgboost

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019 by XGBoost Contributors
/**
* Copyright 2019-2023, XGBoost Contributors
*/
#pragma once
#include <xgboost/base.h>
@@ -32,37 +32,36 @@ class SamplingStrategy {
/*! \brief No sampling in in-memory mode. */
class NoSampling : public SamplingStrategy {
public:
explicit NoSampling(EllpackPageImpl const* page);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
BatchParam batch_param);
explicit NoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
std::unique_ptr<EllpackPageImpl> page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
explicit ExternalMemoryNoSampling(BatchParam batch_param);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
BatchParam batch_param_;
std::unique_ptr<EllpackPageImpl> page_{nullptr};
bool page_concatenated_{false};
};
/*! \brief Uniform sampling in in-memory mode. */
class UniformSampling : public SamplingStrategy {
public:
UniformSampling(EllpackPageImpl const* page, float subsample);
UniformSampling(BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
BatchParam batch_param_;
float subsample_;
};
@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in in-memory mode.. */
class GradientBasedSampling : public SamplingStrategy {
public:
GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
float subsample);
GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
DMatrix* dmat) override;
private:
EllpackPageImpl const* page_;
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
private:
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
dh::device_vector<float> threshold_;
dh::device_vector<float> grad_sum_;
std::unique_ptr<EllpackPageImpl> page_;
dh::device_vector<GradientPair> gpair_;
dh::caching_device_vector<size_t> sample_row_index_;
dh::device_vector<size_t> sample_row_index_;
};
/*! \brief Draw a sample of rows from a DMatrix.
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
*/
class GradientBasedSampler {
public:
GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
const BatchParam& batch_param, float subsample, int sampling_method);
GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
float subsample, int sampling_method, bool is_external_memory);
/*! \brief Sample from a DMatrix based on the given gradient pairs. */
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);

View File

@@ -176,7 +176,7 @@ struct GPUHistMakerDevice {
Context const* ctx_;
public:
EllpackPageImpl const* page;
EllpackPageImpl const* page{nullptr};
common::Span<FeatureType const> feature_types;
BatchParam batch_param;
@@ -205,41 +205,41 @@ struct GPUHistMakerDevice {
std::unique_ptr<FeatureGroups> feature_groups;
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
BatchParam _batch_param)
: evaluator_{_param, n_features, ctx->gpu_id},
ctx_(ctx),
page(_page),
feature_types{_feature_types},
param(std::move(_param)),
column_sampler(column_sampler_seed),
interaction_constraints(param, n_features),
batch_param(std::move(_batch_param)) {
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
param.sampling_method));
sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
param.sampling_method, is_external_memory));
if (!param.monotone_constraints.empty()) {
// Copy assigning an empty vector causes an exception in MSVC debug builds
monotone_constraints = param.monotone_constraints;
}
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientSumT)));
}
~GPUHistMakerDevice() { // NOLINT
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
}
void InitFeatureGroupsOnce() {
if (!feature_groups) {
CHECK(page);
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientSumT)));
}
}
// Reset values for each update iteration
// Note that the column sampler must be passed by value because it is not
// thread safe
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
auto const& info = dmat->Info();
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
@@ -247,26 +247,30 @@ struct GPUHistMakerDevice {
param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
ctx_->gpu_id);
this->interaction_constraints.Reset();
if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size());
}
dh::safe_cuda(cudaMemcpyAsync(
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
cudaMemcpyDeviceToDevice));
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
page = sample.page;
gpair = sample.gpair;
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
quantiser.reset(new GradientQuantiser(this->gpair));
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
hist.Reset();
this->InitFeatureGroupsOnce();
}
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -808,12 +812,11 @@ class GPUHistMaker : public TreeUpdater {
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
info_->feature_types.SetDevice(ctx_->gpu_id);
maker.reset(new GPUHistMakerDevice<GradientSumT>(
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
column_sampling_seed, info_->num_col_, batch_param));
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
*param, column_sampling_seed, info_->num_col_, batch_param));
p_last_fmat_ = dmat;
initialised_ = true;