Use mmap for external memory. (#9282)

- Have basic infrastructure for mmap.
- Release file write handle.
This commit is contained in:
Jiaming Yuan
2023-06-19 18:52:55 +08:00
committed by GitHub
parent d8beb517ed
commit ee6809e642
16 changed files with 599 additions and 275 deletions

View File

@@ -176,7 +176,7 @@ struct GPUHistMakerDevice {
Context const* ctx_;
public:
EllpackPageImpl const* page;
EllpackPageImpl const* page{nullptr};
common::Span<FeatureType const> feature_types;
BatchParam batch_param;
@@ -205,41 +205,41 @@ struct GPUHistMakerDevice {
std::unique_ptr<FeatureGroups> feature_groups;
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
BatchParam _batch_param)
: evaluator_{_param, n_features, ctx->gpu_id},
ctx_(ctx),
page(_page),
feature_types{_feature_types},
param(std::move(_param)),
column_sampler(column_sampler_seed),
interaction_constraints(param, n_features),
batch_param(std::move(_batch_param)) {
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
param.sampling_method));
sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
param.sampling_method, is_external_memory));
if (!param.monotone_constraints.empty()) {
// Copy assigning an empty vector causes an exception in MSVC debug builds
monotone_constraints = param.monotone_constraints;
}
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientSumT)));
}
~GPUHistMakerDevice() { // NOLINT
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
}
void InitFeatureGroupsOnce() {
if (!feature_groups) {
CHECK(page);
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
sizeof(GradientSumT)));
}
}
// Reset values for each update iteration
// Note that the column sampler must be passed by value because it is not
// thread safe
void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
auto const& info = dmat->Info();
this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
@@ -247,26 +247,30 @@ struct GPUHistMakerDevice {
param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
ctx_->gpu_id);
this->interaction_constraints.Reset();
if (d_gpair.size() != dh_gpair->Size()) {
d_gpair.resize(dh_gpair->Size());
}
dh::safe_cuda(cudaMemcpyAsync(
d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
dh_gpair->Size() * sizeof(GradientPair),
cudaMemcpyDeviceToDevice));
auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
page = sample.page;
gpair = sample.gpair;
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
quantiser.reset(new GradientQuantiser(this->gpair));
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
// Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
hist.Reset();
this->InitFeatureGroupsOnce();
}
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -808,12 +812,11 @@ class GPUHistMaker : public TreeUpdater {
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
info_->feature_types.SetDevice(ctx_->gpu_id);
maker.reset(new GPUHistMakerDevice<GradientSumT>(
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
column_sampling_seed, info_->num_col_, batch_param));
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
*param, column_sampling_seed, info_->num_col_, batch_param));
p_last_fmat_ = dmat;
initialised_ = true;