Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan
2021-07-16 12:33:31 +08:00
committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions

View File

@@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
if (rnd_(i) <= p) {
return gpair / p;
} else {
return GradientPair();
return {};
}
}
}
@@ -143,13 +143,13 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_;
};
NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {}
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
return {dmat->Info().num_row_, page_, gpair};
}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param)
: batch_param_(batch_param),
@@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
return {dmat->Info().num_row_, page_.get(), gpair};
}
UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample)
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
: page_(page), subsample_(subsample) {}
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
@@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DM
return {dmat->Info().num_row_, page_, gpair};
}
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page,
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample)
@@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page,
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam&,
float subsample)
@@ -257,7 +257,7 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpa
}
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
EllpackPageImpl* page,
EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample)
@@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample,

View File

@@ -16,7 +16,7 @@ struct GradientBasedSample {
/*!\brief Number of sampled rows. */
size_t sample_rows;
/*!\brief Sampled rows in ELLPACK format. */
EllpackPageImpl* page;
EllpackPageImpl const* page;
/*!\brief Gradient pairs for the sampled rows. */
common::Span<GradientPair> gpair;
};
@@ -31,17 +31,17 @@ class SamplingStrategy {
/*! \brief No sampling in in-memory mode. */
class NoSampling : public SamplingStrategy {
public:
explicit NoSampling(EllpackPageImpl* page);
explicit NoSampling(EllpackPageImpl const* page);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
ExternalMemoryNoSampling(EllpackPageImpl* page,
ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
@@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
/*! \brief Uniform sampling in in-memory mode. */
class UniformSampling : public SamplingStrategy {
public:
UniformSampling(EllpackPageImpl* page, float subsample);
UniformSampling(EllpackPageImpl const* page, float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
float subsample_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryUniformSampling : public SamplingStrategy {
public:
ExternalMemoryUniformSampling(EllpackPageImpl* page,
ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* original_page_;
EllpackPageImpl const* original_page_;
BatchParam batch_param_;
float subsample_;
std::unique_ptr<EllpackPageImpl> page_;
@@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in in-memory mode.. */
class GradientBasedSampling : public SamplingStrategy {
public:
GradientBasedSampling(EllpackPageImpl* page,
GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
@@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in external memory mode.. */
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
public:
ExternalMemoryGradientBasedSampling(EllpackPageImpl* page,
ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* original_page_;
EllpackPageImpl const* original_page_;
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
@@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
*/
class GradientBasedSampler {
public:
GradientBasedSampler(EllpackPageImpl* page,
GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample,

View File

@@ -163,7 +163,7 @@ class DeviceHistogram {
template <typename GradientSumT>
struct GPUHistMakerDevice {
int device_id;
EllpackPageImpl* page;
EllpackPageImpl const* page;
common::Span<FeatureType const> feature_types;
BatchParam batch_param;
@@ -199,7 +199,7 @@ struct GPUHistMakerDevice {
dh::caching_device_vector<uint32_t> node_categories;
GPUHistMakerDevice(int _device_id,
EllpackPageImpl* _page,
EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types,
bst_uint _n_rows,
TrainParam _param,
@@ -488,7 +488,7 @@ struct GPUHistMakerDevice {
}
}
void FinalisePositionInPage(EllpackPageImpl *page,
void FinalisePositionInPage(EllpackPageImpl const *page,
const common::Span<RegTree::Node> d_nodes,
common::Span<FeatureType const> d_feature_types,
common::Span<uint32_t const> categories,
@@ -812,7 +812,6 @@ class GPUHistMakerSpecialised {
BatchParam batch_param{
device_,
param_.max_bin,
generic_param_->gpu_page_size
};
auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
dh::safe_cuda(cudaSetDevice(device_));