Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves.
- Remove use of threaded iterator and IO queue.
- Remove `page_size`.
- Make sure the number of pages in memory is bounded.
- Make sure the cache can not be violated.
- Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
Jiaming Yuan
2021-07-16 12:33:31 +08:00
committed by GitHub
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions

View File

@@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
if (rnd_(i) <= p) {
return gpair / p;
} else {
return GradientPair();
return {};
}
}
}
@@ -143,13 +143,13 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
CombineGradientPair combine_;
};
NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {}
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
return {dmat->Info().num_row_, page_, gpair};
}
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param)
: batch_param_(batch_param),
@@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
return {dmat->Info().num_row_, page_.get(), gpair};
}
UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample)
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
: page_(page), subsample_(subsample) {}
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
@@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DM
return {dmat->Info().num_row_, page_, gpair};
}
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page,
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample)
@@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page,
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam&,
float subsample)
@@ -257,7 +257,7 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpa
}
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
EllpackPageImpl* page,
EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample)
@@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
}
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample,

View File

@@ -16,7 +16,7 @@ struct GradientBasedSample {
/*!\brief Number of sampled rows. */
size_t sample_rows;
/*!\brief Sampled rows in ELLPACK format. */
EllpackPageImpl* page;
EllpackPageImpl const* page;
/*!\brief Gradient pairs for the sampled rows. */
common::Span<GradientPair> gpair;
};
@@ -31,17 +31,17 @@ class SamplingStrategy {
/*! \brief No sampling in in-memory mode. */
class NoSampling : public SamplingStrategy {
public:
explicit NoSampling(EllpackPageImpl* page);
explicit NoSampling(EllpackPageImpl const* page);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryNoSampling : public SamplingStrategy {
public:
ExternalMemoryNoSampling(EllpackPageImpl* page,
ExternalMemoryNoSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
@@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
/*! \brief Uniform sampling in in-memory mode. */
class UniformSampling : public SamplingStrategy {
public:
UniformSampling(EllpackPageImpl* page, float subsample);
UniformSampling(EllpackPageImpl const* page, float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
float subsample_;
};
/*! \brief No sampling in external memory mode. */
class ExternalMemoryUniformSampling : public SamplingStrategy {
public:
ExternalMemoryUniformSampling(EllpackPageImpl* page,
ExternalMemoryUniformSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* original_page_;
EllpackPageImpl const* original_page_;
BatchParam batch_param_;
float subsample_;
std::unique_ptr<EllpackPageImpl> page_;
@@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in in-memory mode.. */
class GradientBasedSampling : public SamplingStrategy {
public:
GradientBasedSampling(EllpackPageImpl* page,
GradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* page_;
EllpackPageImpl const* page_;
float subsample_;
dh::caching_device_vector<float> threshold_;
dh::caching_device_vector<float> grad_sum_;
@@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy {
/*! \brief Gradient-based sampling in external memory mode.. */
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
public:
ExternalMemoryGradientBasedSampling(EllpackPageImpl* page,
ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample);
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
private:
EllpackPageImpl* original_page_;
EllpackPageImpl const* original_page_;
BatchParam batch_param_;
float subsample_;
dh::caching_device_vector<float> threshold_;
@@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
*/
class GradientBasedSampler {
public:
GradientBasedSampler(EllpackPageImpl* page,
GradientBasedSampler(EllpackPageImpl const* page,
size_t n_rows,
const BatchParam& batch_param,
float subsample,