Rewrite sparse dmatrix using callbacks. (#7092)
- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves. - Remove use of threaded iterator and IO queue. - Remove `page_size`. - Make sure the number of pages in memory is bounded. - Make sure the cache can not be violated. - Provide an interface for internal algorithms to process data asynchronously.
This commit is contained in:
@@ -131,7 +131,7 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
||||
if (rnd_(i) <= p) {
|
||||
return gpair / p;
|
||||
} else {
|
||||
return GradientPair();
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -143,13 +143,13 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
|
||||
CombineGradientPair combine_;
|
||||
};
|
||||
|
||||
NoSampling::NoSampling(EllpackPageImpl* page) : page_(page) {}
|
||||
NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
|
||||
|
||||
GradientBasedSample NoSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl* page,
|
||||
ExternalMemoryNoSampling::ExternalMemoryNoSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param)
|
||||
: batch_param_(batch_param),
|
||||
@@ -171,7 +171,7 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(common::Span<GradientPair>
|
||||
return {dmat->Info().num_row_, page_.get(), gpair};
|
||||
}
|
||||
|
||||
UniformSampling::UniformSampling(EllpackPageImpl* page, float subsample)
|
||||
UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
|
||||
: page_(page), subsample_(subsample) {}
|
||||
|
||||
GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DMatrix* dmat) {
|
||||
@@ -183,7 +183,7 @@ GradientBasedSample UniformSampling::Sample(common::Span<GradientPair> gpair, DM
|
||||
return {dmat->Info().num_row_, page_, gpair};
|
||||
}
|
||||
|
||||
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl* page,
|
||||
ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample)
|
||||
@@ -231,7 +231,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::Span<GradientP
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl* page,
|
||||
GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam&,
|
||||
float subsample)
|
||||
@@ -257,7 +257,7 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span<GradientPair> gpa
|
||||
}
|
||||
|
||||
ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
|
||||
EllpackPageImpl* page,
|
||||
EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample)
|
||||
@@ -313,7 +313,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::Span<Gra
|
||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||
}
|
||||
|
||||
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
|
||||
GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample,
|
||||
|
||||
@@ -16,7 +16,7 @@ struct GradientBasedSample {
|
||||
/*!\brief Number of sampled rows. */
|
||||
size_t sample_rows;
|
||||
/*!\brief Sampled rows in ELLPACK format. */
|
||||
EllpackPageImpl* page;
|
||||
EllpackPageImpl const* page;
|
||||
/*!\brief Gradient pairs for the sampled rows. */
|
||||
common::Span<GradientPair> gpair;
|
||||
};
|
||||
@@ -31,17 +31,17 @@ class SamplingStrategy {
|
||||
/*! \brief No sampling in in-memory mode. */
|
||||
class NoSampling : public SamplingStrategy {
|
||||
public:
|
||||
explicit NoSampling(EllpackPageImpl* page);
|
||||
explicit NoSampling(EllpackPageImpl const* page);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl* page_;
|
||||
EllpackPageImpl const* page_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryNoSampling(EllpackPageImpl* page,
|
||||
ExternalMemoryNoSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
@@ -55,25 +55,25 @@ class ExternalMemoryNoSampling : public SamplingStrategy {
|
||||
/*! \brief Uniform sampling in in-memory mode. */
|
||||
class UniformSampling : public SamplingStrategy {
|
||||
public:
|
||||
UniformSampling(EllpackPageImpl* page, float subsample);
|
||||
UniformSampling(EllpackPageImpl const* page, float subsample);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl* page_;
|
||||
EllpackPageImpl const* page_;
|
||||
float subsample_;
|
||||
};
|
||||
|
||||
/*! \brief No sampling in external memory mode. */
|
||||
class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryUniformSampling(EllpackPageImpl* page,
|
||||
ExternalMemoryUniformSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl* original_page_;
|
||||
EllpackPageImpl const* original_page_;
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
std::unique_ptr<EllpackPageImpl> page_;
|
||||
@@ -84,14 +84,14 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
|
||||
/*! \brief Gradient-based sampling in in-memory mode.. */
|
||||
class GradientBasedSampling : public SamplingStrategy {
|
||||
public:
|
||||
GradientBasedSampling(EllpackPageImpl* page,
|
||||
GradientBasedSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl* page_;
|
||||
EllpackPageImpl const* page_;
|
||||
float subsample_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
dh::caching_device_vector<float> grad_sum_;
|
||||
@@ -100,14 +100,14 @@ class GradientBasedSampling : public SamplingStrategy {
|
||||
/*! \brief Gradient-based sampling in external memory mode.. */
|
||||
class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
public:
|
||||
ExternalMemoryGradientBasedSampling(EllpackPageImpl* page,
|
||||
ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample);
|
||||
GradientBasedSample Sample(common::Span<GradientPair> gpair, DMatrix* dmat) override;
|
||||
|
||||
private:
|
||||
EllpackPageImpl* original_page_;
|
||||
EllpackPageImpl const* original_page_;
|
||||
BatchParam batch_param_;
|
||||
float subsample_;
|
||||
dh::caching_device_vector<float> threshold_;
|
||||
@@ -128,7 +128,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
|
||||
*/
|
||||
class GradientBasedSampler {
|
||||
public:
|
||||
GradientBasedSampler(EllpackPageImpl* page,
|
||||
GradientBasedSampler(EllpackPageImpl const* page,
|
||||
size_t n_rows,
|
||||
const BatchParam& batch_param,
|
||||
float subsample,
|
||||
|
||||
Reference in New Issue
Block a user