From 29bfa94bb6a99721f3ab9e0c4f05ee2df4345294 Mon Sep 17 00:00:00 2001 From: Ginko Balboa Date: Fri, 24 Dec 2021 04:15:35 +0100 Subject: [PATCH] Fix external memory with gpu_hist and subsampling combination bug. (#7481) Instead of accessing data from the `original_page_`, access the data from the first page of the available batch. fix #7476 Co-authored-by: jiamingy --- src/tree/gpu_hist/gradient_based_sampler.cu | 32 ++++++++--------- src/tree/gpu_hist/gradient_based_sampler.cuh | 8 ++--- tests/python-gpu/test_gpu_data_iterator.py | 17 ++++++--- tests/python/test_data_iterator.py | 37 ++++++++++++++++---- 4 files changed, 61 insertions(+), 33 deletions(-) diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 42329b492..676497336 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -185,12 +185,10 @@ GradientBasedSample UniformSampling::Sample(common::Span gpair, DM return {dmat->Info().num_row_, page_, gpair}; } -ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(EllpackPageImpl const* page, - size_t n_rows, +ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample) - : original_page_(page), - batch_param_(std::move(batch_param)), + : batch_param_(std::move(batch_param)), subsample_(subsample), sample_row_index_(n_rows) {} @@ -218,15 +216,17 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(common::SpanGetBatches(batch_param_); + auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating page_.reset(new EllpackPageImpl( - batch_param_.gpu_id, original_page_->Cuts(), original_page_->is_dense, - original_page_->row_stride, sample_rows)); + batch_param_.gpu_id, first_page->Cuts(), first_page->is_dense, + first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); - for (auto& batch : dmat->GetBatches(batch_param_)) { + for (auto& batch : batch_iterator) { page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); } @@ -259,12 +259,10 @@ GradientBasedSample GradientBasedSampling::Sample(common::Span gpa } ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling( - EllpackPageImpl const* page, size_t n_rows, BatchParam batch_param, float subsample) - : original_page_(page), - batch_param_(std::move(batch_param)), + : batch_param_(std::move(batch_param)), subsample_(subsample), threshold_(n_rows + 1, 0.0f), grad_sum_(n_rows, 0.0f), @@ -300,15 +298,17 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(common::SpanGetBatches(batch_param_); + auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. page_.reset(); // Release the device memory first before reallocating - page_.reset(new EllpackPageImpl(batch_param_.gpu_id, original_page_->Cuts(), - original_page_->is_dense, - original_page_->row_stride, sample_rows)); + page_.reset(new EllpackPageImpl(batch_param_.gpu_id, first_page->Cuts(), + first_page->is_dense, + first_page->row_stride, sample_rows)); // Compact the ELLPACK pages into the single sample page. thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); - for (auto& batch : dmat->GetBatches(batch_param_)) { + for (auto& batch : batch_iterator) { page_->Compact(batch_param_.gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); } @@ -329,7 +329,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page, switch (sampling_method) { case TrainParam::kUniform: if (is_external_memory) { - strategy_.reset(new ExternalMemoryUniformSampling(page, n_rows, batch_param, subsample)); + strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample)); } else { strategy_.reset(new UniformSampling(page, subsample)); } @@ -337,7 +337,7 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl const* page, case TrainParam::kGradientBased: if (is_external_memory) { strategy_.reset( - new ExternalMemoryGradientBasedSampling(page, n_rows, batch_param, subsample)); + new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample)); } else { strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample)); } diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh index 989d994f9..5be6c71de 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cuh +++ b/src/tree/gpu_hist/gradient_based_sampler.cuh @@ -66,14 +66,12 @@ class UniformSampling : public SamplingStrategy { /*! \brief No sampling in external memory mode. */ class ExternalMemoryUniformSampling : public SamplingStrategy { public: - ExternalMemoryUniformSampling(EllpackPageImpl const* page, - size_t n_rows, + ExternalMemoryUniformSampling(size_t n_rows, BatchParam batch_param, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl const* original_page_; BatchParam batch_param_; float subsample_; std::unique_ptr page_; @@ -100,14 +98,12 @@ class GradientBasedSampling : public SamplingStrategy { /*! \brief Gradient-based sampling in external memory mode.. */ class ExternalMemoryGradientBasedSampling : public SamplingStrategy { public: - ExternalMemoryGradientBasedSampling(EllpackPageImpl const* page, - size_t n_rows, + ExternalMemoryGradientBasedSampling(size_t n_rows, BatchParam batch_param, float subsample); GradientBasedSample Sample(common::Span gpair, DMatrix* dmat) override; private: - EllpackPageImpl const* original_page_; BatchParam batch_param_; float subsample_; dh::caching_device_vector threshold_; diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py index 5b9130301..2975834f9 100644 --- a/tests/python-gpu/test_gpu_data_iterator.py +++ b/tests/python-gpu/test_gpu_data_iterator.py @@ -17,16 +17,23 @@ def test_gpu_single_batch() -> None: @pytest.mark.skipif(**no_cupy()) @given( - strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13) + strategies.integers(0, 1024), + strategies.integers(1, 7), + strategies.integers(0, 13), + strategies.booleans(), ) @settings(deadline=None) def test_gpu_data_iterator( - n_samples_per_batch: int, n_features: int, n_batches: int + n_samples_per_batch: int, n_features: int, n_batches: int, subsample: bool ) -> None: - run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", True) - run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", False) + run_data_iterator( + n_samples_per_batch, n_features, n_batches, "gpu_hist", subsample, True + ) + run_data_iterator( + n_samples_per_batch, n_features, n_batches, "gpu_hist", subsample, False + ) def test_cpu_data_iterator() -> None: """Make sure CPU algorithm can handle GPU inputs""" - run_data_iterator(1024, 2, 3, "approx", True) + run_data_iterator(1024, 2, 3, "approx", False, True) diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py index 1e43fcf77..946127d13 100644 --- a/tests/python/test_data_iterator.py +++ b/tests/python/test_data_iterator.py @@ -68,9 +68,14 @@ def run_data_iterator( n_features: int, n_batches: int, tree_method: str, + subsample: bool, use_cupy: bool, ) -> None: n_rounds = 2 + # The test is more difficult to pass if the subsample rate is smaller as the root_sum + # is accumulated in parallel. Reductions with different number of entries lead to + # different floating point errors. + subsample_rate = 0.8 if subsample else 1.0 it = IteratorForTest( *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy) @@ -84,9 +89,19 @@ def run_data_iterator( assert Xy.num_row() == n_samples_per_batch * n_batches assert Xy.num_col() == n_features + parameters = { + "tree_method": tree_method, + "max_depth": 2, + "subsample": subsample_rate, + "seed": 0, + } + + if tree_method == "gpu_hist": + parameters["sampling_method"] = "gradient_based" + results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {} from_it = xgb.train( - {"tree_method": tree_method, "max_depth": 2}, + parameters, Xy, num_boost_round=n_rounds, evals=[(Xy, "Train")], @@ -102,7 +117,7 @@ def run_data_iterator( results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {} from_arrays = xgb.train( - {"tree_method": tree_method, "max_depth": 2}, + parameters, Xy, num_boost_round=n_rounds, evals=[(Xy, "Train")], @@ -126,11 +141,21 @@ def run_data_iterator( @given( - strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13) + strategies.integers(0, 1024), + strategies.integers(1, 7), + strategies.integers(0, 13), + strategies.booleans(), ) @settings(deadline=None) def test_data_iterator( - n_samples_per_batch: int, n_features: int, n_batches: int + n_samples_per_batch: int, + n_features: int, + n_batches: int, + subsample: bool, ) -> None: - run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False) - run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False) + run_data_iterator( + n_samples_per_batch, n_features, n_batches, "approx", subsample, False + ) + run_data_iterator( + n_samples_per_batch, n_features, n_batches, "hist", subsample, False + )