[EM] Handle base idx in GPU histogram. (#10549)

2024-07-11 03:26:30 +08:00
parent 34b154c284
commit 5f910cd4ff
8 changed files with 167 additions and 71 deletions
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,8 +1,7 @@
 /**
 * Copyright 2020-2024, XGBoost Contributors
 */
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/reduce.h>
+#include <thrust/iterator/transform_iterator.h>  // for make_transform_iterator

 #include <algorithm>
 #include <cstdint>  // uint32_t, int32_t
@@ -101,9 +100,8 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                 static_cast<T>(1) / to_floating_point_.GetHess());
 }

-XGBOOST_DEV_INLINE void
-AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
-               xgboost::GradientPairInt64 const &gpair) {
+XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
+                                             xgboost::GradientPairInt64 const& gpair) {
  auto dst_ptr = reinterpret_cast<int64_t *>(dest);
  auto g = gpair.GetQuantisedGrad();
  auto h = gpair.GetQuantisedHess();
@@ -131,7 +129,9 @@ template <int kBlockThreads, int kItemsPerThread,
 class HistogramAgent {
  GradientPairInt64* smem_arr_;
  GradientPairInt64* d_node_hist_;
-  dh::LDGIterator<const RowPartitioner::RowIndexT> d_ridx_;
+  using Idx = RowPartitioner::RowIndexT;
+
+  dh::LDGIterator<const Idx> d_ridx_;
  const GradientPair* d_gpair_;
  const FeatureGroup group_;
  const EllpackDeviceAccessor& matrix_;
@@ -142,8 +142,7 @@ class HistogramAgent {
 public:
  __device__ HistogramAgent(GradientPairInt64* smem_arr,
                            GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
-                            const EllpackDeviceAccessor& matrix,
-                            common::Span<const RowPartitioner::RowIndexT> d_ridx,
+                            const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
                            const GradientQuantiser& rounding, const GradientPair* d_gpair)
      : smem_arr_(smem_arr),
        d_node_hist_(d_node_hist),
@@ -154,15 +153,15 @@ class HistogramAgent {
        n_elements_(feature_stride_ * d_ridx.size()),
        rounding_(rounding),
        d_gpair_(d_gpair) {}
+
  __device__ void ProcessPartialTileShared(std::size_t offset) {
    for (std::size_t idx = offset + threadIdx.x;
         idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
         idx += kBlockThreads) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_] -
-          group_.start_bin;
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      Idx midx = (ridx - matrix_.base_rowid) * matrix_.row_stride + group_.start_feature +
+                  idx % feature_stride_;
+      bst_bin_t gidx = matrix_.gidx_iter[midx] - group_.start_bin;
      if (matrix_.is_dense || gidx != matrix_.NumBins()) {
        auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
        AtomicAddGpairShared(smem_arr_ + gidx, adjusted);
@@ -188,8 +187,8 @@ class HistogramAgent {
 #pragma unroll
    for (int i = 0; i < kItemsPerThread; i++) {
      gpair[i] = d_gpair_[ridx[i]];
-      gidx[i] = matrix_.gidx_iter[ridx[i] * matrix_.row_stride + group_.start_feature +
-                                 idx[i] % feature_stride_];
+      gidx[i] = matrix_.gidx_iter[(ridx[i] - matrix_.base_rowid) * matrix_.row_stride +
+                                  group_.start_feature + idx[i] % feature_stride_];
    }
 #pragma unroll
    for (int i = 0; i < kItemsPerThread; i++) {
@@ -200,7 +199,7 @@ class HistogramAgent {
    }
  }
  __device__ void BuildHistogramWithShared() {
-    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64());
+    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64{});
    __syncthreads();

    std::size_t offset = blockIdx.x * kItemsPerTile;
@@ -219,10 +218,9 @@ class HistogramAgent {

  __device__ void BuildHistogramWithGlobal() {
    for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements_)) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_];
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      bst_bin_t gidx = matrix_.gidx_iter[(ridx - matrix_.base_rowid) * matrix_.row_stride +
+                                         group_.start_feature + idx % feature_stride_];
      if (matrix_.is_dense || gidx != matrix_.NumBins()) {
        auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
        AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted);
@@ -231,8 +229,7 @@ class HistogramAgent {
  }
 };

-template <bool use_shared_memory_histograms, int kBlockThreads,
-          int kItemsPerThread>
+template <bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
    SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                        const FeatureGroupsAccessor feature_groups,
@@ -251,6 +248,7 @@ __global__ void __launch_bounds__(kBlockThreads)
    agent.BuildHistogramWithGlobal();
  }
 }
+
 namespace {
 constexpr std::int32_t kBlockThreads = 1024;
 constexpr std::int32_t kItemsPerThread = 8;
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -78,5 +78,4 @@ class DeviceHistogramBuilder {
                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
 };
 }  // namespace xgboost::tree
-
 #endif  // HISTOGRAM_CUH_
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -1,28 +1,23 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
 */
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/sequence.h>
+#include <thrust/sequence.h>  // for sequence

-#include <vector>
+#include <vector>  // for vector

-#include "../../common/device_helpers.cuh"
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for CopyDeviceSpanToVector, ToSpan
 #include "row_partitioner.cuh"

-namespace xgboost {
-namespace tree {
-
-RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
+namespace xgboost::tree {
+RowPartitioner::RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid)
+    : device_idx_(ctx->Device()), ridx_(n_samples), ridx_tmp_(n_samples) {
  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
-  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
+  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, n_samples)});
+  thrust::sequence(ctx->CUDACtx()->CTP(), ridx_.data(), ridx_.data() + ridx_.size(), base_rowid);
 }

-RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-}
+RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); }

 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
  auto segment = ridx_segments_.at(nidx).segment;
@@ -39,6 +34,4 @@ std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t ni
  dh::CopyDeviceSpanToVector(&rows, span);
  return rows;
 }
-
-};  // namespace tree
-};  // namespace xgboost
+};  // namespace xgboost::tree
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,17 +1,17 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
 */
 #pragma once
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
+#include <thrust/iterator/transform_output_iterator.h>  // for make_transform_output_iterator

-#include <limits>
-#include <vector>
+#include <algorithm>  // for max
+#include <vector>     // for vector

-#include "../../common/device_helpers.cuh"
-#include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/task.h"
-#include "xgboost/tree_model.h"
+#include "../../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                   // for bst_idx_t
+#include "xgboost/context.h"                // for Context

 namespace xgboost {
 namespace tree {
@@ -223,7 +223,12 @@ class RowPartitioner {
  dh::PinnedMemory pinned2_;

 public:
-  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
+  /**
+   * @param ctx Context for device ordinal and stream.
+   * @param n_samples The number of samples in each batch.
+   * @param base_rowid The base row index for the current batch.
+   */
+  RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid);
  ~RowPartitioner();
  RowPartitioner(const RowPartitioner&) = delete;
  RowPartitioner& operator=(const RowPartitioner&) = delete;
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -251,7 +251,8 @@ struct GPUHistMakerDevice {
    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());

    row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
+    CHECK_EQ(page->base_rowid, 0);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_, sample.sample_rows, page->base_rowid);

    // Init histogram
    hist.Init(ctx_->Device(), page->Cuts().TotalBins());