Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features: - Federated learning for both CPU and GPU. - NCCL. - More data types. - A unified interface for all the underlying implementations. - Improved timeout handling for both tracker and workers. - Exhausted tests with metrics (fixed a couple of bugs along the way). - A reusable tracker for Python and JVM packages.
2024-05-20 11:56:23 +08:00
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -1,9 +1,9 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
 */
+#include <thrust/copy.h>  // for copy
 #include <thrust/scan.h>

-#include <algorithm>
 #include <cassert>
 #include <cub/cub.cuh>  // NOLINT
 #include <limits>
@@ -11,7 +11,7 @@
 #include <tuple>
 #include <utility>

-#include "../collective/communicator-inl.cuh"
+#include "../collective/allreduce.h"
 #include "../common/algorithm.cuh"        // SegmentedArgSort
 #include "../common/optional_weight.h"    // OptionalWeights
 #include "../common/threading_utils.cuh"  // UnravelTrapeziodIdx,SegmentedTrapezoidThreads
@@ -201,13 +201,16 @@ void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
  });
 }

-double ScaleClasses(Context const *ctx, common::Span<double> results,
+double ScaleClasses(Context const *ctx, bool is_column_split, common::Span<double> results,
                    common::Span<double> local_area, common::Span<double> tp,
                    common::Span<double> auc, size_t n_classes) {
-  if (collective::IsDistributed()) {
-    int32_t device = dh::CurrentDevice();
+  // With vertical federated learning, only the root has label, other parties are not
+  // evaluation metrics.
+  if (collective::IsDistributed() && !(is_column_split && collective::IsFederated())) {
+    std::int32_t device = dh::CurrentDevice();
    CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
-    collective::AllReduce<collective::Operation::kSum>(device, results.data(), results.size());
+    auto rc = collective::Allreduce(
+        ctx, linalg::MakeVec(results.data(), results.size(), ctx->Device()), collective::Op::kSum);
  }
  auto reduce_in = dh::MakeTransformIterator<Pair>(
      thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
@@ -334,7 +337,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
    auto local_area = d_results.subspan(0, n_classes);
    auto tp = d_results.subspan(2 * n_classes, n_classes);
    auto auc = d_results.subspan(3 * n_classes, n_classes);
-    return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
+    return ScaleClasses(ctx, info.IsColumnSplit(), d_results, local_area, tp, auc, n_classes);
  }

  /**
@@ -438,7 +441,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
      tp[c] = 1.0f;
    }
  });
-  return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
+  return ScaleClasses(ctx, info.IsColumnSplit(), d_results, local_area, tp, auc, n_classes);
 }

 void MultiClassSortedIdx(Context const *ctx, common::Span<float const> predts,
@@ -835,7 +838,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
  InitCacheOnce<false>(predts, p_cache);

  dh::device_vector<bst_group_t> group_ptr(info.group_ptr_.size());
-  thrust::copy(info.group_ptr_.begin(), info.group_ptr_.end(), group_ptr.begin());
+  thrust::copy(info.group_ptr_.begin(), info.group_ptr_.end(), group_ptr.begin());  // NOLINT
  auto d_group_ptr = dh::ToSpan(group_ptr);
  CHECK_GE(info.group_ptr_.size(), 1) << "Must have at least 1 query group for LTR.";
  size_t n_groups = info.group_ptr_.size() - 1;