Initial support for column-split cpu predictor (#8676)

2023-01-17 14:33:13 -08:00 · 2023-01-17 14:33:13 -08:00 · 78396f8a6e
commit 78396f8a6e
parent 980233e648
4 changed files with 334 additions and 11 deletions
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@ -207,6 +207,8 @@ class Communicator {
      result = CommunicatorType::kRabit;
    } else if (!CompareStringsCaseInsensitive("federated", str)) {
      result = CommunicatorType::kFederated;
    } else if (!CompareStringsCaseInsensitive("in-memory", str)) {
      result = CommunicatorType::kInMemory;
    } else {
      LOG(FATAL) << "Unknown communicator type " << str;
    }
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@ -8,12 +8,12 @@
 #include <limits>
 #include <mutex>
 #include "../collective/communicator-inl.h"
 #include "../common/categorical.h"
 #include "../common/math.h"
 #include "../common/threading_utils.h"
 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "../data/proxy_dmatrix.h"
 #include "../gbm/gbtree_model.h"
 #include "cpu_treeshap.h"  // CalculateContributions
 #include "predict_fn.h"
@ -23,7 +23,6 @@
 #include "xgboost/logging.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
 namespace xgboost {
 namespace predictor {
@ -284,16 +283,277 @@ void FillNodeMeanValues(RegTree const* tree, std::vector<float>* mean_values) {
  FillNodeMeanValues(tree, 0, mean_values);
 }
-class CPUPredictor : public Predictor {
+namespace {
- protected:
+// init thread buffers
-  // init thread buffers
+static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
-  static void InitThreadTemp(int nthread, std::vector<RegTree::FVec> *out) {
+  int prev_thread_temp_size = out->size();
-    int prev_thread_temp_size = out->size();
+  if (prev_thread_temp_size < nthread) {
-    if (prev_thread_temp_size < nthread) {
+    out->resize(nthread, RegTree::FVec());
-      out->resize(nthread, RegTree::FVec());
+  }
 }
 }  // anonymous namespace
 /**
 * @brief A helper class for prediction when the DMatrix is split by column.
 *
 * When data is split by column, a local DMatrix only contains a subset of features. All the workers
 * in a distributed/federated environment need to cooperate to produce a prediction. This is done in
 * two passes with the help of bit vectors.
 *
 * First pass:
 * for each tree:
 *   for each row:
 *     for each node:
 *       if the feature is available and passes the filter, mark the corresponding decision bit
 *       if the feature is missing, mark the missing bit
 *
 * Once the two bit vectors are populated, run allreduce on both, using bitwise OR for the decision
 * bits, and bitwise AND for the missing bits.
 *
 * Second pass:
 * for each tree:
 *   for each row:
 *     find the leaf node using the decision and missing bits, return the leaf value
 *
 * The size of the decision/missing bit vector is:
 *   number of rows in a batch * sum(number of nodes in each tree)
 */
 class ColumnSplitHelper {
 public:
  ColumnSplitHelper(std::int32_t n_threads, gbm::GBTreeModel const &model, uint32_t tree_begin,
                    uint32_t tree_end)
      : n_threads_{n_threads}, model_{model}, tree_begin_{tree_begin}, tree_end_{tree_end} {
    auto const n_trees = tree_end_ - tree_begin_;
    tree_sizes_.resize(n_trees);
    tree_offsets_.resize(n_trees);
    for (auto i = 0; i < n_trees; i++) {
      auto const &tree = *model_.trees[tree_begin_ + i];
      tree_sizes_[i] = tree.GetNodes().size();
    }
    // std::exclusive_scan (only available in c++17) equivalent to get tree offsets.
    tree_offsets_[0] = 0;
    for (auto i = 1; i < n_trees; i++) {
      tree_offsets_[i] = tree_offsets_[i - 1] + tree_sizes_[i - 1];
    }
    bits_per_row_ = tree_offsets_.back() + tree_sizes_.back();
    InitThreadTemp(n_threads_ * kBlockOfRowsSize, &feat_vecs_);
  }
  // Disable copy (and move) semantics.
  ColumnSplitHelper(ColumnSplitHelper const &) = delete;
  ColumnSplitHelper &operator=(ColumnSplitHelper const &) = delete;
  ColumnSplitHelper(ColumnSplitHelper &&) noexcept = delete;
  ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete;
  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";
    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
      CHECK_EQ(out_preds->size(),
               p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(SparsePageView{&batch}, out_preds);
    }
  }
 private:
  using BitVector = RBitField8;
  void InitBitVectors(std::size_t n_rows) {
    n_rows_ = n_rows;
    auto const size = BitVector::ComputeStorageSize(bits_per_row_ * n_rows_);
    decision_storage_.resize(size);
    decision_bits_ = BitVector(common::Span<BitVector::value_type>(decision_storage_));
    missing_storage_.resize(size);
    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
  }
  void ClearBitVectors() {
    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
  }
  std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, std::size_t node_id) const {
    size_t tree_index = tree_id - tree_begin_;
    return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
  }
  void AllreduceBitVectors() {
    collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
                                                             decision_storage_.size());
    collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
                                                              missing_storage_.size());
  }
  void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const &cats = tree.GetCategoriesMatrix();
    auto const has_categorical = tree.HasCategoricalSplit();
    for (auto nid = 0; nid < tree.GetNodes().size(); nid++) {
      auto const &node = tree[nid];
      if (node.IsDeleted() || node.IsLeaf()) {
        continue;
      }
      auto const bit_index = BitIndex(tree_id, row_id, nid);
      unsigned split_index = node.SplitIndex();
      if (feat.IsMissing(split_index)) {
        missing_bits_.Set(bit_index);
        continue;
      }
      auto const fvalue = feat.GetFvalue(split_index);
      if (has_categorical && common::IsCat(cats.split_type, nid)) {
        auto const node_categories =
            cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
        if (!common::Decision(node_categories, fvalue)) {
          decision_bits_.Set(bit_index);
        }
        continue;
      }
      if (fvalue >= node.SplitCond()) {
        decision_bits_.Set(bit_index);
      }
    }
  }
  void MaskAllTrees(std::size_t batch_offset, std::size_t fvec_offset, std::size_t block_size) {
    for (auto tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
      for (size_t i = 0; i < block_size; ++i) {
        MaskOneTree(feat_vecs_[fvec_offset + i], tree_id, batch_offset + i);
      }
    }
  }
  bst_node_t GetNextNode(RegTree::Node const &node, std::size_t bit_index) {
    if (missing_bits_.Check(bit_index)) {
      return node.DefaultChild();
    } else {
      return node.LeftChild() + decision_bits_.Check(bit_index);
    }
  }
  bst_node_t GetLeafIndex(RegTree const &tree, std::size_t tree_id, std::size_t row_id) {
    bst_node_t nid = 0;
    while (!tree[nid].IsLeaf()) {
      auto const bit_index = BitIndex(tree_id, row_id, nid);
      nid = GetNextNode(tree[nid], bit_index);
    }
    return nid;
  }
  bst_float PredictOneTree(std::size_t tree_id, std::size_t row_id) {
    auto const &tree = *model_.trees[tree_id];
    auto const leaf = GetLeafIndex(tree, tree_id, row_id);
    return tree[leaf].LeafValue();
  }
  void PredictAllTrees(std::vector<bst_float> *out_preds, std::size_t batch_offset,
                       std::size_t predict_offset, std::size_t num_group, std::size_t block_size) {
    auto &preds = *out_preds;
    for (size_t tree_id = tree_begin_; tree_id < tree_end_; ++tree_id) {
      auto const gid = model_.tree_info[tree_id];
      for (size_t i = 0; i < block_size; ++i) {
        preds[(predict_offset + i) * num_group + gid] += PredictOneTree(tree_id, batch_offset + i);
      }
    }
  }
  template <typename DataView, size_t block_of_rows_size>
  void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
    auto const num_group = model_.learner_model_param->num_output_group;
    CHECK_EQ(model_.param.size_leaf_vector, 0) << "size_leaf_vector is enforced to 0 so far";
    // parallel over local batch
    auto const nsize = batch.Size();
    auto const num_feature = model_.learner_model_param->num_feature;
    auto const n_blocks = common::DivRoundUp(nsize, block_of_rows_size);
    InitBitVectors(nsize);
    // auto block_id has the same type as `n_blocks`.
    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
      auto const batch_offset = block_id * block_of_rows_size;
      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
      auto const fvec_offset = omp_get_thread_num() * block_of_rows_size;
      FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset, &feat_vecs_);
      MaskAllTrees(batch_offset, fvec_offset, block_size);
      FVecDrop(block_size, batch_offset, &batch, fvec_offset, &feat_vecs_);
    });
    AllreduceBitVectors();
    // auto block_id has the same type as `n_blocks`.
    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
      auto const batch_offset = block_id * block_of_rows_size;
      auto const block_size = std::min(nsize - batch_offset, block_of_rows_size);
      PredictAllTrees(out_preds, batch_offset, batch_offset + batch.base_rowid, num_group,
                      block_size);
    });
    ClearBitVectors();
  }
  static std::size_t constexpr kBlockOfRowsSize = 64;
  std::int32_t const n_threads_;
  gbm::GBTreeModel const &model_;
  uint32_t const tree_begin_;
  uint32_t const tree_end_;
  std::vector<std::size_t> tree_sizes_{};
  std::vector<std::size_t> tree_offsets_{};
  std::size_t bits_per_row_{};
  std::vector<RegTree::FVec> feat_vecs_{};
  std::size_t n_rows_;
  /**
   * @brief Stores decision bit for each split node.
   *
   * Conceptually it's a 3-dimensional bit matrix:
   *   - 1st dimension is the tree index, from `tree_begin_` to `tree_end_`.
   *   - 2nd dimension is the row index, for each row in the batch.
   *   - 3rd dimension is the node id, for each node in the tree.
   *
   * Since we have to ship the whole thing over the wire to do an allreduce, the matrix is flattened
   * into a 1-dimensional array.
   *
   * First, it's divided by the tree index:
   *
   * [ tree 0 ] [ tree 1 ] ...
   *
   * Then each tree is divided by row:
   *
   * [             tree 0              ] [           tree 1     ] ...
   * [ row 0 ] [ row 1 ] ... [ row n-1 ] [ row 0 ] ...
   *
   * Finally, each row is divided by the node id:
   *
   * [                             tree 0                                         ]
   * [              row 0                 ] [        row 1           ] ...
   * [ node 0 ] [ node 1 ] ... [ node n-1 ] [ node 0 ] ...
   *
   * The first two dimensions are fixed length, while the last dimension is variable length since
   * each tree may have a different number of nodes. We precompute the tree offsets, which are the
   * cumulative sums of tree sizes. The index of tree t, row r, node n is:
   *   index(t, r, n) = tree_offsets[t] * n_rows + r * tree_sizes[t] + n 
   */
  std::vector<BitVector::value_type> decision_storage_{};
  BitVector decision_bits_{};
  /**
   * @brief Stores whether the feature is missing for each split node.
   *
   * See above for the storage layout.
   */
  std::vector<BitVector::value_type> missing_storage_{};
  BitVector missing_bits_{};
 };
 class CPUPredictor : public Predictor {
 protected:
  void PredictGHistIndex(DMatrix *p_fmat, gbm::GBTreeModel const &model, int32_t tree_begin,
                         int32_t tree_end, std::vector<bst_float> *out_preds) const {
    auto const n_threads = this->ctx_->Threads();
@ -323,6 +583,12 @@ class CPUPredictor : public Predictor {
  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds,
                      gbm::GBTreeModel const &model, int32_t tree_begin, int32_t tree_end) const {
    if (p_fmat->Info().data_split_mode == DataSplitMode::kCol) {
      ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
      helper.PredictDMatrix(p_fmat, out_preds);
      return;
    }
    if (!p_fmat->PageExists<SparsePage>()) {
      this->PredictGHistIndex(p_fmat, model, tree_begin, tree_end, out_preds);
      return;
--- a/tests/cpp/collective/test_communicator.cc
+++ b/tests/cpp/collective/test_communicator.cc
@ -12,14 +12,17 @@ namespace collective {
 TEST(CommunicatorFactory, TypeFromEnv) {
  EXPECT_EQ(CommunicatorType::kUnknown, Communicator::GetTypeFromEnv());
  dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "foo");
  EXPECT_THROW(Communicator::GetTypeFromEnv(), dmlc::Error);
  dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "rabit");
  EXPECT_EQ(CommunicatorType::kRabit, Communicator::GetTypeFromEnv());
  dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "Federated");
  EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromEnv());
-  dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "foo");
+  dmlc::SetEnv<std::string>("XGBOOST_COMMUNICATOR", "In-Memory");
-  EXPECT_THROW(Communicator::GetTypeFromEnv(), dmlc::Error);
+  EXPECT_EQ(CommunicatorType::kInMemory, Communicator::GetTypeFromEnv());
 }
 TEST(CommunicatorFactory, TypeFromArgs) {
@ -32,6 +35,9 @@ TEST(CommunicatorFactory, TypeFromArgs) {
  config["xgboost_communicator"] = String("federated");
  EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
  config["xgboost_communicator"] = String("in-memory");
  EXPECT_EQ(CommunicatorType::kInMemory, Communicator::GetTypeFromConfig(config));
  config["xgboost_communicator"] = String("foo");
  EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
 }
@ -46,6 +52,9 @@ TEST(CommunicatorFactory, TypeFromArgsUpperCase) {
  config["XGBOOST_COMMUNICATOR"] = String("federated");
  EXPECT_EQ(CommunicatorType::kFederated, Communicator::GetTypeFromConfig(config));
  config["XGBOOST_COMMUNICATOR"] = String("in-memory");
  EXPECT_EQ(CommunicatorType::kInMemory, Communicator::GetTypeFromConfig(config));
  config["XGBOOST_COMMUNICATOR"] = String("foo");
  EXPECT_THROW(Communicator::GetTypeFromConfig(config), dmlc::Error);
 }
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@ -4,6 +4,9 @@
 #include <gtest/gtest.h>
 #include <xgboost/predictor.h>
 #include <thread>
 #include "../../../src/collective/communicator-inl.h"
 #include "../../../src/data/adapter.h"
 #include "../../../src/data/proxy_dmatrix.h"
 #include "../../../src/gbm/gbtree.h"
@ -86,6 +89,49 @@ TEST(CpuPredictor, Basic) {
  }
 }
 TEST(CpuPredictor, ColumnSplit) {
  size_t constexpr kRows = 5;
  size_t constexpr kCols = 5;
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
  std::vector<std::thread> threads;
  size_t constexpr kWorldSize = 2;
  size_t constexpr kSliceSize = (kCols + 1) / kWorldSize;
   for (auto rank = 0; rank < kWorldSize; rank++) {
    threads.emplace_back([=, &dmat]() {
      Json config{JsonObject()};
      config["xgboost_communicator"] = String("in-memory");
      config["in_memory_world_size"] = kWorldSize;
      config["in_memory_rank"] = rank;
      xgboost::collective::Init(config);
      auto lparam = CreateEmptyGenericParam(GPUIDX);
      std::unique_ptr<Predictor> cpu_predictor =
          std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
      LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
      Context ctx;
      ctx.UpdateAllowUnknown(Args{});
      gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
      // Test predict batch
      PredictionCacheEntry out_predictions;
      cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
      auto sliced = std::unique_ptr<DMatrix>{dmat->SliceCol(rank * kSliceSize, kSliceSize)};
      cpu_predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
      std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
      for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
        ASSERT_EQ(out_predictions_h[i], 1.5);
      }
      xgboost::collective::Finalize();
    });
   }
  for (auto& thread : threads) {
    thread.join();
  }
 }
 TEST(CpuPredictor, IterationRange) {
  TestIterationRange("cpu_predictor");