Optimize GPU evaluation function for categorical data. (#7705)

* Use transform and cache.
2022-02-28 17:46:29 +08:00
parent 18a4af63aa
commit 1d468e20a4
3 changed files with 77 additions and 30 deletions
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -51,6 +51,12 @@ class GPUHistEvaluator {
  dh::CUDAStream copy_stream_;
  // storage for sorted index of feature histogram, used for sort based splits.
  dh::device_vector<bst_feature_t> cat_sorted_idx_;
+  // cached input for sorting the histogram, used for sort based splits.
+  using SortPair = thrust::tuple<uint32_t, double>;
+  dh::device_vector<SortPair> sort_input_;
+  // cache for feature index
+  dh::device_vector<bst_feature_t> feature_idx_;
+  // Training param used for evaluation
  TrainParam param_;
  // whether the input data requires sort based split, which is more complicated so we try
  // to avoid it if possible.
@@ -95,6 +101,13 @@ class GPUHistEvaluator {
    return dh::ToSpan(cat_sorted_idx_);
  }

+  auto SortInput(EvaluateSplitInputs<GradientSumT> left) {
+    if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
+      return dh::ToSpan(sort_input_).first(left.feature_values.size());
+    }
+    return dh::ToSpan(sort_input_);
+  }
+
 public:
  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
      : tree_evaluator_{param, n_features, device}, param_{param} {}