Make QuantileDMatrix default to sklearn esitmators. (#8220)

2022-09-13 13:52:19 +08:00
parent a2686543a9
commit bdf265076d
6 changed files with 91 additions and 39 deletions
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -7,6 +7,7 @@

 #include "../common/column_matrix.h"
 #include "../common/hist_util.h"
+#include "../tree/param.h"  // FIXME(jiamingy): Find a better way to share this parameter.
 #include "gradient_index.h"
 #include "proxy_dmatrix.h"
 #include "simple_batch_iterator.h"
@@ -14,6 +15,38 @@
 namespace xgboost {
 namespace data {

+IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
+                                   std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset,
+                                   XGDMatrixCallbackNext* next, float missing, int nthread,
+                                   bst_bin_t max_bin)
+    : proxy_{proxy}, reset_{reset}, next_{next} {
+  // fetch the first batch
+  auto iter =
+      DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
+  iter.Reset();
+  bool valid = iter.Next();
+  CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
+
+  auto d = MakeProxy(proxy_)->DeviceIdx();
+
+  StringView msg{"All batch should be on the same device."};
+  if (batch_param_.gpu_id != Context::kCpuId) {
+    CHECK_EQ(d, batch_param_.gpu_id) << msg;
+  }
+
+  batch_param_ = BatchParam{d, max_bin};
+  // hardcoded parameter.
+  batch_param_.sparse_thresh = tree::TrainParam::DftSparseThreshold();
+
+  ctx_.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  if (ctx_.IsCPU()) {
+    this->InitFromCPU(iter_handle, missing, ref);
+  } else {
+    this->InitFromCUDA(iter_handle, missing, ref);
+  }
+}
+
 void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
                    common::HistogramCuts* p_cuts) {
  CHECK(ref_);
@@ -199,6 +232,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
  if (n_batches == 1) {
    this->info_ = std::move(proxy->Info());
    this->info_.num_nonzero_ = nnz;
+    this->info_.num_col_ = n_features;  // proxy might be empty.
    CHECK_EQ(proxy->Info().labels.Size(), 0);
  }
 }
@@ -210,6 +244,10 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const&
    ghist_ = std::make_shared<GHistIndexMatrix>(&ctx_, Info(), *ellpack_, param);
  }

+  if (param.sparse_thresh != tree::TrainParam::DftSparseThreshold()) {
+    LOG(WARNING) << "`sparse_threshold` can not be changed when `QuantileDMatrix` is used instead "
+                    "of `DMatrix`.";
+  }
  auto begin_iter =
      BatchIterator<GHistIndexMatrix>(new SimpleBatchIteratorImpl<GHistIndexMatrix>(ghist_));
  return BatchSet<GHistIndexMatrix>(begin_iter);
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -173,8 +173,15 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& para
  }
  if (!ellpack_ && ghist_) {
    ellpack_.reset(new EllpackPage());
-    this->ctx_.gpu_id = param.gpu_id;
-    this->Info().feature_types.SetDevice(param.gpu_id);
+    // Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU
+    // ID.
+    if (this->ctx_.IsCPU()) {
+      this->ctx_.gpu_id = param.gpu_id;
+    }
+    if (this->ctx_.IsCPU()) {
+      this->ctx_.gpu_id = dh::CurrentDevice();
+    }
+    this->Info().feature_types.SetDevice(this->ctx_.gpu_id);
    *ellpack_->Impl() =
        EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
  }
--- a/src/data/iterative_dmatrix.h
+++ b/src/data/iterative_dmatrix.h
@@ -75,30 +75,7 @@ class IterativeDMatrix : public DMatrix {
  explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
                            std::shared_ptr<DMatrix> ref, DataIterResetCallback *reset,
                            XGDMatrixCallbackNext *next, float missing, int nthread,
-                            bst_bin_t max_bin)
-      : proxy_{proxy}, reset_{reset}, next_{next} {
-    // fetch the first batch
-    auto iter =
-        DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
-    iter.Reset();
-    bool valid = iter.Next();
-    CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
-
-    auto d = MakeProxy(proxy_)->DeviceIdx();
-    if (batch_param_.gpu_id != Context::kCpuId) {
-      CHECK_EQ(d, batch_param_.gpu_id) << "All batch should be on the same device.";
-    }
-    batch_param_ = BatchParam{d, max_bin};
-    batch_param_.sparse_thresh = 0.2;  // default from TrainParam
-
-    ctx_.UpdateAllowUnknown(
-        Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
-    if (ctx_.IsCPU()) {
-      this->InitFromCPU(iter_handle, missing, ref);
-    } else {
-      this->InitFromCUDA(iter_handle, missing, ref);
-    }
-  }
+                            bst_bin_t max_bin);
  ~IterativeDMatrix() override = default;

  bool EllpackExists() const override { return static_cast<bool>(ellpack_); }
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -78,7 +78,9 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
  // ------ From CPU quantile histogram -------.
  // percentage threshold for treating a feature as sparse
  // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse
-  double sparse_threshold;
+  static constexpr double DftSparseThreshold() { return 0.2; }
+
+  double sparse_threshold{DftSparseThreshold()};

  // declare the parameters
  DMLC_DECLARE_PARAMETER(TrainParam) {
@@ -182,7 +184,9 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
                  "See tutorial for more information");

    // ------ From cpu quantile histogram -------.
-    DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2)
+    DMLC_DECLARE_FIELD(sparse_threshold)
+        .set_range(0, 1.0)
+        .set_default(DftSparseThreshold())
        .describe("percentage threshold for treating a feature as sparse");

    // add alias of parameters