sync Jun 5

2023-06-07 02:43:21 +02:00
parent 9ee1852d4e 0cba2cdbb0
commit af8845405a
56 changed files with 531 additions and 2106 deletions
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x

 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.02
+RAPIDS_VERSION=23.04
 SPARK_VERSION=3.4.0
 JDK_VERSION=8

--- a/tests/buildkite/update-rapids.sh
+++ b/tests/buildkite/update-rapids.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -euo pipefail
+
+LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/')
+echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION"
+
+PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
+
+sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@@ -18,8 +18,17 @@ rm -rf $(find . -name target)
 rm -rf ../build/

 # Re-build package without Mock Rabit
+# Maven profiles:
+# `default`           includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example
+# `gpu`               includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON`
+# `scala-2.13`        sets the scala binary version to the 2.13
+# `release-to-s3`     sets maven deployment targets
+
 # Deploy to S3 bucket xgboost-maven-repo
-mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
+# Deploy scala 2.13 to S3 bucket xgboost-maven-repo
+mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
+

 set +x
 set +e
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -90,7 +90,7 @@ def check_cmd_print_failure_assistance(cmd: List[str]) -> bool:

    subprocess.run([cmd[0], "--version"])
    msg = """
-Please run the following command on your machine to address the formatting error:
+Please run the following command on your machine to address the error:

    """
    msg += " ".join(cmd)
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -17,34 +17,30 @@
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json, String, Object

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {

 inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  // When the limit for precision is not given, it takes the limit at
-  // std::numeric_limits<unsigned>::max(); hence all values are very small
-  // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
  auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
+  std::unique_ptr<xgboost::Metric> metric{Metric::Create("pre", &ctx)};
  ASSERT_STREQ(metric->Name(), "pre");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
-              0, 1e-7);
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5, 1e-7);
+  EXPECT_NEAR(
+      GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
+      0.5, 1e-7);

-  delete metric;
-  metric = xgboost::Metric::Create("pre@2", &ctx);
+  metric.reset(xgboost::Metric::Create("pre@2", &ctx));
  ASSERT_STREQ(metric->Name(), "pre@2");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}, {}, {}, data_split_mode),
-              0.5f, 0.001f);
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
+  EXPECT_NEAR(
+      GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
+      0.5f, 0.001f);

-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
+  EXPECT_ANY_THROW(GetMetricEval(metric.get(), {0, 1}, {}, {}, {}, data_split_mode));

-  delete metric;
+  metric.reset(xgboost::Metric::Create("pre@4", &ctx));
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f},
+                            {0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f}, {}, {}, data_split_mode),
+              0.5f, 1e-7);
 }

 inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
@@ -187,5 +183,4 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
  ndcg = metric->Evaluate(predt, p_fmat);
  ASSERT_NEAR(ndcg, 1.0, kRtEps);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -17,13 +17,15 @@
 #include "test_predictor.h"

 namespace xgboost {
-TEST(CpuPredictor, Basic) {
+
+namespace {
+void TestBasic(DMatrix* dmat) {
  auto lparam = CreateEmptyGenericParam(GPUIDX);
  std::unique_ptr<Predictor> cpu_predictor =
      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));

-  size_t constexpr kRows = 5;
-  size_t constexpr kCols = 5;
+  size_t const kRows = dmat->Info().num_row_;
+  size_t const kCols = dmat->Info().num_col_;

  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};

@@ -31,12 +33,10 @@ TEST(CpuPredictor, Basic) {
  ctx.UpdateAllowUnknown(Args{});
  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);

-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-
  // Test predict batch
  PredictionCacheEntry out_predictions;
  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
-  cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+  cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0);

  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
@@ -44,26 +44,32 @@ TEST(CpuPredictor, Basic) {
  }

  // Test predict instance
-  auto const &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
+  auto const& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
  auto page = batch.GetView();
  for (size_t i = 0; i < batch.Size(); i++) {
    std::vector<float> instance_out_predictions;
-    cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model);
+    cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model, 0,
+                                   dmat->Info().IsColumnSplit());
    ASSERT_EQ(instance_out_predictions[0], 1.5);
  }

  // Test predict leaf
  HostDeviceVector<float> leaf_out_predictions;
-  cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
+  cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
  for (auto v : h_leaf_out_predictions) {
    ASSERT_EQ(v, 0);
  }

+  if (dmat->Info().IsColumnSplit()) {
+    // Predict contribution is not supported for column split.
+    return;
+  }
+
  // Test predict contribution
  HostDeviceVector<float> out_contribution_hdv;
  auto& out_contribution = out_contribution_hdv.HostVector();
-  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
+  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model);
  ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
@@ -76,8 +82,7 @@ TEST(CpuPredictor, Basic) {
    }
  }
  // Test predict contribution (approximate method)
-  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model,
-                                     0, nullptr, true);
+  cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true);
  for (size_t i = 0; i < out_contribution.size(); ++i) {
    auto const& contri = out_contribution[i];
    // shift 1 for bias, as test tree is a decision dump, only global bias is
@@ -89,41 +94,32 @@ TEST(CpuPredictor, Basic) {
    }
  }
 }
+}  // anonymous namespace

-namespace {
-void TestColumnSplitPredictBatch() {
+TEST(CpuPredictor, Basic) {
  size_t constexpr kRows = 5;
  size_t constexpr kCols = 5;
  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+  TestBasic(dmat.get());
+}
+
+namespace {
+void TestColumnSplit() {
+  size_t constexpr kRows = 5;
+  size_t constexpr kCols = 5;
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
+
  auto const world_size = collective::GetWorldSize();
  auto const rank = collective::GetRank();
+  dmat = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};

-  auto lparam = CreateEmptyGenericParam(GPUIDX);
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
-
-  LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
-
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
-
-  // Test predict batch
-  PredictionCacheEntry out_predictions;
-  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
-  auto sliced = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
-  cpu_predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
-
-  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
-  for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
-    ASSERT_EQ(out_predictions_h[i], 1.5);
-  }
+  TestBasic(dmat.get());
 }
 }  // anonymous namespace

-TEST(CpuPredictor, ColumnSplit) {
+TEST(CpuPredictor, ColumnSplitBasic) {
  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplitPredictBatch);
+  RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit);
 }

 TEST(CpuPredictor, IterationRange) {
@@ -133,69 +129,8 @@ TEST(CpuPredictor, IterationRange) {
 TEST(CpuPredictor, ExternalMemory) {
  size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
  size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
-
  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
-  auto lparam = CreateEmptyGenericParam(GPUIDX);
-
-  std::unique_ptr<Predictor> cpu_predictor =
-      std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
-
-  LearnerModelParam mparam{MakeMP(dmat->Info().num_col_, .0, 1)};
-
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
-
-  // Test predict batch
-  PredictionCacheEntry out_predictions;
-  cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
-  cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
-  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
-  ASSERT_EQ(out_predictions.predictions.Size(), dmat->Info().num_row_);
-  for (const auto& v : out_predictions_h) {
-    ASSERT_EQ(v, 1.5);
-  }
-
-  // Test predict leaf
-  HostDeviceVector<float> leaf_out_predictions;
-  cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
-  auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
-  ASSERT_EQ(h_leaf_out_predictions.size(), dmat->Info().num_row_);
-  for (const auto& v : h_leaf_out_predictions) {
-    ASSERT_EQ(v, 0);
-  }
-
-  // Test predict contribution
-  HostDeviceVector<float> out_contribution_hdv;
-  auto& out_contribution = out_contribution_hdv.HostVector();
-  cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
-  ASSERT_EQ(out_contribution.size(), dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
-
-  // Test predict contribution (approximate method)
-  HostDeviceVector<float> out_contribution_approximate_hdv;
-  auto& out_contribution_approximate = out_contribution_approximate_hdv.HostVector();
-  cpu_predictor->PredictContribution(
-      dmat.get(), &out_contribution_approximate_hdv, model, 0, nullptr, true);
-  ASSERT_EQ(out_contribution_approximate.size(),
-            dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
-  for (size_t i = 0; i < out_contribution.size(); ++i) {
-    auto const& contri = out_contribution[i];
-    // shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
-    if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
-      ASSERT_EQ(out_contribution.back(), 1.5f);
-    } else {
-      ASSERT_EQ(contri, 0);
-    }
-  }
+  TestBasic(dmat.get());
 }

 TEST(CpuPredictor, InplacePredict) {
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -5,7 +5,7 @@ import pytest

 import xgboost
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_quantile_error
+from xgboost.testing.metrics import check_precision_score, check_quantile_error

 sys.path.append("tests/python")
 import test_eval_metrics as test_em  # noqa
@@ -59,6 +59,9 @@ class TestGPUEvalMetrics:
    def test_pr_auc_ltr(self):
        self.cpu_test.run_pr_auc_ltr("gpu_hist")

+    def test_precision_score(self):
+        check_precision_score("gpu_hist")
+
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_quantile_error(self) -> None:
        check_quantile_error("gpu_hist")
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -3,7 +3,7 @@ import pytest

 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.metrics import check_quantile_error
+from xgboost.testing.metrics import check_precision_score, check_quantile_error

 rng = np.random.RandomState(1337)

@@ -315,6 +315,9 @@ class TestEvalMetrics:
    def test_pr_auc_ltr(self):
        self.run_pr_auc_ltr("hist")

+    def test_precision_score(self):
+        check_precision_score("hist")
+
    @pytest.mark.skipif(**tm.no_sklearn())
    def test_quantile_error(self) -> None:
        check_quantile_error("hist")
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -55,6 +55,38 @@ class TestQuantileDMatrix:
        r = np.arange(1.0, n_samples)
        np.testing.assert_allclose(Xy.get_data().toarray()[1:, 0], r)

+    def test_error(self):
+        from sklearn.model_selection import train_test_split
+
+        rng = np.random.default_rng(1994)
+        X, y = make_categorical(
+            n_samples=128, n_features=2, n_categories=3, onehot=False
+        )
+        reg = xgb.XGBRegressor(tree_method="hist", enable_categorical=True)
+        w = rng.uniform(0, 1, size=y.shape[0])
+
+        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
+            X, y, w, random_state=1994
+        )
+
+        with pytest.raises(ValueError, match="sample weight"):
+            reg.fit(
+                X,
+                y,
+                sample_weight=w_train,
+                eval_set=[(X_test, y_test)],
+                sample_weight_eval_set=[w_test],
+            )
+
+        with pytest.raises(ValueError, match="sample weight"):
+            reg.fit(
+                X_train,
+                y_train,
+                sample_weight=w,
+                eval_set=[(X_test, y_test)],
+                sample_weight_eval_set=[w_test],
+            )
+
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
    def test_with_iterator(self, sparsity: float) -> None:
        n_samples_per_batch = 317