Re-implement ROC-AUC. (#6747)

* Re-implement ROC-AUC. * Binary * MultiClass * LTR * Add documents. This PR resolves a few issues: - Define a value when the dataset is invalid, which can happen if there's an empty dataset, or when the dataset contains only positive or negative values. - Define ROC-AUC for multi-class classification. - Define weighted average value for distributed setting. - A correct implementation for learning to rank task. Previous implementation is just binary classification with averaging across groups, which doesn't measure ordered learning to rank.
2021-03-20 16:52:40 +08:00
parent 4ee8340e79
commit bcc0277338
27 changed files with 1622 additions and 461 deletions
--- a/tests/cpp/common/test_common.cc
+++ b/tests/cpp/common/test_common.cc
@@ -1,11 +1,12 @@
 #include <gtest/gtest.h>
+#include <xgboost/span.h>
 #include "../../../src/common/common.h"

 namespace xgboost {
 namespace common {
 TEST(ArgSort, Basic) {
  std::vector<float> inputs {3.0, 2.0, 1.0};
-  auto ret = ArgSort<bst_feature_t>(inputs);
+  auto ret = ArgSort<bst_feature_t>(Span<float>{inputs});
  std::vector<bst_feature_t> sol{2, 1, 0};
  ASSERT_EQ(ret, sol);
 }
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -0,0 +1,66 @@
+#include <gtest/gtest.h>
+#include "../../../src/common/ranking_utils.cuh"
+#include "../../../src/common/device_helpers.cuh"
+
+namespace xgboost {
+namespace common {
+
+TEST(SegmentedTrapezoidThreads, Basic) {
+  size_t constexpr kElements = 24, kGroups = 3;
+  dh::device_vector<size_t> offset_ptr(kGroups + 1, 0);
+  offset_ptr[0] = 0;
+  offset_ptr[1] = 8;
+  offset_ptr[2] = 16;
+  offset_ptr[kGroups] = kElements;
+
+  size_t h = 1;
+  dh::device_vector<size_t> thread_ptr(kGroups + 1, 0);
+  size_t total = SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  ASSERT_EQ(total, kElements - kGroups);
+
+  h = 2;
+  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  std::vector<size_t> h_thread_ptr(thread_ptr.size());
+  thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
+  for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
+    ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 13);
+  }
+
+  h = 7;
+  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
+  for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
+    ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 28);
+  }
+}
+
+TEST(SegmentedTrapezoidThreads, Unravel) {
+  size_t i = 0, j = 0;
+  size_t constexpr kN = 8;
+
+  UnravelTrapeziodIdx(6, kN, &i, &j);
+  ASSERT_EQ(i, 0);
+  ASSERT_EQ(j, 7);
+
+  UnravelTrapeziodIdx(12, kN, &i, &j);
+  ASSERT_EQ(i, 1);
+  ASSERT_EQ(j, 7);
+
+  UnravelTrapeziodIdx(15, kN, &i, &j);
+  ASSERT_EQ(i, 2);
+  ASSERT_EQ(j, 5);
+
+  UnravelTrapeziodIdx(21, kN, &i, &j);
+  ASSERT_EQ(i, 3);
+  ASSERT_EQ(j, 7);
+
+  UnravelTrapeziodIdx(25, kN, &i, &j);
+  ASSERT_EQ(i, 5);
+  ASSERT_EQ(j, 6);
+
+  UnravelTrapeziodIdx(27, kN, &i, &j);
+  ASSERT_EQ(i, 6);
+  ASSERT_EQ(j, 7);
+}
+}  // namespace common
+}  // namespace xgboost
--- a/tests/cpp/metric/test_auc.cc
+++ b/tests/cpp/metric/test_auc.cc
@@ -0,0 +1,133 @@
+#include <xgboost/metric.h>
+#include "../helpers.h"
+
+namespace xgboost {
+namespace metric {
+
+TEST(Metric, DeclareUnifiedTest(BinaryAUC)) {
+  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr {Metric::Create("auc", &tparam)};
+  Metric * metric = uni_ptr.get();
+  ASSERT_STREQ(metric->Name(), "auc");
+
+  // Binary
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {1, 0}), 0.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {0, 1}), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {0, 1}), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {0, 0}, {1, 0}), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 1}, {1, 0}), 0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric, {1, 0, 0}, {0, 0, 1}), 0.25f, 1e-10);
+
+  // Invalid dataset
+  MetaInfo info;
+  info.labels_ = {0, 0};
+  float auc = metric->Eval({1, 1}, info, false);
+  ASSERT_TRUE(std::isnan(auc));
+  info.labels_ = HostDeviceVector<float>{};
+  auc = metric->Eval(HostDeviceVector<float>{}, info, false);
+  ASSERT_TRUE(std::isnan(auc));
+
+  EXPECT_NEAR(GetMetricEval(metric, {0, 1, 0, 1}, {0, 1, 0, 1}), 1.0f, 1e-10);
+
+  // AUC with instance weights
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {0.9f, 0.1f, 0.4f, 0.3f},
+                            {0,    0,    1,    1},
+                            {1.0f, 3.0f, 2.0f, 4.0f}),
+              0.75f, 0.001f);
+
+  // regression test case
+  ASSERT_NEAR(GetMetricEval(
+                  metric,
+                  {0.79523796, 0.5201713,  0.79523796, 0.24273258, 0.53452194,
+                   0.53452194, 0.24273258, 0.5201713,  0.79523796, 0.53452194,
+                   0.24273258, 0.53452194, 0.79523796, 0.5201713,  0.24273258,
+                   0.5201713,  0.5201713,  0.53452194, 0.5201713,  0.53452194},
+                  {0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0}),
+              0.5, 1e-10);
+}
+
+TEST(Metric, DeclareUnifiedTest(MultiAUC)) {
+  auto tparam = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> uni_ptr{
+      Metric::Create("auc", &tparam)};
+  auto metric = uni_ptr.get();
+
+  // MultiClass
+  // 3x3
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f, // p_0
+                                0.0f, 1.0f, 0.0f, // p_1
+                                0.0f, 0.0f, 1.0f  // p_2
+                            },
+                            {0, 1, 2}),
+              1.0f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f, // p_0
+                                0.0f, 1.0f, 0.0f, // p_1
+                                0.0f, 0.0f, 1.0f  // p_2
+                            },
+                            {2, 1, 0}),
+              0.5f, 1e-10);
+  EXPECT_NEAR(GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f, // p_0
+                                0.0f, 1.0f, 0.0f, // p_1
+                                0.0f, 0.0f, 1.0f  // p_2
+                            },
+                            {2, 0, 1}),
+              0.25f, 1e-10);
+
+  // invalid dataset
+  float auc = GetMetricEval(metric,
+                            {
+                                1.0f, 0.0f, 0.0f, // p_0
+                                0.0f, 1.0f, 0.0f, // p_1
+                                0.0f, 0.0f, 1.0f  // p_2
+                            },
+                            {0, 1, 1});  // no class 2.
+  EXPECT_TRUE(std::isnan(auc)) << auc;
+}
+
+TEST(Metric, DeclareUnifiedTest(RankingAUC)) {
+  auto tparam = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<Metric> metric{Metric::Create("auc", &tparam)};
+
+  // single group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0.7f, 0.2f, 0.3f, 0.6f},
+                            {1.0f, 0.8f, 0.4f, 0.2f}, /*weights=*/{},
+                            {0, 4}),
+              0.5f, 1e-10);
+
+  // multi group
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
+                            {0, 1, 2, 0, 1, 2}, /*weights=*/{}, {0, 3, 6}),
+              1.0f, 1e-10);
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1, 2, 0, 1, 2},
+                            {0, 1, 2, 0, 1, 2}, /*weights=*/{1.0f, 2.0f},
+                            {0, 3, 6}),
+              1.0f, 1e-10);
+
+  // AUC metric for grouped datasets - exception scenarios
+  ASSERT_TRUE(std::isnan(
+      GetMetricEval(metric.get(), {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3})));
+
+  // regression case
+  HostDeviceVector<float> predt{0.33935383, 0.5149714,  0.32138085, 1.4547751,
+                                1.2010975,  0.42651367, 0.23104341, 0.83610827,
+                                0.8494239,  0.07136688, 0.5623144,  0.8086237,
+                                1.5066161,  -4.094787,  0.76887935, -2.4082742};
+  std::vector<bst_group_t> groups{0, 7, 16};
+  std::vector<float> labels{1., 0., 0., 1., 2., 1., 0., 0.,
+                            0., 0., 0., 0., 1., 0., 1., 0.};
+
+  EXPECT_NEAR(GetMetricEval(metric.get(), std::move(predt), labels,
+                            /*weights=*/{}, groups),
+              0.769841f, 1e-6);
+}
+}  // namespace metric
+}  // namespace xgboost
--- a/tests/cpp/metric/test_auc.cu
+++ b/tests/cpp/metric/test_auc.cu
@@ -0,0 +1,5 @@
+/*!
+ * Copyright 2021 XGBoost contributors
+ */
+// Dummy file to keep the CUDA conditional compile trick.
+#include "test_auc.cc"
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -24,49 +24,6 @@ TEST(Metric, AMS) {
 }
 #endif

-TEST(Metric, DeclareUnifiedTest(AUC)) {
-  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
-  xgboost::Metric * metric = xgboost::Metric::Create("auc", &tparam);
-  ASSERT_STREQ(metric->Name(), "auc");
-  EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10);
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.1f, 0.9f, 0.1f, 0.9f},
-                            {  0,   0,   1,   1}),
-              0.5f, 0.001f);
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}));
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 0}, {0, 0}));
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {1, 1}));
-
-  // AUC with instance weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.9f, 0.1f, 0.4f, 0.3f},
-                            {0,    0,    1,    1},
-                            {1.0f, 3.0f, 2.0f, 4.0f}),
-              0.75f, 0.001f);
-
-  // AUC for a ranking task without weights
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.9f, 0.1f, 0.4f, 0.3f, 0.7f},
-                            {0,    1,    0,    1,    1},
-                            {},
-                            {0, 2, 5}),
-              0.25f, 0.001f);
-
-  // AUC for a ranking task with weights/group
-  EXPECT_NEAR(GetMetricEval(metric,
-                            {0.9f, 0.1f, 0.4f, 0.3f, 0.7f},
-                            {1,    0,    1,    0,    0},
-                            {1, 2},
-                            {0, 2, 5}),
-              0.75f, 0.001f);
-
-  // AUC metric for grouped datasets - exception scenarios
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1, 2}, {0, 0, 0}, {}, {0, 2, 3}));
-  EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1, 2}, {1, 1, 1}, {}, {0, 2, 3}));
-
-  delete metric;
-}
-
 TEST(Metric, DeclareUnifiedTest(AUCPR)) {
  auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX);
  xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &tparam);
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -42,6 +42,7 @@ def local_cuda_cluster(request, pytestconfig):
 def pytest_addoption(parser):
    parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool')

+
 def pytest_collection_modifyitems(config, items):
    if config.getoption('--use-rmm-pool'):
        blocklist = [
@@ -53,3 +54,9 @@ def pytest_collection_modifyitems(config, items):
        for item in items:
            if any(item.nodeid.startswith(x) for x in blocklist):
                item.add_marker(skip_mark)
+
+    # mark dask tests as `mgpu`.
+    mgpu_mark = pytest.mark.mgpu
+    for item in items:
+        if item.nodeid.startswith("python-gpu/test_gpu_with_dask.py"):
+            item.add_marker(mgpu_mark)
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -0,0 +1,47 @@
+import sys
+import xgboost
+import pytest
+
+sys.path.append("tests/python")
+import test_eval_metrics as test_em  # noqa
+
+
+class TestGPUEvalMetrics:
+    cpu_test = test_em.TestEvalMetrics()
+
+    @pytest.mark.parametrize("n_samples", [4, 100, 1000])
+    def test_roc_auc_binary(self, n_samples):
+        self.cpu_test.run_roc_auc_binary("gpu_hist", n_samples)
+
+    @pytest.mark.parametrize("n_samples", [4, 100, 1000])
+    def test_roc_auc_multi(self, n_samples):
+        self.cpu_test.run_roc_auc_multi("gpu_hist", n_samples)
+
+    @pytest.mark.parametrize("n_samples", [4, 100, 1000])
+    def test_roc_auc_ltr(self, n_samples):
+        import numpy as np
+
+        rng = np.random.RandomState(1994)
+        n_samples = n_samples
+        n_features = 10
+        X = rng.randn(n_samples, n_features)
+        y = rng.randint(0, 16, size=n_samples)
+        group = np.array([n_samples // 2, n_samples // 2])
+
+        Xy = xgboost.DMatrix(X, y, group=group)
+
+        cpu = xgboost.train(
+            {"tree_method": "hist", "eval_metric": "auc", "objective": "rank:ndcg"},
+            Xy,
+            num_boost_round=10,
+        )
+        cpu_auc = float(cpu.eval(Xy).split(":")[1])
+
+        gpu = xgboost.train(
+            {"tree_method": "gpu_hist", "eval_metric": "auc", "objective": "rank:ndcg"},
+            Xy,
+            num_boost_round=10,
+        )
+        gpu_auc = float(gpu.eval(Xy).split(":")[1])
+
+        np.testing.assert_allclose(cpu_auc, gpu_auc)
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -5,6 +5,10 @@ import itertools
 import shutil
 import urllib.request
 import zipfile
+import sys
+sys.path.append("tests/python")
+
+import testing as tm            # noqa


 class TestRanking:
@@ -15,9 +19,9 @@ class TestRanking:
        """
        from sklearn.datasets import load_svmlight_files
        # download the test data
-        cls.dpath = 'demo/rank/'
+        cls.dpath = os.path.join(tm.PROJECT_ROOT, "demo/rank/")
        src = 'https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip'
-        target = cls.dpath + '/MQ2008.zip'
+        target = os.path.join(cls.dpath, "MQ2008.zip")

        if os.path.exists(cls.dpath) and os.path.exists(target):
            print("Skipping dataset download...")
@@ -79,8 +83,8 @@ class TestRanking:
        Cleanup test artifacts from download and unpacking
        :return:
        """
-        os.remove(cls.dpath + "MQ2008.zip")
-        shutil.rmtree(cls.dpath + "MQ2008")
+        os.remove(os.path.join(cls.dpath, "MQ2008.zip"))
+        shutil.rmtree(os.path.join(cls.dpath, "MQ2008"))

    @classmethod
    def __test_training_with_rank_objective(cls, rank_objective, metric_name, tolerance=1e-02):
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -17,6 +17,8 @@ if sys.platform.startswith("win"):

 sys.path.append("tests/python")
 from test_with_dask import run_empty_dmatrix_reg      # noqa
+from test_with_dask import run_empty_dmatrix_auc      # noqa
+from test_with_dask import run_auc                    # noqa
 from test_with_dask import run_boost_from_prediction  # noqa
 from test_with_dask import run_dask_classifier        # noqa
 from test_with_dask import run_empty_dmatrix_cls      # noqa
@@ -286,6 +288,15 @@ class TestDistributedGPU:
            run_empty_dmatrix_reg(client, parameters)
            run_empty_dmatrix_cls(client, parameters)

+    def test_empty_dmatrix_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
+        with Client(local_cuda_cluster) as client:
+            n_workers = len(_get_client_workers(client))
+            run_empty_dmatrix_auc(client, "gpu_hist", n_workers)
+
+    def test_auc(self, local_cuda_cluster: LocalCUDACluster) -> None:
+        with Client(local_cuda_cluster) as client:
+            run_auc(client, "gpu_hist")
+
    def test_data_initialization(self, local_cuda_cluster: LocalCUDACluster) -> None:
        with Client(local_cuda_cluster) as client:
            X, y, _ = generate_array()
--- a/tests/python/test_eval_metrics.py
+++ b/tests/python/test_eval_metrics.py
@@ -123,3 +123,90 @@ class TestEvalMetrics:
        gamma_dev = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1].split(":")[0])
        skl_gamma_dev = mean_gamma_deviance(y, score)
        np.testing.assert_allclose(gamma_dev, skl_gamma_dev, rtol=1e-6)
+
+    def run_roc_auc_binary(self, tree_method, n_samples):
+        import numpy as np
+        from sklearn.datasets import make_classification
+        from sklearn.metrics import roc_auc_score
+
+        rng = np.random.RandomState(1994)
+        n_samples = n_samples
+        n_features = 10
+
+        X, y = make_classification(
+            n_samples,
+            n_features,
+            n_informative=n_features,
+            n_redundant=0,
+            random_state=rng
+        )
+        Xy = xgb.DMatrix(X, y)
+        booster = xgb.train(
+            {
+                "tree_method": tree_method,
+                "eval_metric": "auc",
+                "objective": "binary:logistic",
+            },
+            Xy,
+            num_boost_round=8,
+        )
+        score = booster.predict(Xy)
+        skl_auc = roc_auc_score(y, score)
+        auc = float(booster.eval(Xy).split(":")[1])
+        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+        X = rng.randn(*X.shape)
+        score = booster.predict(xgb.DMatrix(X))
+        skl_auc = roc_auc_score(y, score)
+        auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1])
+        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.parametrize("n_samples", [4, 100, 1000])
+    def test_roc_auc(self, n_samples):
+        self.run_roc_auc_binary("hist", n_samples)
+
+    def run_roc_auc_multi(self, tree_method, n_samples):
+        import numpy as np
+        from sklearn.datasets import make_classification
+        from sklearn.metrics import roc_auc_score
+
+        rng = np.random.RandomState(1994)
+        n_samples = n_samples
+        n_features = 10
+        n_classes = 4
+
+        X, y = make_classification(
+            n_samples,
+            n_features,
+            n_informative=n_features,
+            n_redundant=0,
+            n_classes=n_classes,
+            random_state=rng
+        )
+
+        Xy = xgb.DMatrix(X, y)
+        booster = xgb.train(
+            {
+                "tree_method": tree_method,
+                "eval_metric": "auc",
+                "objective": "multi:softprob",
+                "num_class": n_classes,
+            },
+            Xy,
+            num_boost_round=8,
+        )
+        score = booster.predict(Xy)
+        skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
+        auc = float(booster.eval(Xy).split(":")[1])
+        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+        X = rng.randn(*X.shape)
+        score = booster.predict(xgb.DMatrix(X))
+        skl_auc = roc_auc_score(y, score, average="weighted", multi_class="ovr")
+        auc = float(booster.eval(xgb.DMatrix(X, y)).split(":")[1])
+        np.testing.assert_allclose(skl_auc, auc, rtol=1e-6)
+
+    @pytest.mark.parametrize("n_samples", [4, 100, 1000])
+    def test_roc_auc_multi(self, n_samples):
+        self.run_roc_auc_multi("hist", n_samples)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -9,6 +9,7 @@ import scipy
 import json
 from typing import List, Tuple, Dict, Optional, Type, Any
 import asyncio
+from functools import partial
 from concurrent.futures import ThreadPoolExecutor
 import tempfile
 from sklearn.datasets import make_classification
@@ -528,9 +529,106 @@ def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None:
    _check_outputs(out, predictions)


+def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) -> None:
+    from sklearn import datasets
+    n_samples = 100
+    n_features = 97
+    rng = np.random.RandomState(1994)
+
+    make_classification = partial(
+        datasets.make_classification,
+        n_features=n_features,
+        random_state=rng
+    )
+
+    # binary
+    X_, y_ = make_classification(n_samples=n_samples, random_state=rng)
+    X = dd.from_array(X_, chunksize=10)
+    y = dd.from_array(y_, chunksize=10)
+
+    n_samples = n_workers - 1
+    valid_X_, valid_y_ = make_classification(n_samples=n_samples, random_state=rng)
+    valid_X = dd.from_array(valid_X_, chunksize=n_samples)
+    valid_y = dd.from_array(valid_y_, chunksize=n_samples)
+
+    cls = xgb.dask.DaskXGBClassifier(
+        tree_method=tree_method, n_estimators=2, use_label_encoder=False
+    )
+    cls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
+
+    # multiclass
+    X_, y_ = make_classification(
+        n_samples=n_samples,
+        n_classes=10,
+        n_informative=n_features,
+        n_redundant=0,
+        n_repeated=0
+    )
+    X = dd.from_array(X_, chunksize=10)
+    y = dd.from_array(y_, chunksize=10)
+
+    n_samples = n_workers - 1
+    valid_X_, valid_y_ = make_classification(
+        n_samples=n_samples,
+        n_classes=10,
+        n_informative=n_features,
+        n_redundant=0,
+        n_repeated=0
+    )
+    valid_X = dd.from_array(valid_X_, chunksize=n_samples)
+    valid_y = dd.from_array(valid_y_, chunksize=n_samples)
+
+    cls = xgb.dask.DaskXGBClassifier(
+        tree_method=tree_method, n_estimators=2, use_label_encoder=False
+    )
+    cls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
+
+
+def test_empty_dmatrix_auc() -> None:
+    with LocalCluster(n_workers=2) as cluster:
+        with Client(cluster) as client:
+            run_empty_dmatrix_auc(client, "hist", 2)
+
+
+def run_auc(client: "Client", tree_method: str) -> None:
+    from sklearn import datasets
+    n_samples = 100
+    n_features = 97
+    rng = np.random.RandomState(1994)
+    X_, y_ = datasets.make_classification(
+        n_samples=n_samples, n_features=n_features, random_state=rng
+    )
+    X = dd.from_array(X_, chunksize=10)
+    y = dd.from_array(y_, chunksize=10)
+
+    valid_X_, valid_y_ = datasets.make_classification(
+        n_samples=n_samples, n_features=n_features, random_state=rng
+    )
+    valid_X = dd.from_array(valid_X_, chunksize=10)
+    valid_y = dd.from_array(valid_y_, chunksize=10)
+
+    cls = xgb.XGBClassifier(
+        tree_method=tree_method, n_estimators=2, use_label_encoder=False
+    )
+    cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
+
+    dcls = xgb.dask.DaskXGBClassifier(
+        tree_method=tree_method, n_estimators=2, use_label_encoder=False
+    )
+    dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
+
+    approx = dcls.evals_result()["validation_0"]["auc"]
+    exact = cls.evals_result()["validation_0"]["auc"]
+    for i in range(2):
+        # approximated test.
+        assert np.abs(approx[i] - exact[i]) <= 0.06
+
+
+def test_auc(client: "Client") -> None:
+    run_auc(client, "hist")
+
 # No test for Exact, as empty DMatrix handling are mostly for distributed
 # environment and Exact doesn't support it.
-
 def test_empty_dmatrix_hist() -> None:
    with LocalCluster(n_workers=kWorkers) as cluster:
        with Client(cluster) as client: