Extract dask and spark test into distributed test. (#8395)

- Move test files. - Run spark and dask separately to prevent conflicts. - Gather common code into the testing module.
2022-10-28 16:24:32 +08:00
parent f73520bfff
commit cfd2a9f872
34 changed files with 405 additions and 337 deletions
--- a/tests/test_distributed/init.py
+++ b/tests/test_distributed/init.py
--- a/tests/test_distributed/test_federated/runtests-federated.sh
+++ b/tests/test_distributed/test_federated/runtests-federated.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -e
+
+rm -f ./*.model* ./agaricus* ./*.pem
+
+world_size=$(nvidia-smi -L | wc -l)
+
+# Generate server and client certificates.
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out server-cert.pem -subj "/C=US/CN=localhost"
+openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
+
+# Split train and test files manually to simulate a federated environment.
+split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train-
+split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test-
+
+python test_federated.py "${world_size}"
--- a/tests/test_distributed/test_federated/test_federated.py
+++ b/tests/test_distributed/test_federated/test_federated.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python
+import multiprocessing
+import sys
+import time
+
+import xgboost.federated
+
+import xgboost as xgb
+
+SERVER_KEY = 'server-key.pem'
+SERVER_CERT = 'server-cert.pem'
+CLIENT_KEY = 'client-key.pem'
+CLIENT_CERT = 'client-cert.pem'
+
+
+def run_server(port: int, world_size: int, with_ssl: bool) -> None:
+    if with_ssl:
+        xgboost.federated.run_federated_server(port, world_size, SERVER_KEY, SERVER_CERT,
+                                               CLIENT_CERT)
+    else:
+        xgboost.federated.run_federated_server(port, world_size)
+
+
+def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu: bool) -> None:
+    communicator_env = {
+        'xgboost_communicator': 'federated',
+        'federated_server_address': f'localhost:{port}',
+        'federated_world_size': world_size,
+        'federated_rank': rank
+    }
+    if with_ssl:
+        communicator_env['federated_server_cert'] = SERVER_CERT
+        communicator_env['federated_client_key'] = CLIENT_KEY
+        communicator_env['federated_client_cert'] = CLIENT_CERT
+
+    # Always call this before using distributed module
+    with xgb.collective.CommunicatorContext(**communicator_env):
+        # Load file, file will not be sharded in federated mode.
+        dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
+        dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
+
+        # Specify parameters via map, definition are same as c++ version
+        param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+        if with_gpu:
+            param['tree_method'] = 'gpu_hist'
+            param['gpu_id'] = rank
+
+        # Specify validations set to watch performance
+        watchlist = [(dtest, 'eval'), (dtrain, 'train')]
+        num_round = 20
+
+        # Run training, all the features in training API is available.
+        bst = xgb.train(param, dtrain, num_round, evals=watchlist,
+                        early_stopping_rounds=2)
+
+        # Save the model, only ask process 0 to save the model.
+        if xgb.collective.get_rank() == 0:
+            bst.save_model("test.model.json")
+            xgb.collective.communicator_print("Finished training\n")
+
+
+def run_federated(with_ssl: bool = True, with_gpu: bool = False) -> None:
+    port = 9091
+    world_size = int(sys.argv[1])
+
+    server = multiprocessing.Process(target=run_server, args=(port, world_size, with_ssl))
+    server.start()
+    time.sleep(1)
+    if not server.is_alive():
+        raise Exception("Error starting Federated Learning server")
+
+    workers = []
+    for rank in range(world_size):
+        worker = multiprocessing.Process(target=run_worker,
+                                         args=(port, world_size, rank, with_ssl, with_gpu))
+        workers.append(worker)
+        worker.start()
+    for worker in workers:
+        worker.join()
+    server.terminate()
+
+
+if __name__ == '__main__':
+    run_federated(with_ssl=True, with_gpu=False)
+    run_federated(with_ssl=False, with_gpu=False)
+    run_federated(with_ssl=True, with_gpu=True)
+    run_federated(with_ssl=False, with_gpu=True)
--- a/tests/test_distributed/test_gpu_with_dask/init.py
+++ b/tests/test_distributed/test_gpu_with_dask/init.py
@@ -0,0 +1 @@
+
--- a/tests/test_distributed/test_gpu_with_dask/conftest.py
+++ b/tests/test_distributed/test_gpu_with_dask/conftest.py
@@ -0,0 +1,42 @@
+from typing import Generator, Sequence
+
+import pytest
+
+from xgboost import testing as tm
+
+
+@pytest.fixture(scope="session", autouse=True)
+def setup_rmm_pool(request, pytestconfig: pytest.Config) -> None:
+    tm.setup_rmm_pool(request, pytestconfig)
+
+
+@pytest.fixture(scope="class")
+def local_cuda_client(request, pytestconfig: pytest.Config) -> Generator:
+    kwargs = {}
+    if hasattr(request, "param"):
+        kwargs.update(request.param)
+    if pytestconfig.getoption("--use-rmm-pool"):
+        if tm.no_rmm()["condition"]:
+            raise ImportError("The --use-rmm-pool option requires the RMM package")
+        import rmm
+
+        kwargs["rmm_pool_size"] = "2GB"
+    if tm.no_dask_cuda()["condition"]:
+        raise ImportError("The local_cuda_cluster fixture requires dask_cuda package")
+    from dask.distributed import Client
+    from dask_cuda import LocalCUDACluster
+
+    yield Client(LocalCUDACluster(**kwargs))
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--use-rmm-pool", action="store_true", default=False, help="Use RMM pool"
+    )
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: Sequence) -> None:
+    # mark dask tests as `mgpu`.
+    mgpu_mark = pytest.mark.mgpu
+    for item in items:
+        item.add_marker(mgpu_mark)
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -0,0 +1,563 @@
+"""Copyright 2019-2022 XGBoost contributors"""
+import asyncio
+import os
+import subprocess
+from collections import OrderedDict
+from inspect import signature
+from typing import Any, Dict, Type, TypeVar, Union
+
+import numpy as np
+import pytest
+from hypothesis import given, note, settings, strategies
+from hypothesis._settings import duration
+from xgboost.testing.params import hist_parameter_strategy
+
+import xgboost as xgb
+from xgboost import testing as tm
+
+pytestmark = [
+    pytest.mark.skipif(**tm.no_dask()),
+    pytest.mark.skipif(**tm.no_dask_cuda()),
+]
+
+from ..test_with_dask.test_with_dask import generate_array
+from ..test_with_dask.test_with_dask import kCols as random_cols
+from ..test_with_dask.test_with_dask import (
+    make_categorical,
+    run_auc,
+    run_boost_from_prediction,
+    run_boost_from_prediction_multi_class,
+    run_categorical,
+    run_dask_classifier,
+    run_empty_dmatrix_auc,
+    run_empty_dmatrix_cls,
+    run_empty_dmatrix_reg,
+    run_tree_stats,
+    suppress,
+)
+
+try:
+    import cudf
+    import dask.dataframe as dd
+    from dask import array as da
+    from dask.distributed import Client
+    from dask_cuda import LocalCUDACluster
+
+    from xgboost import dask as dxgb
+except ImportError:
+    pass
+
+
+def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
+    import cupy as cp
+
+    cp.cuda.runtime.setDevice(0)
+    _X, _y, _ = generate_array()
+
+    X = dd.from_dask_array(_X)
+    y = dd.from_dask_array(_y)
+
+    X = X.map_partitions(cudf.from_pandas)
+    y = y.map_partitions(cudf.from_pandas)
+
+    dtrain = DMatrixT(client, X, y)
+    out = dxgb.train(
+        client,
+        {"tree_method": "gpu_hist", "debug_synchronize": True},
+        dtrain=dtrain,
+        evals=[(dtrain, "X")],
+        num_boost_round=4,
+    )
+
+    assert isinstance(out["booster"], dxgb.Booster)
+    assert len(out["history"]["X"]["rmse"]) == 4
+
+    predictions = dxgb.predict(client, out, dtrain)
+    assert isinstance(predictions.compute(), np.ndarray)
+
+    series_predictions = dxgb.inplace_predict(client, out, X)
+    assert isinstance(series_predictions, dd.Series)
+
+    single_node = out["booster"].predict(xgb.DMatrix(X.compute()))
+
+    cp.testing.assert_allclose(single_node, predictions.compute())
+    np.testing.assert_allclose(single_node, series_predictions.compute().to_numpy())
+
+    predt = dxgb.predict(client, out, X)
+    assert isinstance(predt, dd.Series)
+
+    T = TypeVar("T")
+
+    def is_df(part: T) -> T:
+        assert isinstance(part, cudf.DataFrame), part
+        return part
+
+    predt.map_partitions(is_df, meta=dd.utils.make_meta({"prediction": "f4"}))
+
+    cp.testing.assert_allclose(predt.values.compute(), single_node)
+
+    # Make sure the output can be integrated back to original dataframe
+    X["predict"] = predictions
+    X["inplace_predict"] = series_predictions
+
+    has_null = X.isnull().values.any().compute()
+    assert bool(has_null) is False
+
+
+def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
+    import cupy as cp
+
+    cp.cuda.runtime.setDevice(0)
+    X, y, _ = generate_array()
+
+    X = X.map_blocks(cp.asarray)
+    y = y.map_blocks(cp.asarray)
+    dtrain = DMatrixT(client, X, y)
+    out = dxgb.train(
+        client,
+        {"tree_method": "gpu_hist", "debug_synchronize": True},
+        dtrain=dtrain,
+        evals=[(dtrain, "X")],
+        num_boost_round=2,
+    )
+    from_dmatrix = dxgb.predict(client, out, dtrain).compute()
+    inplace_predictions = dxgb.inplace_predict(client, out, X).compute()
+    single_node = out["booster"].predict(xgb.DMatrix(X.compute()))
+    np.testing.assert_allclose(single_node, from_dmatrix)
+    device = cp.cuda.runtime.getDevice()
+    assert device == inplace_predictions.device.id
+    single_node = cp.array(single_node)
+    assert device == single_node.device.id
+    cp.testing.assert_allclose(single_node, inplace_predictions)
+
+
+def to_cp(x: Any, DMatrixT: Type) -> Any:
+    import cupy
+
+    if isinstance(x, np.ndarray) and DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
+        X = cupy.array(x)
+    else:
+        X = x
+    return X
+
+
+def run_gpu_hist(
+    params: Dict,
+    num_rounds: int,
+    dataset: tm.TestDataset,
+    DMatrixT: Type,
+    client: Client,
+) -> None:
+    params["tree_method"] = "gpu_hist"
+    params = dataset.set_params(params)
+    # It doesn't make sense to distribute a completely
+    # empty dataset.
+    if dataset.X.shape[0] == 0:
+        return
+
+    chunk = 128
+    X = to_cp(dataset.X, DMatrixT)
+    X = da.from_array(X, chunks=(chunk, dataset.X.shape[1]))
+    y = to_cp(dataset.y, DMatrixT)
+    y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk, dataset.y.shape[1])
+    y = da.from_array(y, chunks=y_chunk)
+
+    if dataset.w is not None:
+        w = to_cp(dataset.w, DMatrixT)
+        w = da.from_array(w, chunks=(chunk,))
+    else:
+        w = None
+
+    if DMatrixT is dxgb.DaskDeviceQuantileDMatrix:
+        m = DMatrixT(
+            client, data=X, label=y, weight=w, max_bin=params.get("max_bin", 256)
+        )
+    else:
+        m = DMatrixT(client, data=X, label=y, weight=w)
+    history = dxgb.train(
+        client,
+        params=params,
+        dtrain=m,
+        num_boost_round=num_rounds,
+        evals=[(m, "train")],
+    )["history"]["train"][dataset.metric]
+    note(history)
+
+    # See note on `ObjFunction::UpdateTreeLeaf`.
+    update_leaf = dataset.name.endswith("-l1")
+    if update_leaf:
+        assert history[0] + 1e-2 >= history[-1]
+        return
+    else:
+        assert tm.non_increasing(history)
+
+
+def test_tree_stats() -> None:
+    with LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster) as client:
+            local = run_tree_stats(client, "gpu_hist")
+
+    with LocalCUDACluster(n_workers=2) as cluster:
+        with Client(cluster) as client:
+            distributed = run_tree_stats(client, "gpu_hist")
+
+    assert local == distributed
+
+
+class TestDistributedGPU:
+    @pytest.mark.skipif(**tm.no_cudf())
+    def test_boost_from_prediction(self, local_cuda_client: Client) -> None:
+        import cudf
+        from sklearn.datasets import load_breast_cancer, load_iris
+
+        X_, y_ = load_breast_cancer(return_X_y=True)
+        X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
+        y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
+        run_boost_from_prediction(X, y, "gpu_hist", local_cuda_client)
+
+        X_, y_ = load_iris(return_X_y=True)
+        X = dd.from_array(X_, chunksize=50).map_partitions(cudf.from_pandas)
+        y = dd.from_array(y_, chunksize=50).map_partitions(cudf.from_pandas)
+        run_boost_from_prediction_multi_class(X, y, "gpu_hist", local_cuda_client)
+
+    @pytest.mark.skipif(**tm.no_dask_cudf())
+    def test_dask_dataframe(self, local_cuda_client: Client) -> None:
+        run_with_dask_dataframe(dxgb.DaskDMatrix, local_cuda_client)
+        run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, local_cuda_client)
+
+    @pytest.mark.skipif(**tm.no_dask_cudf())
+    def test_categorical(self, local_cuda_client: Client) -> None:
+        import dask_cudf
+
+        X, y = make_categorical(local_cuda_client, 10000, 30, 13)
+        X = dask_cudf.from_dask_dataframe(X)
+
+        X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
+        X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
+        run_categorical(local_cuda_client, "gpu_hist", X, X_onehot, y)
+
+    @given(
+        params=hist_parameter_strategy,
+        num_rounds=strategies.integers(1, 20),
+        dataset=tm.dataset_strategy,
+        dmatrix_type=strategies.sampled_from(
+            [dxgb.DaskDMatrix, dxgb.DaskDeviceQuantileDMatrix]
+        ),
+    )
+    @settings(
+        deadline=duration(seconds=120),
+        max_examples=20,
+        suppress_health_check=suppress,
+        print_blob=True,
+    )
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_gpu_hist(
+        self,
+        params: Dict,
+        num_rounds: int,
+        dataset: tm.TestDataset,
+        dmatrix_type: type,
+        local_cuda_client: Client,
+    ) -> None:
+        run_gpu_hist(params, num_rounds, dataset, dmatrix_type, local_cuda_client)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_dask_array(self, local_cuda_client: Client) -> None:
+        run_with_dask_array(dxgb.DaskDMatrix, local_cuda_client)
+        run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, local_cuda_client)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_early_stopping(self, local_cuda_client: Client) -> None:
+        from sklearn.datasets import load_breast_cancer
+
+        X, y = load_breast_cancer(return_X_y=True)
+        X, y = da.from_array(X), da.from_array(y)
+
+        m = dxgb.DaskDMatrix(local_cuda_client, X, y)
+
+        valid = dxgb.DaskDMatrix(local_cuda_client, X, y)
+        early_stopping_rounds = 5
+        booster = dxgb.train(
+            local_cuda_client,
+            {
+                "objective": "binary:logistic",
+                "eval_metric": "error",
+                "tree_method": "gpu_hist",
+            },
+            m,
+            evals=[(valid, "Valid")],
+            num_boost_round=1000,
+            early_stopping_rounds=early_stopping_rounds,
+        )["booster"]
+        assert hasattr(booster, "best_score")
+        dump = booster.get_dump(dump_format="json")
+        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
+
+        valid_X = X
+        valid_y = y
+        cls = dxgb.DaskXGBClassifier(
+            objective="binary:logistic",
+            tree_method="gpu_hist",
+            eval_metric="error",
+            n_estimators=100,
+        )
+        cls.client = local_cuda_client
+        cls.fit(
+            X,
+            y,
+            early_stopping_rounds=early_stopping_rounds,
+            eval_set=[(valid_X, valid_y)],
+        )
+        booster = cls.get_booster()
+        dump = booster.get_dump(dump_format="json")
+        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
+
+    @pytest.mark.skipif(**tm.no_cudf())
+    @pytest.mark.parametrize("model", ["boosting"])
+    def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None:
+        import dask_cudf
+
+        X_, y_, w_ = generate_array(with_weights=True)
+        y_ = (y_ * 10).astype(np.int32)
+        X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
+        y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
+        w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
+        run_dask_classifier(X, y, w, model, "gpu_hist", local_cuda_client, 10)
+
+    def test_empty_dmatrix(self, local_cuda_client: Client) -> None:
+        parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}
+        run_empty_dmatrix_reg(local_cuda_client, parameters)
+        run_empty_dmatrix_cls(local_cuda_client, parameters)
+
+    @pytest.mark.skipif(**tm.no_dask_cudf())
+    def test_empty_partition(self, local_cuda_client: Client) -> None:
+        import cudf
+        import cupy
+        import dask_cudf
+
+        mult = 100
+        df = cudf.DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5.1] * mult,
+                "b": [10, 15, 29.3, 30, 31] * mult,
+                "y": [10, 20, 30, 40.0, 50] * mult,
+            }
+        )
+        parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}
+
+        empty = df.iloc[:0]
+        ddf = dask_cudf.concat(
+            [dask_cudf.from_cudf(empty, npartitions=1)]
+            + [dask_cudf.from_cudf(df, npartitions=3)]
+            + [dask_cudf.from_cudf(df, npartitions=3)]
+        )
+        X = ddf[ddf.columns.difference(["y"])]
+        y = ddf[["y"]]
+        dtrain = dxgb.DaskDeviceQuantileDMatrix(local_cuda_client, X, y)
+        bst_empty = xgb.dask.train(
+            local_cuda_client, parameters, dtrain, evals=[(dtrain, "train")]
+        )
+        predt_empty = dxgb.predict(local_cuda_client, bst_empty, X).compute().values
+
+        ddf = dask_cudf.concat(
+            [dask_cudf.from_cudf(df, npartitions=3)]
+            + [dask_cudf.from_cudf(df, npartitions=3)]
+        )
+        X = ddf[ddf.columns.difference(["y"])]
+        y = ddf[["y"]]
+        dtrain = dxgb.DaskDeviceQuantileDMatrix(local_cuda_client, X, y)
+        bst = xgb.dask.train(
+            local_cuda_client, parameters, dtrain, evals=[(dtrain, "train")]
+        )
+
+        predt = dxgb.predict(local_cuda_client, bst, X).compute().values
+        cupy.testing.assert_allclose(predt, predt_empty)
+
+        predt = dxgb.predict(local_cuda_client, bst, dtrain).compute()
+        cupy.testing.assert_allclose(predt, predt_empty)
+
+        predt = dxgb.inplace_predict(local_cuda_client, bst, X).compute().values
+        cupy.testing.assert_allclose(predt, predt_empty)
+
+        df = df.to_pandas()
+        empty = df.iloc[:0]
+        ddf = dd.concat(
+            [dd.from_pandas(empty, npartitions=1)]
+            + [dd.from_pandas(df, npartitions=3)]
+            + [dd.from_pandas(df, npartitions=3)]
+        )
+        X = ddf[ddf.columns.difference(["y"])]
+        y = ddf[["y"]]
+
+        predt_empty = cupy.asnumpy(predt_empty)
+
+        predt = dxgb.predict(local_cuda_client, bst_empty, X).compute().values
+        np.testing.assert_allclose(predt, predt_empty)
+
+        in_predt = (
+            dxgb.inplace_predict(local_cuda_client, bst_empty, X).compute().values
+        )
+        np.testing.assert_allclose(predt, in_predt)
+
+    def test_empty_dmatrix_auc(self, local_cuda_client: Client) -> None:
+        n_workers = len(tm.get_client_workers(local_cuda_client))
+        run_empty_dmatrix_auc(local_cuda_client, "gpu_hist", n_workers)
+
+    def test_auc(self, local_cuda_client: Client) -> None:
+        run_auc(local_cuda_client, "gpu_hist")
+
+    def test_data_initialization(self, local_cuda_client: Client) -> None:
+
+        X, y, _ = generate_array()
+        fw = da.random.random((random_cols,))
+        fw = fw - fw.min()
+        m = dxgb.DaskDMatrix(local_cuda_client, X, y, feature_weights=fw)
+
+        workers = tm.get_client_workers(local_cuda_client)
+        rabit_args = local_cuda_client.sync(
+            dxgb._get_rabit_args, len(workers), None, local_cuda_client
+        )
+
+        def worker_fn(worker_addr: str, data_ref: Dict) -> None:
+            with dxgb.CommunicatorContext(**rabit_args):
+                local_dtrain = dxgb._dmatrix_from_list_of_parts(**data_ref, nthread=7)
+                fw_rows = local_dtrain.get_float_info("feature_weights").shape[0]
+                assert fw_rows == local_dtrain.num_col()
+
+        futures = []
+        for i in range(len(workers)):
+            futures.append(
+                local_cuda_client.submit(
+                    worker_fn,
+                    workers[i],
+                    m._create_fn_args(workers[i]),
+                    pure=False,
+                    workers=[workers[i]],
+                )
+            )
+        local_cuda_client.gather(futures)
+
+    def test_interface_consistency(self) -> None:
+        sig = OrderedDict(signature(dxgb.DaskDMatrix).parameters)
+        del sig["client"]
+        ddm_names = list(sig.keys())
+        sig = OrderedDict(signature(dxgb.DaskQuantileDMatrix).parameters)
+        del sig["client"]
+        del sig["max_bin"]
+        del sig["ref"]
+        ddqdm_names = list(sig.keys())
+        assert len(ddm_names) == len(ddqdm_names)
+
+        # between dask
+        for i in range(len(ddm_names)):
+            assert ddm_names[i] == ddqdm_names[i]
+
+        sig = OrderedDict(signature(xgb.DMatrix).parameters)
+        del sig["nthread"]  # no nthread in dask
+        dm_names = list(sig.keys())
+        sig = OrderedDict(signature(xgb.QuantileDMatrix).parameters)
+        del sig["nthread"]
+        del sig["max_bin"]
+        del sig["ref"]
+        dqdm_names = list(sig.keys())
+
+        # between single node
+        assert len(dm_names) == len(dqdm_names)
+        for i in range(len(dm_names)):
+            assert dm_names[i] == dqdm_names[i]
+
+        # ddm <-> dm
+        for i in range(len(ddm_names)):
+            assert ddm_names[i] == dm_names[i]
+
+        # dqdm <-> ddqdm
+        for i in range(len(ddqdm_names)):
+            assert ddqdm_names[i] == dqdm_names[i]
+
+        sig = OrderedDict(signature(xgb.XGBRanker.fit).parameters)
+        ranker_names = list(sig.keys())
+        sig = OrderedDict(signature(xgb.dask.DaskXGBRanker.fit).parameters)
+        dranker_names = list(sig.keys())
+
+        for rn, drn in zip(ranker_names, dranker_names):
+            assert rn == drn
+
+    def run_quantile(self, name: str, local_cuda_client: Client) -> None:
+        exe = None
+        for possible_path in {
+            "./testxgboost",
+            "./build/testxgboost",
+            "../build/testxgboost",
+            "../gpu-build/testxgboost",
+        }:
+            if os.path.exists(possible_path):
+                exe = possible_path
+        assert exe, "No testxgboost executable found."
+        test = "--gtest_filter=GPUQuantile." + name
+
+        def runit(
+            worker_addr: str, rabit_args: Dict[str, Union[int, str]]
+        ) -> subprocess.CompletedProcess:
+            # setup environment for running the c++ part.
+            env = os.environ.copy()
+            env['DMLC_TRACKER_PORT'] = str(rabit_args['DMLC_TRACKER_PORT'])
+            env["DMLC_TRACKER_URI"] = str(rabit_args["DMLC_TRACKER_URI"])
+            return subprocess.run([str(exe), test], env=env, stdout=subprocess.PIPE)
+
+        workers = tm.get_client_workers(local_cuda_client)
+        rabit_args = local_cuda_client.sync(
+            dxgb._get_rabit_args, len(workers), None, local_cuda_client
+        )
+        futures = local_cuda_client.map(
+            runit, workers, pure=False, workers=workers, rabit_args=rabit_args
+        )
+        results = local_cuda_client.gather(futures)
+        for ret in results:
+            msg = ret.stdout.decode("utf-8")
+            assert msg.find("1 test from GPUQuantile") != -1, msg
+            assert ret.returncode == 0, msg
+
+    @pytest.mark.gtest
+    def test_quantile_basic(self, local_cuda_client: Client) -> None:
+        self.run_quantile("AllReduceBasic", local_cuda_client)
+
+    @pytest.mark.gtest
+    def test_quantile_same_on_all_workers(self, local_cuda_client: Client) -> None:
+        self.run_quantile("SameOnAllWorkers", local_cuda_client)
+
+
+@pytest.mark.skipif(**tm.no_cupy())
+def test_with_asyncio(local_cuda_client: Client) -> None:
+    address = local_cuda_client.scheduler.address
+    output = asyncio.run(run_from_dask_array_asyncio(address))
+    assert isinstance(output["booster"], xgb.Booster)
+    assert isinstance(output["history"], dict)
+
+
+async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainReturnT:
+    async with Client(scheduler_address, asynchronous=True) as client:
+        import cupy as cp
+
+        X, y, _ = generate_array()
+        X = X.map_blocks(cp.array)
+        y = y.map_blocks(cp.array)
+
+        m = await xgb.dask.DaskDeviceQuantileDMatrix(client, X, y)
+        output = await xgb.dask.train(client, {"tree_method": "gpu_hist"}, dtrain=m)
+
+        with_m = await xgb.dask.predict(client, output, m)
+        with_X = await xgb.dask.predict(client, output, X)
+        inplace = await xgb.dask.inplace_predict(client, output, X)
+        assert isinstance(with_m, da.Array)
+        assert isinstance(with_X, da.Array)
+        assert isinstance(inplace, da.Array)
+
+        cp.testing.assert_allclose(
+            await client.compute(with_m), await client.compute(with_X)
+        )
+        cp.testing.assert_allclose(
+            await client.compute(with_m), await client.compute(inplace)
+        )
+
+        client.shutdown()
+        return output
--- a/tests/test_distributed/test_gpu_with_spark/init.py
+++ b/tests/test_distributed/test_gpu_with_spark/init.py
--- a/tests/test_distributed/test_gpu_with_spark/conftest.py
+++ b/tests/test_distributed/test_gpu_with_spark/conftest.py
@@ -0,0 +1,10 @@
+from typing import Sequence
+
+import pytest
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: Sequence) -> None:
+    # mark dask tests as `mgpu`.
+    mgpu_mark = pytest.mark.mgpu
+    for item in items:
+        item.add_marker(mgpu_mark)
--- a/tests/test_distributed/test_gpu_with_spark/discover_gpu.sh
+++ b/tests/test_distributed/test_gpu_with_spark/discover_gpu.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script is only made for running XGBoost tests on official CI where we have access
+# to a 4-GPU cluster, the discovery command is for running tests on a local machine where
+# the driver and the GPU worker might be the same machine for the ease of development.
+
+if ! command -v nvidia-smi &> /dev/null
+then
+    # default to 4 GPUs
+    echo "{\"name\":\"gpu\",\"addresses\":[\"0\",\"1\",\"2\",\"3\"]}"
+    exit
+else
+    # https://github.com/apache/spark/blob/master/examples/src/main/scripts/getGpusResources.sh
+    ADDRS=`nvidia-smi --query-gpu=index --format=csv,noheader | sed -e ':a' -e 'N' -e'$!ba' -e 's/\n/","/g'`
+    echo {\"name\": \"gpu\", \"addresses\":[\"$ADDRS\"]}
+fi
--- a/tests/test_distributed/test_gpu_with_spark/test_data.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_data.py
@@ -0,0 +1,16 @@
+import pytest
+
+from xgboost import testing as tm
+
+pytestmark = pytest.mark.skipif(**tm.no_spark())
+
+from ..test_with_spark.test_data import run_dmatrix_ctor
+
+
+@pytest.mark.skipif(**tm.no_cudf())
+@pytest.mark.parametrize(
+    "is_feature_cols,is_qdm",
+    [(True, True), (True, False), (False, True), (False, False)],
+)
+def test_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool) -> None:
+    run_dmatrix_ctor(is_feature_cols, is_qdm, on_gpu=True)
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -0,0 +1,224 @@
+import json
+import logging
+import subprocess
+
+import pytest
+import sklearn
+
+from xgboost import testing as tm
+
+pytestmark = pytest.mark.skipif(**tm.no_spark())
+
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
+from pyspark.sql import SparkSession
+from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
+
+gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
+
+
+def get_devices():
+    """This works only if driver is the same machine of worker."""
+    completed = subprocess.run(gpu_discovery_script_path, stdout=subprocess.PIPE)
+    assert completed.returncode == 0, "Failed to execute discovery script."
+    msg = completed.stdout.decode("utf-8")
+    result = json.loads(msg)
+    addresses = result["addresses"]
+    return addresses
+
+
+executor_gpu_amount = len(get_devices())
+executor_cores = executor_gpu_amount
+num_workers = executor_gpu_amount
+
+
+@pytest.fixture(scope="module", autouse=True)
+def spark_session_with_gpu():
+    spark_config = {
+        "spark.master": f"local-cluster[1, {executor_gpu_amount}, 1024]",
+        "spark.python.worker.reuse": "false",
+        "spark.driver.host": "127.0.0.1",
+        "spark.task.maxFailures": "1",
+        "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
+        "spark.sql.pyspark.jvmStacktrace.enabled": "true",
+        "spark.cores.max": executor_cores,
+        "spark.task.cpus": "1",
+        "spark.executor.cores": executor_cores,
+        "spark.worker.resource.gpu.amount": executor_gpu_amount,
+        "spark.task.resource.gpu.amount": "1",
+        "spark.executor.resource.gpu.amount": executor_gpu_amount,
+        "spark.worker.resource.gpu.discoveryScript": gpu_discovery_script_path,
+    }
+    builder = SparkSession.builder.appName("xgboost spark python API Tests with GPU")
+    for k, v in spark_config.items():
+        builder.config(k, v)
+    spark = builder.getOrCreate()
+    logging.getLogger("pyspark").setLevel(logging.INFO)
+    # We run a dummy job so that we block until the workers have connected to the master
+    spark.sparkContext.parallelize(
+        range(num_workers), num_workers
+    ).barrier().mapPartitions(lambda _: []).collect()
+    yield spark
+    spark.stop()
+
+
+@pytest.fixture
+def spark_iris_dataset(spark_session_with_gpu):
+    spark = spark_session_with_gpu
+    data = sklearn.datasets.load_iris()
+    train_rows = [
+        (Vectors.dense(features), float(label))
+        for features, label in zip(data.data[0::2], data.target[0::2])
+    ]
+    train_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(train_rows, num_workers), ["features", "label"]
+    )
+    test_rows = [
+        (Vectors.dense(features), float(label))
+        for features, label in zip(data.data[1::2], data.target[1::2])
+    ]
+    test_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(test_rows, num_workers), ["features", "label"]
+    )
+    return train_df, test_df
+
+
+@pytest.fixture
+def spark_iris_dataset_feature_cols(spark_session_with_gpu):
+    spark = spark_session_with_gpu
+    data = sklearn.datasets.load_iris()
+    train_rows = [
+        (*features.tolist(), float(label))
+        for features, label in zip(data.data[0::2], data.target[0::2])
+    ]
+    train_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(train_rows, num_workers),
+        [*data.feature_names, "label"],
+    )
+    test_rows = [
+        (*features.tolist(), float(label))
+        for features, label in zip(data.data[1::2], data.target[1::2])
+    ]
+    test_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(test_rows, num_workers),
+        [*data.feature_names, "label"],
+    )
+    return train_df, test_df, data.feature_names
+
+
+@pytest.fixture
+def spark_diabetes_dataset(spark_session_with_gpu):
+    spark = spark_session_with_gpu
+    data = sklearn.datasets.load_diabetes()
+    train_rows = [
+        (Vectors.dense(features), float(label))
+        for features, label in zip(data.data[0::2], data.target[0::2])
+    ]
+    train_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(train_rows, num_workers), ["features", "label"]
+    )
+    test_rows = [
+        (Vectors.dense(features), float(label))
+        for features, label in zip(data.data[1::2], data.target[1::2])
+    ]
+    test_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(test_rows, num_workers), ["features", "label"]
+    )
+    return train_df, test_df
+
+
+@pytest.fixture
+def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
+    spark = spark_session_with_gpu
+    data = sklearn.datasets.load_diabetes()
+    train_rows = [
+        (*features.tolist(), float(label))
+        for features, label in zip(data.data[0::2], data.target[0::2])
+    ]
+    train_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(train_rows, num_workers),
+        [*data.feature_names, "label"],
+    )
+    test_rows = [
+        (*features.tolist(), float(label))
+        for features, label in zip(data.data[1::2], data.target[1::2])
+    ]
+    test_df = spark.createDataFrame(
+        spark.sparkContext.parallelize(test_rows, num_workers),
+        [*data.feature_names, "label"],
+    )
+    return train_df, test_df, data.feature_names
+
+
+def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
+    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+
+    classifier = SparkXGBClassifier(use_gpu=True, num_workers=num_workers)
+    train_df, test_df = spark_iris_dataset
+    model = classifier.fit(train_df)
+    pred_result_df = model.transform(test_df)
+    evaluator = MulticlassClassificationEvaluator(metricName="f1")
+    f1 = evaluator.evaluate(pred_result_df)
+    assert f1 >= 0.97
+
+
+def test_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature_cols):
+    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+
+    train_df, test_df, feature_names = spark_iris_dataset_feature_cols
+
+    classifier = SparkXGBClassifier(
+        features_col=feature_names, use_gpu=True, num_workers=num_workers
+    )
+
+    model = classifier.fit(train_df)
+    pred_result_df = model.transform(test_df)
+    evaluator = MulticlassClassificationEvaluator(metricName="f1")
+    f1 = evaluator.evaluate(pred_result_df)
+    assert f1 >= 0.97
+
+
+def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature_cols):
+    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
+
+    train_df, test_df, feature_names = spark_iris_dataset_feature_cols
+
+    classifier = SparkXGBClassifier(
+        features_col=feature_names, use_gpu=True, num_workers=num_workers
+    )
+    grid = ParamGridBuilder().addGrid(classifier.max_depth, [6, 8]).build()
+    evaluator = MulticlassClassificationEvaluator(metricName="f1")
+    cv = CrossValidator(
+        estimator=classifier, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3
+    )
+    cvModel = cv.fit(train_df)
+    pred_result_df = cvModel.transform(test_df)
+    f1 = evaluator.evaluate(pred_result_df)
+    assert f1 >= 0.97
+
+
+def test_sparkxgb_regressor_with_gpu(spark_diabetes_dataset):
+    from pyspark.ml.evaluation import RegressionEvaluator
+
+    regressor = SparkXGBRegressor(use_gpu=True, num_workers=num_workers)
+    train_df, test_df = spark_diabetes_dataset
+    model = regressor.fit(train_df)
+    pred_result_df = model.transform(test_df)
+    evaluator = RegressionEvaluator(metricName="rmse")
+    rmse = evaluator.evaluate(pred_result_df)
+    assert rmse <= 65.0
+
+
+def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature_cols):
+    from pyspark.ml.evaluation import RegressionEvaluator
+
+    train_df, test_df, feature_names = spark_diabetes_dataset_feature_cols
+    regressor = SparkXGBRegressor(
+        features_col=feature_names, use_gpu=True, num_workers=num_workers
+    )
+
+    model = regressor.fit(train_df)
+    pred_result_df = model.transform(test_df)
+    evaluator = RegressionEvaluator(metricName="rmse")
+    rmse = evaluator.evaluate(pred_result_df)
+    assert rmse <= 65.0
--- a/tests/test_distributed/test_with_dask/init.py
+++ b/tests/test_distributed/test_with_dask/init.py
@@ -0,0 +1 @@
+
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
--- a/tests/test_distributed/test_with_spark/init.py
+++ b/tests/test_distributed/test_with_spark/init.py
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -0,0 +1,156 @@
+from typing import List
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from xgboost import testing as tm
+
+pytestmark = [pytest.mark.skipif(**tm.no_spark())]
+
+from xgboost.spark.data import (
+    _read_csr_matrix_from_unwrapped_spark_vec,
+    alias,
+    create_dmatrix_from_partitions,
+    stack_series,
+)
+
+from xgboost import DMatrix, QuantileDMatrix
+
+
+def test_stack() -> None:
+    a = pd.DataFrame({"a": [[1, 2], [3, 4]]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 2)
+
+    a = pd.DataFrame({"a": [[1], [3]]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 1)
+
+    a = pd.DataFrame({"a": [np.array([1, 2]), np.array([3, 4])]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 2)
+
+    a = pd.DataFrame({"a": [np.array([1]), np.array([3])]})
+    b = stack_series(a["a"])
+    assert b.shape == (2, 1)
+
+
+def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
+    rng = np.random.default_rng(0)
+    dfs: List[pd.DataFrame] = []
+    n_features = 16
+    n_samples_per_batch = 16
+    n_batches = 10
+    feature_types = ["float"] * n_features
+
+    for i in range(n_batches):
+        X = rng.normal(loc=0, size=256).reshape(n_samples_per_batch, n_features)
+        y = rng.normal(loc=0, size=n_samples_per_batch)
+        m = rng.normal(loc=0, size=n_samples_per_batch)
+        w = rng.normal(loc=0.5, scale=0.5, size=n_samples_per_batch)
+        w -= w.min()
+
+        valid = rng.binomial(n=1, p=0.5, size=16).astype(np.bool_)
+
+        df = pd.DataFrame(
+            {alias.label: y, alias.margin: m, alias.weight: w, alias.valid: valid}
+        )
+        if is_feature_cols:
+            for j in range(X.shape[1]):
+                df[f"feat-{j}"] = pd.Series(X[:, j])
+        else:
+            df[alias.data] = pd.Series(list(X))
+        dfs.append(df)
+
+    kwargs = {"feature_types": feature_types}
+    device_id = 0 if on_gpu else None
+    cols = [f"feat-{i}" for i in range(n_features)]
+    feature_cols = cols if is_feature_cols else None
+    train_Xy, valid_Xy = create_dmatrix_from_partitions(
+        iter(dfs),
+        feature_cols,
+        gpu_id=device_id,
+        use_qdm=is_qdm,
+        kwargs=kwargs,
+        enable_sparse_data_optim=False,
+        has_validation_col=True,
+    )
+
+    if is_qdm:
+        assert isinstance(train_Xy, QuantileDMatrix)
+        assert isinstance(valid_Xy, QuantileDMatrix)
+    else:
+        assert not isinstance(train_Xy, QuantileDMatrix)
+        assert isinstance(train_Xy, DMatrix)
+        assert not isinstance(valid_Xy, QuantileDMatrix)
+        assert isinstance(valid_Xy, DMatrix)
+
+    assert valid_Xy is not None
+    assert valid_Xy.num_row() + train_Xy.num_row() == n_samples_per_batch * n_batches
+    assert train_Xy.num_col() == n_features
+    assert valid_Xy.num_col() == n_features
+
+    df = pd.concat(dfs, axis=0)
+    df_train = df.loc[~df[alias.valid], :]
+    df_valid = df.loc[df[alias.valid], :]
+
+    assert df_train.shape[0] == train_Xy.num_row()
+    assert df_valid.shape[0] == valid_Xy.num_row()
+
+    # margin
+    np.testing.assert_allclose(
+        df_train[alias.margin].to_numpy(), train_Xy.get_base_margin()
+    )
+    np.testing.assert_allclose(
+        df_valid[alias.margin].to_numpy(), valid_Xy.get_base_margin()
+    )
+    # weight
+    np.testing.assert_allclose(df_train[alias.weight].to_numpy(), train_Xy.get_weight())
+    np.testing.assert_allclose(df_valid[alias.weight].to_numpy(), valid_Xy.get_weight())
+    # label
+    np.testing.assert_allclose(df_train[alias.label].to_numpy(), train_Xy.get_label())
+    np.testing.assert_allclose(df_valid[alias.label].to_numpy(), valid_Xy.get_label())
+
+    np.testing.assert_equal(train_Xy.feature_types, feature_types)
+    np.testing.assert_equal(valid_Xy.feature_types, feature_types)
+
+
+@pytest.mark.parametrize(
+    "is_feature_cols,is_qdm",
+    [(True, True), (True, False), (False, True), (False, False)],
+)
+def test_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool) -> None:
+    run_dmatrix_ctor(is_feature_cols, is_qdm, on_gpu=False)
+
+
+def test_read_csr_matrix_from_unwrapped_spark_vec() -> None:
+    from scipy.sparse import csr_matrix
+
+    pd1 = pd.DataFrame(
+        {
+            "featureVectorType": [0, 1, 1, 0],
+            "featureVectorSize": [3, None, None, 3],
+            "featureVectorIndices": [
+                np.array([0, 2], dtype=np.int32),
+                None,
+                None,
+                np.array([1, 2], dtype=np.int32),
+            ],
+            "featureVectorValues": [
+                np.array([3.0, 0.0], dtype=np.float64),
+                np.array([13.0, 14.0, 0.0], dtype=np.float64),
+                np.array([0.0, 24.0, 25.0], dtype=np.float64),
+                np.array([0.0, 35.0], dtype=np.float64),
+            ],
+        }
+    )
+    sm = _read_csr_matrix_from_unwrapped_spark_vec(pd1)
+    assert isinstance(sm, csr_matrix)
+
+    np.testing.assert_array_equal(
+        sm.data, [3.0, 0.0, 13.0, 14.0, 0.0, 0.0, 24.0, 25.0, 0.0, 35.0]
+    )
+    np.testing.assert_array_equal(sm.indptr, [0, 2, 5, 8, 10])
+    np.testing.assert_array_equal(sm.indices, [0, 2, 0, 1, 2, 0, 1, 2, 1, 2])
+    assert sm.shape == (4, 3)
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
--- a/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local_cluster.py
@@ -0,0 +1,449 @@
+import json
+import os
+import random
+import sys
+import uuid
+
+import numpy as np
+import pytest
+
+from xgboost import testing as tm
+
+pytestmark = pytest.mark.skipif(**tm.no_spark())
+
+from pyspark.ml.linalg import Vectors
+from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
+from xgboost.spark.utils import _get_max_num_concurrent_tasks
+
+from .utils import SparkLocalClusterTestCase
+
+
+class XgboostLocalClusterTestCase(SparkLocalClusterTestCase):
+    def setUp(self):
+        random.seed(2020)
+
+        self.n_workers = _get_max_num_concurrent_tasks(self.session.sparkContext)
+        # The following code use xgboost python library to train xgb model and predict.
+        #
+        # >>> import numpy as np
+        # >>> import xgboost
+        # >>> X = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]])
+        # >>> y = np.array([0, 1])
+        # >>> reg1 = xgboost.XGBRegressor()
+        # >>> reg1.fit(X, y)
+        # >>> reg1.predict(X)
+        # array([8.8363886e-04, 9.9911636e-01], dtype=float32)
+        # >>> def custom_lr(boosting_round, num_boost_round):
+        # ...     return 1.0 / (boosting_round + 1)
+        # ...
+        # >>> reg1.fit(X, y, callbacks=[xgboost.callback.reset_learning_rate(custom_lr)])
+        # >>> reg1.predict(X)
+        # array([0.02406833, 0.97593164], dtype=float32)
+        # >>> reg2 = xgboost.XGBRegressor(max_depth=5, n_estimators=10)
+        # >>> reg2.fit(X, y)
+        # >>> reg2.predict(X, ntree_limit=5)
+        # array([0.22185263, 0.77814734], dtype=float32)
+        self.reg_params = {"max_depth": 5, "n_estimators": 10, "ntree_limit": 5}
+        self.reg_df_train = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
+            ],
+            ["features", "label"],
+        )
+        self.reg_df_test = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0.0, 0.2219, 0.02406),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1.0, 0.7781, 0.9759),
+            ],
+            [
+                "features",
+                "expected_prediction",
+                "expected_prediction_with_params",
+                "expected_prediction_with_callbacks",
+            ],
+        )
+
+        # Distributed section
+        # Binary classification
+        self.cls_df_train_distributed = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
+                (Vectors.dense(4.0, 5.0, 6.0), 0),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1),
+            ]
+            * 100,
+            ["features", "label"],
+        )
+        self.cls_df_test_distributed = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, [0.9949826, 0.0050174]),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0050174, 0.9949826]),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, [0.9949826, 0.0050174]),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0050174, 0.9949826]),
+            ],
+            ["features", "expected_label", "expected_probability"],
+        )
+        # Binary classification with different num_estimators
+        self.cls_df_test_distributed_lower_estimators = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, [0.9735, 0.0265]),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, [0.0265, 0.9735]),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, [0.9735, 0.0265]),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, [0.0265, 0.9735]),
+            ],
+            ["features", "expected_label", "expected_probability"],
+        )
+
+        # Multiclass classification
+        self.cls_df_train_distributed_multiclass = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
+                (Vectors.dense(4.0, 5.0, 6.0), 0),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
+            ]
+            * 100,
+            ["features", "label"],
+        )
+        self.cls_df_test_distributed_multiclass = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, [4.294563, -2.449409, -2.449409]),
+                (
+                    Vectors.sparse(3, {1: 1.0, 2: 5.5}),
+                    1,
+                    [-2.3796105, 3.669014, -2.449409],
+                ),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, [4.294563, -2.449409, -2.449409]),
+                (
+                    Vectors.sparse(3, {1: 6.0, 2: 7.5}),
+                    2,
+                    [-2.3796105, -2.449409, 3.669014],
+                ),
+            ],
+            ["features", "expected_label", "expected_margins"],
+        )
+
+        # Regression
+        self.reg_df_train_distributed = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1),
+                (Vectors.dense(4.0, 5.0, 6.0), 0),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 2),
+            ]
+            * 100,
+            ["features", "label"],
+        )
+        self.reg_df_test_distributed = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 1.533e-04),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 9.999e-01),
+                (Vectors.dense(4.0, 5.0, 6.0), 1.533e-04),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1.999e00),
+            ],
+            ["features", "expected_label"],
+        )
+
+        # Adding weight and validation
+        self.clf_params_with_eval_dist = {
+            "validation_indicator_col": "isVal",
+            "early_stopping_rounds": 1,
+            "eval_metric": "logloss",
+        }
+        self.clf_params_with_weight_dist = {"weight_col": "weight"}
+        self.cls_df_train_distributed_with_eval_weight = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
+            ]
+            * 100,
+            ["features", "label", "isVal", "weight"],
+        )
+        self.cls_df_test_distributed_with_eval_weight = self.session.createDataFrame(
+            [
+                (
+                    Vectors.dense(1.0, 2.0, 3.0),
+                    [0.9955, 0.0044],
+                    [0.9904, 0.0096],
+                    [0.9903, 0.0097],
+                ),
+            ],
+            [
+                "features",
+                "expected_prob_with_weight",
+                "expected_prob_with_eval",
+                "expected_prob_with_weight_and_eval",
+            ],
+        )
+        self.clf_best_score_eval = 0.009677
+        self.clf_best_score_weight_and_eval = 0.006626
+
+        self.reg_params_with_eval_dist = {
+            "validation_indicator_col": "isVal",
+            "early_stopping_rounds": 1,
+            "eval_metric": "rmse",
+        }
+        self.reg_params_with_weight_dist = {"weight_col": "weight"}
+        self.reg_df_train_distributed_with_eval_weight = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 0, False, 1.0),
+                (Vectors.sparse(3, {1: 1.0, 2: 5.5}), 1, False, 2.0),
+                (Vectors.dense(4.0, 5.0, 6.0), 0, True, 1.0),
+                (Vectors.sparse(3, {1: 6.0, 2: 7.5}), 1, True, 2.0),
+            ]
+            * 100,
+            ["features", "label", "isVal", "weight"],
+        )
+        self.reg_df_test_distributed_with_eval_weight = self.session.createDataFrame(
+            [
+                (Vectors.dense(1.0, 2.0, 3.0), 4.583e-05, 5.239e-05, 6.03e-05),
+                (
+                    Vectors.sparse(3, {1: 1.0, 2: 5.5}),
+                    9.9997e-01,
+                    9.99947e-01,
+                    9.9995e-01,
+                ),
+            ],
+            [
+                "features",
+                "expected_prediction_with_weight",
+                "expected_prediction_with_eval",
+                "expected_prediction_with_weight_and_eval",
+            ],
+        )
+        self.reg_best_score_eval = 5.239e-05
+        self.reg_best_score_weight_and_eval = 4.810e-05
+
+    def test_regressor_basic_with_params(self):
+        regressor = SparkXGBRegressor(**self.reg_params)
+        model = regressor.fit(self.reg_df_train)
+        pred_result = model.transform(self.reg_df_test).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.isclose(
+                    row.prediction, row.expected_prediction_with_params, atol=1e-3
+                )
+            )
+
+    def test_callbacks(self):
+        from xgboost.callback import LearningRateScheduler
+
+        path = os.path.join(self.tempdir, str(uuid.uuid4()))
+
+        def custom_learning_rate(boosting_round):
+            return 1.0 / (boosting_round + 1)
+
+        cb = [LearningRateScheduler(custom_learning_rate)]
+        regressor = SparkXGBRegressor(callbacks=cb)
+
+        # Test the save/load of the estimator instead of the model, since
+        # the callbacks param only exists in the estimator but not in the model
+        regressor.save(path)
+        regressor = SparkXGBRegressor.load(path)
+
+        model = regressor.fit(self.reg_df_train)
+        pred_result = model.transform(self.reg_df_test).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.isclose(
+                    row.prediction, row.expected_prediction_with_callbacks, atol=1e-3
+                )
+            )
+
+    def test_classifier_distributed_basic(self):
+        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
+        model = classifier.fit(self.cls_df_train_distributed)
+        pred_result = model.transform(self.cls_df_test_distributed).collect()
+        for row in pred_result:
+            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
+            self.assertTrue(
+                np.allclose(row.expected_probability, row.probability, atol=1e-3)
+            )
+
+    def test_classifier_distributed_multiclass(self):
+        # There is no built-in multiclass option for external storage
+        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=100)
+        model = classifier.fit(self.cls_df_train_distributed_multiclass)
+        pred_result = model.transform(self.cls_df_test_distributed_multiclass).collect()
+        for row in pred_result:
+            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
+            self.assertTrue(
+                np.allclose(row.expected_margins, row.rawPrediction, atol=1e-3)
+            )
+
+    def test_regressor_distributed_basic(self):
+        regressor = SparkXGBRegressor(num_workers=self.n_workers, n_estimators=100)
+        model = regressor.fit(self.reg_df_train_distributed)
+        pred_result = model.transform(self.reg_df_test_distributed).collect()
+        for row in pred_result:
+            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
+
+    def test_classifier_distributed_weight_eval(self):
+        # with weight
+        classifier = SparkXGBClassifier(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            **self.clf_params_with_weight_dist
+        )
+        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.cls_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.allclose(row.probability, row.expected_prob_with_weight, atol=1e-3)
+            )
+
+        # with eval only
+        classifier = SparkXGBClassifier(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            **self.clf_params_with_eval_dist
+        )
+        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.cls_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.allclose(row.probability, row.expected_prob_with_eval, atol=1e-3)
+            )
+        assert np.isclose(
+            float(model.get_booster().attributes()["best_score"]),
+            self.clf_best_score_eval,
+            rtol=1e-3,
+        )
+
+        # with both weight and eval
+        classifier = SparkXGBClassifier(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            **self.clf_params_with_eval_dist,
+            **self.clf_params_with_weight_dist
+        )
+        model = classifier.fit(self.cls_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.cls_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.allclose(
+                    row.probability, row.expected_prob_with_weight_and_eval, atol=1e-3
+                )
+            )
+        np.isclose(
+            float(model.get_booster().attributes()["best_score"]),
+            self.clf_best_score_weight_and_eval,
+            rtol=1e-3,
+        )
+
+    def test_regressor_distributed_weight_eval(self):
+        # with weight
+        regressor = SparkXGBRegressor(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            **self.reg_params_with_weight_dist
+        )
+        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.reg_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.isclose(
+                    row.prediction, row.expected_prediction_with_weight, atol=1e-3
+                )
+            )
+        # with eval only
+        regressor = SparkXGBRegressor(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            **self.reg_params_with_eval_dist
+        )
+        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.reg_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.isclose(row.prediction, row.expected_prediction_with_eval, atol=1e-3)
+            )
+        assert np.isclose(
+            float(model.get_booster().attributes()["best_score"]),
+            self.reg_best_score_eval,
+            rtol=1e-3,
+        )
+        # with both weight and eval
+        regressor = SparkXGBRegressor(
+            num_workers=self.n_workers,
+            n_estimators=100,
+            use_external_storage=False,
+            **self.reg_params_with_eval_dist,
+            **self.reg_params_with_weight_dist
+        )
+        model = regressor.fit(self.reg_df_train_distributed_with_eval_weight)
+        pred_result = model.transform(
+            self.reg_df_test_distributed_with_eval_weight
+        ).collect()
+        for row in pred_result:
+            self.assertTrue(
+                np.isclose(
+                    row.prediction,
+                    row.expected_prediction_with_weight_and_eval,
+                    atol=1e-3,
+                )
+            )
+        assert np.isclose(
+            float(model.get_booster().attributes()["best_score"]),
+            self.reg_best_score_weight_and_eval,
+            rtol=1e-3,
+        )
+
+    def test_num_estimators(self):
+        classifier = SparkXGBClassifier(num_workers=self.n_workers, n_estimators=10)
+        model = classifier.fit(self.cls_df_train_distributed)
+        pred_result = model.transform(
+            self.cls_df_test_distributed_lower_estimators
+        ).collect()
+        print(pred_result)
+        for row in pred_result:
+            self.assertTrue(np.isclose(row.expected_label, row.prediction, atol=1e-3))
+            self.assertTrue(
+                np.allclose(row.expected_probability, row.probability, atol=1e-3)
+            )
+
+    def test_distributed_params(self):
+        classifier = SparkXGBClassifier(num_workers=self.n_workers, max_depth=7)
+        model = classifier.fit(self.cls_df_train_distributed)
+        self.assertTrue(hasattr(classifier, "max_depth"))
+        self.assertEqual(classifier.getOrDefault(classifier.max_depth), 7)
+        booster_config = json.loads(model.get_booster().save_config())
+        max_depth = booster_config["learner"]["gradient_booster"]["updater"][
+            "grow_histmaker"
+        ]["train_param"]["max_depth"]
+        self.assertEqual(int(max_depth), 7)
+
+    def test_repartition(self):
+        # The following test case has a few partitioned datasets that are either
+        # well partitioned relative to the number of workers that the user wants
+        # or poorly partitioned. We only want to repartition when the dataset
+        # is poorly partitioned so _repartition_needed is true in those instances.
+
+        classifier = SparkXGBClassifier(num_workers=self.n_workers)
+        basic = self.cls_df_train_distributed
+        self.assertTrue(classifier._repartition_needed(basic))
+        bad_repartitioned = basic.repartition(self.n_workers + 1)
+        self.assertTrue(classifier._repartition_needed(bad_repartitioned))
+        good_repartitioned = basic.repartition(self.n_workers)
+        self.assertFalse(classifier._repartition_needed(good_repartitioned))
+
+        # Now testing if force_repartition returns True regardless of whether the data is well partitioned
+        classifier = SparkXGBClassifier(
+            num_workers=self.n_workers, force_repartition=True
+        )
+        good_repartitioned = basic.repartition(self.n_workers)
+        self.assertTrue(classifier._repartition_needed(good_repartitioned))
--- a/tests/test_distributed/test_with_spark/utils.py
+++ b/tests/test_distributed/test_with_spark/utils.py
@@ -0,0 +1,143 @@
+import contextlib
+import logging
+import shutil
+import sys
+import tempfile
+import unittest
+
+import pytest
+from six import StringIO
+
+from xgboost import testing as tm
+
+pytestmark = [pytest.mark.skipif(**tm.no_spark())]
+
+
+from pyspark.sql import SparkSession, SQLContext
+from xgboost.spark.utils import _get_default_params_from_func
+
+
+class UtilsTest(unittest.TestCase):
+    def test_get_default_params(self):
+        class Foo:
+            def func1(self, x, y, key1=None, key2="val2", key3=0, key4=None):
+                pass
+
+        unsupported_params = {"key2", "key4"}
+        expected_default_params = {
+            "key1": None,
+            "key3": 0,
+        }
+        actual_default_params = _get_default_params_from_func(
+            Foo.func1, unsupported_params
+        )
+        self.assertEqual(
+            len(expected_default_params.keys()), len(actual_default_params.keys())
+        )
+        for k, v in actual_default_params.items():
+            self.assertEqual(expected_default_params[k], v)
+
+
+@contextlib.contextmanager
+def patch_stdout():
+    """patch stdout and give an output"""
+    sys_stdout = sys.stdout
+    io_out = StringIO()
+    sys.stdout = io_out
+    try:
+        yield io_out
+    finally:
+        sys.stdout = sys_stdout
+
+
+@contextlib.contextmanager
+def patch_logger(name):
+    """patch logger and give an output"""
+    io_out = StringIO()
+    log = logging.getLogger(name)
+    handler = logging.StreamHandler(io_out)
+    log.addHandler(handler)
+    try:
+        yield io_out
+    finally:
+        log.removeHandler(handler)
+
+
+class TestTempDir(object):
+    @classmethod
+    def make_tempdir(cls):
+        """
+        :param dir: Root directory in which to create the temp directory
+        """
+        cls.tempdir = tempfile.mkdtemp(prefix="sparkdl_tests")
+
+    @classmethod
+    def remove_tempdir(cls):
+        shutil.rmtree(cls.tempdir)
+
+
+class TestSparkContext(object):
+    @classmethod
+    def setup_env(cls, spark_config):
+        builder = SparkSession.builder.appName("xgboost spark python API Tests")
+        for k, v in spark_config.items():
+            builder.config(k, v)
+        spark = builder.getOrCreate()
+        logging.getLogger("pyspark").setLevel(logging.INFO)
+
+        cls.sc = spark.sparkContext
+        cls.session = spark
+
+    @classmethod
+    def tear_down_env(cls):
+        cls.session.stop()
+        cls.session = None
+        cls.sc.stop()
+        cls.sc = None
+
+
+class SparkTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.setup_env(
+            {
+                "spark.master": "local[4]",
+                "spark.python.worker.reuse": "false",
+                "spark.driver.host": "127.0.0.1",
+                "spark.task.maxFailures": "1",
+                "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
+                "spark.sql.pyspark.jvmStacktrace.enabled": "true",
+            }
+        )
+        cls.make_tempdir()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.remove_tempdir()
+        cls.tear_down_env()
+
+
+class SparkLocalClusterTestCase(TestSparkContext, TestTempDir, unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.setup_env(
+            {
+                "spark.master": "local-cluster[2, 2, 1024]",
+                "spark.python.worker.reuse": "false",
+                "spark.driver.host": "127.0.0.1",
+                "spark.task.maxFailures": "1",
+                "spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled": "false",
+                "spark.sql.pyspark.jvmStacktrace.enabled": "true",
+                "spark.cores.max": "4",
+                "spark.task.cpus": "1",
+                "spark.executor.cores": "2",
+            }
+        )
+        cls.make_tempdir()
+        # We run a dummy job so that we block until the workers have connected to the master
+        cls.sc.parallelize(range(4), 4).barrier().mapPartitions(lambda _: []).collect()
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.remove_tempdir()
+        cls.tear_down_env()