sync up May15 2023

2023-05-15 18:59:18 +02:00 · 2023-05-15 18:59:18 +02:00 · 8cad8c693c
commit 8cad8c693c
parent b066accad6 7375bd058b
37 changed files with 628 additions and 398 deletions
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@ -66,7 +66,7 @@ jobs:
        cd python-package
        python --version
        python -m build --sdist
-        pip install -v ./dist/xgboost-*.tar.gz
+        pip install -v ./dist/xgboost-*.tar.gz --config-settings use_openmp=False
        cd ..
        python -c 'import xgboost'

--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@ -43,9 +43,38 @@ In the admin CLI, run the following command:
 submit_job horizontal-xgboost
 ```

+Make a note of the job id:
+```console
+Submitted job: 28309e77-a7c5-45e6-b2bc-c2e3655122d8
+```
+
+On both workers, you should see train and eval losses printed:
+```console
+[10:45:41] [0]	eval-logloss:0.22646	train-logloss:0.23316
+[10:45:41] [1]	eval-logloss:0.13776	train-logloss:0.13654
+[10:45:41] [2]	eval-logloss:0.08036	train-logloss:0.08243
+[10:45:41] [3]	eval-logloss:0.05830	train-logloss:0.05645
+[10:45:41] [4]	eval-logloss:0.03825	train-logloss:0.04148
+[10:45:41] [5]	eval-logloss:0.02660	train-logloss:0.02958
+[10:45:41] [6]	eval-logloss:0.01386	train-logloss:0.01918
+[10:45:41] [7]	eval-logloss:0.01018	train-logloss:0.01331
+[10:45:41] [8]	eval-logloss:0.00847	train-logloss:0.01112
+[10:45:41] [9]	eval-logloss:0.00691	train-logloss:0.00662
+[10:45:41] [10]	eval-logloss:0.00543	train-logloss:0.00503
+[10:45:41] [11]	eval-logloss:0.00445	train-logloss:0.00420
+[10:45:41] [12]	eval-logloss:0.00336	train-logloss:0.00355
+[10:45:41] [13]	eval-logloss:0.00277	train-logloss:0.00280
+[10:45:41] [14]	eval-logloss:0.00252	train-logloss:0.00244
+[10:45:41] [15]	eval-logloss:0.00177	train-logloss:0.00193
+[10:45:41] [16]	eval-logloss:0.00156	train-logloss:0.00161
+[10:45:41] [17]	eval-logloss:0.00135	train-logloss:0.00142
+[10:45:41] [18]	eval-logloss:0.00123	train-logloss:0.00125
+[10:45:41] [19]	eval-logloss:0.00106	train-logloss:0.00107
+```
+
 Once the training finishes, the model file should be written into
-`/tmp/nvlfare/poc/site-1/run_1/test.model.json` and `/tmp/nvflare/poc/site-2/run_1/test.model.json`
-respectively.
+`/tmp/nvlfare/poc/site-1/${job_id}/test.model.json` and `/tmp/nvflare/poc/site-2/${job_id}/test.model.json`
+respectively, where `job_id` is the UUID printed out when we ran `submit_job`.

 Finally, shutdown everything from the admin CLI, using `admin` as password:
 ```shell
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@ -63,8 +63,8 @@ class XGBoostTrainer(Executor):
        }
        with xgb.collective.CommunicatorContext(**communicator_env):
            # Load file, file will not be sharded in federated mode.
-            dtrain = xgb.DMatrix('agaricus.txt.train')
-            dtest = xgb.DMatrix('agaricus.txt.test')
+            dtrain = xgb.DMatrix('agaricus.txt.train?format=libsvm')
+            dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')

            # Specify parameters via map, definition are same as c++ version
            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
--- a/demo/nvflare/horizontal/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@ -2,7 +2,7 @@

 set -e

-rm -fr ./agaricus* ./*.pem ./poc
+rm -fr ./agaricus* ./*.pem /tmp/nvflare

 world_size=2

@ -11,15 +11,15 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
 openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"

 # Split train and test files manually to simulate a federated environment.
-split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.train agaricus.txt.train-site-
-split -n l/${world_size} --numeric-suffixes=1 -a 1 ../data/agaricus.txt.test agaricus.txt.test-site-
+split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.train agaricus.txt.train-site-
+split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test agaricus.txt.test-site-

 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
-for id in $(eval echo "{1..$world_size}"); do
-  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$id"/
-  cp agaricus.txt.train-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.train
-  cp agaricus.txt.test-site-"$id" /tmp/nvflare/poc/site-"$id"/agaricus.txt.test
+for (( site=1; site<=world_size; site++ )); do
+  cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
+  cp agaricus.txt.train-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.train
+  cp agaricus.txt.test-site-"$site" /tmp/nvflare/poc/site-"$site"/agaricus.txt.test
 done
--- a/doc/conf.py
+++ b/doc/conf.py
@ -143,7 +143,7 @@ extensions = [
    "sphinx.ext.intersphinx",
    "sphinx_gallery.gen_gallery",
    "breathe",
-    "recommonmark",
+    "myst_parser",
 ]

 sphinx_gallery_conf = {
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@ -1,14 +1,15 @@
-sphinx>=5.2.1
+sphinx
 mock
 sphinx_rtd_theme>=1.0.0
 breathe
 scikit-learn
-sh>=1.12.14
-matplotlib>=2.1
+sh
+matplotlib
 graphviz
 numpy
-recommonmark
+myst-parser
 xgboost_ray
 sphinx-gallery
 pyspark
-cloudpickle
+cloudpickle
+setuptools
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -150,7 +150,7 @@ inline LINALG_HD int Popc(uint64_t v) {
  return __popcll(v);
 #elif defined(__GNUC__) || defined(__clang__) || defined(__HIP_PLATFORM_AMD__)
  return __builtin_popcountll(v);
-#elif defined(_MSC_VER) && _defined(_M_X64)
+#elif defined(_MSC_VER) && defined(_M_X64)
  return __popcnt64(v);
 #else
  return NativePopc(v);
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -129,7 +129,7 @@
                    <plugin>
                        <groupId>org.apache.maven.plugins</groupId>
                        <artifactId>maven-gpg-plugin</artifactId>
-                        <version>3.0.1</version>
+                        <version>3.1.0</version>
                        <executions>
                            <execution>
                                <id>sign-artifacts</id>
@ -427,7 +427,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.0.0</version>
+                <version>3.1.0</version>
                <configuration>
                    <skipTests>false</skipTests>
                    <useSystemClassLoader>false</useSystemClassLoader>
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@ -48,12 +48,6 @@ pom_template = """
      <artifactId>commons-logging</artifactId>
      <version>1.2</version>
    </dependency>
-    <dependency>
-      <groupId>com.typesafe.akka</groupId>
-      <artifactId>akka-actor_${{scala.binary.version}}</artifactId>
-      <version>2.6.20</version>
-      <scope>compile</scope>
-    </dependency>
    <dependency>
      <groupId>com.typesafe.akka</groupId>
      <artifactId>akka-testkit_${{scala.binary.version}}</artifactId>
--- a/python-package/.pylintrc
+++ b/python-package/.pylintrc
@ -1,26 +0,0 @@
-[MASTER]
-
-ignore=tests
-
-extension-pkg-whitelist=numpy
-
-disable=unexpected-special-method-signature,too-many-nested-blocks,useless-object-inheritance,import-outside-toplevel,unsubscriptable-object,attribute-defined-outside-init
-
-dummy-variables-rgx=(unused|)_.*
-
-reports=no
-
-[BASIC]
-
-# Enforce naming convention
-const-naming-style=UPPER_CASE
-class-naming-style=PascalCase
-function-naming-style=snake_case
-method-naming-style=snake_case
-attr-naming-style=snake_case
-argument-naming-style=snake_case
-variable-naming-style=snake_case
-class-attribute-naming-style=snake_case
-
-# Allow single-letter variables
-variable-rgx=[a-zA-Z_][a-z0-9_]{0,30}$
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@ -26,23 +26,18 @@ class BuildConfiguration:  # pylint: disable=R0902
    # Special option: See explanation below
    use_system_libxgboost: bool = False

-    def _set_config_setting(
-        self, config_settings: Dict[str, Any], field_name: str
-    ) -> None:
-        if field_name in config_settings:
+    def _set_config_setting(self, config_settings: Dict[str, Any]) -> None:
+        for field_name in config_settings:
            setattr(
                self,
                field_name,
                (config_settings[field_name].lower() in ["true", "1", "on"]),
            )
-        else:
-            raise ValueError(f"Field {field_name} is not a valid config_settings")

    def update(self, config_settings: Optional[Dict[str, Any]]) -> None:
        """Parse config_settings from Pip (or other PEP 517 frontend)"""
        if config_settings is not None:
-            for field_name in [x.name for x in dataclasses.fields(self)]:
-                self._set_config_setting(config_settings, field_name)
+            self._set_config_setting(config_settings)

    def get_cmake_args(self) -> List[str]:
        """Convert build configuration to CMake args"""
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@ -130,20 +130,21 @@ def locate_or_build_libxgboost(
    """Locate libxgboost; if not exist, build it"""
    logger = logging.getLogger("xgboost.packager.locate_or_build_libxgboost")

-    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
-    if libxgboost is not None:
-        return libxgboost
    if build_config.use_system_libxgboost:
        # Find libxgboost from system prefix
        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
-        libxgboost = sys_prefix / "lib" / _lib_name()
-        if not libxgboost.exists():
+        libxgboost_sys = sys_prefix / "lib" / _lib_name()
+        if not libxgboost_sys.exists():
            raise RuntimeError(
                f"use_system_libxgboost was specified but {_lib_name()} is "
-                f"not found in {libxgboost.parent}"
+                f"not found in {libxgboost_sys.parent}"
            )

-        logger.info("Using system XGBoost: %s", str(libxgboost))
+        logger.info("Using system XGBoost: %s", str(libxgboost_sys))
+        return libxgboost_sys
+
+    libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
+    if libxgboost is not None:
        return libxgboost

    if toplevel_dir.joinpath("cpp_src").exists():
--- a/python-package/packager/pep517.py
+++ b/python-package/packager/pep517.py
@ -79,7 +79,8 @@ def build_wheel(
        libxgboost = locate_or_build_libxgboost(
            TOPLEVEL_DIR, build_dir=build_dir, build_config=build_config
        )
-        copy_with_logging(libxgboost, lib_path, logger=logger)
+        if not build_config.use_system_libxgboost:
+            copy_with_logging(libxgboost, lib_path, logger=logger)

        with cd(workspace):
            wheel_name = hatchling.build.build_wheel(
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@ -9,13 +9,13 @@ build-backend = "packager.pep517"
 name = "xgboost"
 version = "2.0.0-dev"
 authors = [
-    {name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu"},
-    {name = "Jiaming Yuan", email = "jm.yuan@outlook.com"}
+    { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
+    { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
 ]
 description = "XGBoost Python Package"
-readme = {file = "README.rst", content-type = "text/x-rst"}
+readme = { file = "README.rst", content-type = "text/x-rst" }
 requires-python = ">=3.8"
-license = {text = "Apache-2.0"}
+license = { text = "Apache-2.0" }
 classifiers = [
    "License :: OSI Approved :: Apache Software License",
    "Development Status :: 5 - Production/Stable",
@ -24,13 +24,18 @@ classifiers = [
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10"
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11"
 ]
 dependencies = [
    "numpy",
    "scipy"
 ]

+[project.urls]
+documentation = "https://xgboost.readthedocs.io/en/stable/"
+repository = "https://github.com/dmlc/xgboost"
+
 [project.optional-dependencies]
 pandas = ["pandas"]
 scikit-learn = ["scikit-learn"]
@ -40,3 +45,39 @@ plotting = ["graphviz", "matplotlib"]
 pyspark = ["pyspark", "scikit-learn", "cloudpickle"]

 [tool.hatch.build.targets.wheel.hooks.custom]
+
+[tool.isort]
+profile = "black"
+
+[tool.mypy]
+ignore_missing_imports = true
+disallow_untyped_defs = true
+follow_imports = "silent"
+
+[tool.pylint.main]
+ignore = ["tests"]
+extension-pkg-whitelist = ["numpy"]
+disable = [
+    "attribute-defined-outside-init",
+    "import-outside-toplevel",
+    "too-many-nested-blocks",
+    "unexpected-special-method-signature",
+    "unsubscriptable-object",
+    "useless-object-inheritance"
+]
+dummy-variables-rgx = "(unused|)_.*"
+reports = false
+
+[tool.pylint.basic]
+# Enforce naming convention
+const-naming-style = "UPPER_CASE"
+class-naming-style = "PascalCase"
+function-naming-style = "snake_case"
+method-naming-style = "snake_case"
+attr-naming-style = "snake_case"
+argument-naming-style = "snake_case"
+variable-naming-style = "snake_case"
+class-attribute-naming-style = "snake_case"
+
+# Allow single-letter variables
+variable-rgx = "[a-zA-Z_][a-z0-9_]{0,30}$"
--- a/python-package/setup.cfg
+++ b/python-package/setup.cfg
@ -1,7 +0,0 @@
-[metadata]
-description_file = README.rst
-
-[mypy]
-ignore_missing_imports = True
-disallow_untyped_defs = True
-follow_imports = silent
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@ -200,12 +200,6 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
  if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
    connect_retry = atoi(val);
  }
-  if (!strcmp(name, "rabit_bootstrap_cache")) {
-    rabit_bootstrap_cache = utils::StringToBool(val);
-  }
-  if (!strcmp(name, "rabit_debug")) {
-    rabit_debug = utils::StringToBool(val);
-  }
  if (!strcmp(name, "rabit_timeout")) {
    rabit_timeout = utils::StringToBool(val);
  }
--- a/rabit/src/allreduce_base.h
+++ b/rabit/src/allreduce_base.h
@ -487,10 +487,6 @@ class AllreduceBase : public IEngine {
  int world_size;  // NOLINT
  // connect retry time
  int connect_retry;  // NOLINT
-  // enable bootstrap cache 0 false 1 true
-  bool rabit_bootstrap_cache = false;  // NOLINT
-  // enable detailed logging
-  bool rabit_debug = false;  // NOLINT
  // by default, if rabit worker not recover in half an hour exit
  std::chrono::seconds timeout_sec{std::chrono::seconds{1800}}; // NOLINT
  // flag to enable rabit_timeout
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@ -4,9 +4,6 @@
 * \brief The command line interface program of xgboost.
 *  This file is not included in dynamic library.
 */
-#define _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_DEPRECATE
-
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
 #endif  // !defined(NOMINMAX)
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@ -222,15 +222,15 @@ void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string*

  std::unique_lock<std::mutex> lock(mutex_);

-  LOG(INFO) << functor.name << " rank " << rank << ": waiting for current sequence number";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": waiting for current sequence number";
  cv_.wait(lock, [this, sequence_number] { return sequence_number_ == sequence_number; });

-  LOG(INFO) << functor.name << " rank " << rank << ": handling request";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": handling request";
  functor(input, bytes, &buffer_);
  received_++;

  if (received_ == world_size_) {
-    LOG(INFO) << functor.name << " rank " << rank << ": all requests received";
+    LOG(DEBUG) << functor.name << " rank " << rank << ": all requests received";
    output->assign(buffer_);
    sent_++;
    lock.unlock();
@ -238,15 +238,15 @@ void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string*
    return;
  }

-  LOG(INFO) << functor.name << " rank " << rank << ": waiting for all clients";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": waiting for all clients";
  cv_.wait(lock, [this] { return received_ == world_size_; });

-  LOG(INFO) << functor.name << " rank " << rank << ": sending reply";
+  LOG(DEBUG) << functor.name << " rank " << rank << ": sending reply";
  output->assign(buffer_);
  sent_++;

  if (sent_ == world_size_) {
-    LOG(INFO) << functor.name << " rank " << rank << ": all replies sent";
+    LOG(DEBUG) << functor.name << " rank " << rank << ": all replies sent";
    sent_ = 0;
    received_ = 0;
    buffer_.clear();
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -1355,14 +1355,12 @@ class CUDAStream {
  cudaStream_t stream_;

 public:
-  CUDAStream() {
-    dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
-  }
-  ~CUDAStream() {
-    dh::safe_cuda(cudaStreamDestroy(stream_));
-  }
+  CUDAStream() { dh::safe_cuda(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)); }
+  ~CUDAStream() { dh::safe_cuda(cudaStreamDestroy(stream_)); }
+
+  [[nodiscard]] CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+  [[nodiscard]] cudaStream_t Handle() const { return stream_; }

-  CUDAStreamView View() const { return CUDAStreamView{stream_}; }
  void Sync() { this->View().Sync(); }
 };

--- a/src/common/device_helpers.hip.h
+++ b/src/common/device_helpers.hip.h
@ -1273,14 +1273,12 @@ class CUDAStream {
  hipStream_t stream_;

 public:
-  CUDAStream() {
-    dh::safe_cuda(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking));
-  }
-  ~CUDAStream() {
-    dh::safe_cuda(hipStreamDestroy(stream_));
-  }
+  CUDAStream() { dh::safe_cuda(hipStreamCreateWithFlags(&stream_, hipStreamNonBlocking)); }
+  ~CUDAStream() { dh::safe_cuda(hipStreamDestroy(stream_)); }
+
+  [[nodiscard]] CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+  [[nodiscard]] hipStream_t Handle() const { return stream_; }

-  CUDAStreamView View() const { return CUDAStreamView{stream_}; }
  void Sync() { this->View().Sync(); }
 };

--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@ -183,14 +183,28 @@ class PartitionBuilder {
    SetNRightElems(node_in_set, range.begin(), n_right);
  }

+  template <bool any_missing, typename ColumnType, typename Predicate>
+  void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
+                  BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
+    auto& column = *p_column;
+    for (auto const row_id : row_indices) {
+      auto const bin_id = column[row_id - base_rowid];
+      if (any_missing && bin_id == ColumnType::kMissingId) {
+        missing_bits->Set(row_id - base_rowid);
+      } else if (pred(row_id, bin_id)) {
+        decision_bits->Set(row_id - base_rowid);
+      }
+    }
+  }
+
  /**
   * @brief When data is split by column, we don't have all the features locally on the current
   * worker, so we go through all the rows and mark the bit vectors on whether the decision is made
   * to go right, or if the feature value used for the split is missing.
   */
-  template <typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
  void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
-                const common::Range1d range, GHistIndexMatrix const& gmat,
+                const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
                BitVector* decision_bits, BitVector* missing_bits) {
    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
@ -204,7 +218,7 @@ class PartitionBuilder {
      for (auto row_id : rid_span) {
        auto gidx = gmat.GetGindex(row_id, fid);
        if (gidx > -1) {
-          bool go_left = false;
+          bool go_left;
          if (is_cat) {
            go_left = Decision(node_cats, cut_values[gidx]);
          } else {
@ -218,7 +232,27 @@ class PartitionBuilder {
        }
      }
    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
+      auto pred_hist = [&](auto ridx, auto bin_id) {
+        if (any_cat && is_cat) {
+          auto gidx = gmat.GetGindex(ridx, fid);
+          CHECK_GT(gidx, -1);
+          return Decision(node_cats, cut_values[gidx]);
+        } else {
+          return bin_id <= split_cond;
+        }
+      };
+
+      if (column_matrix.GetColumnType(fid) == xgboost::common::kDenseColumn) {
+        auto column = column_matrix.DenseColumn<BinIdxType, any_missing>(fid);
+        MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
+                                pred_hist);
+      } else {
+        CHECK_EQ(any_missing, true);
+        auto column =
+            column_matrix.SparseColumn<BinIdxType>(fid, rid_span.front() - gmat.base_rowid);
+        MaskKernel<any_missing>(&column, rid_span, gmat.base_rowid, decision_bits, missing_bits,
+                                pred_hist);
+      }
    }
  }

@ -238,7 +272,7 @@ class PartitionBuilder {
    std::size_t nid = nodes[node_in_set].nid;
    bool default_left = tree[nid].DefaultLeft();

-    auto pred_approx = [&](auto ridx) {
+    auto pred = [&](auto ridx) {
      bool go_left = default_left;
      bool is_missing = missing_bits.Check(ridx - gmat.base_rowid);
      if (!is_missing) {
@ -248,11 +282,7 @@ class PartitionBuilder {
    };

    std::pair<size_t, size_t> child_nodes_sizes;
-    if (!column_matrix.IsInitialized()) {
-      child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred_approx);
-    } else {
-      LOG(FATAL) << "Column data split is only supported for the `approx` tree method";
-    }
+    child_nodes_sizes = PartitionRangeKernel(rid_span, left, right, pred);

    const size_t n_left  = child_nodes_sizes.first;
    const size_t n_right = child_nodes_sizes.second;
--- a/src/common/transform_iterator.h
+++ b/src/common/transform_iterator.h
@ -26,9 +26,9 @@ class IndexTransformIter {

 public:
  using iterator_category = std::random_access_iterator_tag;  // NOLINT
-  using value_type = std::result_of_t<Fn(std::size_t)>;       // NOLINT
+  using reference = std::result_of_t<Fn(std::size_t)>;        // NOLINT
+  using value_type = std::remove_cv_t<std::remove_reference_t<reference>>; // NOLINT
  using difference_type = detail::ptrdiff_t;                  // NOLINT
-  using reference = std::add_lvalue_reference_t<value_type>;  // NOLINT
  using pointer = std::add_pointer_t<value_type>;             // NOLINT

 public:
@ -43,8 +43,8 @@ class IndexTransformIter {
    return *this;
  }

-  value_type operator*() const { return fn_(iter_); }
-  value_type operator[](std::size_t i) const {
+  reference operator*() const { return fn_(iter_); }
+  reference operator[](std::size_t i) const {
    auto iter = *this + i;
    return *iter;
  }
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@ -1,11 +1,15 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
+#include <cstdint>  // for int64_t
+
 #include "../common/common.h"
+#include "../common/device_helpers.cuh"  // for DefaultStream, CUDAEvent
 #include "array_interface.h"
+#include "xgboost/logging.h"

 namespace xgboost {
-void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
+void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
  switch (stream) {
    case 0:
      /**
@ -22,12 +26,15 @@ void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
      break;
    case 2:
      // default per-thread stream
-    default:
+    default: {
+      dh::CUDAEvent e;
 #if defined(XGBOOST_USE_CUDA)
-      dh::safe_cuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+      e.Record(dh::CUDAStreamView{reinterpret_cast<cudaStream_t>(stream)});
 #elif defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipStreamSynchronize(reinterpret_cast<hipStream_t>(stream)));
+      e.Record(dh::CUDAStreamView{reinterpret_cast<hipStream_t>(stream)});
 #endif
+      dh::DefaultStream().Wait(e);
+    }
  }
 }

--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@ -166,7 +166,7 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
  }
  if (!gradient_index_ || detail::RegenGHist(batch_param_, param)) {
    // GIDX page doesn't exist, generate it
-    LOG(INFO) << "Generating new Gradient Index.";
+    LOG(DEBUG) << "Generating new Gradient Index.";
    // These places can ask for a CSR gidx:
    // - CPU Hist: the ctx must be on CPU.
    // - IterativeDMatrix::InitFromCPU: The ctx must be on CPU.
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@ -38,19 +38,21 @@ class ColumnSplitHelper {
    missing_bits_ = BitVector(common::Span<BitVector::value_type>(missing_storage_));
  }

-  template <typename ExpandEntry>
+  template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
  void Partition(common::BlockedSpace2d const& space, std::int32_t n_threads,
                 GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
-                 std::vector<ExpandEntry> const& nodes, RegTree const* p_tree) {
+                 std::vector<ExpandEntry> const& nodes,
+                 std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
    // When data is split by column, we don't have all the feature values in the local worker, so
    // we first collect all the decisions and whether the feature is missing into bit vectors.
    std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
    common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) {
      const int32_t nid = nodes[node_in_set].nid;
-      partition_builder_->MaskRows(node_in_set, nodes, r, gmat, column_matrix, *p_tree,
-                                   (*row_set_collection_)[nid].begin, &decision_bits_,
-                                   &missing_bits_);
+      bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
+      partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
+          node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
+          (*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
    });

    // Then aggregate the bit vectors across all the workers.
@ -217,7 +219,8 @@ class CommonRowPartitioner {
    // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
    // Store results in intermediate buffers from partition_builder_
    if (is_col_split_) {
-      column_split_helper_.Partition(space, ctx->Threads(), gmat, column_matrix, nodes, p_tree);
+      column_split_helper_.Partition<BinIdxType, any_missing, any_cat>(
+          space, ctx->Threads(), gmat, column_matrix, nodes, split_conditions, p_tree);
    } else {
      common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
        size_t begin = r.begin();
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@ -412,6 +412,7 @@ class HistEvaluator {
    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
                             tree[candidate.nid].SplitIndex(), left_weight,
                             right_weight);
+    evaluator = tree_evaluator_.GetEvaluator();

    snode_.resize(tree.GetNodes().size());
    snode_.at(left_child).stats = candidate.split.left_sum;
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@ -49,6 +49,8 @@ class TreeEvaluator {
      monotone_.HostVector().resize(n_features, 0);
      has_constraint_ = false;
    } else {
+      CHECK_LE(p.monotone_constraints.size(), n_features)
+          << "The size of monotone constraint should be less or equal to the number of features.";
      monotone_.HostVector() = p.monotone_constraints;
      monotone_.HostVector().resize(n_features, 0);
      // Initialised to some small size, can grow if needed
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@ -1,227 +1,225 @@
 import argparse
 import os
+import pathlib
 import subprocess
 import sys
+from collections import Counter
 from multiprocessing import Pool, cpu_count
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple

-from pylint import epylint
 from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time

-CURDIR = os.path.normpath(os.path.abspath(os.path.dirname(__file__)))
-SRCPATH = os.path.normpath(
-    os.path.join(CURDIR, os.path.pardir, os.path.pardir, "python-package")
-)
+
+class LintersPaths:
+    """The paths each linter run on."""
+
+    BLACK = (
+        # core
+        "python-package/",
+        # tests
+        "tests/python/test_config.py",
+        "tests/python/test_data_iterator.py",
+        "tests/python/test_dt.py",
+        "tests/python/test_predict.py",
+        "tests/python/test_quantile_dmatrix.py",
+        "tests/python/test_tree_regularization.py",
+        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/test_distributed/test_with_spark/",
+        "tests/test_distributed/test_gpu_with_spark/",
+        # demo
+        "demo/json-model/json_parser.py",
+        "demo/guide-python/cat_in_the_dat.py",
+        "demo/guide-python/categorical.py",
+        "demo/guide-python/feature_weights.py",
+        "demo/guide-python/sklearn_parallel.py",
+        "demo/guide-python/spark_estimator_examples.py",
+        "demo/guide-python/individual_trees.py",
+        "demo/guide-python/quantile_regression.py",
+        "demo/guide-python/multioutput_regression.py",
+        # CI
+        "tests/ci_build/lint_python.py",
+        "tests/ci_build/test_r_package.py",
+        "tests/ci_build/test_utils.py",
+        "tests/ci_build/change_version.py",
+    )
+
+    ISORT = (
+        # core
+        "python-package/",
+        # tests
+        "tests/test_distributed/",
+        "tests/python/",
+        "tests/python-gpu/",
+        "tests/ci_build/",
+        # demo
+        "demo/",
+        # misc
+        "dev/",
+        "doc/",
+    )
+
+    MYPY = (
+        # core
+        "python-package/",
+        # tests
+        "tests/python/test_dt.py",
+        "tests/python/test_data_iterator.py",
+        "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/test_distributed/test_with_spark/test_data.py",
+        "tests/test_distributed/test_gpu_with_spark/test_data.py",
+        "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
+        # demo
+        "demo/json-model/json_parser.py",
+        "demo/guide-python/external_memory.py",
+        "demo/guide-python/cat_in_the_dat.py",
+        "demo/guide-python/feature_weights.py",
+        "demo/guide-python/individual_trees.py",
+        "demo/guide-python/quantile_regression.py",
+        "demo/guide-python/multioutput_regression.py",
+        # CI
+        "tests/ci_build/lint_python.py",
+        "tests/ci_build/test_r_package.py",
+        "tests/ci_build/test_utils.py",
+        "tests/ci_build/change_version.py",
+    )
+
+
+def check_cmd_print_failure_assistance(cmd: List[str]) -> bool:
+    if subprocess.run(cmd).returncode == 0:
+        return True
+
+    subprocess.run([cmd[0], "--version"])
+    msg = """
+Please run the following command on your machine to address the formatting error:
+
+    """
+    msg += " ".join(cmd)
+    print(msg, file=sys.stderr)
+    return False


@record_time
+@cd(PY_PACKAGE)
 def run_black(rel_path: str, fix: bool) -> bool:
-    if fix:
-        cmd = ["black", "-q", rel_path]
-    else:
-        cmd = ["black", "-q", "--check", rel_path]
-    ret = subprocess.run(cmd).returncode
-    if ret != 0:
-        subprocess.run(["black", "--version"])
-        msg = """
-Please run the following command on your machine to address the formatting error:
+    cmd = ["black", "-q", os.path.join(ROOT, rel_path)]
+    if not fix:
+        cmd += ["--check"]

-        """
-        msg += " ".join(cmd)
-        print(msg, file=sys.stderr)
-        return False
-    return True
+    return check_cmd_print_failure_assistance(cmd)


@record_time
+@cd(PY_PACKAGE)
 def run_isort(rel_path: str, fix: bool) -> bool:
-    if fix:
-        cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
-    else:
-        cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
-    ret = subprocess.run(cmd).returncode
-    if ret != 0:
-        subprocess.run(["isort", "--version"])
-        msg = """
-Please run the following command on your machine to address the formatting error:
+    # Isort gets confused when trying to find the config file, so specified explicitly.
+    cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
+    if not fix:
+        cmd += ["--check"]

-        """
-        msg += " ".join(cmd)
-        print(msg, file=sys.stderr)
-        return False
-    return True
+    return check_cmd_print_failure_assistance(cmd)


@record_time
@cd(PY_PACKAGE)
 def run_mypy(rel_path: str) -> bool:
-    path = os.path.join(ROOT, rel_path)
-    ret = subprocess.run(["mypy", path])
-    if ret.returncode != 0:
-        return False
-    return True
+    cmd = ["mypy", os.path.join(ROOT, rel_path)]
+
+    return check_cmd_print_failure_assistance(cmd)


 class PyLint:
    """A helper for running pylint, mostly copied from dmlc-core/scripts."""

-    def __init__(self) -> None:
-        self.pypackage_root = os.path.join(ROOT, "python-package/")
-        self.pylint_cats = set(["error", "warning", "convention", "refactor"])
-        self.pylint_opts = [
-            "--extension-pkg-whitelist=numpy",
-            "--rcfile=" + os.path.join(self.pypackage_root, ".pylintrc"),
-        ]
+    MESSAGE_CATEGORIES = {
+        "Fatal",
+        "Error",
+        "Warning",
+        "Convention",
+        "Refactor",
+        "Information",
+    }
+    MESSAGE_PREFIX_TO_CATEGORY = {
+        category[0]: category for category in MESSAGE_CATEGORIES
+    }

-    def run(self, path: str) -> Tuple[Dict, str, str]:
-        (pylint_stdout, pylint_stderr) = epylint.py_run(
-            " ".join([str(path)] + self.pylint_opts), return_std=True
+    @classmethod
+    @cd(PY_PACKAGE)
+    def get_summary(cls, path: str) -> Tuple[str, Dict[str, int], str, str, bool]:
+        """Get the summary of pylint's errors, warnings, etc."""
+        ret = subprocess.run(["pylint", path], capture_output=True)
+        stdout = ret.stdout.decode("utf-8")
+
+        emap: Dict[str, int] = Counter()
+        for line in stdout.splitlines():
+            if ":" in line and (
+                category := cls.MESSAGE_PREFIX_TO_CATEGORY.get(
+                    line.split(":")[-2].strip()[0]
+                )
+            ):
+                emap[category] += 1
+
+        return path, emap, stdout, ret.stderr.decode("utf-8"), ret.returncode == 0
+
+    @staticmethod
+    def print_summary_map(result_map: Dict[str, Dict[str, int]]) -> int:
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+
+        ftype = "Python"
+        nfail = sum(map(bool, result_map.values()))
+        print(
+            f"====={len(result_map) - nfail}/{len(result_map)} {ftype} files passed check====="
        )
-        emap = {}
-        err = pylint_stderr.read()
+        for fname, emap in result_map.items():
+            if emap:
+                print(
+                    f"{fname}: {sum(emap.values())} Errors of {len(emap)} Categories map={emap}"
+                )
+        return nfail

-        out = []
-        for line in pylint_stdout:
-            out.append(line)
-            key = line.split(":")[-1].split("(")[0].strip()
-            if key not in self.pylint_cats:
-                continue
-            if key not in emap:
-                emap[key] = 1
-            else:
-                emap[key] += 1
-
-        return {path: emap}, err, "\n".join(out)
-
-    def __call__(self) -> bool:
+    @classmethod
+    def run(cls) -> bool:
+        """Run pylint with parallelization on a batch of paths."""
        all_errors: Dict[str, Dict[str, int]] = {}

-        def print_summary_map(result_map: Dict[str, Dict[str, int]]) -> int:
-            """Print summary of certain result map."""
-            if len(result_map) == 0:
-                return 0
-            ftype = "Python"
-            npass = sum(1 for x in result_map.values() if len(x) == 0)
-            print(f"====={npass}/{len(result_map)} {ftype} files passed check=====")
-            for fname, emap in result_map.items():
-                if len(emap) == 0:
-                    continue
-                print(
-                    f"{fname}: {sum(emap.values())} Errors of {len(emap)} Categories map={str(emap)}"
-                )
-            return len(result_map) - npass
-
-        all_scripts = []
-        for root, dirs, files in os.walk(self.pypackage_root):
-            for f in files:
-                if f.endswith(".py"):
-                    all_scripts.append(os.path.join(root, f))
-
        with Pool(cpu_count()) as pool:
-            error_maps = pool.map(self.run, all_scripts)
-            for emap, err, out in error_maps:
+            error_maps = pool.map(
+                cls.get_summary,
+                (os.fspath(file) for file in pathlib.Path(PY_PACKAGE).glob("**/*.py")),
+            )
+            for path, emap, out, err, succeeded in error_maps:
+                all_errors[path] = emap
+                if succeeded:
+                    continue
+
                print(out)
                if len(err) != 0:
                    print(err)
-                all_errors.update(emap)

-        nerr = print_summary_map(all_errors)
+        nerr = cls.print_summary_map(all_errors)
        return nerr == 0


@record_time
 def run_pylint() -> bool:
-    return PyLint()()
+    return PyLint.run()


@record_time
 def main(args: argparse.Namespace) -> None:
    if args.format == 1:
-        black_results = [
-            run_black(path, args.fix)
-            for path in [
-                # core
-                "python-package/",
-                # tests
-                "tests/python/test_config.py",
-                "tests/python/test_data_iterator.py",
-                "tests/python/test_dt.py",
-                "tests/python/test_predict.py",
-                "tests/python/test_quantile_dmatrix.py",
-                "tests/python/test_tree_regularization.py",
-                "tests/python-gpu/test_gpu_data_iterator.py",
-                "tests/ci_build/lint_python.py",
-                "tests/test_distributed/test_with_spark/",
-                "tests/test_distributed/test_gpu_with_spark/",
-                # demo
-                "demo/json-model/json_parser.py",
-                "demo/guide-python/cat_in_the_dat.py",
-                "demo/guide-python/categorical.py",
-                "demo/guide-python/feature_weights.py",
-                "demo/guide-python/sklearn_parallel.py",
-                "demo/guide-python/spark_estimator_examples.py",
-                "demo/guide-python/individual_trees.py",
-                "demo/guide-python/quantile_regression.py",
-                "demo/guide-python/multioutput_regression.py",
-                # CI
-                "tests/ci_build/lint_python.py",
-                "tests/ci_build/test_r_package.py",
-                "tests/ci_build/test_utils.py",
-                "tests/ci_build/change_version.py",
-            ]
-        ]
+        black_results = [run_black(path, args.fix) for path in LintersPaths.BLACK]
        if not all(black_results):
            sys.exit(-1)

-        isort_results = [
-            run_isort(path, args.fix)
-            for path in [
-                # core
-                "python-package/",
-                # tests
-                "tests/test_distributed/",
-                "tests/python/",
-                "tests/python-gpu/",
-                "tests/ci_build/",
-                # demo
-                "demo/",
-                # misc
-                "dev/",
-                "doc/",
-            ]
-        ]
+        isort_results = [run_isort(path, args.fix) for path in LintersPaths.ISORT]
        if not all(isort_results):
            sys.exit(-1)

    if args.type_check == 1:
-        if not all(
-            run_mypy(path)
-            for path in [
-                # core
-                "python-package/",
-                # demo
-                "demo/json-model/json_parser.py",
-                "demo/guide-python/external_memory.py",
-                "demo/guide-python/cat_in_the_dat.py",
-                "demo/guide-python/feature_weights.py",
-                "demo/guide-python/individual_trees.py",
-                "demo/guide-python/quantile_regression.py",
-                "demo/guide-python/multioutput_regression.py",
-                # tests
-                "tests/python/test_dt.py",
-                "tests/python/test_data_iterator.py",
-                "tests/python-gpu/test_gpu_data_iterator.py",
-                "tests/test_distributed/test_with_spark/test_data.py",
-                "tests/test_distributed/test_gpu_with_spark/test_data.py",
-                "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
-                # CI
-                "tests/ci_build/lint_python.py",
-                "tests/ci_build/test_r_package.py",
-                "tests/ci_build/test_utils.py",
-                "tests/ci_build/change_version.py",
-            ]
-        ):
-            subprocess.check_call(["mypy", "--version"])
+        mypy_results = [run_mypy(path) for path in LintersPaths.MYPY]
+        if not all(mypy_results):
            sys.exit(-1)

    if args.pylint == 1:
--- a/tests/cpp/collective/test_in_memory_communicator.cc
+++ b/tests/cpp/collective/test_in_memory_communicator.cc
@ -26,6 +26,60 @@ class InMemoryCommunicatorTest : public ::testing::Test {

  static void Allgather(int rank) {
    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllgather(comm, rank);
+  }
+
+  static void AllreduceMax(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceMax(comm, rank);
+  }
+
+  static void AllreduceMin(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceMin(comm, rank);
+  }
+
+  static void AllreduceSum(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceSum(comm);
+  }
+
+  static void AllreduceBitwiseAND(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceBitwiseAND(comm, rank);
+  }
+
+  static void AllreduceBitwiseOR(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceBitwiseOR(comm, rank);
+  }
+
+  static void AllreduceBitwiseXOR(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllreduceBitwiseXOR(comm, rank);
+  }
+
+  static void Broadcast(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyBroadcast(comm, rank);
+  }
+
+  static void Mixture(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    for (auto i = 0; i < 5; i++) {
+      VerifyAllgather(comm, rank);
+      VerifyAllreduceMax(comm, rank);
+      VerifyAllreduceMin(comm, rank);
+      VerifyAllreduceSum(comm);
+      VerifyAllreduceBitwiseAND(comm, rank);
+      VerifyAllreduceBitwiseOR(comm, rank);
+      VerifyAllreduceBitwiseXOR(comm, rank);
+      VerifyBroadcast(comm, rank);
+    }
+  }
+
+ protected:
+  static void VerifyAllgather(InMemoryCommunicator &comm, int rank) {
    char buffer[kWorldSize] = {'a', 'b', 'c'};
    buffer[rank] = '0' + rank;
    comm.AllGather(buffer, kWorldSize);
@ -34,8 +88,7 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    }
  }

-  static void AllreduceMax(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceMax(InMemoryCommunicator &comm, int rank) {
    int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
    comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMax);
    int expected[] = {3, 4, 5, 6, 7};
@ -44,8 +97,7 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    }
  }

-  static void AllreduceMin(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceMin(InMemoryCommunicator &comm, int rank) {
    int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
    comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMin);
    int expected[] = {1, 2, 3, 4, 5};
@ -54,8 +106,7 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    }
  }

-  static void AllreduceSum(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceSum(InMemoryCommunicator &comm) {
    int buffer[] = {1, 2, 3, 4, 5};
    comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
    int expected[] = {3, 6, 9, 12, 15};
@ -64,16 +115,14 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    }
  }

-  static void AllreduceBitwiseAND(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceBitwiseAND(InMemoryCommunicator &comm, int rank) {
    std::bitset<2> original(rank);
    auto buffer = original.to_ulong();
    comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseAND);
    EXPECT_EQ(buffer, 0UL);
  }

-  static void AllreduceBitwiseOR(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceBitwiseOR(InMemoryCommunicator &comm, int rank) {
    std::bitset<2> original(rank);
    auto buffer = original.to_ulong();
    comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseOR);
@ -82,8 +131,7 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    EXPECT_EQ(actual, expected);
  }

-  static void AllreduceBitwiseXOR(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyAllreduceBitwiseXOR(InMemoryCommunicator &comm, int rank) {
    std::bitset<3> original(rank * 2);
    auto buffer = original.to_ulong();
    comm.AllReduce(&buffer, 1, DataType::kUInt32, Operation::kBitwiseXOR);
@ -92,8 +140,7 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    EXPECT_EQ(actual, expected);
  }

-  static void Broadcast(int rank) {
-    InMemoryCommunicator comm{kWorldSize, rank};
+  static void VerifyBroadcast(InMemoryCommunicator &comm, int rank) {
    if (rank == 0) {
      std::string buffer{"hello"};
      comm.Broadcast(&buffer[0], buffer.size(), 0);
@ -105,7 +152,6 @@ class InMemoryCommunicatorTest : public ::testing::Test {
    }
  }

- protected:
  static int const kWorldSize{3};
 };

@ -173,5 +219,7 @@ TEST_F(InMemoryCommunicatorTest, AllreduceBitwiseXOR) { Verify(&AllreduceBitwise

 TEST_F(InMemoryCommunicatorTest, Broadcast) { Verify(&Broadcast); }

+TEST_F(InMemoryCommunicatorTest, Mixture) { Verify(&Mixture); }
+
 }  // namespace collective
 }  // namespace xgboost
--- a/tests/cpp/data/test_array_interface.cu
+++ b/tests/cpp/data/test_array_interface.cu
@ -1,5 +1,5 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
@ -22,31 +22,19 @@ TEST(ArrayInterface, Stream) {
  HostDeviceVector<float> storage;
  auto arr_str = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);

-#if defined(XGBOOST_USE_CUDA)
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-#elif defined(XGBOOST_USE_HIP)
-  hipStream_t stream;
-  hipStreamCreate(&stream);
-#endif
+  dh::CUDAStream stream;

-  auto j_arr =Json::Load(StringView{arr_str});
-  j_arr["stream"] = Integer(reinterpret_cast<int64_t>(stream));
+  auto j_arr = Json::Load(StringView{arr_str});
+  j_arr["stream"] = Integer(reinterpret_cast<int64_t>(stream.Handle()));
  Json::Dump(j_arr, &arr_str);

  dh::caching_device_vector<uint64_t> out(1, 0);
-  uint64_t dur = 1e9;
-  dh::LaunchKernel{1, 1, 0, stream}(SleepForTest, out.data().get(), dur);
+  std::uint64_t dur = 1e9;
+  dh::LaunchKernel{1, 1, 0, stream.View()}(SleepForTest, out.data().get(), dur);
  ArrayInterface<2> arr(arr_str);

  auto t = out[0];
  CHECK_GE(t, dur);
-
-#if defined(XGBOOST_USE_CUDA)
-  cudaStreamDestroy(stream);
-#elif defined(XGBOOST_USE_HIP)
-  hipStreamDestroy(stream);
-#endif
 }

 TEST(ArrayInterface, Ptr) {
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@ -497,23 +497,32 @@ inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }

 template <typename Function, typename... Args>
 void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
+  auto run = [&](auto rank) {
+    Json config{JsonObject()};
+    config["xgboost_communicator"] = String("in-memory");
+    config["in_memory_world_size"] = world_size;
+    config["in_memory_rank"] = rank;
+    xgboost::collective::Init(config);
+
+    std::forward<Function>(function)(std::forward<Args>(args)...);
+
+    xgboost::collective::Finalize();
+  };
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(world_size)
+  {
+    auto rank = omp_get_thread_num();
+    run(rank);
+  }
+#else
  std::vector<std::thread> threads;
  for (auto rank = 0; rank < world_size; rank++) {
-    threads.emplace_back([&, rank]() {
-      Json config{JsonObject()};
-      config["xgboost_communicator"] = String("in-memory");
-      config["in_memory_world_size"] = world_size;
-      config["in_memory_rank"] = rank;
-      xgboost::collective::Init(config);
-
-      std::forward<Function>(function)(std::forward<Args>(args)...);
-
-      xgboost::collective::Finalize();
-    });
+    threads.emplace_back(run, rank);
  }
  for (auto& thread : threads) {
    thread.join();
  }
+#endif
 }

 class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@ -3,6 +3,7 @@
 */
 #pragma once

+#include <dmlc/omp.h>
 #include <grpcpp/server_builder.h>
 #include <gtest/gtest.h>
 #include <xgboost/json.h>
@ -61,24 +62,33 @@ class BaseFederatedTest : public ::testing::Test {
 template <typename Function, typename... Args>
 void RunWithFederatedCommunicator(int32_t world_size, std::string const& server_address,
                                  Function&& function, Args&&... args) {
+  auto run = [&](auto rank) {
+    Json config{JsonObject()};
+    config["xgboost_communicator"] = String("federated");
+    config["federated_server_address"] = String(server_address);
+    config["federated_world_size"] = world_size;
+    config["federated_rank"] = rank;
+    xgboost::collective::Init(config);
+
+    std::forward<Function>(function)(std::forward<Args>(args)...);
+
+    xgboost::collective::Finalize();
+  };
+#if defined(_OPENMP)
+#pragma omp parallel num_threads(world_size)
+  {
+    auto rank = omp_get_thread_num();
+    run(rank);
+  }
+#else
  std::vector<std::thread> threads;
  for (auto rank = 0; rank < world_size; rank++) {
-    threads.emplace_back([&, rank]() {
-      Json config{JsonObject()};
-      config["xgboost_communicator"] = String("federated");
-      config["federated_server_address"] = String(server_address);
-      config["federated_world_size"] = world_size;
-      config["federated_rank"] = rank;
-      xgboost::collective::Init(config);
-
-      std::forward<Function>(function)(std::forward<Args>(args)...);
-
-      xgboost::collective::Finalize();
-    });
+    threads.emplace_back(run, rank);
  }
  for (auto& thread : threads) {
    thread.join();
  }
+#endif
 }

 }  // namespace xgboost
--- a/tests/cpp/rabit/allreduce_base_test.cc
+++ b/tests/cpp/rabit/allreduce_base_test.cc
@ -20,32 +20,6 @@ TEST(AllreduceBase, InitTask)
  EXPECT_EQ(base.task_id, "1");
 }

-TEST(AllreduceBase, InitWithCacheOn)
-{
-  rabit::engine::AllreduceBase base;
-
-  std::string rabit_task_id = "rabit_task_id=1";
-  char cmd[rabit_task_id.size()+1];
-  std::copy(rabit_task_id.begin(), rabit_task_id.end(), cmd);
-  cmd[rabit_task_id.size()] = '\0';
-
-  std::string rabit_bootstrap_cache = "rabit_bootstrap_cache=1";
-  char cmd2[rabit_bootstrap_cache.size()+1];
-  std::copy(rabit_bootstrap_cache.begin(), rabit_bootstrap_cache.end(), cmd2);
-  cmd2[rabit_bootstrap_cache.size()] = '\0';
-
-  std::string rabit_debug = "rabit_debug=1";
-  char cmd3[rabit_debug.size()+1];
-  std::copy(rabit_debug.begin(), rabit_debug.end(), cmd3);
-  cmd3[rabit_debug.size()] = '\0';
-
-  char* argv[] = {cmd, cmd2, cmd3};
-  base.Init(3, argv);
-  EXPECT_EQ(base.task_id, "1");
-  EXPECT_TRUE(base.rabit_bootstrap_cache);
-  EXPECT_EQ(base.rabit_debug, 1);
-}
-
 TEST(AllreduceBase, InitWithRingReduce)
 {
  rabit::engine::AllreduceBase base;
--- a/tests/cpp/tree/test_constraints.cc
+++ b/tests/cpp/tree/test_constraints.cc
@ -6,6 +6,8 @@
 #include <string>

 #include "../../../src/tree/constraints.h"
+#include "../../../src/tree/hist/evaluate_splits.h"
+#include "../helpers.h"

 namespace xgboost {
 namespace tree {
@ -56,5 +58,37 @@ TEST(CPUFeatureInteractionConstraint, Basic) {
  ASSERT_FALSE(constraints.Query(1, 5));
 }

+TEST(CPUMonoConstraint, Basic) {
+  std::size_t kRows{64}, kCols{16};
+  Context ctx;
+
+  TrainParam param;
+  std::vector<std::int32_t> mono(kCols, 1);
+  I32Array arr;
+  for (std::size_t i = 0; i < kCols; ++i) {
+    arr.GetArray().push_back(mono[i]);
+  }
+  Json jarr{std::move(arr)};
+  std::string str_mono;
+  Json::Dump(jarr, &str_mono);
+  str_mono.front() = '(';
+  str_mono.back() = ')';
+
+  param.UpdateAllowUnknown(Args{{"monotone_constraints", str_mono}});
+
+  auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
+  auto sampler = std::make_shared<common::ColumnSampler>();
+
+  HistEvaluator<CPUExpandEntry> evalutor{&ctx, &param, Xy->Info(), sampler};
+  evalutor.InitRoot(GradStats{2.0, 2.0});
+
+  SplitEntry split;
+  split.Update(1.0f, 0, 3.0, false, false, GradStats{1.0, 1.0}, GradStats{1.0, 1.0});
+  CPUExpandEntry entry{0, 0, split};
+  RegTree tree{1, static_cast<bst_feature_t>(kCols)};
+  evalutor.ApplyTreeSplit(entry, &tree);
+
+  ASSERT_TRUE(evalutor.Evaluator().has_constraint);
+}
 }  // namespace tree
 }  // namespace xgboost
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@ -90,13 +90,16 @@ void TestColumnSplit(int32_t rows, bst_feature_t cols, RegTree const& expected_t
  param.Init(Args{});
  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});

-  EXPECT_EQ(tree.NumExtraNodes(), 10);
-  EXPECT_EQ(tree[0].SplitIndex(), 1);
+  ASSERT_EQ(tree.NumExtraNodes(), 10);
+  ASSERT_EQ(tree[0].SplitIndex(), 1);

-  EXPECT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
-  EXPECT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);
+  ASSERT_NE(tree[tree[0].LeftChild()].SplitIndex(), 0);
+  ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0);

-  EXPECT_EQ(tree, expected_tree);
+  FeatureMap fmap;
+  auto json = tree.DumpModel(fmap, false, "json");
+  auto expected_json = expected_tree.DumpModel(fmap, false, "json");
+  ASSERT_EQ(json, expected_json);
 }
 }  // anonymous namespace

--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@ -19,6 +19,8 @@
 #include "xgboost/data.h"

 namespace xgboost::tree {
+
+namespace {
 template <typename ExpandEntry>
 void TestPartitioner(bst_target_t n_targets) {
  std::size_t n_samples = 1024, base_rowid = 0;
@ -86,8 +88,117 @@ void TestPartitioner(bst_target_t n_targets) {
    }
  }
 }
+}  // anonymous namespace

 TEST(QuantileHist, Partitioner) { TestPartitioner<CPUExpandEntry>(1); }

 TEST(QuantileHist, MultiPartitioner) { TestPartitioner<MultiExpandEntry>(3); }
+
+namespace {
+
+template <typename ExpandEntry>
+void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
+                                  bst_feature_t n_features, size_t base_rowid,
+                                  std::shared_ptr<DMatrix> Xy, float min_value, float mid_value,
+                                  CommonRowPartitioner const& expected_mid_partitioner) {
+  auto dmat =
+      std::unique_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+
+  std::vector<ExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+  auto cuts = common::SketchOnDMatrix(&ctx, dmat.get(), 64);
+
+  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+    GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
+    bst_feature_t const split_ind = 0;
+    common::ColumnMatrix column_indices;
+    column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
+    {
+      RegTree tree{n_targets, n_features};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, min_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, min_value, &candidates);
+      }
+      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
+      ASSERT_EQ(partitioner.Size(), 3);
+      ASSERT_EQ(partitioner[1].Size(), 0);
+      ASSERT_EQ(partitioner[2].Size(), n_samples);
+    }
+    {
+      RegTree tree{n_targets, n_features};
+      CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, true};
+      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+        GetSplit(&tree, mid_value, &candidates);
+      } else {
+        GetMultiSplitForTest(&tree, mid_value, &candidates);
+      }
+      auto left_nidx = tree.LeftChild(RegTree::kRoot);
+      partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
+
+      auto elem = partitioner[left_nidx];
+      ASSERT_LT(elem.Size(), n_samples);
+      ASSERT_GT(elem.Size(), 1);
+      auto expected_elem = expected_mid_partitioner[left_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+
+      auto right_nidx = tree.RightChild(RegTree::kRoot);
+      elem = partitioner[right_nidx];
+      expected_elem = expected_mid_partitioner[right_nidx];
+      ASSERT_EQ(elem.Size(), expected_elem.Size());
+      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
+        ASSERT_EQ(*it, *eit);
+      }
+    }
+  }
+}
+
+template <typename ExpandEntry>
+void TestColumnSplitPartitioner(bst_target_t n_targets) {
+  std::size_t n_samples = 1024, base_rowid = 0;
+  bst_feature_t n_features = 16;
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  std::vector<ExpandEntry> candidates{{0, 0}};
+  candidates.front().split.loss_chg = 0.4;
+
+  Context ctx;
+  ctx.InitAllowUnknown(Args{});
+  auto cuts = common::SketchOnDMatrix(&ctx, Xy.get(), 64);
+
+  float min_value, mid_value;
+  CommonRowPartitioner mid_partitioner{&ctx, n_samples, base_rowid, false};
+  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+    GHistIndexMatrix gmat(page, {}, cuts, 64, true, 0.5, ctx.Threads());
+    bst_feature_t const split_ind = 0;
+    common::ColumnMatrix column_indices;
+    column_indices.InitFromSparse(page, gmat, 0.5, ctx.Threads());
+    min_value = gmat.cut.MinValues()[split_ind];
+
+    auto ptr = gmat.cut.Ptrs()[split_ind + 1];
+    mid_value = gmat.cut.Values().at(ptr / 2);
+    RegTree tree{n_targets, n_features};
+    if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      GetSplit(&tree, mid_value, &candidates);
+    } else {
+      GetMultiSplitForTest(&tree, mid_value, &candidates);
+    }
+    mid_partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
+  }
+
+  auto constexpr kWorkers = 4;
+  RunWithInMemoryCommunicator(kWorkers, VerifyColumnSplitPartitioner<ExpandEntry>, n_targets,
+                              n_samples, n_features, base_rowid, Xy, min_value, mid_value, mid_partitioner);
+}
+}  // anonymous namespace
+
+TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEntry>(1); }
+
+TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
 }  // namespace xgboost::tree