Support column-wise data split with in-memory inputs (#9628)

--------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
2023-10-16 21:16:39 -07:00
parent 4d1607eefd
commit da6803b75b
12 changed files with 307 additions and 27 deletions
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -108,6 +108,7 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
  Json::Dump(data_arr, &sdata);
  Json config{Object{}};
  config["missing"] = Number{std::numeric_limits<float>::quiet_NaN()};
+  config["data_split_mode"] = Integer{static_cast<int64_t>(DataSplitMode::kCol)};
  Json::Dump(config, &sconfig);

  DMatrixHandle handle;
@@ -120,6 +121,8 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
  ASSERT_EQ(n, 3);
  ASSERT_EQ(XGDMatrixNumNonMissing(handle, &n), 0);
  ASSERT_EQ(n, 3);
+  ASSERT_EQ(XGDMatrixDataSplitMode(handle, &n), 0);
+  ASSERT_EQ(n, static_cast<int64_t>(DataSplitMode::kCol));

  std::shared_ptr<xgboost::DMatrix> *pp_fmat =
      static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -74,6 +74,49 @@ TEST(MetaInfo, GetSetFeature) {
  // Other conditions are tested in `SaveLoadBinary`.
 }

+namespace {
+void VerifyGetSetFeatureColumnSplit() {
+  xgboost::MetaInfo info;
+  info.data_split_mode = DataSplitMode::kCol;
+  auto const world_size = collective::GetWorldSize();
+
+  auto constexpr kCols{2};
+  std::vector<std::string> types{u8"float", u8"c"};
+  std::vector<char const *> c_types(kCols);
+  std::transform(types.cbegin(), types.cend(), c_types.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
+  std::vector<std::string> expected_type_names{u8"float", u8"c",     u8"float",
+                                               u8"c",     u8"float", u8"c"};
+  EXPECT_EQ(info.feature_type_names, expected_type_names);
+  std::vector<xgboost::FeatureType> expected_types{
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical};
+  EXPECT_EQ(info.feature_types.HostVector(), expected_types);
+
+  std::vector<std::string> names{u8"feature0", u8"feature1"};
+  std::vector<char const *> c_names(kCols);
+  std::transform(names.cbegin(), names.cend(), c_names.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
+  std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
+                                          u8"1.feature1", u8"2.feature0", u8"2.feature1"};
+  EXPECT_EQ(info.feature_names, expected_names);
+}
+}  // anonymous namespace
+
+TEST(MetaInfo, GetSetFeatureColumnSplit) {
+  auto constexpr kWorldSize{3};
+  RunWithInMemoryCommunicator(kWorldSize, VerifyGetSetFeatureColumnSplit);
+}
+
 TEST(MetaInfo, SaveLoadBinary) {
  xgboost::MetaInfo info;
  xgboost::Context ctx;
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import tempfile

 import numpy as np
@@ -9,6 +10,7 @@ from scipy.sparse import csr_matrix, rand

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode
 from xgboost.testing.data import np_dtypes

 rng = np.random.RandomState(1)
@@ -467,3 +469,97 @@ class TestDMatrix:
            m0 = xgb.DMatrix(orig)
            m1 = xgb.DMatrix(x)
            assert tm.predictor_equal(m0, m1)
+
+
+class TestDMatrixColumnSplit:
+    def test_numpy(self):
+        def verify_numpy():
+            data = np.random.randn(5, 5)
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+            assert dm.feature_names is None
+            assert dm.feature_types is None
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy)
+
+    def test_numpy_feature_names(self):
+        def verify_numpy_feature_names():
+            world_size = xgb.collective.get_world_size()
+            data = np.random.randn(5, 5)
+            feature_names = [f'feature{x}' for x in range(5)]
+            feature_types = ['float'] * 5
+            dm = xgb.DMatrix(data, feature_names=feature_names, feature_types=feature_types,
+                             data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * world_size
+            assert len(dm.feature_names) == 5 * world_size
+            assert len(dm.feature_types) == 5 * world_size
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy_feature_names)
+
+    def test_csr(self):
+        def verify_csr():
+            indptr = np.array([0, 2, 3, 6])
+            indices = np.array([0, 2, 2, 0, 1, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csr)
+
+    def test_csc(self):
+        def verify_csc():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csc)
+
+    def test_coo(self):
+        def verify_coo():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_coo)
+
+    def test_list(self):
+        def verify_list():
+            data = [
+                [1, 2, 3, 4, 5],
+                [6, 7, 8, 9, 10],
+                [11, 12, 13, 14, 15],
+                [16, 17, 18, 19, 20],
+                [21, 22, 23, 24, 25]
+            ]
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_list)
+
+    def test_tuple(self):
+        def verify_tuple():
+            data = (
+                (1, 2, 3, 4, 5),
+                (6, 7, 8, 9, 10),
+                (11, 12, 13, 14, 15),
+                (16, 17, 18, 19, 20),
+                (21, 22, 23, 24, 25)
+            )
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_tuple)
--- a/tests/python/test_with_arrow.py
+++ b/tests/python/test_with_arrow.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import unittest

 import numpy as np
@@ -6,6 +7,7 @@ import pytest

 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode

 try:
    import pandas as pd
@@ -97,3 +99,17 @@ class TestArrowTable:
        y_np_low = dtrain.get_float_info("label_lower_bound")
        np.testing.assert_equal(y_np_up, y_upper_bound.to_pandas().values)
        np.testing.assert_equal(y_np_low, y_lower_bound.to_pandas().values)
+
+
+class TestArrowTableColumnSplit:
+    def test_arrow_table(self):
+        def verify_arrow_table():
+            df = pd.DataFrame(
+                [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
+            )
+            table = pa.Table.from_pandas(df)
+            dm = xgb.DMatrix(table, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 2
+            assert dm.num_col() == 4 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_arrow_table)