Support column-wise data split with in-memory inputs (#9628)
--------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import numpy as np
|
||||
@@ -9,6 +10,7 @@ from scipy.sparse import csr_matrix, rand
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
from xgboost.core import DataSplitMode
|
||||
from xgboost.testing.data import np_dtypes
|
||||
|
||||
rng = np.random.RandomState(1)
|
||||
@@ -467,3 +469,97 @@ class TestDMatrix:
|
||||
m0 = xgb.DMatrix(orig)
|
||||
m1 = xgb.DMatrix(x)
|
||||
assert tm.predictor_equal(m0, m1)
|
||||
|
||||
|
||||
class TestDMatrixColumnSplit:
|
||||
def test_numpy(self):
|
||||
def verify_numpy():
|
||||
data = np.random.randn(5, 5)
|
||||
dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5 * xgb.collective.get_world_size()
|
||||
assert dm.feature_names is None
|
||||
assert dm.feature_types is None
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_numpy)
|
||||
|
||||
def test_numpy_feature_names(self):
|
||||
def verify_numpy_feature_names():
|
||||
world_size = xgb.collective.get_world_size()
|
||||
data = np.random.randn(5, 5)
|
||||
feature_names = [f'feature{x}' for x in range(5)]
|
||||
feature_types = ['float'] * 5
|
||||
dm = xgb.DMatrix(data, feature_names=feature_names, feature_types=feature_types,
|
||||
data_split_mode=DataSplitMode.COL)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5 * world_size
|
||||
assert len(dm.feature_names) == 5 * world_size
|
||||
assert len(dm.feature_types) == 5 * world_size
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_numpy_feature_names)
|
||||
|
||||
def test_csr(self):
|
||||
def verify_csr():
|
||||
indptr = np.array([0, 2, 3, 6])
|
||||
indices = np.array([0, 2, 2, 0, 1, 2])
|
||||
data = np.array([1, 2, 3, 4, 5, 6])
|
||||
X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
|
||||
dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
|
||||
assert dtrain.num_row() == 3
|
||||
assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_csr)
|
||||
|
||||
def test_csc(self):
|
||||
def verify_csc():
|
||||
row = np.array([0, 2, 2, 0, 1, 2])
|
||||
col = np.array([0, 0, 1, 2, 2, 2])
|
||||
data = np.array([1, 2, 3, 4, 5, 6])
|
||||
X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
|
||||
dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
|
||||
assert dtrain.num_row() == 3
|
||||
assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_csc)
|
||||
|
||||
def test_coo(self):
|
||||
def verify_coo():
|
||||
row = np.array([0, 2, 2, 0, 1, 2])
|
||||
col = np.array([0, 0, 1, 2, 2, 2])
|
||||
data = np.array([1, 2, 3, 4, 5, 6])
|
||||
X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
|
||||
dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
|
||||
assert dtrain.num_row() == 3
|
||||
assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_coo)
|
||||
|
||||
def test_list(self):
|
||||
def verify_list():
|
||||
data = [
|
||||
[1, 2, 3, 4, 5],
|
||||
[6, 7, 8, 9, 10],
|
||||
[11, 12, 13, 14, 15],
|
||||
[16, 17, 18, 19, 20],
|
||||
[21, 22, 23, 24, 25]
|
||||
]
|
||||
dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_list)
|
||||
|
||||
def test_tuple(self):
|
||||
def verify_tuple():
|
||||
data = (
|
||||
(1, 2, 3, 4, 5),
|
||||
(6, 7, 8, 9, 10),
|
||||
(11, 12, 13, 14, 15),
|
||||
(16, 17, 18, 19, 20),
|
||||
(21, 22, 23, 24, 25)
|
||||
)
|
||||
dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
|
||||
assert dm.num_row() == 5
|
||||
assert dm.num_col() == 5 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_tuple)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -6,6 +7,7 @@ import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
from xgboost.core import DataSplitMode
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
@@ -97,3 +99,17 @@ class TestArrowTable:
|
||||
y_np_low = dtrain.get_float_info("label_lower_bound")
|
||||
np.testing.assert_equal(y_np_up, y_upper_bound.to_pandas().values)
|
||||
np.testing.assert_equal(y_np_low, y_lower_bound.to_pandas().values)
|
||||
|
||||
|
||||
class TestArrowTableColumnSplit:
|
||||
def test_arrow_table(self):
|
||||
def verify_arrow_table():
|
||||
df = pd.DataFrame(
|
||||
[[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
|
||||
)
|
||||
table = pa.Table.from_pandas(df)
|
||||
dm = xgb.DMatrix(table, data_split_mode=DataSplitMode.COL)
|
||||
assert dm.num_row() == 2
|
||||
assert dm.num_col() == 4 * xgb.collective.get_world_size()
|
||||
|
||||
tm.run_with_rabit(world_size=3, test_fn=verify_arrow_table)
|
||||
|
||||
Reference in New Issue
Block a user