diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index 391f2bf9f..da795c9bf 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -8,6 +8,7 @@ import importlib.util import multiprocessing import os import platform +import queue import socket import sys import threading @@ -942,13 +943,20 @@ def project_root(path: str) -> str: return normpath(os.path.join(demo_dir(path), os.path.pardir)) -def run_with_rabit(world_size: int, test_fn: Callable) -> None: - tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size) - tracker.start(world_size) +def run_with_rabit( + world_size: int, test_fn: Callable[..., Any], *args: Any, **kwargs: Any +) -> None: + exception_queue: queue.Queue = queue.Queue() def run_worker(rabit_env: Dict[str, Union[str, int]]) -> None: - with xgb.collective.CommunicatorContext(**rabit_env): - test_fn() + try: + with xgb.collective.CommunicatorContext(**rabit_env): + test_fn(*args, **kwargs) + except Exception as e: # pylint: disable=broad-except + exception_queue.put(e) + + tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size) + tracker.start(world_size) workers = [] for _ in range(world_size): @@ -957,5 +965,20 @@ def run_with_rabit(world_size: int, test_fn: Callable) -> None: worker.start() for worker in workers: worker.join() + assert exception_queue.empty(), f"Worker failed: {exception_queue.get()}" tracker.join() + + +def column_split_feature_names( + feature_names: List[Union[str, int]], world_size: int +) -> List[str]: + """Get the global list of feature names from the local feature names.""" + return [ + f"{rank}.{feature}" for rank in range(world_size) for feature in feature_names + ] + + +def is_windows() -> bool: + """Check if the current platform is Windows.""" + return platform.system() == "Windows" diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index e6cfb462b..4cd4de8c1 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -19,11 +19,13 @@ class LintersPaths: # tests "tests/python/test_config.py", "tests/python/test_data_iterator.py", + "tests/python/test_dmatrix.py", "tests/python/test_dt.py", "tests/python/test_predict.py", "tests/python/test_quantile_dmatrix.py", "tests/python/test_tree_regularization.py", "tests/python/test_shap.py", + "tests/python/test_with_pandas.py", "tests/python-gpu/test_gpu_data_iterator.py", "tests/python-gpu/test_gpu_prediction.py", "tests/python-gpu/load_pickle.py", diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 51bee5669..05a9af3b0 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -1,3 +1,4 @@ +import csv import os import sys import tempfile @@ -15,7 +16,7 @@ from xgboost.testing.data import np_dtypes rng = np.random.RandomState(1) -dpath = 'demo/data/' +dpath = "demo/data/" rng = np.random.RandomState(1994) @@ -67,12 +68,13 @@ def set_base_margin_info(DType, DMatrixT, tm: str): class TestDMatrix: def test_warn_missing(self): from xgboost import data + with pytest.warns(UserWarning): - data._warn_unused_missing('uri', 4) + data._warn_unused_missing("uri", 4) with pytest.warns(None) as record: - data._warn_unused_missing('uri', None) - data._warn_unused_missing('uri', np.nan) + data._warn_unused_missing("uri", None) + data._warn_unused_missing("uri", np.nan) assert len(record) == 0 @@ -106,7 +108,7 @@ class TestDMatrix: with pytest.raises(ValueError): xgb.DMatrix(data) # object dtype - data = np.array([['a', 'b'], ['c', 'd']]) + data = np.array([["a", "b"], ["c", "d"]]) with pytest.raises(ValueError): xgb.DMatrix(data) @@ -148,18 +150,18 @@ class TestDMatrix: y = np.array([12, 34, 56], np.float32)[::2] from_view = xgb.DMatrix(np.array([[]]), label=y).get_label() from_array = xgb.DMatrix(np.array([[]]), label=y + 0).get_label() - assert (from_view.shape == from_array.shape) + assert from_view.shape == from_array.shape assert (from_view == from_array).all() # Sliced UInt array z = np.array([12, 34, 56], np.uint32)[::2] dmat = xgb.DMatrix(np.array([[]])) - dmat.set_uint_info('group', z) - from_view = dmat.get_uint_info('group_ptr') + dmat.set_uint_info("group", z) + from_view = dmat.get_uint_info("group_ptr") dmat = xgb.DMatrix(np.array([[]])) - dmat.set_uint_info('group', z + 0) - from_array = dmat.get_uint_info('group_ptr') - assert (from_view.shape == from_array.shape) + dmat.set_uint_info("group", z + 0) + from_array = dmat.get_uint_info("group_ptr") + assert from_view.shape == from_array.shape assert (from_view == from_array).all() def test_slice(self): @@ -181,9 +183,11 @@ class TestDMatrix: # Slicing works with label and other meta info fields np.testing.assert_equal(sliced.get_label(), y[1:7]) - np.testing.assert_equal(sliced.get_float_info('feature_weights'), fw) + np.testing.assert_equal(sliced.get_float_info("feature_weights"), fw) np.testing.assert_equal(sliced.get_base_margin(), base_margin[1:7, :].flatten()) - np.testing.assert_equal(sliced.get_base_margin(), sliced.get_float_info('base_margin')) + np.testing.assert_equal( + sliced.get_base_margin(), sliced.get_float_info("base_margin") + ) # Slicing a DMatrix results into a DMatrix that's equivalent to a DMatrix that's # constructed from the corresponding NumPy slice @@ -191,11 +195,15 @@ class TestDMatrix: d2.set_base_margin(base_margin[1:7, :]) eval_res = {} _ = xgb.train( - {'num_class': 3, 'objective': 'multi:softprob', - 'eval_metric': 'mlogloss'}, + {"num_class": 3, "objective": "multi:softprob", "eval_metric": "mlogloss"}, d, - num_boost_round=2, evals=[(d2, 'd2'), (sliced, 'sliced')], evals_result=eval_res) - np.testing.assert_equal(eval_res['d2']['mlogloss'], eval_res['sliced']['mlogloss']) + num_boost_round=2, + evals=[(d2, "d2"), (sliced, "sliced")], + evals_result=eval_res, + ) + np.testing.assert_equal( + eval_res["d2"]["mlogloss"], eval_res["sliced"]["mlogloss"] + ) ridxs_arr = np.array(ridxs)[1:] # handles numpy slice correctly sliced = d.slice(ridxs_arr) @@ -206,17 +214,17 @@ class TestDMatrix: # different length with pytest.raises(ValueError): - xgb.DMatrix(data, feature_names=list('abcdef')) + xgb.DMatrix(data, feature_names=list("abcdef")) # contains duplicates with pytest.raises(ValueError): - xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'd']) + xgb.DMatrix(data, feature_names=["a", "b", "c", "d", "d"]) # contains symbol with pytest.raises(ValueError): - xgb.DMatrix(data, feature_names=['a', 'b', 'c', 'd', 'e<1']) + xgb.DMatrix(data, feature_names=["a", "b", "c", "d", "e<1"]) dm = xgb.DMatrix(data) - dm.feature_names = list('abcde') - assert dm.feature_names == list('abcde') + dm.feature_names = list("abcde") + assert dm.feature_names == list("abcde") assert dm.slice([0, 1]).num_col() == dm.num_col() assert dm.slice([0, 1]).feature_names == dm.feature_names @@ -224,11 +232,11 @@ class TestDMatrix: with pytest.raises(ValueError, match=r"Duplicates found: \['bar'\]"): dm.feature_names = ["bar"] * (data.shape[1] - 2) + ["a", "b"] - dm.feature_types = list('qiqiq') - assert dm.feature_types == list('qiqiq') + dm.feature_types = list("qiqiq") + assert dm.feature_types == list("qiqiq") with pytest.raises(ValueError): - dm.feature_types = list('abcde') + dm.feature_types = list("abcde") # reset dm.feature_names = None @@ -240,20 +248,23 @@ class TestDMatrix: data = np.random.randn(100, 5) target = np.array([0, 1] * 50) - cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], - [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] + cases = [ + ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"], + ["要因1", "要因2", "要因3", "要因4", "要因5"], + ] for features in cases: - dm = xgb.DMatrix(data, label=target, - feature_names=features) + dm = xgb.DMatrix(data, label=target, feature_names=features) assert dm.feature_names == features assert dm.num_row() == 100 assert dm.num_col() == 5 - params = {'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} + params = { + "objective": "multi:softprob", + "eval_metric": "mlogloss", + "eta": 0.3, + "num_class": 3, + } bst = xgb.train(params, dm, num_boost_round=10) scores = bst.get_fscore() @@ -264,22 +275,19 @@ class TestDMatrix: bst.predict(dm) # different feature name must raises error - dm = xgb.DMatrix(dummy, feature_names=list('abcde')) + dm = xgb.DMatrix(dummy, feature_names=list("abcde")) with pytest.raises(ValueError): bst.predict(dm) @pytest.mark.skipif(**tm.no_pandas()) def test_save_binary(self): import pandas as pd + with tempfile.TemporaryDirectory() as tmpdir: - path = os.path.join(tmpdir, 'm.dmatrix') - data = pd.DataFrame({ - "a": [0, 1], - "b": [2, 3], - "c": [4, 5] - }) + path = os.path.join(tmpdir, "m.dmatrix") + data = pd.DataFrame({"a": [0, 1], "b": [2, 3], "c": [4, 5]}) m0 = xgb.DMatrix(data.loc[:, ["a", "b"]], data["c"]) - assert m0.feature_names == ['a', 'b'] + assert m0.feature_names == ["a", "b"] m0.save_binary(path) m1 = xgb.DMatrix(path) assert m0.feature_names == m1.feature_names @@ -287,10 +295,10 @@ class TestDMatrix: def test_get_info(self): dtrain, _ = tm.load_agaricus(__file__) - dtrain.get_float_info('label') - dtrain.get_float_info('weight') - dtrain.get_float_info('base_margin') - dtrain.get_uint_info('group_ptr') + dtrain.get_float_info("label") + dtrain.get_float_info("weight") + dtrain.get_float_info("base_margin") + dtrain.get_uint_info("group_ptr") group_len = np.array([2, 3, 4]) dtrain.set_group(group_len) @@ -305,7 +313,7 @@ class TestDMatrix: Xy = xgb.DMatrix(X, y) Xy.set_info(qid=qid) - group_ptr = Xy.get_uint_info('group_ptr') + group_ptr = Xy.get_uint_info("group_ptr") assert group_ptr[0] == 0 assert group_ptr[-1] == rows @@ -317,11 +325,11 @@ class TestDMatrix: X = rng.randn(kRows, kCols) m = xgb.DMatrix(X) m.set_info(feature_weights=fw) - np.testing.assert_allclose(fw, m.get_float_info('feature_weights')) + np.testing.assert_allclose(fw, m.get_float_info("feature_weights")) # Handle empty - m.set_info(feature_weights=np.empty((0, ))) + m.set_info(feature_weights=np.empty((0,))) - assert m.get_float_info('feature_weights').shape[0] == 0 + assert m.get_float_info("feature_weights").shape[0] == 0 fw -= 1 @@ -331,13 +339,13 @@ class TestDMatrix: def test_sparse_dmatrix_csr(self): nrow = 100 ncol = 1000 - x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) + x = rand(nrow, ncol, density=0.0005, format="csr", random_state=rng) assert x.indices.max() < ncol x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) - watchlist = [(dtrain, 'train')] - param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} + watchlist = [(dtrain, "train")] + param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0} bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) @@ -369,13 +377,13 @@ class TestDMatrix: def test_sparse_dmatrix_csc(self): nrow = 1000 ncol = 100 - x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng) + x = rand(nrow, ncol, density=0.0005, format="csc", random_state=rng) assert x.indices.max() < nrow - 1 x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) - watchlist = [(dtrain, 'train')] - param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} + watchlist = [(dtrain, "train")] + param = {"max_depth": 3, "objective": "binary:logistic", "verbosity": 0} bst = xgb.train(param, dtrain, 5, watchlist) bst.predict(dtrain) @@ -389,6 +397,7 @@ class TestDMatrix: xgb.DMatrix(d) from scipy import sparse + rng = np.random.RandomState(1994) X = rng.rand(10, 10) y = rng.rand(10) @@ -402,7 +411,7 @@ class TestDMatrix: n_features = 10 X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False) X = X.values.astype(np.float32) - feature_types = ['c'] * n_features + feature_types = ["c"] * n_features assert isinstance(X, np.ndarray) Xy = xgb.DMatrix(X, y, feature_types=feature_types) @@ -410,10 +419,11 @@ class TestDMatrix: def test_scipy_categorical(self): from scipy import sparse + n_features = 10 X, y = tm.make_categorical(10, n_features, n_categories=4, onehot=False) X = X.values.astype(np.float32) - feature_types = ['c'] * n_features + feature_types = ["c"] * n_features X[1, 3] = np.NAN X[2, 4] = np.NAN @@ -433,7 +443,7 @@ class TestDMatrix: np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) def test_uri_categorical(self): - path = os.path.join(dpath, 'agaricus.txt.train') + path = os.path.join(dpath, "agaricus.txt.train") feature_types = ["q"] * 5 + ["c"] + ["q"] * 120 Xy = xgb.DMatrix( path + "?indexing_mode=1&format=libsvm", feature_types=feature_types @@ -471,6 +481,7 @@ class TestDMatrix: assert tm.predictor_equal(m0, m1) +@pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") class TestDMatrixColumnSplit: def test_numpy(self): def verify_numpy(): @@ -487,14 +498,22 @@ class TestDMatrixColumnSplit: def verify_numpy_feature_names(): world_size = xgb.collective.get_world_size() data = np.random.randn(5, 5) - feature_names = [f'feature{x}' for x in range(5)] - feature_types = ['float'] * 5 - dm = xgb.DMatrix(data, feature_names=feature_names, feature_types=feature_types, - data_split_mode=DataSplitMode.COL) + feature_names = [f"feature{x}" for x in range(5)] + feature_types = ["float"] * 5 + dm = xgb.DMatrix( + data, + feature_names=feature_names, + feature_types=feature_types, + data_split_mode=DataSplitMode.COL, + ) assert dm.num_row() == 5 assert dm.num_col() == 5 * world_size assert len(dm.feature_names) == 5 * world_size + assert dm.feature_names == tm.column_split_feature_names( + feature_names, world_size + ) assert len(dm.feature_types) == 5 * world_size + assert dm.feature_types == ["float"] * 5 * world_size tm.run_with_rabit(world_size=3, test_fn=verify_numpy_feature_names) @@ -534,6 +553,23 @@ class TestDMatrixColumnSplit: tm.run_with_rabit(world_size=3, test_fn=verify_coo) + def test_uri(self): + def verify_uri(): + rank = xgb.collective.get_rank() + data = np.random.rand(5, 5) + filename = f"test_data_{rank}.csv" + with open(filename, mode="w", newline="") as file: + writer = csv.writer(file) + for row in data: + writer.writerow(row) + dtrain = xgb.DMatrix( + f"{filename}?format=csv", data_split_mode=DataSplitMode.COL + ) + assert dtrain.num_row() == 5 + assert dtrain.num_col() == 5 * xgb.collective.get_world_size() + + tm.run_with_rabit(world_size=3, test_fn=verify_uri) + def test_list(self): def verify_list(): data = [ @@ -541,7 +577,7 @@ class TestDMatrixColumnSplit: [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20], - [21, 22, 23, 24, 25] + [21, 22, 23, 24, 25], ] dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL) assert dm.num_row() == 5 @@ -556,7 +592,7 @@ class TestDMatrixColumnSplit: (6, 7, 8, 9, 10), (11, 12, 13, 14, 15), (16, 17, 18, 19, 20), - (21, 22, 23, 24, 25) + (21, 22, 23, 24, 25), ) dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL) assert dm.num_row() == 5 diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py index fdc4c7dbe..4d12f32df 100644 --- a/tests/python/test_with_arrow.py +++ b/tests/python/test_with_arrow.py @@ -1,6 +1,5 @@ import os import sys -import unittest import numpy as np import pytest @@ -101,6 +100,7 @@ class TestArrowTable: np.testing.assert_equal(y_np_low, y_lower_bound.to_pandas().values) +@pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") class TestArrowTableColumnSplit: def test_arrow_table(self): def verify_arrow_table(): diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index a23a66b63..6a9ed4a84 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -1,3 +1,4 @@ +import sys from typing import Type import numpy as np @@ -6,6 +7,7 @@ from test_dmatrix import set_base_margin_info import xgboost as xgb from xgboost import testing as tm +from xgboost.core import DataSplitMode from xgboost.testing.data import pd_arrow_dtypes, pd_dtypes try: @@ -17,114 +19,194 @@ except ImportError: pytestmark = pytest.mark.skipif(**tm.no_pandas()) -dpath = 'demo/data/' +dpath = "demo/data/" rng = np.random.RandomState(1994) class TestPandas: - def test_pandas(self): - df = pd.DataFrame([[1, 2., True], [2, 3., False]], - columns=['a', 'b', 'c']) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['a', 'b', 'c'] - assert dm.feature_types == ['int', 'float', 'i'] + def test_pandas(self, data_split_mode=DataSplitMode.ROW): + world_size = xgb.collective.get_world_size() + df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), data_split_mode=data_split_mode) assert dm.num_row() == 2 - assert dm.num_col() == 3 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["a", "b", "c"] + assert dm.feature_types == ["int", "float", "i"] + assert dm.num_col() == 3 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["a", "b", "c"], world_size + ) + assert dm.feature_types == ["int", "float", "i"] * world_size + assert dm.num_col() == 3 * world_size np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) # overwrite feature_names and feature_types - dm = xgb.DMatrix(df, label=pd.Series([1, 2]), - feature_names=['x', 'y', 'z'], - feature_types=['q', 'q', 'q']) - assert dm.feature_names == ['x', 'y', 'z'] - assert dm.feature_types == ['q', 'q', 'q'] + dm = xgb.DMatrix( + df, + label=pd.Series([1, 2]), + feature_names=["x", "y", "z"], + feature_types=["q", "q", "q"], + data_split_mode=data_split_mode, + ) assert dm.num_row() == 2 - assert dm.num_col() == 3 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["x", "y", "z"] + assert dm.feature_types == ["q", "q", "q"] + assert dm.num_col() == 3 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["x", "y", "z"], world_size + ) + assert dm.feature_types == ["q", "q", "q"] * world_size + assert dm.num_col() == 3 * world_size # incorrect dtypes - df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], - columns=['a', 'b', 'c']) + df = pd.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"]) with pytest.raises(ValueError): - xgb.DMatrix(df) + xgb.DMatrix(df, data_split_mode=data_split_mode) # numeric columns - df = pd.DataFrame([[1, 2., True], [2, 3., False]]) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['0', '1', '2'] - assert dm.feature_types == ['int', 'float', 'i'] + df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), data_split_mode=data_split_mode) assert dm.num_row() == 2 - assert dm.num_col() == 3 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["0", "1", "2"] + assert dm.feature_types == ["int", "float", "i"] + assert dm.num_col() == 3 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["0", "1", "2"], world_size + ) + assert dm.feature_types == ["int", "float", "i"] * world_size + assert dm.num_col() == 3 * world_size np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) - df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) - dm = xgb.DMatrix(df, label=pd.Series([1, 2])) - assert dm.feature_names == ['4', '5', '6'] - assert dm.feature_types == ['int', 'float', 'int'] + df = pd.DataFrame([[1, 2.0, 1], [2, 3.0, 1]], columns=[4, 5, 6]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), data_split_mode=data_split_mode) assert dm.num_row() == 2 - assert dm.num_col() == 3 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["4", "5", "6"] + assert dm.feature_types == ["int", "float", "int"] + assert dm.num_col() == 3 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["4", "5", "6"], world_size + ) + assert dm.feature_types == ["int", "float", "int"] * world_size + assert dm.num_col() == 3 * world_size - df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) + df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]}) dummies = pd.get_dummies(df) # B A_X A_Y A_Z # 0 1 1 0 0 # 1 2 0 1 0 # 2 3 0 0 1 - result, _, _ = xgb.data._transform_pandas_df(dummies, - enable_categorical=False) - exp = np.array([[1., 1., 0., 0.], - [2., 0., 1., 0.], - [3., 0., 0., 1.]]) + result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False) + exp = np.array( + [[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]] + ) np.testing.assert_array_equal(result, exp) - dm = xgb.DMatrix(dummies) - assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z'] - if int(pd.__version__[0]) >= 2: - assert dm.feature_types == ['int', 'i', 'i', 'i'] + dm = xgb.DMatrix(dummies, data_split_mode=data_split_mode) + assert dm.num_row() == 3 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"] + if int(pd.__version__[0]) >= 2: + assert dm.feature_types == ["int", "i", "i", "i"] + else: + assert dm.feature_types == ["int", "int", "int", "int"] + assert dm.num_col() == 4 else: - assert dm.feature_types == ['int', 'int', 'int', 'int'] - assert dm.num_row() == 3 - assert dm.num_col() == 4 + assert dm.feature_names == tm.column_split_feature_names( + ["B", "A_X", "A_Y", "A_Z"], world_size + ) + if int(pd.__version__[0]) >= 2: + assert dm.feature_types == ["int", "i", "i", "i"] * world_size + else: + assert dm.feature_types == ["int", "int", "int", "int"] * world_size + assert dm.num_col() == 4 * world_size - df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]}) - dm = xgb.DMatrix(df) - assert dm.feature_names == ['A=1', 'A=2'] - assert dm.feature_types == ['int', 'int'] + df = pd.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]}) + dm = xgb.DMatrix(df, data_split_mode=data_split_mode) assert dm.num_row() == 3 - assert dm.num_col() == 2 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["A=1", "A=2"] + assert dm.feature_types == ["int", "int"] + assert dm.num_col() == 2 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["A=1", "A=2"], world_size + ) + assert dm.feature_types == ["int", "int"] * world_size + assert dm.num_col() == 2 * world_size df_int = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=[9, 10]) - dm_int = xgb.DMatrix(df_int) + dm_int = xgb.DMatrix(df_int, data_split_mode=data_split_mode) df_range = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1)) - dm_range = xgb.DMatrix(df_range) - assert dm_int.feature_names == ['9', '10'] # assert not "9 " + dm_range = xgb.DMatrix(df_range, data_split_mode=data_split_mode) + if data_split_mode == DataSplitMode.ROW: + assert dm_int.feature_names == ["9", "10"] # assert not "9 " + else: + assert dm_int.feature_names == tm.column_split_feature_names( + ["9", "10"], world_size + ) assert dm_int.feature_names == dm_range.feature_names # test MultiIndex as columns df = pd.DataFrame( - [ - (1, 2, 3, 4, 5, 6), - (6, 5, 4, 3, 2, 1) - ], - columns=pd.MultiIndex.from_tuples(( - ('a', 1), ('a', 2), ('a', 3), - ('b', 1), ('b', 2), ('b', 3), - )) + [(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)], + columns=pd.MultiIndex.from_tuples( + ( + ("a", 1), + ("a", 2), + ("a", 3), + ("b", 1), + ("b", 2), + ("b", 3), + ) + ), ) - dm = xgb.DMatrix(df) - assert dm.feature_names == ['a 1', 'a 2', 'a 3', 'b 1', 'b 2', 'b 3'] - assert dm.feature_types == ['int', 'int', 'int', 'int', 'int', 'int'] + dm = xgb.DMatrix(df, data_split_mode=data_split_mode) assert dm.num_row() == 2 - assert dm.num_col() == 6 + if data_split_mode == DataSplitMode.ROW: + assert dm.feature_names == ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"] + assert dm.feature_types == ["int", "int", "int", "int", "int", "int"] + assert dm.num_col() == 6 + else: + assert dm.feature_names == tm.column_split_feature_names( + ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"], world_size + ) + assert ( + dm.feature_types + == ["int", "int", "int", "int", "int", "int"] * world_size + ) + assert dm.num_col() == 6 * world_size # test Index as columns df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2])) - Xy = xgb.DMatrix(df) - np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"])) + Xy = xgb.DMatrix(df, data_split_mode=data_split_mode) + if data_split_mode == DataSplitMode.ROW: + np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"])) + else: + np.testing.assert_equal( + np.array(Xy.feature_names), + np.array(tm.column_split_feature_names(["1", "2"], world_size)), + ) + + # test pandas series + data_series = pd.Series([1, 2, 3, 4, 5]) + dm = xgb.DMatrix(data_series, data_split_mode=data_split_mode) + assert dm.num_row() == 5 + if data_split_mode == DataSplitMode.ROW: + assert dm.num_col() == 1 + else: + assert dm.num_col() == 1 * world_size def test_slice(self): rng = np.random.RandomState(1994) rows = 100 X = rng.randint(3, 7, size=rows) - X = pd.DataFrame({'f0': X}) + X = pd.DataFrame({"f0": X}) y = rng.randn(rows) ridxs = [1, 2, 3, 4, 5, 6] m = xgb.DMatrix(X, y) @@ -132,15 +214,16 @@ class TestPandas: assert m.feature_types == sliced.feature_types - def test_pandas_categorical(self): + def test_pandas_categorical(self, data_split_mode=DataSplitMode.ROW): + world_size = xgb.collective.get_world_size() rng = np.random.RandomState(1994) rows = 100 X = rng.randint(3, 7, size=rows) X = pd.Series(X, dtype="category") - X = pd.DataFrame({'f0': X}) + X = pd.DataFrame({"f0": X}) y = rng.randn(rows) - m = xgb.DMatrix(X, y, enable_categorical=True) - assert m.feature_types[0] == 'c' + m = xgb.DMatrix(X, y, enable_categorical=True, data_split_mode=data_split_mode) + assert m.feature_types[0] == "c" X_0 = ["f", "o", "o"] X_1 = [4, 3, 2] @@ -159,22 +242,29 @@ class TestPandas: assert not np.any(arr == -1.0) X = X["f0"] - y = y[:X.shape[0]] + y = y[: X.shape[0]] with pytest.raises(ValueError, match=r".*enable_categorical.*"): - xgb.DMatrix(X, y) + xgb.DMatrix(X, y, data_split_mode=data_split_mode) - Xy = xgb.DMatrix(X, y, enable_categorical=True) + Xy = xgb.DMatrix(X, y, enable_categorical=True, data_split_mode=data_split_mode) assert Xy.num_row() == 3 - assert Xy.num_col() == 1 + if data_split_mode == DataSplitMode.ROW: + assert Xy.num_col() == 1 + else: + assert Xy.num_col() == 1 * world_size def test_pandas_sparse(self): import pandas as pd + rows = 100 X = pd.DataFrame( - {"A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)), - "B": pd.arrays.SparseArray(np.random.randn(rows)), - "C": pd.arrays.SparseArray(np.random.permutation( - [True, False] * (rows // 2)))} + { + "A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)), + "B": pd.arrays.SparseArray(np.random.randn(rows)), + "C": pd.arrays.SparseArray( + np.random.permutation([True, False] * (rows // 2)) + ), + } ) y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) dtrain = xgb.DMatrix(X, y) @@ -183,27 +273,36 @@ class TestPandas: predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) np.testing.assert_allclose(predt_sparse, predt_dense) - def test_pandas_label(self): + def test_pandas_label(self, data_split_mode=DataSplitMode.ROW): + world_size = xgb.collective.get_world_size() # label must be a single column - df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) + df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]}) with pytest.raises(ValueError): - xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float') + xgb.data._transform_pandas_df(df, False, None, None, "label", "float") # label must be supported dtype - df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)}) + df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)}) with pytest.raises(ValueError): - xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float') + xgb.data._transform_pandas_df(df, False, None, None, "label", "float") - df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)}) - result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, - 'label', 'float') - np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], - dtype=float)) - dm = xgb.DMatrix(np.random.randn(3, 2), label=df) + df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)}) + result, _, _ = xgb.data._transform_pandas_df( + df, False, None, None, "label", "float" + ) + np.testing.assert_array_equal( + result, np.array([[1.0], [2.0], [3.0]], dtype=float) + ) + dm = xgb.DMatrix( + np.random.randn(3, 2), label=df, data_split_mode=data_split_mode + ) assert dm.num_row() == 3 - assert dm.num_col() == 2 + if data_split_mode == DataSplitMode.ROW: + assert dm.num_col() == 2 + else: + assert dm.num_col() == 2 * world_size - def test_pandas_weight(self): + def test_pandas_weight(self, data_split_mode=DataSplitMode.ROW): + world_size = xgb.collective.get_world_size() kRows = 32 kCols = 8 @@ -211,11 +310,13 @@ class TestPandas: y = np.random.randn(kRows) w = np.random.uniform(size=kRows).astype(np.float32) w_pd = pd.DataFrame(w) - data = xgb.DMatrix(X, y, weight=w_pd) + data = xgb.DMatrix(X, y, weight=w_pd, data_split_mode=data_split_mode) assert data.num_row() == kRows - assert data.num_col() == kCols - + if data_split_mode == DataSplitMode.ROW: + assert data.num_col() == kCols + else: + assert data.num_col() == kCols * world_size np.testing.assert_array_equal(data.get_weight(), w) def test_base_margin(self): @@ -223,81 +324,128 @@ class TestPandas: def test_cv_as_pandas(self): dm, _ = tm.load_agaricus(__file__) - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': 'error'} + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + "eval_metric": "error", + } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) + exp = pd.Index( + ["test-error-mean", "test-error-std", "train-error-mean", "train-error-std"] + ) assert len(cv.columns.intersection(exp)) == 4 # show progress log (result is the same as above) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - verbose_eval=True) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, verbose_eval=True) assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) + exp = pd.Index( + ["test-error-mean", "test-error-std", "train-error-mean", "train-error-std"] + ) assert len(cv.columns.intersection(exp)) == 4 - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - verbose_eval=True, show_stdv=False) + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, verbose_eval=True, show_stdv=False + ) assert isinstance(cv, pd.DataFrame) - exp = pd.Index([u'test-error-mean', u'test-error-std', - u'train-error-mean', u'train-error-std']) + exp = pd.Index( + ["test-error-mean", "test-error-std", "train-error-mean", "train-error-std"] + ) assert len(cv.columns.intersection(exp)) == 4 - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': 'auc'} + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + "eval_metric": "auc", + } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] + assert "eval_metric" in params + assert "auc" in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': ['auc']} + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + "eval_metric": ["auc"], + } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] + assert "eval_metric" in params + assert "auc" in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, early_stopping_rounds=1) - assert 'eval_metric' in params - assert 'auc' in cv.columns[0] + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + "eval_metric": ["auc"], + } + cv = xgb.cv( + params, + dm, + num_boost_round=10, + nfold=10, + as_pandas=True, + early_stopping_rounds=1, + ) + assert "eval_metric" in params + assert "auc" in cv.columns[0] assert cv.shape[0] < 10 - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, metrics='auc') - assert 'auc' in cv.columns[0] + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + } + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics="auc" + ) + assert "auc" in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic'} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, metrics=['auc']) - assert 'auc' in cv.columns[0] + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + } + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=["auc"] + ) + assert "auc" in cv.columns[0] - params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, - 'objective': 'binary:logistic', 'eval_metric': ['auc']} - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, metrics='error') - assert 'eval_metric' in params - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] + params = { + "max_depth": 2, + "eta": 1, + "verbosity": 0, + "objective": "binary:logistic", + "eval_metric": ["auc"], + } + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics="error" + ) + assert "eval_metric" in params + assert "auc" not in cv.columns[0] + assert "error" in cv.columns[0] - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, metrics=['error']) - assert 'eval_metric' in params - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=["error"] + ) + assert "eval_metric" in params + assert "auc" not in cv.columns[0] + assert "error" in cv.columns[0] params = list(params.items()) - cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, - as_pandas=True, metrics=['error']) + cv = xgb.cv( + params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=["error"] + ) assert isinstance(params, list) - assert 'auc' not in cv.columns[0] - assert 'error' in cv.columns[0] + assert "auc" not in cv.columns[0] + assert "error" in cv.columns[0] @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix]) def test_nullable_type(self, DMatrixT) -> None: @@ -358,3 +506,60 @@ class TestPandas: if y is not None: np.testing.assert_allclose(m_orig.get_label(), m_etype.get_label()) np.testing.assert_allclose(m_etype.get_label(), y.values) + + @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") + def test_pandas_column_split(self): + tm.run_with_rabit( + world_size=3, test_fn=self.test_pandas, data_split_mode=DataSplitMode.COL + ) + + @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") + def test_pandas_categorical_column_split(self): + tm.run_with_rabit( + world_size=3, + test_fn=self.test_pandas_categorical, + data_split_mode=DataSplitMode.COL, + ) + + @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") + def test_pandas_sparse_column_split(self): + rows = 100 + X = pd.DataFrame( + { + "A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)), + "B": pd.arrays.SparseArray(np.random.randn(rows)), + "C": pd.arrays.SparseArray( + np.random.permutation([True, False] * (rows // 2)) + ), + } + ) + y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) + + def verify_pandas_sparse(): + dtrain = xgb.DMatrix(X, y, data_split_mode=DataSplitMode.COL) + booster = xgb.train({}, dtrain, num_boost_round=4) + predt_sparse = booster.predict( + xgb.DMatrix(X, data_split_mode=DataSplitMode.COL) + ) + predt_dense = booster.predict( + xgb.DMatrix(X.sparse.to_dense(), data_split_mode=DataSplitMode.COL) + ) + np.testing.assert_allclose(predt_sparse, predt_dense) + + tm.run_with_rabit(world_size=3, test_fn=verify_pandas_sparse) + + @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") + def test_pandas_label_column_split(self): + tm.run_with_rabit( + world_size=3, + test_fn=self.test_pandas_label, + data_split_mode=DataSplitMode.COL, + ) + + @pytest.mark.skipif(tm.is_windows(), reason="Rabit does not run on windows") + def test_pandas_weight_column_split(self): + tm.run_with_rabit( + world_size=3, + test_fn=self.test_pandas_weight, + data_split_mode=DataSplitMode.COL, + )