diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py index e85c1769f..0bc17c052 100644 --- a/python-package/xgboost/testing/__init__.py +++ b/python-package/xgboost/testing/__init__.py @@ -37,6 +37,7 @@ from scipy import sparse import xgboost as xgb from xgboost import RabitTracker from xgboost.core import ArrayLike +from xgboost.data import is_pd_cat_dtype from xgboost.sklearn import SklObjective from xgboost.testing.data import ( get_california_housing, @@ -403,7 +404,6 @@ def make_categorical( X, y """ import pandas as pd - from pandas.api.types import is_categorical_dtype rng = np.random.RandomState(1994) @@ -431,8 +431,8 @@ def make_categorical( low=0, high=n_samples - 1, size=int(n_samples * sparsity) ) df.iloc[index, i] = np.nan - if is_categorical_dtype(df.dtypes[i]): - assert n_categories == np.unique(df.dtypes[i].categories).size + if is_pd_cat_dtype(df.dtypes.iloc[i]): + assert n_categories == np.unique(df.dtypes.iloc[i].categories).size if onehot: df = pd.get_dummies(df) diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py index 1e0b9b0d1..c6ba8256d 100644 --- a/python-package/xgboost/testing/updater.py +++ b/python-package/xgboost/testing/updater.py @@ -8,6 +8,7 @@ import numpy as np import xgboost as xgb import xgboost.testing as tm +from xgboost.data import is_pd_cat_dtype def get_basescore(model: xgb.XGBModel) -> float: @@ -166,8 +167,6 @@ def check_cut( n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any ) -> None: """Check the cut values.""" - from pandas.api.types import is_categorical_dtype - assert data.shape[0] == indptr[-1] assert data.shape[0] == n_entries @@ -177,18 +176,18 @@ def check_cut( end = int(indptr[i]) for j in range(beg + 1, end): assert data[j] > data[j - 1] - if is_categorical_dtype(dtypes[i - 1]): + if is_pd_cat_dtype(dtypes.iloc[i - 1]): assert data[j] == data[j - 1] + 1 def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None: """Check with optional cupy.""" - from pandas.api.types import is_categorical_dtype + import pandas as pd n_samples = 1024 n_features = 14 max_bin = 16 - dtypes = [np.float32] * n_features + dtypes = pd.Series([np.float32] * n_features) # numerical X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy) @@ -237,7 +236,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None: X, y = tm.make_categorical( n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5 ) - n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)]) + n_cat_features = len([0 for dtype in X.dtypes if is_pd_cat_dtype(dtype)]) n_num_features = n_features - n_cat_features n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features # - qdm diff --git a/tests/test_distributed/test_with_dask/test_external_memory.py b/tests/test_distributed/test_with_dask/test_external_memory.py index cf475d90f..820d73b56 100644 --- a/tests/test_distributed/test_with_dask/test_external_memory.py +++ b/tests/test_distributed/test_with_dask/test_external_memory.py @@ -54,7 +54,7 @@ def run_external_memory(worker_id: int, n_workers: int, comm_args: dict) -> None X = concat(lx) yconcat = concat(ly) wconcat = concat(lw) - Xy = xgb.DMatrix(X, yconcat, wconcat, nthread=n_threads) + Xy = xgb.DMatrix(X, yconcat, weight=wconcat, nthread=n_threads) results_local: xgb.callback.TrainingCallback.EvalsLog = {} booster = xgb.train( diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py index af5924b84..6ec242696 100644 --- a/tests/test_distributed/test_with_dask/test_with_dask.py +++ b/tests/test_distributed/test_with_dask/test_with_dask.py @@ -155,6 +155,10 @@ def deterministic_repartition( m: Margin, divisions, ) -> Tuple[dd.DataFrame, dd.Series, Margin]: + """Try to partition the dataframes according to divisions, this doesn't guarantee + the reproducibiliy. + + """ X, y, margin = ( dd.repartition(X, divisions=divisions, force=True), dd.repartition(y, divisions=divisions, force=True), @@ -434,7 +438,7 @@ def run_boost_from_prediction_multi_class( device=device, ) X, y, _ = deterministic_repartition(client, X, y, None, divisions) - model_0.fit(X=X, y=y) + model_0.fit(X=X, y=y, eval_set=[(X, y)]) margin = xgb.dask.inplace_predict( client, model_0.get_booster(), X, predict_type="margin" ) @@ -448,7 +452,9 @@ def run_boost_from_prediction_multi_class( device=device, ) X, y, margin = deterministic_repartition(client, X, y, margin, divisions) - model_1.fit(X=X, y=y, base_margin=margin) + model_1.fit( + X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin] + ) predictions_1 = xgb.dask.predict( client, model_1.get_booster(), @@ -464,7 +470,7 @@ def run_boost_from_prediction_multi_class( device=device, ) X, y, _ = deterministic_repartition(client, X, y, None, divisions) - model_2.fit(X=X, y=y) + model_2.fit(X=X, y=y, eval_set=[(X, y)]) predictions_2 = xgb.dask.inplace_predict( client, model_2.get_booster(), X, predict_type="margin" ) @@ -492,45 +498,46 @@ def run_boost_from_prediction( model_0 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, - n_estimators=4, + n_estimators=3, tree_method=tree_method, max_bin=512, device=device, ) X, y, _ = deterministic_repartition(client, X, y, None, divisions) - model_0.fit(X=X, y=y) + model_0.fit(X=X, y=y, eval_set=[(X, y)]) margin: dd.Series = model_0.predict(X, output_margin=True) model_1 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, - n_estimators=4, + n_estimators=3, tree_method=tree_method, max_bin=512, device=device, ) X, y, margin = deterministic_repartition(client, X, y, margin, divisions) - model_1.fit(X=X, y=y, base_margin=margin) + model_1.fit( + X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin] + ) X, y, margin = deterministic_repartition(client, X, y, margin, divisions) predictions_1: dd.Series = model_1.predict(X, base_margin=margin) model_2 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, - n_estimators=8, + n_estimators=6, tree_method=tree_method, max_bin=512, device=device, ) X, y, _ = deterministic_repartition(client, X, y, None, divisions) - model_2.fit(X=X, y=y) + model_2.fit(X=X, y=y, eval_set=[(X, y)]) predictions_2: dd.Series = model_2.predict(X) - predt_1 = predictions_1.compute() - predt_2 = predictions_2.compute() - if hasattr(predt_1, "to_numpy"): - predt_1 = predt_1.to_numpy() - if hasattr(predt_2, "to_numpy"): - predt_2 = predt_2.to_numpy() - np.testing.assert_allclose(predt_1, predt_2, atol=1e-5) + logloss_concat = ( + model_0.evals_result()["validation_0"]["logloss"] + + model_1.evals_result()["validation_0"]["logloss"] + ) + logloss_2 = model_2.evals_result()["validation_0"]["logloss"] + np.testing.assert_allclose(logloss_concat, logloss_2, rtol=1e-4) margined = xgb.dask.DaskXGBClassifier(n_estimators=4) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)