Reduce warnings and flakiness in tests. (#10659)

- Fix warnings in tests.
- Try to reduce the flakiness of dask test.
This commit is contained in:
Jiaming Yuan 2024-08-03 07:32:47 +08:00 committed by GitHub
parent 2e7ba900ef
commit a185b693dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 32 additions and 26 deletions

View File

@ -37,6 +37,7 @@ from scipy import sparse
import xgboost as xgb import xgboost as xgb
from xgboost import RabitTracker from xgboost import RabitTracker
from xgboost.core import ArrayLike from xgboost.core import ArrayLike
from xgboost.data import is_pd_cat_dtype
from xgboost.sklearn import SklObjective from xgboost.sklearn import SklObjective
from xgboost.testing.data import ( from xgboost.testing.data import (
get_california_housing, get_california_housing,
@ -403,7 +404,6 @@ def make_categorical(
X, y X, y
""" """
import pandas as pd import pandas as pd
from pandas.api.types import is_categorical_dtype
rng = np.random.RandomState(1994) rng = np.random.RandomState(1994)
@ -431,8 +431,8 @@ def make_categorical(
low=0, high=n_samples - 1, size=int(n_samples * sparsity) low=0, high=n_samples - 1, size=int(n_samples * sparsity)
) )
df.iloc[index, i] = np.nan df.iloc[index, i] = np.nan
if is_categorical_dtype(df.dtypes[i]): if is_pd_cat_dtype(df.dtypes.iloc[i]):
assert n_categories == np.unique(df.dtypes[i].categories).size assert n_categories == np.unique(df.dtypes.iloc[i].categories).size
if onehot: if onehot:
df = pd.get_dummies(df) df = pd.get_dummies(df)

View File

@ -8,6 +8,7 @@ import numpy as np
import xgboost as xgb import xgboost as xgb
import xgboost.testing as tm import xgboost.testing as tm
from xgboost.data import is_pd_cat_dtype
def get_basescore(model: xgb.XGBModel) -> float: def get_basescore(model: xgb.XGBModel) -> float:
@ -166,8 +167,6 @@ def check_cut(
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
) -> None: ) -> None:
"""Check the cut values.""" """Check the cut values."""
from pandas.api.types import is_categorical_dtype
assert data.shape[0] == indptr[-1] assert data.shape[0] == indptr[-1]
assert data.shape[0] == n_entries assert data.shape[0] == n_entries
@ -177,18 +176,18 @@ def check_cut(
end = int(indptr[i]) end = int(indptr[i])
for j in range(beg + 1, end): for j in range(beg + 1, end):
assert data[j] > data[j - 1] assert data[j] > data[j - 1]
if is_categorical_dtype(dtypes[i - 1]): if is_pd_cat_dtype(dtypes.iloc[i - 1]):
assert data[j] == data[j - 1] + 1 assert data[j] == data[j - 1] + 1
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None: def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
"""Check with optional cupy.""" """Check with optional cupy."""
from pandas.api.types import is_categorical_dtype import pandas as pd
n_samples = 1024 n_samples = 1024
n_features = 14 n_features = 14
max_bin = 16 max_bin = 16
dtypes = [np.float32] * n_features dtypes = pd.Series([np.float32] * n_features)
# numerical # numerical
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy) X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
@ -237,7 +236,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
X, y = tm.make_categorical( X, y = tm.make_categorical(
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5 n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
) )
n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)]) n_cat_features = len([0 for dtype in X.dtypes if is_pd_cat_dtype(dtype)])
n_num_features = n_features - n_cat_features n_num_features = n_features - n_cat_features
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
# - qdm # - qdm

View File

@ -54,7 +54,7 @@ def run_external_memory(worker_id: int, n_workers: int, comm_args: dict) -> None
X = concat(lx) X = concat(lx)
yconcat = concat(ly) yconcat = concat(ly)
wconcat = concat(lw) wconcat = concat(lw)
Xy = xgb.DMatrix(X, yconcat, wconcat, nthread=n_threads) Xy = xgb.DMatrix(X, yconcat, weight=wconcat, nthread=n_threads)
results_local: xgb.callback.TrainingCallback.EvalsLog = {} results_local: xgb.callback.TrainingCallback.EvalsLog = {}
booster = xgb.train( booster = xgb.train(

View File

@ -155,6 +155,10 @@ def deterministic_repartition(
m: Margin, m: Margin,
divisions, divisions,
) -> Tuple[dd.DataFrame, dd.Series, Margin]: ) -> Tuple[dd.DataFrame, dd.Series, Margin]:
"""Try to partition the dataframes according to divisions, this doesn't guarantee
the reproducibiliy.
"""
X, y, margin = ( X, y, margin = (
dd.repartition(X, divisions=divisions, force=True), dd.repartition(X, divisions=divisions, force=True),
dd.repartition(y, divisions=divisions, force=True), dd.repartition(y, divisions=divisions, force=True),
@ -434,7 +438,7 @@ def run_boost_from_prediction_multi_class(
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None, divisions) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_0.fit(X=X, y=y) model_0.fit(X=X, y=y, eval_set=[(X, y)])
margin = xgb.dask.inplace_predict( margin = xgb.dask.inplace_predict(
client, model_0.get_booster(), X, predict_type="margin" client, model_0.get_booster(), X, predict_type="margin"
) )
@ -448,7 +452,9 @@ def run_boost_from_prediction_multi_class(
device=device, device=device,
) )
X, y, margin = deterministic_repartition(client, X, y, margin, divisions) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
model_1.fit(X=X, y=y, base_margin=margin) model_1.fit(
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
)
predictions_1 = xgb.dask.predict( predictions_1 = xgb.dask.predict(
client, client,
model_1.get_booster(), model_1.get_booster(),
@ -464,7 +470,7 @@ def run_boost_from_prediction_multi_class(
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None, divisions) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_2.fit(X=X, y=y) model_2.fit(X=X, y=y, eval_set=[(X, y)])
predictions_2 = xgb.dask.inplace_predict( predictions_2 = xgb.dask.inplace_predict(
client, model_2.get_booster(), X, predict_type="margin" client, model_2.get_booster(), X, predict_type="margin"
) )
@ -492,45 +498,46 @@ def run_boost_from_prediction(
model_0 = xgb.dask.DaskXGBClassifier( model_0 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, learning_rate=0.3,
n_estimators=4, n_estimators=3,
tree_method=tree_method, tree_method=tree_method,
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None, divisions) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_0.fit(X=X, y=y) model_0.fit(X=X, y=y, eval_set=[(X, y)])
margin: dd.Series = model_0.predict(X, output_margin=True) margin: dd.Series = model_0.predict(X, output_margin=True)
model_1 = xgb.dask.DaskXGBClassifier( model_1 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, learning_rate=0.3,
n_estimators=4, n_estimators=3,
tree_method=tree_method, tree_method=tree_method,
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, margin = deterministic_repartition(client, X, y, margin, divisions) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
model_1.fit(X=X, y=y, base_margin=margin) model_1.fit(
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
)
X, y, margin = deterministic_repartition(client, X, y, margin, divisions) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
predictions_1: dd.Series = model_1.predict(X, base_margin=margin) predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
model_2 = xgb.dask.DaskXGBClassifier( model_2 = xgb.dask.DaskXGBClassifier(
learning_rate=0.3, learning_rate=0.3,
n_estimators=8, n_estimators=6,
tree_method=tree_method, tree_method=tree_method,
max_bin=512, max_bin=512,
device=device, device=device,
) )
X, y, _ = deterministic_repartition(client, X, y, None, divisions) X, y, _ = deterministic_repartition(client, X, y, None, divisions)
model_2.fit(X=X, y=y) model_2.fit(X=X, y=y, eval_set=[(X, y)])
predictions_2: dd.Series = model_2.predict(X) predictions_2: dd.Series = model_2.predict(X)
predt_1 = predictions_1.compute() logloss_concat = (
predt_2 = predictions_2.compute() model_0.evals_result()["validation_0"]["logloss"]
if hasattr(predt_1, "to_numpy"): + model_1.evals_result()["validation_0"]["logloss"]
predt_1 = predt_1.to_numpy() )
if hasattr(predt_2, "to_numpy"): logloss_2 = model_2.evals_result()["validation_0"]["logloss"]
predt_2 = predt_2.to_numpy() np.testing.assert_allclose(logloss_concat, logloss_2, rtol=1e-4)
np.testing.assert_allclose(predt_1, predt_2, atol=1e-5)
margined = xgb.dask.DaskXGBClassifier(n_estimators=4) margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
X, y, margin = deterministic_repartition(client, X, y, margin, divisions) X, y, margin = deterministic_repartition(client, X, y, margin, divisions)