Reduce warnings and flakiness in tests. (#10659)
- Fix warnings in tests. - Try to reduce the flakiness of dask test.
This commit is contained in:
parent
2e7ba900ef
commit
a185b693dc
@ -37,6 +37,7 @@ from scipy import sparse
|
|||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from xgboost import RabitTracker
|
from xgboost import RabitTracker
|
||||||
from xgboost.core import ArrayLike
|
from xgboost.core import ArrayLike
|
||||||
|
from xgboost.data import is_pd_cat_dtype
|
||||||
from xgboost.sklearn import SklObjective
|
from xgboost.sklearn import SklObjective
|
||||||
from xgboost.testing.data import (
|
from xgboost.testing.data import (
|
||||||
get_california_housing,
|
get_california_housing,
|
||||||
@ -403,7 +404,6 @@ def make_categorical(
|
|||||||
X, y
|
X, y
|
||||||
"""
|
"""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pandas.api.types import is_categorical_dtype
|
|
||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
@ -431,8 +431,8 @@ def make_categorical(
|
|||||||
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
|
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
|
||||||
)
|
)
|
||||||
df.iloc[index, i] = np.nan
|
df.iloc[index, i] = np.nan
|
||||||
if is_categorical_dtype(df.dtypes[i]):
|
if is_pd_cat_dtype(df.dtypes.iloc[i]):
|
||||||
assert n_categories == np.unique(df.dtypes[i].categories).size
|
assert n_categories == np.unique(df.dtypes.iloc[i].categories).size
|
||||||
|
|
||||||
if onehot:
|
if onehot:
|
||||||
df = pd.get_dummies(df)
|
df = pd.get_dummies(df)
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import numpy as np
|
|||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
import xgboost.testing as tm
|
import xgboost.testing as tm
|
||||||
|
from xgboost.data import is_pd_cat_dtype
|
||||||
|
|
||||||
|
|
||||||
def get_basescore(model: xgb.XGBModel) -> float:
|
def get_basescore(model: xgb.XGBModel) -> float:
|
||||||
@ -166,8 +167,6 @@ def check_cut(
|
|||||||
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
|
n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check the cut values."""
|
"""Check the cut values."""
|
||||||
from pandas.api.types import is_categorical_dtype
|
|
||||||
|
|
||||||
assert data.shape[0] == indptr[-1]
|
assert data.shape[0] == indptr[-1]
|
||||||
assert data.shape[0] == n_entries
|
assert data.shape[0] == n_entries
|
||||||
|
|
||||||
@ -177,18 +176,18 @@ def check_cut(
|
|||||||
end = int(indptr[i])
|
end = int(indptr[i])
|
||||||
for j in range(beg + 1, end):
|
for j in range(beg + 1, end):
|
||||||
assert data[j] > data[j - 1]
|
assert data[j] > data[j - 1]
|
||||||
if is_categorical_dtype(dtypes[i - 1]):
|
if is_pd_cat_dtype(dtypes.iloc[i - 1]):
|
||||||
assert data[j] == data[j - 1] + 1
|
assert data[j] == data[j - 1] + 1
|
||||||
|
|
||||||
|
|
||||||
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
|
def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
|
||||||
"""Check with optional cupy."""
|
"""Check with optional cupy."""
|
||||||
from pandas.api.types import is_categorical_dtype
|
import pandas as pd
|
||||||
|
|
||||||
n_samples = 1024
|
n_samples = 1024
|
||||||
n_features = 14
|
n_features = 14
|
||||||
max_bin = 16
|
max_bin = 16
|
||||||
dtypes = [np.float32] * n_features
|
dtypes = pd.Series([np.float32] * n_features)
|
||||||
|
|
||||||
# numerical
|
# numerical
|
||||||
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
|
X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
|
||||||
@ -237,7 +236,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
|
|||||||
X, y = tm.make_categorical(
|
X, y = tm.make_categorical(
|
||||||
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
|
n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
|
||||||
)
|
)
|
||||||
n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
|
n_cat_features = len([0 for dtype in X.dtypes if is_pd_cat_dtype(dtype)])
|
||||||
n_num_features = n_features - n_cat_features
|
n_num_features = n_features - n_cat_features
|
||||||
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
|
n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
|
||||||
# - qdm
|
# - qdm
|
||||||
|
|||||||
@ -54,7 +54,7 @@ def run_external_memory(worker_id: int, n_workers: int, comm_args: dict) -> None
|
|||||||
X = concat(lx)
|
X = concat(lx)
|
||||||
yconcat = concat(ly)
|
yconcat = concat(ly)
|
||||||
wconcat = concat(lw)
|
wconcat = concat(lw)
|
||||||
Xy = xgb.DMatrix(X, yconcat, wconcat, nthread=n_threads)
|
Xy = xgb.DMatrix(X, yconcat, weight=wconcat, nthread=n_threads)
|
||||||
|
|
||||||
results_local: xgb.callback.TrainingCallback.EvalsLog = {}
|
results_local: xgb.callback.TrainingCallback.EvalsLog = {}
|
||||||
booster = xgb.train(
|
booster = xgb.train(
|
||||||
|
|||||||
@ -155,6 +155,10 @@ def deterministic_repartition(
|
|||||||
m: Margin,
|
m: Margin,
|
||||||
divisions,
|
divisions,
|
||||||
) -> Tuple[dd.DataFrame, dd.Series, Margin]:
|
) -> Tuple[dd.DataFrame, dd.Series, Margin]:
|
||||||
|
"""Try to partition the dataframes according to divisions, this doesn't guarantee
|
||||||
|
the reproducibiliy.
|
||||||
|
|
||||||
|
"""
|
||||||
X, y, margin = (
|
X, y, margin = (
|
||||||
dd.repartition(X, divisions=divisions, force=True),
|
dd.repartition(X, divisions=divisions, force=True),
|
||||||
dd.repartition(y, divisions=divisions, force=True),
|
dd.repartition(y, divisions=divisions, force=True),
|
||||||
@ -434,7 +438,7 @@ def run_boost_from_prediction_multi_class(
|
|||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||||
model_0.fit(X=X, y=y)
|
model_0.fit(X=X, y=y, eval_set=[(X, y)])
|
||||||
margin = xgb.dask.inplace_predict(
|
margin = xgb.dask.inplace_predict(
|
||||||
client, model_0.get_booster(), X, predict_type="margin"
|
client, model_0.get_booster(), X, predict_type="margin"
|
||||||
)
|
)
|
||||||
@ -448,7 +452,9 @@ def run_boost_from_prediction_multi_class(
|
|||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||||
model_1.fit(X=X, y=y, base_margin=margin)
|
model_1.fit(
|
||||||
|
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
|
||||||
|
)
|
||||||
predictions_1 = xgb.dask.predict(
|
predictions_1 = xgb.dask.predict(
|
||||||
client,
|
client,
|
||||||
model_1.get_booster(),
|
model_1.get_booster(),
|
||||||
@ -464,7 +470,7 @@ def run_boost_from_prediction_multi_class(
|
|||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||||
model_2.fit(X=X, y=y)
|
model_2.fit(X=X, y=y, eval_set=[(X, y)])
|
||||||
predictions_2 = xgb.dask.inplace_predict(
|
predictions_2 = xgb.dask.inplace_predict(
|
||||||
client, model_2.get_booster(), X, predict_type="margin"
|
client, model_2.get_booster(), X, predict_type="margin"
|
||||||
)
|
)
|
||||||
@ -492,45 +498,46 @@ def run_boost_from_prediction(
|
|||||||
|
|
||||||
model_0 = xgb.dask.DaskXGBClassifier(
|
model_0 = xgb.dask.DaskXGBClassifier(
|
||||||
learning_rate=0.3,
|
learning_rate=0.3,
|
||||||
n_estimators=4,
|
n_estimators=3,
|
||||||
tree_method=tree_method,
|
tree_method=tree_method,
|
||||||
max_bin=512,
|
max_bin=512,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||||
model_0.fit(X=X, y=y)
|
model_0.fit(X=X, y=y, eval_set=[(X, y)])
|
||||||
margin: dd.Series = model_0.predict(X, output_margin=True)
|
margin: dd.Series = model_0.predict(X, output_margin=True)
|
||||||
|
|
||||||
model_1 = xgb.dask.DaskXGBClassifier(
|
model_1 = xgb.dask.DaskXGBClassifier(
|
||||||
learning_rate=0.3,
|
learning_rate=0.3,
|
||||||
n_estimators=4,
|
n_estimators=3,
|
||||||
tree_method=tree_method,
|
tree_method=tree_method,
|
||||||
max_bin=512,
|
max_bin=512,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||||
model_1.fit(X=X, y=y, base_margin=margin)
|
model_1.fit(
|
||||||
|
X=X, y=y, base_margin=margin, eval_set=[(X, y)], base_margin_eval_set=[margin]
|
||||||
|
)
|
||||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||||
predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
|
predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
|
||||||
|
|
||||||
model_2 = xgb.dask.DaskXGBClassifier(
|
model_2 = xgb.dask.DaskXGBClassifier(
|
||||||
learning_rate=0.3,
|
learning_rate=0.3,
|
||||||
n_estimators=8,
|
n_estimators=6,
|
||||||
tree_method=tree_method,
|
tree_method=tree_method,
|
||||||
max_bin=512,
|
max_bin=512,
|
||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
X, y, _ = deterministic_repartition(client, X, y, None, divisions)
|
||||||
model_2.fit(X=X, y=y)
|
model_2.fit(X=X, y=y, eval_set=[(X, y)])
|
||||||
predictions_2: dd.Series = model_2.predict(X)
|
predictions_2: dd.Series = model_2.predict(X)
|
||||||
|
|
||||||
predt_1 = predictions_1.compute()
|
logloss_concat = (
|
||||||
predt_2 = predictions_2.compute()
|
model_0.evals_result()["validation_0"]["logloss"]
|
||||||
if hasattr(predt_1, "to_numpy"):
|
+ model_1.evals_result()["validation_0"]["logloss"]
|
||||||
predt_1 = predt_1.to_numpy()
|
)
|
||||||
if hasattr(predt_2, "to_numpy"):
|
logloss_2 = model_2.evals_result()["validation_0"]["logloss"]
|
||||||
predt_2 = predt_2.to_numpy()
|
np.testing.assert_allclose(logloss_concat, logloss_2, rtol=1e-4)
|
||||||
np.testing.assert_allclose(predt_1, predt_2, atol=1e-5)
|
|
||||||
|
|
||||||
margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
margined = xgb.dask.DaskXGBClassifier(n_estimators=4)
|
||||||
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
X, y, margin = deterministic_repartition(client, X, y, margin, divisions)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user