diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 3315d3d8f..da9441aee 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -137,16 +137,13 @@ def test_from_dask_array() -> None: prediction = prediction.compute() booster: xgb.Booster = result['booster'] - single_node_predt = booster.predict( - xgb.DMatrix(X.compute()) - ) + single_node_predt = booster.predict(xgb.DMatrix(X.compute())) np.testing.assert_allclose(prediction, single_node_predt) config = json.loads(booster.save_config()) assert int(config['learner']['generic_param']['nthread']) == 5 - from_arr = xgb.dask.predict( - client, model=booster, data=X) + from_arr = xgb.dask.predict(client, model=booster, data=X) assert isinstance(from_arr, da.Array) assert np.all(single_node_predt == from_arr.compute()) @@ -477,23 +474,6 @@ def test_dask_classifier(model: str, client: "Client") -> None: run_dask_classifier(X, y_bin, w, model, None, client, 2) -@pytest.mark.skipif(**tm.no_sklearn()) -def test_sklearn_grid_search(client: "Client") -> None: - from sklearn.model_selection import GridSearchCV - X, y, _ = generate_array() - reg = xgb.dask.DaskXGBRegressor(learning_rate=0.1, - tree_method='hist') - reg.client = client - model = GridSearchCV(reg, {'max_depth': [2, 4], - 'n_estimators': [5, 10]}, - cv=2, verbose=1) - model.fit(X, y) - # Expect unique results for each parameter value This confirms - # sklearn is able to successfully update the parameter - means = model.cv_results_['mean_test_score'] - assert len(means) == len(set(means)) - - def test_empty_dmatrix_training_continuation(client: "Client") -> None: kRows, kCols = 1, 97 X = dd.from_array(np.random.randn(kRows, kCols)) @@ -714,18 +694,11 @@ def test_auc(client: "Client") -> None: # No test for Exact, as empty DMatrix handling are mostly for distributed # environment and Exact doesn't support it. -def test_empty_dmatrix_hist() -> None: +@pytest.mark.parametrize("tree_method", ["hist", "approx"]) +def test_empty_dmatrix(tree_method) -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: - parameters = {'tree_method': 'hist'} - run_empty_dmatrix_reg(client, parameters) - run_empty_dmatrix_cls(client, parameters) - - -def test_empty_dmatrix_approx() -> None: - with LocalCluster(n_workers=kWorkers) as cluster: - with Client(cluster) as client: - parameters = {'tree_method': 'approx'} + parameters = {'tree_method': tree_method} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters) @@ -1102,12 +1075,12 @@ class TestWithDask: os.remove(after_fname) def run_updater_test( - self, - client: "Client", - params: Dict, - num_rounds: int, - dataset: tm.TestDataset, - tree_method: str + self, + client: "Client", + params: Dict, + num_rounds: int, + dataset: tm.TestDataset, + tree_method: str ) -> None: params['tree_method'] = tree_method params = dataset.set_params(params) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 9d63b160c..4ab86b7e2 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -17,16 +17,6 @@ pytestmark = pytest.mark.skipif(**tm.no_sklearn()) from sklearn.utils.estimator_checks import parametrize_with_checks -class TemporaryDirectory(object): - """Context manager for tempfile.mkdtemp()""" - def __enter__(self): - self.name = tempfile.mkdtemp() - return self.name - - def __exit__(self, exc_type, exc_value, traceback): - shutil.rmtree(self.name) - - def test_binary_classification(): from sklearn.datasets import load_digits from sklearn.model_selection import KFold @@ -509,7 +499,7 @@ def test_classification_with_custom_objective(): assert is_called[0] -def test_sklearn_api(): +def run_sklearn_api(booster, error, n_est): from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split @@ -517,30 +507,18 @@ def test_sklearn_api(): tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120, test_size=0.2) - classifier = xgb.XGBClassifier(booster='gbtree', n_estimators=10) + classifier = xgb.XGBClassifier(booster=booster, n_estimators=n_est) classifier.fit(tr_d, tr_l) preds = classifier.predict(te_d) labels = te_l err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) - assert err < 0.2 + assert err < error -def test_sklearn_api_gblinear(): - from sklearn.datasets import load_iris - from sklearn.model_selection import train_test_split - - iris = load_iris() - tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, - train_size=120) - - classifier = xgb.XGBClassifier(booster='gblinear', n_estimators=100) - classifier.fit(tr_d, tr_l) - - preds = classifier.predict(te_d) - labels = te_l - err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l) - assert err < 0.5 +def test_sklearn_api(): + run_sklearn_api("gbtree", 0.2, 10) + run_sklearn_api("gblinear", 0.5, 100) @pytest.mark.skipif(**tm.no_matplotlib()) @@ -721,7 +699,7 @@ def test_sklearn_get_default_params(): assert cls.get_params()['base_score'] is not None -def test_validation_weights_xgbmodel(): +def run_validation_weights(model): from sklearn.datasets import make_hastie_10_2 # prepare training and test data @@ -733,7 +711,7 @@ def test_validation_weights_xgbmodel(): # instantiate model param_dist = {'objective': 'binary:logistic', 'n_estimators': 2, 'random_state': 123} - clf = xgb.sklearn.XGBModel(**param_dist) + clf = model(**param_dist) # train it using instance weights only in the training set weights_train = np.random.choice([1, 2], len(X_train)) @@ -778,49 +756,9 @@ def test_validation_weights_xgbmodel(): sample_weight_eval_set=[weights_train]) -def test_validation_weights_xgbclassifier(): - from sklearn.datasets import make_hastie_10_2 - - # prepare training and test data - X, y = make_hastie_10_2(n_samples=2000, random_state=42) - labels, y = np.unique(y, return_inverse=True) - X_train, X_test = X[:1600], X[1600:] - y_train, y_test = y[:1600], y[1600:] - - # instantiate model - param_dist = {'objective': 'binary:logistic', 'n_estimators': 2, - 'random_state': 123} - clf = xgb.sklearn.XGBClassifier(**param_dist) - - # train it using instance weights only in the training set - weights_train = np.random.choice([1, 2], len(X_train)) - clf.fit(X_train, y_train, - sample_weight=weights_train, - eval_set=[(X_test, y_test)], - eval_metric='logloss', - verbose=False) - - # evaluate logloss metric on test set *without* using weights - evals_result_without_weights = clf.evals_result() - logloss_without_weights = evals_result_without_weights[ - "validation_0"]["logloss"] - - # now use weights for the test set - np.random.seed(0) - weights_test = np.random.choice([1, 2], len(X_test)) - clf.fit(X_train, y_train, - sample_weight=weights_train, - eval_set=[(X_test, y_test)], - sample_weight_eval_set=[weights_test], - eval_metric='logloss', - verbose=False) - evals_result_with_weights = clf.evals_result() - logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"] - - # check that the logloss in the test set is actually different - # when using weights than when not using them - assert all((logloss_with_weights[i] != logloss_without_weights[i] - for i in [0, 1])) +def test_validation_weights(): + run_validation_weights(xgb.XGBModel) + run_validation_weights(xgb.XGBClassifier) def save_load_model(model_path): @@ -862,16 +800,16 @@ def save_load_model(model_path): def test_save_load_model(): - with TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model') save_load_model(model_path) - with TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model.json') save_load_model(model_path) from sklearn.datasets import load_digits - with TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: model_path = os.path.join(tempdir, 'digits.model.json') digits = load_digits(n_class=2) y = digits['target'] @@ -949,7 +887,7 @@ def test_XGBClassifier_resume(): from sklearn.datasets import load_breast_cancer from sklearn.metrics import log_loss - with TemporaryDirectory() as tempdir: + with tempfile.TemporaryDirectory() as tempdir: model1_path = os.path.join(tempdir, 'test_XGBClassifier.model') model1_booster_path = os.path.join(tempdir, 'test_XGBClassifier.booster') @@ -1089,7 +1027,7 @@ def test_pandas_input(): def run_feature_weights(X, y, fw, model=xgb.XGBRegressor): - with TemporaryDirectory() as tmpdir: + with tempfile.TemporaryDirectory() as tmpdir: colsample_bynode = 0.5 reg = model(tree_method='hist', colsample_bynode=colsample_bynode)