This makes GPU Hist robust in distributed environment as some workers might not be associated with any data in either training or evaluation. * Disable rabit mock test for now: See #5012 . * Disable dask-cudf test at prediction for now: See #5003 * Launch dask job for all workers despite they might not have any data. * Check 0 rows in elementwise evaluation metrics. Using AUC and AUC-PR still throws an error. See #4663 for a robust fix. * Add tests for edge cases. * Add `LaunchKernel` wrapper handling zero sized grid. * Move some parts of allreducer into a cu file. * Don't validate feature names when the booster is empty. * Sync number of columns in DMatrix. As num_feature is required to be the same across all workers in data split mode. * Filtering in dask interface now by default syncs all booster that's not empty, instead of using rank 0. * Fix Jenkins' GPU tests. * Install dask-cuda from source in Jenkins' test. Now all tests are actually running. * Restore GPU Hist tree synchronization test. * Check UUID of running devices. The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field. * Fix CMake policy and project variables. Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13. * Fix copying data to CPU * Fix race condition in cpu predictor. * Fix duplicated DMatrix construction. * Don't download extra nccl in CI script.
165 lines
4.9 KiB
Python
165 lines
4.9 KiB
Python
from __future__ import print_function
|
|
|
|
import glob
|
|
import itertools as it
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
import xgboost as xgb
|
|
|
|
try:
|
|
from sklearn import datasets
|
|
from sklearn.preprocessing import scale
|
|
except ImportError:
|
|
None
|
|
|
|
|
|
class Dataset:
|
|
def __init__(self, name, get_dataset, objective, metric,
|
|
has_weights=False, use_external_memory=False):
|
|
self.name = name
|
|
self.objective = objective
|
|
self.metric = metric
|
|
if has_weights:
|
|
self.X, self.y, self.w = get_dataset()
|
|
else:
|
|
self.X, self.y = get_dataset()
|
|
self.w = None
|
|
self.use_external_memory = use_external_memory
|
|
|
|
|
|
def get_boston():
|
|
data = datasets.load_boston()
|
|
return data.data, data.target
|
|
|
|
|
|
def get_digits():
|
|
data = datasets.load_digits()
|
|
return data.data, data.target
|
|
|
|
|
|
def get_cancer():
|
|
data = datasets.load_breast_cancer()
|
|
return data.data, data.target
|
|
|
|
|
|
def get_sparse():
|
|
rng = np.random.RandomState(199)
|
|
n = 5000
|
|
sparsity = 0.75
|
|
X, y = datasets.make_regression(n, random_state=rng)
|
|
X = np.array([[0.0 if rng.uniform(0, 1) < sparsity else x for x in x_row] for x_row in X])
|
|
from scipy import sparse
|
|
X = sparse.csr_matrix(X)
|
|
return X, y
|
|
|
|
|
|
def get_sparse_weights():
|
|
return get_weights_regression(1, 10)
|
|
|
|
|
|
def get_small_weights():
|
|
return get_weights_regression(1e-6, 1e-5)
|
|
|
|
|
|
def get_weights_regression(min_weight, max_weight):
|
|
rng = np.random.RandomState(199)
|
|
n = 10000
|
|
sparsity = 0.25
|
|
X, y = datasets.make_regression(n, random_state=rng)
|
|
X = np.array([[np.nan if rng.uniform(0, 1) < sparsity else x
|
|
for x in x_row] for x_row in X])
|
|
w = np.array([rng.uniform(min_weight, max_weight) for i in range(n)])
|
|
return X, y, w
|
|
|
|
|
|
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False):
|
|
param = param_in.copy()
|
|
param["objective"] = dataset.objective
|
|
if dataset.objective == "multi:softmax":
|
|
param["num_class"] = int(np.max(dataset.y) + 1)
|
|
param["eval_metric"] = dataset.metric
|
|
|
|
if scale_features:
|
|
X = scale(dataset.X, with_mean=isinstance(dataset.X, np.ndarray))
|
|
else:
|
|
X = dataset.X
|
|
|
|
if dataset.use_external_memory:
|
|
np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
|
|
delimiter=',')
|
|
dtrain = xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
|
weight=dataset.w)
|
|
else:
|
|
dtrain = xgb.DMatrix(X, dataset.y, weight=dataset.w)
|
|
|
|
print("Training on dataset: " + dataset.name, file=sys.stderr)
|
|
print("Using parameters: " + str(param), file=sys.stderr)
|
|
res = {}
|
|
bst = xgb.train(param, dtrain, num_rounds, [(dtrain, 'train')],
|
|
evals_result=res, verbose_eval=False)
|
|
|
|
# Free the booster and dmatrix so we can delete temporary files
|
|
bst_copy = bst.copy()
|
|
del bst
|
|
del dtrain
|
|
|
|
# Cleanup temporary files
|
|
if dataset.use_external_memory:
|
|
for f in glob.glob("tmptmp_*"):
|
|
os.remove(f)
|
|
|
|
return {"dataset": dataset, "bst": bst_copy, "param": param.copy(),
|
|
"eval": res['train'][dataset.metric]}
|
|
|
|
|
|
def parameter_combinations(variable_param):
|
|
"""
|
|
Enumerate all possible combinations of parameters
|
|
"""
|
|
result = []
|
|
names = sorted(variable_param)
|
|
combinations = it.product(*(variable_param[Name] for Name in names))
|
|
for set in combinations:
|
|
param = {}
|
|
for i, name in enumerate(names):
|
|
param[name] = set[i]
|
|
result.append(param)
|
|
return result
|
|
|
|
|
|
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False):
|
|
"""
|
|
Run the given parameters on a range of datasets. Objective and eval metric will be automatically set
|
|
"""
|
|
datasets = [
|
|
Dataset("Boston", get_boston, "reg:squarederror", "rmse"),
|
|
Dataset("Digits", get_digits, "multi:softmax", "merror"),
|
|
Dataset("Cancer", get_cancer, "binary:logistic", "error"),
|
|
Dataset("Sparse regression", get_sparse, "reg:squarederror", "rmse"),
|
|
Dataset("Sparse regression with weights", get_sparse_weights,
|
|
"reg:squarederror", "rmse", has_weights=True),
|
|
Dataset("Small weights regression", get_small_weights,
|
|
"reg:squarederror", "rmse", has_weights=True),
|
|
Dataset("Boston External Memory", get_boston,
|
|
"reg:squarederror", "rmse",
|
|
use_external_memory=True)
|
|
]
|
|
|
|
results = [
|
|
]
|
|
for d in datasets:
|
|
if select_datasets is None or d.name in select_datasets:
|
|
results.append(
|
|
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features))
|
|
return results
|
|
|
|
|
|
def non_increasing(L, tolerance):
|
|
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
|
|
|
|
|
def assert_results_non_increasing(results, tolerance=1e-5):
|
|
for r in results:
|
|
assert non_increasing(r['eval'], tolerance), r
|