Quantile DMatrix for CPU. (#8130)

- Add a new `QuantileDMatrix` that works for both CPU and GPU.
- Deprecate `DeviceQuantileDMatrix`.
This commit is contained in:
Jiaming Yuan
2022-08-02 15:51:23 +08:00
committed by GitHub
parent 2cba1d9fcc
commit d87f69215e
14 changed files with 521 additions and 117 deletions

View File

@@ -1,32 +1,12 @@
import xgboost as xgb
from xgboost.data import SingleBatchInternalIter as SingleBatch
import numpy as np
from testing import IteratorForTest, non_increasing
from typing import Tuple, List
from testing import IteratorForTest, non_increasing, make_batches
import pytest
from hypothesis import given, strategies, settings
from scipy.sparse import csr_matrix
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
X = []
y = []
if use_cupy:
import cupy
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
X.append(_X)
y.append(_y)
return X, y
def test_single_batch(tree_method: str = "approx") -> None:
from sklearn.datasets import load_breast_cancer
@@ -111,8 +91,8 @@ def run_data_iterator(
if not subsample:
assert non_increasing(results_from_it["Train"]["rmse"])
X, y = it.as_arrays()
Xy = xgb.DMatrix(X, y)
X, y, w = it.as_arrays()
Xy = xgb.DMatrix(X, y, weight=w)
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features

View File

@@ -0,0 +1,212 @@
from typing import Dict, List, Any
import numpy as np
import pytest
from scipy import sparse
from testing import IteratorForTest, make_batches, make_batches_sparse, make_categorical
import xgboost as xgb
class TestQuantileDMatrix:
def test_basic(self) -> None:
n_samples = 234
n_features = 8
rng = np.random.default_rng()
X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
n_samples, n_features
)
y = rng.normal(0, 3, size=n_samples)
Xy = xgb.QuantileDMatrix(X, y)
assert Xy.num_row() == n_samples
assert Xy.num_col() == n_features
X = sparse.random(n_samples, n_features, density=0.1, format="csr")
Xy = xgb.QuantileDMatrix(X, y)
assert Xy.num_row() == n_samples
assert Xy.num_col() == n_features
X = sparse.random(n_samples, n_features, density=0.8, format="csr")
Xy = xgb.QuantileDMatrix(X, y)
assert Xy.num_row() == n_samples
assert Xy.num_col() == n_features
@pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
def test_with_iterator(self, sparsity: float) -> None:
n_samples_per_batch = 317
n_features = 8
n_batches = 7
if sparsity == 0.0:
it = IteratorForTest(
*make_batches(n_samples_per_batch, n_features, n_batches, False), None
)
else:
it = IteratorForTest(
*make_batches_sparse(
n_samples_per_batch, n_features, n_batches, sparsity
),
None
)
Xy = xgb.QuantileDMatrix(it)
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features
@pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
def test_training(self, sparsity: float) -> None:
n_samples_per_batch = 317
n_features = 8
n_batches = 7
if sparsity == 0.0:
it = IteratorForTest(
*make_batches(n_samples_per_batch, n_features, n_batches, False), None
)
else:
it = IteratorForTest(
*make_batches_sparse(
n_samples_per_batch, n_features, n_batches, sparsity
),
None
)
parameters = {"tree_method": "hist", "max_bin": 256}
Xy_it = xgb.QuantileDMatrix(it, max_bin=parameters["max_bin"])
from_it = xgb.train(parameters, Xy_it)
X, y, w = it.as_arrays()
w_it = Xy_it.get_weight()
np.testing.assert_allclose(w_it, w)
Xy_arr = xgb.DMatrix(X, y, weight=w)
from_arr = xgb.train(parameters, Xy_arr)
np.testing.assert_allclose(from_arr.predict(Xy_it), from_it.predict(Xy_arr))
y -= y.min()
y += 0.01
Xy = xgb.QuantileDMatrix(X, y, weight=w)
with pytest.raises(ValueError, match=r"Only.*hist.*"):
parameters = {
"tree_method": "approx",
"max_bin": 256,
"objective": "reg:gamma",
}
xgb.train(parameters, Xy)
def run_ref_dmatrix(self, rng: Any, tree_method: str, enable_cat: bool) -> None:
n_samples, n_features = 2048, 17
if enable_cat:
X, y = make_categorical(
n_samples, n_features, n_categories=13, onehot=False
)
if tree_method == "gpu_hist":
import cudf
X = cudf.from_pandas(X)
y = cudf.from_pandas(y)
else:
X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
n_samples, n_features
)
y = rng.normal(0, 3, size=n_samples)
# Use ref
Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
qdm_results: Dict[str, Dict[str, List[float]]] = {}
xgb.train(
{"tree_method": tree_method},
Xy,
evals=[(Xy, "Train"), (Xy_valid, "valid")],
evals_result=qdm_results,
)
np.testing.assert_allclose(
qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
)
# No ref
Xy_valid = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
qdm_results = {}
xgb.train(
{"tree_method": tree_method},
Xy,
evals=[(Xy, "Train"), (Xy_valid, "valid")],
evals_result=qdm_results,
)
np.testing.assert_allclose(
qdm_results["Train"]["rmse"], qdm_results["valid"]["rmse"]
)
# Different number of features
Xy = xgb.QuantileDMatrix(X, y, enable_categorical=enable_cat)
dXy = xgb.DMatrix(X, y, enable_categorical=enable_cat)
n_samples, n_features = 256, 15
X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
n_samples, n_features
)
y = rng.normal(0, 3, size=n_samples)
with pytest.raises(ValueError, match=r".*features\."):
xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
# Compare training results
n_samples, n_features = 256, 17
if enable_cat:
X, y = make_categorical(n_samples, n_features, 13, onehot=False)
if tree_method == "gpu_hist":
import cudf
X = cudf.from_pandas(X)
y = cudf.from_pandas(y)
else:
X = rng.normal(loc=0, scale=3, size=n_samples * n_features).reshape(
n_samples, n_features
)
y = rng.normal(0, 3, size=n_samples)
Xy_valid = xgb.QuantileDMatrix(X, y, ref=Xy, enable_categorical=enable_cat)
# use DMatrix as ref
Xy_valid_d = xgb.QuantileDMatrix(X, y, ref=dXy, enable_categorical=enable_cat)
dXy_valid = xgb.DMatrix(X, y, enable_categorical=enable_cat)
qdm_results = {}
xgb.train(
{"tree_method": tree_method},
Xy,
evals=[(Xy, "Train"), (Xy_valid, "valid")],
evals_result=qdm_results,
)
dm_results: Dict[str, Dict[str, List[float]]] = {}
xgb.train(
{"tree_method": tree_method},
dXy,
evals=[(dXy, "Train"), (dXy_valid, "valid"), (Xy_valid_d, "dvalid")],
evals_result=dm_results,
)
np.testing.assert_allclose(
dm_results["Train"]["rmse"], qdm_results["Train"]["rmse"]
)
np.testing.assert_allclose(
dm_results["valid"]["rmse"], qdm_results["valid"]["rmse"]
)
np.testing.assert_allclose(
dm_results["dvalid"]["rmse"], qdm_results["valid"]["rmse"]
)
def test_ref_dmatrix(self) -> None:
rng = np.random.RandomState(1994)
self.run_ref_dmatrix(rng, "hist", True)
self.run_ref_dmatrix(rng, "hist", False)
def test_predict(self) -> None:
n_samples, n_features = 16, 2
X, y = make_categorical(
n_samples, n_features, n_categories=13, onehot=False
)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "hist"}, Xy)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
a = booster.predict(Xy)
qXy = xgb.QuantileDMatrix(X, y, enable_categorical=True)
b = booster.predict(qXy)
np.testing.assert_allclose(a, b)

View File

@@ -1382,6 +1382,42 @@ class TestWithDask:
num_rounds = 30
self.run_updater_test(client, params, num_rounds, dataset, 'hist')
def test_quantile_dmatrix(self, client: Client) -> None:
X, y = make_categorical(client, 10000, 30, 13)
Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
valid_Xy = xgb.dask.DaskDMatrix(client, X, y, enable_categorical=True)
output = xgb.dask.train(
client,
{"tree_method": "hist"},
Xy,
num_boost_round=10,
evals=[(Xy, "Train"), (valid_Xy, "Valid")]
)
dmatrix_hist = output["history"]
Xy = xgb.dask.DaskQuantileDMatrix(client, X, y, enable_categorical=True)
valid_Xy = xgb.dask.DaskQuantileDMatrix(
client, X, y, enable_categorical=True, ref=Xy
)
output = xgb.dask.train(
client,
{"tree_method": "hist"},
Xy,
num_boost_round=10,
evals=[(Xy, "Train"), (valid_Xy, "Valid")]
)
quantile_hist = output["history"]
np.testing.assert_allclose(
quantile_hist["Train"]["rmse"], dmatrix_hist["Train"]["rmse"]
)
np.testing.assert_allclose(
quantile_hist["Valid"]["rmse"], dmatrix_hist["Valid"]["rmse"]
)
@given(params=exact_parameter_strategy,
dataset=tm.dataset_strategy)
@settings(deadline=None, suppress_health_check=suppress, print_blob=True)

View File

@@ -1,11 +1,11 @@
from concurrent.futures import ThreadPoolExecutor
import os
import multiprocessing
from typing import Tuple, Union
from typing import Tuple, Union, List, Sequence, Callable
import urllib
import zipfile
import sys
from typing import Optional
from typing import Optional, Dict, Any
from contextlib import contextmanager
from io import StringIO
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
@@ -180,79 +180,148 @@ def skip_s390x():
class IteratorForTest(xgb.core.DataIter):
def __init__(self, X, y):
def __init__(
self,
X: Sequence,
y: Sequence,
w: Optional[Sequence],
cache: Optional[str] = "./"
) -> None:
assert len(X) == len(y)
self.X = X
self.y = y
self.w = w
self.it = 0
super().__init__("./")
super().__init__(cache)
def next(self, input_data):
def next(self, input_data: Callable) -> int:
if self.it == len(self.X):
return 0
# Use copy to make sure the iterator doesn't hold a reference to the data.
input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
gc.collect() # clear up the copy, see if XGBoost access freed memory.
input_data(
data=self.X[self.it].copy(),
label=self.y[self.it].copy(),
weight=self.w[self.it].copy() if self.w else None,
)
gc.collect() # clear up the copy, see if XGBoost access freed memory.
self.it += 1
return 1
def reset(self):
def reset(self) -> None:
self.it = 0
def as_arrays(self):
X = np.concatenate(self.X, axis=0)
def as_arrays(
self,
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], np.ndarray, np.ndarray]:
if isinstance(self.X[0], sparse.csr_matrix):
X = sparse.vstack(self.X, format="csr")
else:
X = np.concatenate(self.X, axis=0)
y = np.concatenate(self.y, axis=0)
return X, y
w = np.concatenate(self.w, axis=0)
return X, y, w
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
w = []
if use_cupy:
import cupy
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
X.append(_X)
y.append(_y)
w.append(_w)
return X, y, w
def make_batches_sparse(
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
w = []
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = sparse.random(
n_samples_per_batch,
n_features,
1.0 - sparsity,
format="csr",
dtype=np.float32,
random_state=rng,
)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
X.append(_X)
y.append(_y)
w.append(_w)
return X, y, w
# Contains a dataset in numpy format as well as the relevant objective and metric
class TestDataset:
def __init__(self, name, get_dataset, objective, metric):
def __init__(
self, name: str, get_dataset: Callable, objective: str, metric: str
) -> None:
self.name = name
self.objective = objective
self.metric = metric
self.X, self.y = get_dataset()
self.w = None
self.w: Optional[np.ndarray] = None
self.margin: Optional[np.ndarray] = None
def set_params(self, params_in):
def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
params_in['objective'] = self.objective
params_in['eval_metric'] = self.metric
if self.objective == "multi:softmax":
params_in["num_class"] = int(np.max(self.y) + 1)
return params_in
def get_dmat(self):
def get_dmat(self) -> xgb.DMatrix:
return xgb.DMatrix(
self.X, self.y, self.w, base_margin=self.margin, enable_categorical=True
)
def get_device_dmat(self):
def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
w = None if self.w is None else cp.array(self.w)
X = cp.array(self.X, dtype=np.float32)
y = cp.array(self.y, dtype=np.float32)
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
def get_external_dmat(self):
def get_external_dmat(self) -> xgb.DMatrix:
n_samples = self.X.shape[0]
n_batches = 10
per_batch = n_samples // n_batches + 1
predictor = []
response = []
weight = []
for i in range(n_batches):
beg = i * per_batch
end = min((i + 1) * per_batch, n_samples)
assert end != beg
X = self.X[beg: end, ...]
y = self.y[beg: end]
w = self.w[beg: end] if self.w is not None else None
predictor.append(X)
response.append(y)
if w is not None:
weight.append(w)
it = IteratorForTest(predictor, response)
it = IteratorForTest(predictor, response, weight if weight else None)
return xgb.DMatrix(it)
def __repr__(self):
def __repr__(self) -> str:
return self.name

View File

@@ -1,4 +1,3 @@
import os
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score