Export Python Interface for external memory. (#7070)
* Add Python iterator interface. * Add tests. * Add demo. * Add documents. * Handle empty dataset.
This commit is contained in:
@@ -77,12 +77,23 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
|
||||
|
||||
for (size_t i = 0; i < iterators.size(); ++i) {
|
||||
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
|
||||
if (i != iterators.size() - 1) {
|
||||
ASSERT_EQ(iterators[i].use_count(), 1);
|
||||
} else {
|
||||
// The last batch is still being held by sparse page DMatrix.
|
||||
ASSERT_EQ(iterators[i].use_count(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
// make sure it's const and the caller can not modify the content of page.
|
||||
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
|
||||
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
|
||||
}
|
||||
|
||||
// The above iteration clears out all references inside DMatrix.
|
||||
for (auto const& ptr : iterators) {
|
||||
ASSERT_TRUE(ptr.unique());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparsePageDMatrix, EllpackPageContent) {
|
||||
|
||||
32
tests/python-gpu/test_gpu_data_iterator.py
Normal file
32
tests/python-gpu/test_gpu_data_iterator.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
from hypothesis import given, strategies, settings
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
sys.path.append("tests/python")
|
||||
from test_data_iterator import SingleBatch, make_batches
|
||||
from test_data_iterator import test_single_batch as cpu_single_batch
|
||||
from test_data_iterator import run_data_iterator
|
||||
from testing import IteratorForTest, no_cupy
|
||||
|
||||
|
||||
def test_gpu_single_batch() -> None:
|
||||
cpu_single_batch("gpu_hist")
|
||||
|
||||
|
||||
@pytest.mark.skipif(**no_cupy())
|
||||
@given(
|
||||
strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
|
||||
)
|
||||
@settings(deadline=None)
|
||||
def test_gpu_data_iterator(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int
|
||||
) -> None:
|
||||
run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", True)
|
||||
run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", False)
|
||||
|
||||
|
||||
def test_cpu_data_iterator() -> None:
|
||||
"""Make sure CPU algorithm can handle GPU inputs"""
|
||||
run_data_iterator(1024, 2, 3, "approx", True)
|
||||
@@ -9,7 +9,7 @@ import test_demos as td # noqa
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_data_iterator():
|
||||
script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
|
||||
script = os.path.join(td.PYTHON_DEMO_DIR, 'quantile_data_iterator.py')
|
||||
cmd = ['python', script]
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
|
||||
@@ -112,7 +112,6 @@ class TestGPUUpdaters:
|
||||
tm.dataset_strategy)
|
||||
@settings(deadline=None)
|
||||
def test_external_memory(self, param, num_rounds, dataset):
|
||||
pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
|
||||
# We cannot handle empty dataset yet
|
||||
assume(len(dataset.y) > 0)
|
||||
param['tree_method'] = 'gpu_hist'
|
||||
|
||||
135
tests/python/test_data_iterator.py
Normal file
135
tests/python/test_data_iterator.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import xgboost as xgb
|
||||
from xgboost.data import SingleBatchInternalIter as SingleBatch
|
||||
import numpy as np
|
||||
from testing import IteratorForTest
|
||||
from typing import Tuple, List
|
||||
import pytest
|
||||
from hypothesis import given, strategies, settings
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
|
||||
def make_batches(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
|
||||
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
if use_cupy:
|
||||
import cupy
|
||||
|
||||
rng = cupy.random.RandomState(1994)
|
||||
else:
|
||||
rng = np.random.RandomState(1994)
|
||||
for i in range(n_batches):
|
||||
_X = rng.randn(n_samples_per_batch, n_features)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
return X, y
|
||||
|
||||
|
||||
def test_single_batch(tree_method: str = "approx") -> None:
|
||||
from sklearn.datasets import load_breast_cancer
|
||||
|
||||
n_rounds = 10
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X = X.astype(np.float32)
|
||||
y = y.astype(np.float32)
|
||||
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
assert from_it.get_dump() == from_dmat.get_dump()
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
|
||||
X = X.astype(np.float32)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
# remove feature info to generate exact same text representation.
|
||||
from_pd.feature_names = None
|
||||
from_pd.feature_types = None
|
||||
|
||||
assert from_pd.get_dump() == from_it.get_dump()
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
X = csr_matrix(X)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
|
||||
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
|
||||
X, y = load_breast_cancer(return_X_y=True)
|
||||
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
|
||||
from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
|
||||
assert from_np.get_dump() == from_it.get_dump()
|
||||
|
||||
|
||||
def run_data_iterator(
|
||||
n_samples_per_batch: int,
|
||||
n_features: int,
|
||||
n_batches: int,
|
||||
tree_method: str,
|
||||
use_cupy: bool,
|
||||
) -> None:
|
||||
n_rounds = 2
|
||||
|
||||
it = IteratorForTest(
|
||||
*make_batches(n_samples_per_batch, n_features, n_batches, use_cupy)
|
||||
)
|
||||
if n_batches == 0:
|
||||
with pytest.raises(ValueError, match="1 batch"):
|
||||
Xy = xgb.DMatrix(it)
|
||||
return
|
||||
|
||||
Xy = xgb.DMatrix(it)
|
||||
assert Xy.num_row() == n_samples_per_batch * n_batches
|
||||
assert Xy.num_col() == n_features
|
||||
|
||||
results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
|
||||
from_it = xgb.train(
|
||||
{"tree_method": tree_method, "max_depth": 2},
|
||||
Xy,
|
||||
num_boost_round=n_rounds,
|
||||
evals=[(Xy, "Train")],
|
||||
evals_result=results_from_it,
|
||||
verbose_eval=False,
|
||||
)
|
||||
it_predt = from_it.predict(Xy)
|
||||
|
||||
X, y = it.as_arrays()
|
||||
Xy = xgb.DMatrix(X, y)
|
||||
assert Xy.num_row() == n_samples_per_batch * n_batches
|
||||
assert Xy.num_col() == n_features
|
||||
|
||||
results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
|
||||
from_arrays = xgb.train(
|
||||
{"tree_method": tree_method, "max_depth": 2},
|
||||
Xy,
|
||||
num_boost_round=n_rounds,
|
||||
evals=[(Xy, "Train")],
|
||||
evals_result=results_from_arrays,
|
||||
verbose_eval=False,
|
||||
)
|
||||
arr_predt = from_arrays.predict(Xy)
|
||||
|
||||
if tree_method != "gpu_hist":
|
||||
rtol = 1e-1 # flaky
|
||||
else:
|
||||
np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-3)
|
||||
rtol = 1e-6
|
||||
|
||||
np.testing.assert_allclose(
|
||||
results_from_it["Train"]["rmse"],
|
||||
results_from_arrays["Train"]["rmse"],
|
||||
rtol=rtol,
|
||||
)
|
||||
|
||||
|
||||
@given(
|
||||
strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
|
||||
)
|
||||
@settings(deadline=None)
|
||||
def test_data_iterator(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int
|
||||
) -> None:
|
||||
run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False)
|
||||
run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False)
|
||||
@@ -8,7 +8,7 @@ from io import StringIO
|
||||
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
|
||||
from xgboost.compat import DASK_INSTALLED
|
||||
import pytest
|
||||
import tempfile
|
||||
import gc
|
||||
import xgboost as xgb
|
||||
import numpy as np
|
||||
import platform
|
||||
@@ -143,10 +143,35 @@ def skip_s390x():
|
||||
return {"condition": condition, "reason": reason}
|
||||
|
||||
|
||||
class IteratorForTest(xgb.core.DataIter):
|
||||
def __init__(self, X, y):
|
||||
assert len(X) == len(y)
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.it = 0
|
||||
super().__init__("./")
|
||||
|
||||
def next(self, input_data):
|
||||
if self.it == len(self.X):
|
||||
return 0
|
||||
# Use copy to make sure the iterator doesn't hold a reference to the data.
|
||||
input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
|
||||
gc.collect() # clear up the copy, see if XGBoost access freed memory.
|
||||
self.it += 1
|
||||
return 1
|
||||
|
||||
def reset(self):
|
||||
self.it = 0
|
||||
|
||||
def as_arrays(self):
|
||||
X = np.concatenate(self.X, axis=0)
|
||||
y = np.concatenate(self.y, axis=0)
|
||||
return X, y
|
||||
|
||||
|
||||
# Contains a dataset in numpy format as well as the relevant objective and metric
|
||||
class TestDataset:
|
||||
def __init__(self, name, get_dataset, objective, metric
|
||||
):
|
||||
def __init__(self, name, get_dataset, objective, metric):
|
||||
self.name = name
|
||||
self.objective = objective
|
||||
self.metric = metric
|
||||
@@ -171,16 +196,23 @@ class TestDataset:
|
||||
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
|
||||
|
||||
def get_external_dmat(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
path = os.path.join(tmpdir, 'tmptmp_1234.csv')
|
||||
np.savetxt(path,
|
||||
np.hstack((self.y.reshape(len(self.y), 1), self.X)),
|
||||
delimiter=',')
|
||||
assert os.path.exists(path)
|
||||
uri = path + '?format=csv&label_column=0#tmptmp_'
|
||||
# The uri looks like:
|
||||
# 'tmptmp_1234.csv?format=csv&label_column=0#tmptmp_'
|
||||
return xgb.DMatrix(uri, weight=self.w, base_margin=self.margin)
|
||||
n_samples = self.X.shape[0]
|
||||
n_batches = 10
|
||||
per_batch = n_samples // n_batches + 1
|
||||
|
||||
predictor = []
|
||||
response = []
|
||||
for i in range(n_batches):
|
||||
beg = i * per_batch
|
||||
end = min((i + 1) * per_batch, n_samples)
|
||||
assert end != beg
|
||||
X = self.X[beg: end, ...]
|
||||
y = self.y[beg: end]
|
||||
predictor.append(X)
|
||||
response.append(y)
|
||||
|
||||
it = IteratorForTest(predictor, response)
|
||||
return xgb.DMatrix(it)
|
||||
|
||||
def __repr__(self):
|
||||
return self.name
|
||||
|
||||
Reference in New Issue
Block a user