Export Python Interface for external memory. (#7070)

* Add Python iterator interface.
* Add tests.
* Add demo.
* Add documents.
* Handle empty dataset.
This commit is contained in:
Jiaming Yuan
2021-07-22 15:15:53 +08:00
committed by GitHub
parent e64ee6592f
commit e6088366df
34 changed files with 961 additions and 200 deletions

View File

@@ -77,12 +77,23 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
for (size_t i = 0; i < iterators.size(); ++i) {
ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
if (i != iterators.size() - 1) {
ASSERT_EQ(iterators[i].use_count(), 1);
} else {
// The last batch is still being held by sparse page DMatrix.
ASSERT_EQ(iterators[i].use_count(), 2);
}
}
// make sure it's const and the caller can not modify the content of page.
for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
}
// The above iteration clears out all references inside DMatrix.
for (auto const& ptr : iterators) {
ASSERT_TRUE(ptr.unique());
}
}
TEST(SparsePageDMatrix, EllpackPageContent) {

View File

@@ -0,0 +1,32 @@
import numpy as np
import xgboost as xgb
from hypothesis import given, strategies, settings
import pytest
import sys
sys.path.append("tests/python")
from test_data_iterator import SingleBatch, make_batches
from test_data_iterator import test_single_batch as cpu_single_batch
from test_data_iterator import run_data_iterator
from testing import IteratorForTest, no_cupy
def test_gpu_single_batch() -> None:
cpu_single_batch("gpu_hist")
@pytest.mark.skipif(**no_cupy())
@given(
strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
)
@settings(deadline=None)
def test_gpu_data_iterator(
n_samples_per_batch: int, n_features: int, n_batches: int
) -> None:
run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", True)
run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", False)
def test_cpu_data_iterator() -> None:
"""Make sure CPU algorithm can handle GPU inputs"""
run_data_iterator(1024, 2, 3, "approx", True)

View File

@@ -9,7 +9,7 @@ import test_demos as td # noqa
@pytest.mark.skipif(**tm.no_cupy())
def test_data_iterator():
script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
script = os.path.join(td.PYTHON_DEMO_DIR, 'quantile_data_iterator.py')
cmd = ['python', script]
subprocess.check_call(cmd)

View File

@@ -112,7 +112,6 @@ class TestGPUUpdaters:
tm.dataset_strategy)
@settings(deadline=None)
def test_external_memory(self, param, num_rounds, dataset):
pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
# We cannot handle empty dataset yet
assume(len(dataset.y) > 0)
param['tree_method'] = 'gpu_hist'

View File

@@ -0,0 +1,135 @@
import xgboost as xgb
from xgboost.data import SingleBatchInternalIter as SingleBatch
import numpy as np
from testing import IteratorForTest
from typing import Tuple, List
import pytest
from hypothesis import given, strategies, settings
from scipy.sparse import csr_matrix
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
X = []
y = []
if use_cupy:
import cupy
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
X.append(_X)
y.append(_y)
return X, y
def test_single_batch(tree_method: str = "approx") -> None:
from sklearn.datasets import load_breast_cancer
n_rounds = 10
X, y = load_breast_cancer(return_X_y=True)
X = X.astype(np.float32)
y = y.astype(np.float32)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
Xy = xgb.DMatrix(X, y)
from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
assert from_it.get_dump() == from_dmat.get_dump()
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X = X.astype(np.float32)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
# remove feature info to generate exact same text representation.
from_pd.feature_names = None
from_pd.feature_types = None
assert from_pd.get_dump() == from_it.get_dump()
X, y = load_breast_cancer(return_X_y=True)
X = csr_matrix(X)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
X, y = load_breast_cancer(return_X_y=True)
Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
assert from_np.get_dump() == from_it.get_dump()
def run_data_iterator(
n_samples_per_batch: int,
n_features: int,
n_batches: int,
tree_method: str,
use_cupy: bool,
) -> None:
n_rounds = 2
it = IteratorForTest(
*make_batches(n_samples_per_batch, n_features, n_batches, use_cupy)
)
if n_batches == 0:
with pytest.raises(ValueError, match="1 batch"):
Xy = xgb.DMatrix(it)
return
Xy = xgb.DMatrix(it)
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features
results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
from_it = xgb.train(
{"tree_method": tree_method, "max_depth": 2},
Xy,
num_boost_round=n_rounds,
evals=[(Xy, "Train")],
evals_result=results_from_it,
verbose_eval=False,
)
it_predt = from_it.predict(Xy)
X, y = it.as_arrays()
Xy = xgb.DMatrix(X, y)
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features
results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
from_arrays = xgb.train(
{"tree_method": tree_method, "max_depth": 2},
Xy,
num_boost_round=n_rounds,
evals=[(Xy, "Train")],
evals_result=results_from_arrays,
verbose_eval=False,
)
arr_predt = from_arrays.predict(Xy)
if tree_method != "gpu_hist":
rtol = 1e-1 # flaky
else:
np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-3)
rtol = 1e-6
np.testing.assert_allclose(
results_from_it["Train"]["rmse"],
results_from_arrays["Train"]["rmse"],
rtol=rtol,
)
@given(
strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
)
@settings(deadline=None)
def test_data_iterator(
n_samples_per_batch: int, n_features: int, n_batches: int
) -> None:
run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False)
run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False)

View File

@@ -8,7 +8,7 @@ from io import StringIO
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
from xgboost.compat import DASK_INSTALLED
import pytest
import tempfile
import gc
import xgboost as xgb
import numpy as np
import platform
@@ -143,10 +143,35 @@ def skip_s390x():
return {"condition": condition, "reason": reason}
class IteratorForTest(xgb.core.DataIter):
def __init__(self, X, y):
assert len(X) == len(y)
self.X = X
self.y = y
self.it = 0
super().__init__("./")
def next(self, input_data):
if self.it == len(self.X):
return 0
# Use copy to make sure the iterator doesn't hold a reference to the data.
input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
gc.collect() # clear up the copy, see if XGBoost access freed memory.
self.it += 1
return 1
def reset(self):
self.it = 0
def as_arrays(self):
X = np.concatenate(self.X, axis=0)
y = np.concatenate(self.y, axis=0)
return X, y
# Contains a dataset in numpy format as well as the relevant objective and metric
class TestDataset:
def __init__(self, name, get_dataset, objective, metric
):
def __init__(self, name, get_dataset, objective, metric):
self.name = name
self.objective = objective
self.metric = metric
@@ -171,16 +196,23 @@ class TestDataset:
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
def get_external_dmat(self):
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, 'tmptmp_1234.csv')
np.savetxt(path,
np.hstack((self.y.reshape(len(self.y), 1), self.X)),
delimiter=',')
assert os.path.exists(path)
uri = path + '?format=csv&label_column=0#tmptmp_'
# The uri looks like:
# 'tmptmp_1234.csv?format=csv&label_column=0#tmptmp_'
return xgb.DMatrix(uri, weight=self.w, base_margin=self.margin)
n_samples = self.X.shape[0]
n_batches = 10
per_batch = n_samples // n_batches + 1
predictor = []
response = []
for i in range(n_batches):
beg = i * per_batch
end = min((i + 1) * per_batch, n_samples)
assert end != beg
X = self.X[beg: end, ...]
y = self.y[beg: end]
predictor.append(X)
response.append(y)
it = IteratorForTest(predictor, response)
return xgb.DMatrix(it)
def __repr__(self):
return self.name