Export Python Interface for external memory. (#7070)

* Add Python iterator interface. * Add tests. * Add demo. * Add documents. * Handle empty dataset.
2021-07-22 15:15:53 +08:00
parent e64ee6592f
commit e6088366df
34 changed files with 961 additions and 200 deletions
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -77,12 +77,23 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {

  for (size_t i = 0; i < iterators.size(); ++i) {
    ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
+    if (i != iterators.size() - 1) {
+      ASSERT_EQ(iterators[i].use_count(), 1);
+    } else {
+      // The last batch is still being held by sparse page DMatrix.
+      ASSERT_EQ(iterators[i].use_count(), 2);
+    }
  }

  // make sure it's const and the caller can not modify the content of page.
  for (auto& page : m->GetBatches<EllpackPage>({0, 32})) {
    static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value, "");
  }
+
+  // The above iteration clears out all references inside DMatrix.
+  for (auto const& ptr : iterators) {
+    ASSERT_TRUE(ptr.unique());
+  }
 }

 TEST(SparsePageDMatrix, EllpackPageContent) {
--- a/tests/python-gpu/test_gpu_data_iterator.py
+++ b/tests/python-gpu/test_gpu_data_iterator.py
@@ -0,0 +1,32 @@
+import numpy as np
+import xgboost as xgb
+from hypothesis import given, strategies, settings
+import pytest
+import sys
+
+sys.path.append("tests/python")
+from test_data_iterator import SingleBatch, make_batches
+from test_data_iterator import test_single_batch as cpu_single_batch
+from test_data_iterator import run_data_iterator
+from testing import IteratorForTest, no_cupy
+
+
+def test_gpu_single_batch() -> None:
+    cpu_single_batch("gpu_hist")
+
+
+@pytest.mark.skipif(**no_cupy())
+@given(
+    strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
+)
+@settings(deadline=None)
+def test_gpu_data_iterator(
+    n_samples_per_batch: int, n_features: int, n_batches: int
+) -> None:
+    run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", True)
+    run_data_iterator(n_samples_per_batch, n_features, n_batches, "gpu_hist", False)
+
+
+def test_cpu_data_iterator() -> None:
+    """Make sure CPU algorithm can handle GPU inputs"""
+    run_data_iterator(1024, 2, 3, "approx", True)
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -9,7 +9,7 @@ import test_demos as td         # noqa

@pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
-    script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
+    script = os.path.join(td.PYTHON_DEMO_DIR, 'quantile_data_iterator.py')
    cmd = ['python', script]
    subprocess.check_call(cmd)

--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -112,7 +112,6 @@ class TestGPUUpdaters:
           tm.dataset_strategy)
    @settings(deadline=None)
    def test_external_memory(self, param, num_rounds, dataset):
-        pytest.xfail(reason='TestGPUUpdaters::test_external_memory is flaky')
        # We cannot handle empty dataset yet
        assume(len(dataset.y) > 0)
        param['tree_method'] = 'gpu_hist'
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -0,0 +1,135 @@
+import xgboost as xgb
+from xgboost.data import SingleBatchInternalIter as SingleBatch
+import numpy as np
+from testing import IteratorForTest
+from typing import Tuple, List
+import pytest
+from hypothesis import given, strategies, settings
+from scipy.sparse import csr_matrix
+
+
+def make_batches(
+    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+    X = []
+    y = []
+    if use_cupy:
+        import cupy
+
+        rng = cupy.random.RandomState(1994)
+    else:
+        rng = np.random.RandomState(1994)
+    for i in range(n_batches):
+        _X = rng.randn(n_samples_per_batch, n_features)
+        _y = rng.randn(n_samples_per_batch)
+        X.append(_X)
+        y.append(_y)
+    return X, y
+
+
+def test_single_batch(tree_method: str = "approx") -> None:
+    from sklearn.datasets import load_breast_cancer
+
+    n_rounds = 10
+    X, y = load_breast_cancer(return_X_y=True)
+    X = X.astype(np.float32)
+    y = y.astype(np.float32)
+
+    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
+    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+
+    Xy = xgb.DMatrix(X, y)
+    from_dmat = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    assert from_it.get_dump() == from_dmat.get_dump()
+
+    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
+    X = X.astype(np.float32)
+    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
+    from_pd = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    # remove feature info to generate exact same text representation.
+    from_pd.feature_names = None
+    from_pd.feature_types = None
+
+    assert from_pd.get_dump() == from_it.get_dump()
+
+    X, y = load_breast_cancer(return_X_y=True)
+    X = csr_matrix(X)
+    Xy = xgb.DMatrix(SingleBatch(data=X, label=y))
+    from_it = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+
+    X, y = load_breast_cancer(return_X_y=True)
+    Xy = xgb.DMatrix(SingleBatch(data=X, label=y), missing=0.0)
+    from_np = xgb.train({"tree_method": tree_method}, Xy, num_boost_round=n_rounds)
+    assert from_np.get_dump() == from_it.get_dump()
+
+
+def run_data_iterator(
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    tree_method: str,
+    use_cupy: bool,
+) -> None:
+    n_rounds = 2
+
+    it = IteratorForTest(
+        *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy)
+    )
+    if n_batches == 0:
+        with pytest.raises(ValueError, match="1 batch"):
+            Xy = xgb.DMatrix(it)
+        return
+
+    Xy = xgb.DMatrix(it)
+    assert Xy.num_row() == n_samples_per_batch * n_batches
+    assert Xy.num_col() == n_features
+
+    results_from_it: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    from_it = xgb.train(
+        {"tree_method": tree_method, "max_depth": 2},
+        Xy,
+        num_boost_round=n_rounds,
+        evals=[(Xy, "Train")],
+        evals_result=results_from_it,
+        verbose_eval=False,
+    )
+    it_predt = from_it.predict(Xy)
+
+    X, y = it.as_arrays()
+    Xy = xgb.DMatrix(X, y)
+    assert Xy.num_row() == n_samples_per_batch * n_batches
+    assert Xy.num_col() == n_features
+
+    results_from_arrays: xgb.callback.EvaluationMonitor.EvalsLog = {}
+    from_arrays = xgb.train(
+        {"tree_method": tree_method, "max_depth": 2},
+        Xy,
+        num_boost_round=n_rounds,
+        evals=[(Xy, "Train")],
+        evals_result=results_from_arrays,
+        verbose_eval=False,
+    )
+    arr_predt = from_arrays.predict(Xy)
+
+    if tree_method != "gpu_hist":
+        rtol = 1e-1  # flaky
+    else:
+        np.testing.assert_allclose(it_predt, arr_predt, rtol=1e-3)
+        rtol = 1e-6
+
+    np.testing.assert_allclose(
+        results_from_it["Train"]["rmse"],
+        results_from_arrays["Train"]["rmse"],
+        rtol=rtol,
+    )
+
+
+@given(
+    strategies.integers(0, 1024), strategies.integers(1, 7), strategies.integers(0, 13)
+)
+@settings(deadline=None)
+def test_data_iterator(
+    n_samples_per_batch: int, n_features: int, n_batches: int
+) -> None:
+    run_data_iterator(n_samples_per_batch, n_features, n_batches, "approx", False)
+    run_data_iterator(n_samples_per_batch, n_features, n_batches, "hist", False)
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -8,7 +8,7 @@ from io import StringIO
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
 from xgboost.compat import DASK_INSTALLED
 import pytest
-import tempfile
+import gc
 import xgboost as xgb
 import numpy as np
 import platform
@@ -143,10 +143,35 @@ def skip_s390x():
    return {"condition": condition, "reason": reason}


+class IteratorForTest(xgb.core.DataIter):
+    def __init__(self, X, y):
+        assert len(X) == len(y)
+        self.X = X
+        self.y = y
+        self.it = 0
+        super().__init__("./")
+
+    def next(self, input_data):
+        if self.it == len(self.X):
+            return 0
+        # Use copy to make sure the iterator doesn't hold a reference to the data.
+        input_data(data=self.X[self.it].copy(), label=self.y[self.it].copy())
+        gc.collect()            # clear up the copy, see if XGBoost access freed memory.
+        self.it += 1
+        return 1
+
+    def reset(self):
+        self.it = 0
+
+    def as_arrays(self):
+        X = np.concatenate(self.X, axis=0)
+        y = np.concatenate(self.y, axis=0)
+        return X, y
+
+
 # Contains a dataset in numpy format as well as the relevant objective and metric
 class TestDataset:
-    def __init__(self, name, get_dataset, objective, metric
-                 ):
+    def __init__(self, name, get_dataset, objective, metric):
        self.name = name
        self.objective = objective
        self.metric = metric
@@ -171,16 +196,23 @@ class TestDataset:
        return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)

    def get_external_dmat(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            path = os.path.join(tmpdir, 'tmptmp_1234.csv')
-            np.savetxt(path,
-                       np.hstack((self.y.reshape(len(self.y), 1), self.X)),
-                       delimiter=',')
-            assert os.path.exists(path)
-            uri = path + '?format=csv&label_column=0#tmptmp_'
-            # The uri looks like:
-            # 'tmptmp_1234.csv?format=csv&label_column=0#tmptmp_'
-            return xgb.DMatrix(uri, weight=self.w, base_margin=self.margin)
+        n_samples = self.X.shape[0]
+        n_batches = 10
+        per_batch = n_samples // n_batches + 1
+
+        predictor = []
+        response = []
+        for i in range(n_batches):
+            beg = i * per_batch
+            end = min((i + 1) * per_batch, n_samples)
+            assert end != beg
+            X = self.X[beg: end, ...]
+            y = self.y[beg: end]
+            predictor.append(X)
+            response.append(y)
+
+        it = IteratorForTest(predictor, response)
+        return xgb.DMatrix(it)

    def __repr__(self):
        return self.name