Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
2023-08-10 00:40:06 +08:00
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -1,4 +1,5 @@
-from typing import Callable, Dict, List
+import weakref
+from typing import Any, Callable, Dict, List

 import numpy as np
 import pytest
@@ -179,5 +180,18 @@ def test_data_cache() -> None:
    data = make_batches(n_samples_per_batch, n_features, n_batches, False)
    batches = [v[0] for v in data]
    it = IterForCacheTest(*batches)
+    transform = xgb.data._proxy_transform
+
+    called = 0
+
+    def mock(*args: Any, **kwargs: Any) -> Any:
+        nonlocal called
+        called += 1
+        return transform(*args, **kwargs)
+
+    xgb.data._proxy_transform = mock
    xgb.QuantileDMatrix(it)
-    assert it._input_id == id(batches[0])
+    assert it._data_ref is weakref.ref(batches[0])
+    assert called == 1
+
+    xgb.data._proxy_transform = transform
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -103,12 +103,29 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )
        Xy = xgb.QuantileDMatrix(it)
        assert Xy.num_row() == n_samples_per_batch * n_batches
        assert Xy.num_col() == n_features

+    def test_different_size(self) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+
+        it = IteratorForTest(
+            *make_batches(
+                n_samples_per_batch, n_features, n_batches, False, vary_size=True
+            ),
+            cache=None,
+        )
+        Xy = xgb.QuantileDMatrix(it)
+        assert Xy.num_row() == 2429
+        X, y, w = it.as_arrays()
+        Xy1 = xgb.QuantileDMatrix(X, y, weight=w)
+        assert predictor_equal(Xy, Xy1)
+
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
    def test_training(self, sparsity: float) -> None:
        n_samples_per_batch = 317
@@ -123,7 +140,7 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )

        parameters = {"tree_method": "hist", "max_bin": 256}