Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
2023-08-10 00:40:06 +08:00
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -103,12 +103,29 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )
        Xy = xgb.QuantileDMatrix(it)
        assert Xy.num_row() == n_samples_per_batch * n_batches
        assert Xy.num_col() == n_features

+    def test_different_size(self) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+
+        it = IteratorForTest(
+            *make_batches(
+                n_samples_per_batch, n_features, n_batches, False, vary_size=True
+            ),
+            cache=None,
+        )
+        Xy = xgb.QuantileDMatrix(it)
+        assert Xy.num_row() == 2429
+        X, y, w = it.as_arrays()
+        Xy1 = xgb.QuantileDMatrix(X, y, weight=w)
+        assert predictor_equal(Xy, Xy1)
+
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
    def test_training(self, sparsity: float) -> None:
        n_samples_per_batch = 317
@@ -123,7 +140,7 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )

        parameters = {"tree_method": "hist", "max_bin": 256}