Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects.
- Small optimization to column matrix with QDM by using `realloc` instead of copying data.
This commit is contained in:
Jiaming Yuan
2023-08-10 00:40:06 +08:00
committed by GitHub
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions

View File

@@ -103,12 +103,29 @@ class TestQuantileDMatrix:
*make_batches_sparse(
n_samples_per_batch, n_features, n_batches, sparsity
),
None
None,
)
Xy = xgb.QuantileDMatrix(it)
assert Xy.num_row() == n_samples_per_batch * n_batches
assert Xy.num_col() == n_features
def test_different_size(self) -> None:
n_samples_per_batch = 317
n_features = 8
n_batches = 7
it = IteratorForTest(
*make_batches(
n_samples_per_batch, n_features, n_batches, False, vary_size=True
),
cache=None,
)
Xy = xgb.QuantileDMatrix(it)
assert Xy.num_row() == 2429
X, y, w = it.as_arrays()
Xy1 = xgb.QuantileDMatrix(X, y, weight=w)
assert predictor_equal(Xy, Xy1)
@pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
def test_training(self, sparsity: float) -> None:
n_samples_per_batch = 317
@@ -123,7 +140,7 @@ class TestQuantileDMatrix:
*make_batches_sparse(
n_samples_per_batch, n_features, n_batches, sparsity
),
None
None,
)
parameters = {"tree_method": "hist", "max_bin": 256}