Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
2023-08-10 00:40:06 +08:00
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -42,6 +42,7 @@ class LintersPaths:
        "demo/guide-python/feature_weights.py",
        "demo/guide-python/sklearn_parallel.py",
        "demo/guide-python/spark_estimator_examples.py",
+        "demo/guide-python/external_memory.py",
        "demo/guide-python/individual_trees.py",
        "demo/guide-python/quantile_regression.py",
        "demo/guide-python/multioutput_regression.py",
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -2,15 +2,26 @@
 * Copyright 2018-2023 by XGBoost Contributors
 */
 #include <gtest/gtest.h>
+#include <xgboost/base.h>     // for bst_bin_t
+#include <xgboost/context.h>  // for Context
+#include <xgboost/data.h>     // for BatchIterator, BatchSet, DMatrix, Met...

-#include "../../../src/common/column_matrix.h"
-#include "../helpers.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t, uint16_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <memory>       // for shared_ptr, __shared_ptr_access, allo...
+#include <type_traits>  // for remove_reference_t

+#include "../../../src/common/column_matrix.h"      // for ColumnMatrix, Column, DenseColumnIter
+#include "../../../src/common/hist_util.h"          // for DispatchBinType, BinTypeSize, Index
+#include "../../../src/common/ref_resource_view.h"  // for RefResourceView
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/tree/param.h"                // for TrainParam
+#include "../helpers.h"                             // for RandomDataGenerator, NumpyArrayIterFo...

-namespace xgboost {
-namespace common {
-
-TEST(DenseColumn, Test) {
+namespace xgboost::common {
+TEST(ColumnMatrix, Basic) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -22,7 +33,7 @@ TEST(DenseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads());
    }
    ASSERT_GE(column_matrix.GetTypeSize(), last);
    ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize);
@@ -59,7 +70,7 @@ void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatr
  }
 }

-TEST(SparseColumn, Test) {
+TEST(ColumnMatrix, SparseColumn) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -69,7 +80,7 @@ TEST(SparseColumn, Test) {
    GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads());
    }
    common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
      using T = decltype(dtype);
@@ -90,7 +101,7 @@ void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
  }
 }

-TEST(DenseColumnWithMissing, Test) {
+TEST(ColumnMatrix, DenseColumnWithMissing) {
  int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                            static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -100,7 +111,7 @@ TEST(DenseColumnWithMissing, Test) {
    GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
    ColumnMatrix column_matrix;
    for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads());
    }
    ASSERT_TRUE(column_matrix.AnyMissing());
    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
@@ -110,5 +121,29 @@ TEST(DenseColumnWithMissing, Test) {
    });
  }
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(ColumnMatrix, GrowMissing) {
+  float sparsity = 0.5;
+  NumpyArrayIterForTest iter(sparsity);
+  auto n_threads = 0;
+  bst_bin_t n_bins = 16;
+  BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()};
+  Context ctx;
+  auto m = std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                    std::numeric_limits<float>::quiet_NaN(),
+                                                    n_threads, n_bins);
+  for (auto const& page : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+    auto const& column_matrix = page.Transpose();
+    auto const& missing = column_matrix.Missing();
+    auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols();
+    auto expected = std::remove_reference_t<decltype(missing)>::BitFieldT::ComputeStorageSize(n);
+    auto got = missing.storage.size();
+    ASSERT_EQ(expected, got);
+    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
+      using T = decltype(dtype);
+      auto col = column_matrix.DenseColumn<T, true>(0);
+      CheckColumWithMissingValue(col, page);
+    });
+  }
+}
+}  // namespace xgboost::common
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -119,6 +119,20 @@ TEST(IO, Resource) {
    for (std::size_t i = n; i < 2 * n; ++i) {
      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
    }
+
+    ptr = malloc_resource->DataAs<std::uint8_t>();
+    std::fill_n(ptr, malloc_resource->Size(), 7);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 3, std::byte{3});
+    } else {
+      malloc_resource->Resize<false>(n * 3, std::byte{3});
+    }
+    for (std::size_t i = 0; i < n * 2; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 7);
+    }
+    for (std::size_t i = n * 2; i < n * 3; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 3);
+    }
  };
  test_malloc_resize(true);
  test_malloc_resize(false);
--- a/tests/cpp/data/test_iterative_dmatrix.cc
+++ b/tests/cpp/data/test_iterative_dmatrix.cc
@@ -12,8 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/data.h"  // DMatrix

-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(IterativeDMatrix, Ref) {
  Context ctx;
  TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
@@ -21,7 +20,7 @@ TEST(IterativeDMatrix, Ref) {
 }

 TEST(IterativeDMatrix, IsDense) {
-  int n_bins = 16;
+  bst_bin_t n_bins = 16;
  auto test = [n_bins](float sparsity) {
    NumpyArrayIterForTest iter(sparsity);
    auto n_threads = 0;
@@ -38,5 +37,4 @@ TEST(IterativeDMatrix, IsDense) {
  test(0.1);
  test(1.0);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -1,4 +1,5 @@
-from typing import Callable, Dict, List
+import weakref
+from typing import Any, Callable, Dict, List

 import numpy as np
 import pytest
@@ -179,5 +180,18 @@ def test_data_cache() -> None:
    data = make_batches(n_samples_per_batch, n_features, n_batches, False)
    batches = [v[0] for v in data]
    it = IterForCacheTest(*batches)
+    transform = xgb.data._proxy_transform
+
+    called = 0
+
+    def mock(*args: Any, **kwargs: Any) -> Any:
+        nonlocal called
+        called += 1
+        return transform(*args, **kwargs)
+
+    xgb.data._proxy_transform = mock
    xgb.QuantileDMatrix(it)
-    assert it._input_id == id(batches[0])
+    assert it._data_ref is weakref.ref(batches[0])
+    assert called == 1
+
+    xgb.data._proxy_transform = transform
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -103,12 +103,29 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )
        Xy = xgb.QuantileDMatrix(it)
        assert Xy.num_row() == n_samples_per_batch * n_batches
        assert Xy.num_col() == n_features

+    def test_different_size(self) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+
+        it = IteratorForTest(
+            *make_batches(
+                n_samples_per_batch, n_features, n_batches, False, vary_size=True
+            ),
+            cache=None,
+        )
+        Xy = xgb.QuantileDMatrix(it)
+        assert Xy.num_row() == 2429
+        X, y, w = it.as_arrays()
+        Xy1 = xgb.QuantileDMatrix(X, y, weight=w)
+        assert predictor_equal(Xy, Xy1)
+
    @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
    def test_training(self, sparsity: float) -> None:
        n_samples_per_batch = 317
@@ -123,7 +140,7 @@ class TestQuantileDMatrix:
                *make_batches_sparse(
                    n_samples_per_batch, n_features, n_batches, sparsity
                ),
-                None
+                None,
            )

        parameters = {"tree_method": "hist", "max_bin": 256}