Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
2023-08-10 00:40:06 +08:00
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -8,7 +8,9 @@ from typing import (
    Callable,
    Dict,
    List,
+    Optional,
    Sequence,
+    Tuple,
    Type,
    TypeVar,
    Union,
@@ -20,8 +22,6 @@ import numpy as np

 DataType = Any

-# xgboost accepts some other possible types in practice due to historical reason, which is
-# lesser tested.  For now we encourage users to pass a simple list of string.
 FeatureInfo = Sequence[str]
 FeatureNames = FeatureInfo
 FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
        ctypes._Pointer,
    ]

+# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
+# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
+# being freed.
+TransformedData = Tuple[
+    Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
+]
+
 # template parameter
 _T = TypeVar("_T")
 _F = TypeVar("_F", bound=Callable[..., Any])
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -9,6 +9,7 @@ import os
 import re
 import sys
 import warnings
+import weakref
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from enum import IntEnum, unique
@@ -51,6 +52,7 @@ from ._typing import (
    FeatureTypes,
    ModelIn,
    NumpyOrCupy,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -486,7 +488,16 @@ def _prediction_output(


 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
-    """The interface for user defined data iterator.
+    """The interface for user defined data iterator. The iterator facilitates
+    distributed training, :py:class:`QuantileDMatrix`, and external memory support using
+    :py:class:`DMatrix`. Most of time, users don't need to interact with this class
+    directly.
+
+    .. note::
+
+        The class caches some intermediate results using the `data` input (predictor
+        `X`) as key. Don't repeat the `X` for multiple batches with different meta data
+        (like `label`), make a copy if necessary.

    Parameters
    ----------
@@ -510,13 +521,13 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
        self._allow_host = True
        self._release = release_data
        # Stage data in Python until reset or next is called to avoid data being free.
-        self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
-        self._input_id: int = 0
+        self._temporary_data: Optional[TransformedData] = None
+        self._data_ref: Optional[weakref.ReferenceType] = None

    def get_callbacks(
        self, allow_host: bool, enable_categorical: bool
    ) -> Tuple[Callable, Callable]:
-        """Get callback functions for iterating in C."""
+        """Get callback functions for iterating in C. This is an internal function."""
        assert hasattr(self, "cache_prefix"), "__init__ is not called."
        self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
            self._reset_wrapper
@@ -591,7 +602,19 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
            from .data import _proxy_transform, dispatch_proxy_set_data

            # Reduce the amount of transformation that's needed for QuantileDMatrix.
-            if self._temporary_data is not None and id(data) == self._input_id:
+            #
+            # To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
+            # GPU. If the QDM has only one batch of input (most of the cases), we can
+            # avoid transforming the data repeatly.
+            try:
+                ref = weakref.ref(data)
+            except TypeError:
+                ref = None
+            if (
+                self._temporary_data is not None
+                and ref is not None
+                and ref is self._data_ref
+            ):
                new, cat_codes, feature_names, feature_types = self._temporary_data
            else:
                new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -608,7 +631,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
                feature_types=feature_types,
                **kwargs,
            )
-            self._input_id = id(data)
+            self._data_ref = ref

        # pylint: disable=not-callable
        return self._handle_exception(lambda: self.next(input_data), 0)
@@ -1134,7 +1157,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        testing purposes. If this is a quantized DMatrix then quantized values are
        returned instead of input values.

-            .. versionadded:: 1.7.0
+        .. versionadded:: 1.7.0

        """
        indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1155,7 +1178,11 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
        return ret

    def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
-        """Get quantile cuts for quantization."""
+        """Get quantile cuts for quantization.
+
+        .. versionadded:: 2.0.0
+
+        """
        n_features = self.num_col()

        c_sindptr = ctypes.c_char_p()
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast

 import numpy as np

@@ -17,6 +17,7 @@ from ._typing import (
    FloatCompatible,
    NumpyDType,
    PandasDType,
+    TransformedData,
    c_bst_ulong,
 )
 from .compat import DataFrame, lazy_isinstance
@@ -1268,12 +1269,7 @@ def _proxy_transform(
    feature_names: Optional[FeatureNames],
    feature_types: Optional[FeatureTypes],
    enable_categorical: bool,
-) -> Tuple[
-    Union[bool, ctypes.c_void_p, np.ndarray],
-    Optional[list],
-    Optional[FeatureNames],
-    Optional[FeatureTypes],
-]:
+) -> TransformedData:
    if _is_cudf_df(data) or _is_cudf_ser(data):
        return _transform_cudf_df(
            data, feature_names, feature_types, enable_categorical
--- a/python-package/xgboost/testing/init.py
+++ b/python-package/xgboost/testing/init.py
@@ -230,7 +230,7 @@ class IteratorForTest(xgb.core.DataIter):

    def as_arrays(
        self,
-    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
        if isinstance(self.X[0], sparse.csr_matrix):
            X = sparse.vstack(self.X, format="csr")
        else:
@@ -244,7 +244,12 @@ class IteratorForTest(xgb.core.DataIter):


 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
    X = []
    y = []
@@ -255,10 +260,11 @@ def make_batches(
        rng = cupy.random.RandomState(1994)
    else:
        rng = np.random.RandomState(1994)
-    for _ in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
        X.append(_X)
        y.append(_y)
        w.append(_w)