Use weakref instead of id for DataIter cache. (#9445)

- Fix case where Python reuses id from freed objects.
- Small optimization to column matrix with QDM by using `realloc` instead of copying data.
This commit is contained in:
Jiaming Yuan
2023-08-10 00:40:06 +08:00
committed by GitHub
parent d495a180d8
commit f05a23b41c
14 changed files with 193 additions and 63 deletions

View File

@@ -8,7 +8,9 @@ from typing import (
Callable,
Dict,
List,
Optional,
Sequence,
Tuple,
Type,
TypeVar,
Union,
@@ -20,8 +22,6 @@ import numpy as np
DataType = Any
# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatureInfo = Sequence[str]
FeatureNames = FeatureInfo
FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
ctypes._Pointer,
]
# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
# being freed.
TransformedData = Tuple[
Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
]
# template parameter
_T = TypeVar("_T")
_F = TypeVar("_F", bound=Callable[..., Any])

View File

@@ -9,6 +9,7 @@ import os
import re
import sys
import warnings
import weakref
from abc import ABC, abstractmethod
from collections.abc import Mapping
from enum import IntEnum, unique
@@ -51,6 +52,7 @@ from ._typing import (
FeatureTypes,
ModelIn,
NumpyOrCupy,
TransformedData,
c_bst_ulong,
)
from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -486,7 +488,16 @@ def _prediction_output(
class DataIter(ABC): # pylint: disable=too-many-instance-attributes
"""The interface for user defined data iterator.
"""The interface for user defined data iterator. The iterator facilitates
distributed training, :py:class:`QuantileDMatrix`, and external memory support using
:py:class:`DMatrix`. Most of time, users don't need to interact with this class
directly.
.. note::
The class caches some intermediate results using the `data` input (predictor
`X`) as key. Don't repeat the `X` for multiple batches with different meta data
(like `label`), make a copy if necessary.
Parameters
----------
@@ -510,13 +521,13 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
self._allow_host = True
self._release = release_data
# Stage data in Python until reset or next is called to avoid data being free.
self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
self._input_id: int = 0
self._temporary_data: Optional[TransformedData] = None
self._data_ref: Optional[weakref.ReferenceType] = None
def get_callbacks(
self, allow_host: bool, enable_categorical: bool
) -> Tuple[Callable, Callable]:
"""Get callback functions for iterating in C."""
"""Get callback functions for iterating in C. This is an internal function."""
assert hasattr(self, "cache_prefix"), "__init__ is not called."
self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
self._reset_wrapper
@@ -591,7 +602,19 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
from .data import _proxy_transform, dispatch_proxy_set_data
# Reduce the amount of transformation that's needed for QuantileDMatrix.
if self._temporary_data is not None and id(data) == self._input_id:
#
# To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
# GPU. If the QDM has only one batch of input (most of the cases), we can
# avoid transforming the data repeatly.
try:
ref = weakref.ref(data)
except TypeError:
ref = None
if (
self._temporary_data is not None
and ref is not None
and ref is self._data_ref
):
new, cat_codes, feature_names, feature_types = self._temporary_data
else:
new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -608,7 +631,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
feature_types=feature_types,
**kwargs,
)
self._input_id = id(data)
self._data_ref = ref
# pylint: disable=not-callable
return self._handle_exception(lambda: self.next(input_data), 0)
@@ -1134,7 +1157,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
testing purposes. If this is a quantized DMatrix then quantized values are
returned instead of input values.
.. versionadded:: 1.7.0
.. versionadded:: 1.7.0
"""
indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1155,7 +1178,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
return ret
def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
"""Get quantile cuts for quantization."""
"""Get quantile cuts for quantization.
.. versionadded:: 2.0.0
"""
n_features = self.num_col()
c_sindptr = ctypes.c_char_p()

View File

@@ -5,7 +5,7 @@ import ctypes
import json
import os
import warnings
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
import numpy as np
@@ -17,6 +17,7 @@ from ._typing import (
FloatCompatible,
NumpyDType,
PandasDType,
TransformedData,
c_bst_ulong,
)
from .compat import DataFrame, lazy_isinstance
@@ -1268,12 +1269,7 @@ def _proxy_transform(
feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes],
enable_categorical: bool,
) -> Tuple[
Union[bool, ctypes.c_void_p, np.ndarray],
Optional[list],
Optional[FeatureNames],
Optional[FeatureTypes],
]:
) -> TransformedData:
if _is_cudf_df(data) or _is_cudf_ser(data):
return _transform_cudf_df(
data, feature_names, feature_types, enable_categorical

View File

@@ -230,7 +230,7 @@ class IteratorForTest(xgb.core.DataIter):
def as_arrays(
self,
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
if isinstance(self.X[0], sparse.csr_matrix):
X = sparse.vstack(self.X, format="csr")
else:
@@ -244,7 +244,12 @@ class IteratorForTest(xgb.core.DataIter):
def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
n_samples_per_batch: int,
n_features: int,
n_batches: int,
use_cupy: bool = False,
*,
vary_size: bool = False,
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
X = []
y = []
@@ -255,10 +260,11 @@ def make_batches(
rng = cupy.random.RandomState(1994)
else:
rng = np.random.RandomState(1994)
for _ in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
for i in range(n_batches):
n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
_X = rng.randn(n_samples, n_features)
_y = rng.randn(n_samples)
_w = rng.uniform(low=0, high=1, size=n_samples)
X.append(_X)
y.append(_y)
w.append(_w)