Use weakref instead of id for DataIter cache. (#9445)
- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
This commit is contained in:
@@ -8,7 +8,9 @@ from typing import (
|
||||
Callable,
|
||||
Dict,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
Type,
|
||||
TypeVar,
|
||||
Union,
|
||||
@@ -20,8 +22,6 @@ import numpy as np
|
||||
|
||||
DataType = Any
|
||||
|
||||
# xgboost accepts some other possible types in practice due to historical reason, which is
|
||||
# lesser tested. For now we encourage users to pass a simple list of string.
|
||||
FeatureInfo = Sequence[str]
|
||||
FeatureNames = FeatureInfo
|
||||
FeatureTypes = FeatureInfo
|
||||
@@ -97,6 +97,13 @@ else:
|
||||
ctypes._Pointer,
|
||||
]
|
||||
|
||||
# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
|
||||
# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
|
||||
# being freed.
|
||||
TransformedData = Tuple[
|
||||
Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
|
||||
]
|
||||
|
||||
# template parameter
|
||||
_T = TypeVar("_T")
|
||||
_F = TypeVar("_F", bound=Callable[..., Any])
|
||||
|
||||
@@ -9,6 +9,7 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import warnings
|
||||
import weakref
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from enum import IntEnum, unique
|
||||
@@ -51,6 +52,7 @@ from ._typing import (
|
||||
FeatureTypes,
|
||||
ModelIn,
|
||||
NumpyOrCupy,
|
||||
TransformedData,
|
||||
c_bst_ulong,
|
||||
)
|
||||
from .compat import PANDAS_INSTALLED, DataFrame, py_str
|
||||
@@ -486,7 +488,16 @@ def _prediction_output(
|
||||
|
||||
|
||||
class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
"""The interface for user defined data iterator.
|
||||
"""The interface for user defined data iterator. The iterator facilitates
|
||||
distributed training, :py:class:`QuantileDMatrix`, and external memory support using
|
||||
:py:class:`DMatrix`. Most of time, users don't need to interact with this class
|
||||
directly.
|
||||
|
||||
.. note::
|
||||
|
||||
The class caches some intermediate results using the `data` input (predictor
|
||||
`X`) as key. Don't repeat the `X` for multiple batches with different meta data
|
||||
(like `label`), make a copy if necessary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -510,13 +521,13 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
self._allow_host = True
|
||||
self._release = release_data
|
||||
# Stage data in Python until reset or next is called to avoid data being free.
|
||||
self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
|
||||
self._input_id: int = 0
|
||||
self._temporary_data: Optional[TransformedData] = None
|
||||
self._data_ref: Optional[weakref.ReferenceType] = None
|
||||
|
||||
def get_callbacks(
|
||||
self, allow_host: bool, enable_categorical: bool
|
||||
) -> Tuple[Callable, Callable]:
|
||||
"""Get callback functions for iterating in C."""
|
||||
"""Get callback functions for iterating in C. This is an internal function."""
|
||||
assert hasattr(self, "cache_prefix"), "__init__ is not called."
|
||||
self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
|
||||
self._reset_wrapper
|
||||
@@ -591,7 +602,19 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
from .data import _proxy_transform, dispatch_proxy_set_data
|
||||
|
||||
# Reduce the amount of transformation that's needed for QuantileDMatrix.
|
||||
if self._temporary_data is not None and id(data) == self._input_id:
|
||||
#
|
||||
# To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
|
||||
# GPU. If the QDM has only one batch of input (most of the cases), we can
|
||||
# avoid transforming the data repeatly.
|
||||
try:
|
||||
ref = weakref.ref(data)
|
||||
except TypeError:
|
||||
ref = None
|
||||
if (
|
||||
self._temporary_data is not None
|
||||
and ref is not None
|
||||
and ref is self._data_ref
|
||||
):
|
||||
new, cat_codes, feature_names, feature_types = self._temporary_data
|
||||
else:
|
||||
new, cat_codes, feature_names, feature_types = _proxy_transform(
|
||||
@@ -608,7 +631,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
|
||||
feature_types=feature_types,
|
||||
**kwargs,
|
||||
)
|
||||
self._input_id = id(data)
|
||||
self._data_ref = ref
|
||||
|
||||
# pylint: disable=not-callable
|
||||
return self._handle_exception(lambda: self.next(input_data), 0)
|
||||
@@ -1134,7 +1157,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
testing purposes. If this is a quantized DMatrix then quantized values are
|
||||
returned instead of input values.
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
"""
|
||||
indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
|
||||
@@ -1155,7 +1178,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m
|
||||
return ret
|
||||
|
||||
def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Get quantile cuts for quantization."""
|
||||
"""Get quantile cuts for quantization.
|
||||
|
||||
.. versionadded:: 2.0.0
|
||||
|
||||
"""
|
||||
n_features = self.num_col()
|
||||
|
||||
c_sindptr = ctypes.c_char_p()
|
||||
|
||||
@@ -5,7 +5,7 @@ import ctypes
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
|
||||
from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -17,6 +17,7 @@ from ._typing import (
|
||||
FloatCompatible,
|
||||
NumpyDType,
|
||||
PandasDType,
|
||||
TransformedData,
|
||||
c_bst_ulong,
|
||||
)
|
||||
from .compat import DataFrame, lazy_isinstance
|
||||
@@ -1268,12 +1269,7 @@ def _proxy_transform(
|
||||
feature_names: Optional[FeatureNames],
|
||||
feature_types: Optional[FeatureTypes],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[
|
||||
Union[bool, ctypes.c_void_p, np.ndarray],
|
||||
Optional[list],
|
||||
Optional[FeatureNames],
|
||||
Optional[FeatureTypes],
|
||||
]:
|
||||
) -> TransformedData:
|
||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||
return _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
|
||||
@@ -230,7 +230,7 @@ class IteratorForTest(xgb.core.DataIter):
|
||||
|
||||
def as_arrays(
|
||||
self,
|
||||
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
|
||||
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
|
||||
if isinstance(self.X[0], sparse.csr_matrix):
|
||||
X = sparse.vstack(self.X, format="csr")
|
||||
else:
|
||||
@@ -244,7 +244,12 @@ class IteratorForTest(xgb.core.DataIter):
|
||||
|
||||
|
||||
def make_batches(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
|
||||
n_samples_per_batch: int,
|
||||
n_features: int,
|
||||
n_batches: int,
|
||||
use_cupy: bool = False,
|
||||
*,
|
||||
vary_size: bool = False,
|
||||
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
@@ -255,10 +260,11 @@ def make_batches(
|
||||
rng = cupy.random.RandomState(1994)
|
||||
else:
|
||||
rng = np.random.RandomState(1994)
|
||||
for _ in range(n_batches):
|
||||
_X = rng.randn(n_samples_per_batch, n_features)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
|
||||
for i in range(n_batches):
|
||||
n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
|
||||
_X = rng.randn(n_samples, n_features)
|
||||
_y = rng.randn(n_samples)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
w.append(_w)
|
||||
|
||||
Reference in New Issue
Block a user