Move Python testing utilities into xgboost module. (#8379)
- Add typehints. - Fixes for pylint. Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
This commit is contained in:
@@ -65,7 +65,7 @@ def _check_rf_callback(
|
||||
)
|
||||
|
||||
|
||||
_SklObjective = Optional[
|
||||
SklObjective = Optional[
|
||||
Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
|
||||
]
|
||||
|
||||
@@ -144,7 +144,7 @@ __model_doc = f"""
|
||||
Boosting learning rate (xgb's "eta")
|
||||
verbosity : Optional[int]
|
||||
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
|
||||
objective : {_SklObjective}
|
||||
objective : {SklObjective}
|
||||
Specify the learning task and the corresponding learning objective or
|
||||
a custom objective function to be used (see note below).
|
||||
booster: Optional[str]
|
||||
@@ -546,7 +546,7 @@ class XGBModel(XGBModelBase):
|
||||
learning_rate: Optional[float] = None,
|
||||
n_estimators: int = 100,
|
||||
verbosity: Optional[int] = None,
|
||||
objective: _SklObjective = None,
|
||||
objective: SklObjective = None,
|
||||
booster: Optional[str] = None,
|
||||
tree_method: Optional[str] = None,
|
||||
n_jobs: Optional[int] = None,
|
||||
@@ -1409,7 +1409,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
objective: _SklObjective = "binary:logistic",
|
||||
objective: SklObjective = "binary:logistic",
|
||||
use_label_encoder: Optional[bool] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
@@ -1712,7 +1712,7 @@ class XGBRegressor(XGBModel, XGBRegressorBase):
|
||||
# pylint: disable=missing-docstring
|
||||
@_deprecate_positional_args
|
||||
def __init__(
|
||||
self, *, objective: _SklObjective = "reg:squarederror", **kwargs: Any
|
||||
self, *, objective: SklObjective = "reg:squarederror", **kwargs: Any
|
||||
) -> None:
|
||||
super().__init__(objective=objective, **kwargs)
|
||||
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
"""Utilities for defining Python tests."""
|
||||
|
||||
import socket
|
||||
from platform import system
|
||||
from typing import Any, TypedDict
|
||||
|
||||
PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
|
||||
|
||||
|
||||
def has_ipv6() -> bool:
|
||||
"""Check whether IPv6 is enabled on this host."""
|
||||
# connection error in macos, still need some fixes.
|
||||
if system() not in ("Linux", "Windows"):
|
||||
return False
|
||||
|
||||
if socket.has_ipv6:
|
||||
try:
|
||||
with socket.socket(
|
||||
socket.AF_INET6, socket.SOCK_STREAM
|
||||
) as server, socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as client:
|
||||
server.bind(("::1", 0))
|
||||
port = server.getsockname()[1]
|
||||
server.listen()
|
||||
|
||||
client.connect(("::1", port))
|
||||
conn, _ = server.accept()
|
||||
|
||||
client.sendall("abc".encode())
|
||||
msg = conn.recv(3).decode()
|
||||
# if the code can be executed to this point, the message should be
|
||||
# correct.
|
||||
assert msg == "abc"
|
||||
return True
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def skip_ipv6() -> PytestSkip:
|
||||
"""PyTest skip mark for IPv6."""
|
||||
return {"condition": not has_ipv6(), "reason": "IPv6 is required to be enabled."}
|
||||
|
||||
|
||||
def timeout(sec: int, *args: Any, enable: bool = True, **kwargs: Any) -> Any:
|
||||
"""Make a pytest mark for the `pytest-timeout` package.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sec :
|
||||
Timeout seconds.
|
||||
enable :
|
||||
Control whether timeout should be applied, used for debugging.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pytest.mark.timeout
|
||||
"""
|
||||
import pytest # pylint: disable=import-error
|
||||
|
||||
# This is disabled for now due to regression caused by conflicts between federated
|
||||
# learning build and the CI container environment.
|
||||
if enable:
|
||||
return pytest.mark.timeout(sec, *args, **kwargs)
|
||||
return pytest.mark.timeout(None, *args, **kwargs)
|
||||
890
python-package/xgboost/testing/__init__.py
Normal file
890
python-package/xgboost/testing/__init__.py
Normal file
@@ -0,0 +1,890 @@
|
||||
"""Utilities for defining Python tests. The module is private and subject to frequent
|
||||
change without notice.
|
||||
|
||||
"""
|
||||
# pylint: disable=invalid-name,missing-function-docstring,import-error
|
||||
import gc
|
||||
import importlib.util
|
||||
import multiprocessing
|
||||
import os
|
||||
import platform
|
||||
import socket
|
||||
import sys
|
||||
import urllib
|
||||
import zipfile
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from contextlib import contextmanager
|
||||
from io import StringIO
|
||||
from platform import system
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Generator,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Set,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
Union,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from xgboost.core import ArrayLike
|
||||
from xgboost.sklearn import SklObjective
|
||||
|
||||
import xgboost as xgb
|
||||
|
||||
hypothesis = pytest.importorskip("hypothesis")
|
||||
|
||||
# pylint:disable=wrong-import-position,wrong-import-order
|
||||
from hypothesis import strategies
|
||||
from hypothesis.extra.numpy import arrays
|
||||
|
||||
joblib = pytest.importorskip("joblib")
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
|
||||
Memory = joblib.Memory
|
||||
|
||||
memory = Memory("./cachedir", verbose=0)
|
||||
|
||||
PytestSkip = TypedDict("PytestSkip", {"condition": bool, "reason": str})
|
||||
|
||||
|
||||
def has_ipv6() -> bool:
|
||||
"""Check whether IPv6 is enabled on this host."""
|
||||
# connection error in macos, still need some fixes.
|
||||
if system() not in ("Linux", "Windows"):
|
||||
return False
|
||||
|
||||
if socket.has_ipv6:
|
||||
try:
|
||||
with socket.socket(
|
||||
socket.AF_INET6, socket.SOCK_STREAM
|
||||
) as server, socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as client:
|
||||
server.bind(("::1", 0))
|
||||
port = server.getsockname()[1]
|
||||
server.listen()
|
||||
|
||||
client.connect(("::1", port))
|
||||
conn, _ = server.accept()
|
||||
|
||||
client.sendall("abc".encode())
|
||||
msg = conn.recv(3).decode()
|
||||
# if the code can be executed to this point, the message should be
|
||||
# correct.
|
||||
assert msg == "abc"
|
||||
return True
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def no_mod(name: str) -> PytestSkip:
|
||||
spec = importlib.util.find_spec(name)
|
||||
return {"condition": spec is None, "reason": f"{name} is not installed."}
|
||||
|
||||
|
||||
def no_ipv6() -> PytestSkip:
|
||||
"""PyTest skip mark for IPv6."""
|
||||
return {"condition": not has_ipv6(), "reason": "IPv6 is required to be enabled."}
|
||||
|
||||
|
||||
def no_ubjson() -> PytestSkip:
|
||||
return no_mod("ubjson")
|
||||
|
||||
|
||||
def no_sklearn() -> PytestSkip:
|
||||
return no_mod("sklearn")
|
||||
|
||||
|
||||
def no_dask() -> PytestSkip:
|
||||
return no_mod("dask")
|
||||
|
||||
|
||||
def no_spark() -> PytestSkip:
|
||||
return no_mod("pyspark")
|
||||
|
||||
|
||||
def no_pandas() -> PytestSkip:
|
||||
return no_mod("pandas")
|
||||
|
||||
|
||||
def no_arrow() -> PytestSkip:
|
||||
return no_mod("pyarrow")
|
||||
|
||||
|
||||
def no_modin() -> PytestSkip:
|
||||
return no_mod("modin")
|
||||
|
||||
|
||||
def no_dt() -> PytestSkip:
|
||||
return no_mod("datatable")
|
||||
|
||||
|
||||
def no_matplotlib() -> PytestSkip:
|
||||
reason = "Matplotlib is not installed."
|
||||
try:
|
||||
import matplotlib.pyplot as _ # noqa
|
||||
|
||||
return {"condition": False, "reason": reason}
|
||||
except ImportError:
|
||||
return {"condition": True, "reason": reason}
|
||||
|
||||
|
||||
def no_dask_cuda() -> PytestSkip:
|
||||
return no_mod("dask_cuda")
|
||||
|
||||
|
||||
def no_cudf() -> PytestSkip:
|
||||
return no_mod("cudf")
|
||||
|
||||
|
||||
def no_cupy() -> PytestSkip:
|
||||
return no_mod("cupy")
|
||||
|
||||
|
||||
def no_dask_cudf() -> PytestSkip:
|
||||
return no_mod("dask_cudf")
|
||||
|
||||
|
||||
def no_json_schema() -> PytestSkip:
|
||||
return no_mod("jsonschema")
|
||||
|
||||
|
||||
def no_graphviz() -> PytestSkip:
|
||||
return no_mod("graphviz")
|
||||
|
||||
|
||||
def no_multiple(*args: Any) -> PytestSkip:
|
||||
condition = False
|
||||
reason = ""
|
||||
for arg in args:
|
||||
condition = condition or arg["condition"]
|
||||
if arg["condition"]:
|
||||
reason = arg["reason"]
|
||||
break
|
||||
return {"condition": condition, "reason": reason}
|
||||
|
||||
|
||||
def skip_s390x() -> PytestSkip:
|
||||
condition = platform.machine() == "s390x"
|
||||
reason = "Known to fail on s390x"
|
||||
return {"condition": condition, "reason": reason}
|
||||
|
||||
|
||||
class IteratorForTest(xgb.core.DataIter):
|
||||
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X: Sequence,
|
||||
y: Sequence,
|
||||
w: Optional[Sequence],
|
||||
cache: Optional[str] = "./",
|
||||
) -> None:
|
||||
assert len(X) == len(y)
|
||||
self.X = X
|
||||
self.y = y
|
||||
self.w = w
|
||||
self.it = 0
|
||||
super().__init__(cache)
|
||||
|
||||
def next(self, input_data: Callable) -> int:
|
||||
if self.it == len(self.X):
|
||||
return 0
|
||||
|
||||
with pytest.raises(TypeError, match="keyword args"):
|
||||
input_data(self.X[self.it], self.y[self.it], None)
|
||||
|
||||
# Use copy to make sure the iterator doesn't hold a reference to the data.
|
||||
input_data(
|
||||
data=self.X[self.it].copy(),
|
||||
label=self.y[self.it].copy(),
|
||||
weight=self.w[self.it].copy() if self.w else None,
|
||||
)
|
||||
gc.collect() # clear up the copy, see if XGBoost access freed memory.
|
||||
self.it += 1
|
||||
return 1
|
||||
|
||||
def reset(self) -> None:
|
||||
self.it = 0
|
||||
|
||||
def as_arrays(
|
||||
self,
|
||||
) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
|
||||
if isinstance(self.X[0], sparse.csr_matrix):
|
||||
X = sparse.vstack(self.X, format="csr")
|
||||
else:
|
||||
X = np.concatenate(self.X, axis=0)
|
||||
y = np.concatenate(self.y, axis=0)
|
||||
if self.w:
|
||||
w = np.concatenate(self.w, axis=0)
|
||||
else:
|
||||
w = None
|
||||
return X, y, w
|
||||
|
||||
|
||||
def make_batches(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
|
||||
) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
w = []
|
||||
if use_cupy:
|
||||
import cupy
|
||||
|
||||
rng = cupy.random.RandomState(1994)
|
||||
else:
|
||||
rng = np.random.RandomState(1994)
|
||||
for _ in range(n_batches):
|
||||
_X = rng.randn(n_samples_per_batch, n_features)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
w.append(_w)
|
||||
return X, y, w
|
||||
|
||||
|
||||
def make_batches_sparse(
|
||||
n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
|
||||
) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
|
||||
X = []
|
||||
y = []
|
||||
w = []
|
||||
rng = np.random.RandomState(1994)
|
||||
for _ in range(n_batches):
|
||||
_X = sparse.random(
|
||||
n_samples_per_batch,
|
||||
n_features,
|
||||
1.0 - sparsity,
|
||||
format="csr",
|
||||
dtype=np.float32,
|
||||
random_state=rng,
|
||||
)
|
||||
_y = rng.randn(n_samples_per_batch)
|
||||
_w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
|
||||
X.append(_X)
|
||||
y.append(_y)
|
||||
w.append(_w)
|
||||
return X, y, w
|
||||
|
||||
|
||||
class TestDataset:
|
||||
"""Contains a dataset in numpy format as well as the relevant objective and metric."""
|
||||
|
||||
def __init__(
|
||||
self, name: str, get_dataset: Callable, objective: str, metric: str
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.objective = objective
|
||||
self.metric = metric
|
||||
self.X, self.y = get_dataset()
|
||||
self.w: Optional[np.ndarray] = None
|
||||
self.margin: Optional[np.ndarray] = None
|
||||
|
||||
def set_params(self, params_in: Dict[str, Any]) -> Dict[str, Any]:
|
||||
params_in["objective"] = self.objective
|
||||
params_in["eval_metric"] = self.metric
|
||||
if self.objective == "multi:softmax":
|
||||
params_in["num_class"] = int(np.max(self.y) + 1)
|
||||
return params_in
|
||||
|
||||
def get_dmat(self) -> xgb.DMatrix:
|
||||
return xgb.DMatrix(
|
||||
self.X,
|
||||
self.y,
|
||||
weight=self.w,
|
||||
base_margin=self.margin,
|
||||
enable_categorical=True,
|
||||
)
|
||||
|
||||
def get_device_dmat(self) -> xgb.DeviceQuantileDMatrix:
|
||||
import cupy as cp
|
||||
|
||||
w = None if self.w is None else cp.array(self.w)
|
||||
X = cp.array(self.X, dtype=np.float32)
|
||||
y = cp.array(self.y, dtype=np.float32)
|
||||
return xgb.DeviceQuantileDMatrix(X, y, w, base_margin=self.margin)
|
||||
|
||||
def get_external_dmat(self) -> xgb.DMatrix:
|
||||
n_samples = self.X.shape[0]
|
||||
n_batches = 10
|
||||
per_batch = n_samples // n_batches + 1
|
||||
|
||||
predictor = []
|
||||
response = []
|
||||
weight = []
|
||||
for i in range(n_batches):
|
||||
beg = i * per_batch
|
||||
end = min((i + 1) * per_batch, n_samples)
|
||||
assert end != beg
|
||||
X = self.X[beg:end, ...]
|
||||
y = self.y[beg:end]
|
||||
w = self.w[beg:end] if self.w is not None else None
|
||||
predictor.append(X)
|
||||
response.append(y)
|
||||
if w is not None:
|
||||
weight.append(w)
|
||||
|
||||
it = IteratorForTest(predictor, response, weight if weight else None)
|
||||
return xgb.DMatrix(it)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.name
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_california_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
data = datasets.fetch_california_housing()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_digits() -> Tuple[np.ndarray, np.ndarray]:
|
||||
data = datasets.load_digits()
|
||||
return data.data, data.target
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_cancer() -> Tuple[np.ndarray, np.ndarray]:
|
||||
return datasets.load_breast_cancer(return_X_y=True)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_sparse() -> Tuple[np.ndarray, np.ndarray]:
|
||||
rng = np.random.RandomState(199)
|
||||
n = 2000
|
||||
sparsity = 0.75
|
||||
X, y = datasets.make_regression(n, random_state=rng)
|
||||
flag = rng.binomial(1, sparsity, X.shape)
|
||||
for i in range(X.shape[0]):
|
||||
for j in range(X.shape[1]):
|
||||
if flag[i, j]:
|
||||
X[i, j] = np.nan
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_ames_housing() -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Number of samples: 1460
|
||||
Number of features: 20
|
||||
Number of categorical features: 10
|
||||
Number of numerical features: 10
|
||||
"""
|
||||
from sklearn.datasets import fetch_openml
|
||||
|
||||
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
|
||||
|
||||
categorical_columns_subset: List[str] = [
|
||||
"BldgType", # 5 cats, no nan
|
||||
"GarageFinish", # 3 cats, nan
|
||||
"LotConfig", # 5 cats, no nan
|
||||
"Functional", # 7 cats, no nan
|
||||
"MasVnrType", # 4 cats, nan
|
||||
"HouseStyle", # 8 cats, no nan
|
||||
"FireplaceQu", # 5 cats, nan
|
||||
"ExterCond", # 5 cats, no nan
|
||||
"ExterQual", # 4 cats, no nan
|
||||
"PoolQC", # 3 cats, nan
|
||||
]
|
||||
|
||||
numerical_columns_subset: List[str] = [
|
||||
"3SsnPorch",
|
||||
"Fireplaces",
|
||||
"BsmtHalfBath",
|
||||
"HalfBath",
|
||||
"GarageCars",
|
||||
"TotRmsAbvGrd",
|
||||
"BsmtFinSF1",
|
||||
"BsmtFinSF2",
|
||||
"GrLivArea",
|
||||
"ScreenPorch",
|
||||
]
|
||||
|
||||
X = X[categorical_columns_subset + numerical_columns_subset]
|
||||
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")
|
||||
return X, y
|
||||
|
||||
|
||||
@memory.cache
|
||||
def get_mq2008(
|
||||
dpath: str,
|
||||
) -> Tuple[
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
sparse.csr_matrix,
|
||||
np.ndarray,
|
||||
np.ndarray,
|
||||
]:
|
||||
from sklearn.datasets import load_svmlight_files
|
||||
|
||||
src = "https://s3-us-west-2.amazonaws.com/xgboost-examples/MQ2008.zip"
|
||||
target = dpath + "/MQ2008.zip"
|
||||
if not os.path.exists(target):
|
||||
urllib.request.urlretrieve(url=src, filename=target)
|
||||
|
||||
with zipfile.ZipFile(target, "r") as f:
|
||||
f.extractall(path=dpath)
|
||||
|
||||
(
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
) = load_svmlight_files(
|
||||
(
|
||||
dpath + "MQ2008/Fold1/train.txt",
|
||||
dpath + "MQ2008/Fold1/test.txt",
|
||||
dpath + "MQ2008/Fold1/vali.txt",
|
||||
),
|
||||
query_id=True,
|
||||
zero_based=False,
|
||||
)
|
||||
|
||||
return (
|
||||
x_train,
|
||||
y_train,
|
||||
qid_train,
|
||||
x_test,
|
||||
y_test,
|
||||
qid_test,
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
)
|
||||
|
||||
|
||||
@memory.cache
|
||||
def make_categorical(
|
||||
n_samples: int,
|
||||
n_features: int,
|
||||
n_categories: int,
|
||||
onehot: bool,
|
||||
sparsity: float = 0.0,
|
||||
) -> Tuple[ArrayLike, np.ndarray]:
|
||||
import pandas as pd
|
||||
|
||||
rng = np.random.RandomState(1994)
|
||||
|
||||
pd_dict = {}
|
||||
for i in range(n_features + 1):
|
||||
c = rng.randint(low=0, high=n_categories, size=n_samples)
|
||||
pd_dict[str(i)] = pd.Series(c, dtype=np.int64)
|
||||
|
||||
df = pd.DataFrame(pd_dict)
|
||||
label = df.iloc[:, 0]
|
||||
df = df.iloc[:, 1:]
|
||||
for i in range(0, n_features):
|
||||
label += df.iloc[:, i]
|
||||
label += 1
|
||||
|
||||
df = df.astype("category")
|
||||
categories = np.arange(0, n_categories)
|
||||
for col in df.columns:
|
||||
df[col] = df[col].cat.set_categories(categories)
|
||||
|
||||
if sparsity > 0.0:
|
||||
for i in range(n_features):
|
||||
index = rng.randint(
|
||||
low=0, high=n_samples - 1, size=int(n_samples * sparsity)
|
||||
)
|
||||
df.iloc[index, i] = np.NaN
|
||||
assert n_categories == np.unique(df.dtypes[i].categories).size
|
||||
|
||||
if onehot:
|
||||
return pd.get_dummies(df), label
|
||||
return df, label
|
||||
|
||||
|
||||
def _cat_sampled_from() -> strategies.SearchStrategy:
|
||||
@strategies.composite
|
||||
def _make_cat(draw: Callable) -> Tuple[int, int, int, float]:
|
||||
n_samples = draw(strategies.integers(2, 512))
|
||||
n_features = draw(strategies.integers(1, 4))
|
||||
n_cats = draw(strategies.integers(1, 128))
|
||||
sparsity = draw(
|
||||
strategies.floats(
|
||||
min_value=0,
|
||||
max_value=1,
|
||||
allow_nan=False,
|
||||
allow_infinity=False,
|
||||
allow_subnormal=False,
|
||||
)
|
||||
)
|
||||
return n_samples, n_features, n_cats, sparsity
|
||||
|
||||
def _build(args: Tuple[int, int, int, float]) -> TestDataset:
|
||||
n_samples = args[0]
|
||||
n_features = args[1]
|
||||
n_cats = args[2]
|
||||
sparsity = args[3]
|
||||
return TestDataset(
|
||||
f"{n_samples}x{n_features}-{n_cats}-{sparsity}",
|
||||
lambda: make_categorical(n_samples, n_features, n_cats, False, sparsity),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
)
|
||||
|
||||
return _make_cat().map(_build) # pylint: disable=no-member
|
||||
|
||||
|
||||
categorical_dataset_strategy: strategies.SearchStrategy = _cat_sampled_from()
|
||||
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
@memory.cache
|
||||
def make_sparse_regression(
|
||||
n_samples: int, n_features: int, sparsity: float, as_dense: bool
|
||||
) -> Tuple[Union[sparse.csr_matrix], np.ndarray]:
|
||||
"""Make sparse matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
as_dense:
|
||||
|
||||
Return the matrix as np.ndarray with missing values filled by NaN
|
||||
|
||||
"""
|
||||
if not hasattr(np.random, "default_rng"):
|
||||
# old version of numpy on s390x
|
||||
rng = np.random.RandomState(1994)
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
format="csr",
|
||||
)
|
||||
y = rng.normal(loc=0.0, scale=1.0, size=n_samples)
|
||||
return X, y
|
||||
|
||||
# Use multi-thread to speed up the generation, convenient if you use this function
|
||||
# for benchmarking.
|
||||
n_threads = min(multiprocessing.cpu_count(), n_features)
|
||||
|
||||
def random_csc(t_id: int) -> sparse.csc_matrix:
|
||||
rng = np.random.default_rng(1994 * t_id)
|
||||
thread_size = n_features // n_threads
|
||||
if t_id == n_threads - 1:
|
||||
n_features_tloc = n_features - t_id * thread_size
|
||||
else:
|
||||
n_features_tloc = thread_size
|
||||
|
||||
X = sparse.random(
|
||||
m=n_samples,
|
||||
n=n_features_tloc,
|
||||
density=1.0 - sparsity,
|
||||
random_state=rng,
|
||||
).tocsc()
|
||||
y = np.zeros((n_samples, 1))
|
||||
|
||||
for i in range(X.shape[1]):
|
||||
size = X.indptr[i + 1] - X.indptr[i]
|
||||
if size != 0:
|
||||
y += X[:, i].toarray() * rng.random((n_samples, 1)) * 0.2
|
||||
|
||||
return X, y
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor(max_workers=n_threads) as executor:
|
||||
for i in range(n_threads):
|
||||
futures.append(executor.submit(random_csc, i))
|
||||
|
||||
X_results = []
|
||||
y_results = []
|
||||
for f in futures:
|
||||
X, y = f.result()
|
||||
X_results.append(X)
|
||||
y_results.append(y)
|
||||
|
||||
assert len(y_results) == n_threads
|
||||
|
||||
csr: sparse.csr_matrix = sparse.hstack(X_results, format="csr")
|
||||
y = np.asarray(y_results)
|
||||
y = y.reshape((y.shape[0], y.shape[1])).T
|
||||
y = np.sum(y, axis=1)
|
||||
|
||||
assert csr.shape[0] == n_samples
|
||||
assert csr.shape[1] == n_features
|
||||
assert y.shape[0] == n_samples
|
||||
|
||||
if as_dense:
|
||||
arr = csr.toarray()
|
||||
assert arr.shape[0] == n_samples
|
||||
assert arr.shape[1] == n_features
|
||||
arr[arr == 0] = np.nan
|
||||
return arr, y
|
||||
|
||||
return csr, y
|
||||
|
||||
|
||||
sparse_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
"1e5x8-0.95-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.95, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.5-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.5, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.5-dense",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.5, True),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.05-csr",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.05, False),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset(
|
||||
"1e5x8-0.05-dense",
|
||||
lambda: make_sparse_regression(int(1e5), 8, 0.05, True),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
_unweighted_datasets_strategy = strategies.sampled_from(
|
||||
[
|
||||
TestDataset(
|
||||
"calif_housing", get_california_housing, "reg:squarederror", "rmse"
|
||||
),
|
||||
TestDataset(
|
||||
"calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
|
||||
),
|
||||
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
|
||||
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
|
||||
TestDataset(
|
||||
"mtreg",
|
||||
lambda: datasets.make_regression(n_samples=128, n_targets=3),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
|
||||
TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
|
||||
TestDataset(
|
||||
"empty",
|
||||
lambda: (np.empty((0, 100)), np.empty(0)),
|
||||
"reg:squarederror",
|
||||
"rmse",
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@strategies.composite
|
||||
def _dataset_weight_margin(draw: Callable) -> TestDataset:
|
||||
data: TestDataset = draw(_unweighted_datasets_strategy)
|
||||
if draw(strategies.booleans()):
|
||||
data.w = draw(
|
||||
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
|
||||
)
|
||||
if draw(strategies.booleans()):
|
||||
num_class = 1
|
||||
if data.objective == "multi:softmax":
|
||||
num_class = int(np.max(data.y) + 1)
|
||||
elif data.name == "mtreg":
|
||||
num_class = data.y.shape[1]
|
||||
|
||||
data.margin = draw(
|
||||
arrays(
|
||||
np.float64,
|
||||
(data.y.shape[0] * num_class),
|
||||
elements=strategies.floats(0.5, 1.0),
|
||||
)
|
||||
)
|
||||
assert data.margin is not None
|
||||
if num_class != 1:
|
||||
data.margin = data.margin.reshape(data.y.shape[0], num_class)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# A strategy for drawing from a set of example datasets
|
||||
# May add random weights to the dataset
|
||||
dataset_strategy = _dataset_weight_margin()
|
||||
|
||||
|
||||
def non_increasing(L: Sequence[float], tolerance: float = 1e-4) -> bool:
|
||||
return all((y - x) < tolerance for x, y in zip(L, L[1:]))
|
||||
|
||||
|
||||
def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
|
||||
"""Evaluation metric for xgb.train"""
|
||||
label = dtrain.get_label()
|
||||
r = np.zeros(predt.shape)
|
||||
gt = predt > 0.5
|
||||
if predt.size == 0:
|
||||
return "CustomErr", np.float64(0.0)
|
||||
r[gt] = 1 - label[gt]
|
||||
le = predt <= 0.5
|
||||
r[le] = label[le]
|
||||
return "CustomErr", np.sum(r)
|
||||
|
||||
|
||||
def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> np.float64:
|
||||
"""Evaluation metric that looks like metrics provided by sklearn."""
|
||||
r = np.zeros(y_score.shape)
|
||||
gt = y_score > 0.5
|
||||
r[gt] = 1 - y_true[gt]
|
||||
le = y_score <= 0.5
|
||||
r[le] = y_true[le]
|
||||
return np.sum(r)
|
||||
|
||||
|
||||
def root_mean_square(y_true: np.ndarray, y_score: np.ndarray) -> float:
|
||||
err = y_score - y_true
|
||||
rmse = np.sqrt(np.dot(err, err) / y_score.size)
|
||||
return rmse
|
||||
|
||||
|
||||
def softmax(x: np.ndarray) -> np.ndarray:
|
||||
e = np.exp(x)
|
||||
return e / np.sum(e)
|
||||
|
||||
|
||||
def softprob_obj(classes: int) -> SklObjective:
|
||||
def objective(
|
||||
labels: np.ndarray, predt: np.ndarray
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
rows = labels.shape[0]
|
||||
grad = np.zeros((rows, classes), dtype=float)
|
||||
hess = np.zeros((rows, classes), dtype=float)
|
||||
eps = 1e-6
|
||||
for r in range(predt.shape[0]):
|
||||
target = labels[r]
|
||||
p = softmax(predt[r, :])
|
||||
for c in range(predt.shape[1]):
|
||||
assert target >= 0 or target <= classes
|
||||
g = p[c] - 1.0 if c == target else p[c]
|
||||
h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
|
||||
grad[r, c] = g
|
||||
hess[r, c] = h
|
||||
|
||||
grad = grad.reshape((rows * classes, 1))
|
||||
hess = hess.reshape((rows * classes, 1))
|
||||
return grad, hess
|
||||
|
||||
return objective
|
||||
|
||||
|
||||
class DirectoryExcursion:
|
||||
"""Change directory. Change back and optionally cleaning up the directory when
|
||||
exit.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, path: os.PathLike, cleanup: bool = False):
|
||||
self.path = path
|
||||
self.curdir = os.path.normpath(os.path.abspath(os.path.curdir))
|
||||
self.cleanup = cleanup
|
||||
self.files: Set[str] = set()
|
||||
|
||||
def __enter__(self) -> None:
|
||||
os.chdir(self.path)
|
||||
if self.cleanup:
|
||||
self.files = {
|
||||
os.path.join(root, f)
|
||||
for root, subdir, files in os.walk(os.path.expanduser(self.path))
|
||||
for f in files
|
||||
}
|
||||
|
||||
def __exit__(self, *args: Any) -> None:
|
||||
os.chdir(self.curdir)
|
||||
if self.cleanup:
|
||||
files = {
|
||||
os.path.join(root, f)
|
||||
for root, subdir, files in os.walk(os.path.expanduser(self.path))
|
||||
for f in files
|
||||
}
|
||||
diff = files.difference(self.files)
|
||||
for f in diff:
|
||||
os.remove(f)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def captured_output() -> Generator[Tuple[StringIO, StringIO], None, None]:
|
||||
"""Reassign stdout temporarily in order to test printed statements
|
||||
Taken from:
|
||||
https://stackoverflow.com/questions/4219717/how-to-assert-output-with-nosetest-unittest-in-python
|
||||
|
||||
Also works for pytest.
|
||||
|
||||
"""
|
||||
new_out, new_err = StringIO(), StringIO()
|
||||
old_out, old_err = sys.stdout, sys.stderr
|
||||
try:
|
||||
sys.stdout, sys.stderr = new_out, new_err
|
||||
yield sys.stdout, sys.stderr
|
||||
finally:
|
||||
sys.stdout, sys.stderr = old_out, old_err
|
||||
|
||||
|
||||
def timeout(sec: int, *args: Any, enable: bool = True, **kwargs: Any) -> Any:
|
||||
"""Make a pytest mark for the `pytest-timeout` package.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sec :
|
||||
Timeout seconds.
|
||||
enable :
|
||||
Control whether timeout should be applied, used for debugging.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pytest.mark.timeout
|
||||
"""
|
||||
|
||||
if enable:
|
||||
return pytest.mark.timeout(sec, *args, **kwargs)
|
||||
return pytest.mark.timeout(None, *args, **kwargs)
|
||||
|
||||
|
||||
def demo_dir(path: str) -> str:
|
||||
"""Look for the demo directory based on the test file name."""
|
||||
path = normpath(os.path.dirname(path))
|
||||
while True:
|
||||
subdirs = [f.path for f in os.scandir(path) if f.is_dir()]
|
||||
subdirs = [os.path.basename(d) for d in subdirs]
|
||||
if "demo" in subdirs:
|
||||
return os.path.join(path, "demo")
|
||||
new_path = normpath(os.path.join(path, os.path.pardir))
|
||||
assert new_path != path
|
||||
path = new_path
|
||||
|
||||
|
||||
def normpath(path: str) -> str:
|
||||
return os.path.normpath(os.path.abspath(path))
|
||||
|
||||
|
||||
def data_dir(path: str) -> str:
|
||||
return os.path.join(demo_dir(path), "data")
|
||||
|
||||
|
||||
def project_root(path: str) -> str:
|
||||
return normpath(os.path.join(demo_dir(path), os.path.pardir))
|
||||
Reference in New Issue
Block a user