* Define `best_iteration` only if early stopping is used. This is the behavior specified by the document but not honored in the actual code. - Don't set the attributes if there's no early stopping. - Clean up the code for callbacks, and replace assertions with proper exceptions. - Assign the attributes when early stopping `save_best` is used. - Turn the attributes into Python properties. --------- Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
265 lines
8.5 KiB
Python
265 lines
8.5 KiB
Python
"""Tests for running inplace prediction."""
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
from scipy import sparse
|
|
|
|
import xgboost as xgb
|
|
from xgboost import testing as tm
|
|
from xgboost.testing.data import np_dtypes, pd_dtypes
|
|
from xgboost.testing.shared import validate_leaf_output
|
|
|
|
|
|
def run_threaded_predict(X, rows, predict_func):
|
|
results = []
|
|
per_thread = 20
|
|
with ThreadPoolExecutor(max_workers=10) as e:
|
|
for i in range(0, rows, int(rows / per_thread)):
|
|
if hasattr(X, "iloc"):
|
|
predictor = X.iloc[i : i + per_thread, :]
|
|
else:
|
|
predictor = X[i : i + per_thread, ...]
|
|
f = e.submit(predict_func, predictor)
|
|
results.append(f)
|
|
|
|
for f in results:
|
|
assert f.result()
|
|
|
|
|
|
def run_predict_leaf(device: str) -> np.ndarray:
|
|
rows = 100
|
|
cols = 4
|
|
classes = 5
|
|
num_parallel_tree = 4
|
|
num_boost_round = 10
|
|
rng = np.random.RandomState(1994)
|
|
X = rng.randn(rows, cols)
|
|
y = rng.randint(low=0, high=classes, size=rows)
|
|
m = xgb.DMatrix(X, y)
|
|
booster = xgb.train(
|
|
{
|
|
"num_parallel_tree": num_parallel_tree,
|
|
"num_class": classes,
|
|
"tree_method": "hist",
|
|
},
|
|
m,
|
|
num_boost_round=num_boost_round,
|
|
)
|
|
|
|
booster.set_param({"device": device})
|
|
empty = xgb.DMatrix(np.ones(shape=(0, cols)))
|
|
empty_leaf = booster.predict(empty, pred_leaf=True)
|
|
assert empty_leaf.shape[0] == 0
|
|
|
|
leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
|
|
assert leaf.shape[0] == rows
|
|
assert leaf.shape[1] == num_boost_round
|
|
assert leaf.shape[2] == classes
|
|
assert leaf.shape[3] == num_parallel_tree
|
|
|
|
validate_leaf_output(leaf, num_parallel_tree)
|
|
|
|
n_iters = 2
|
|
sliced = booster.predict(
|
|
m,
|
|
pred_leaf=True,
|
|
iteration_range=(0, n_iters),
|
|
strict_shape=True,
|
|
)
|
|
first = sliced[0, ...]
|
|
|
|
assert np.prod(first.shape) == classes * num_parallel_tree * n_iters
|
|
|
|
# When there's only 1 tree, the output is a 1 dim vector
|
|
booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
|
|
booster.set_param({"device": device})
|
|
assert booster.predict(m, pred_leaf=True).shape == (rows,)
|
|
|
|
return leaf
|
|
|
|
|
|
def test_predict_leaf() -> None:
|
|
run_predict_leaf("cpu")
|
|
|
|
|
|
def test_predict_shape():
|
|
from sklearn.datasets import fetch_california_housing
|
|
|
|
X, y = fetch_california_housing(return_X_y=True)
|
|
reg = xgb.XGBRegressor(n_estimators=1)
|
|
reg.fit(X, y)
|
|
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
|
|
assert len(predt.shape) == 2
|
|
assert predt.shape[0] == X.shape[0]
|
|
assert predt.shape[1] == 1
|
|
|
|
contrib = reg.get_booster().predict(
|
|
xgb.DMatrix(X), pred_contribs=True, strict_shape=True
|
|
)
|
|
assert len(contrib.shape) == 3
|
|
assert contrib.shape[1] == 1
|
|
|
|
contrib = reg.get_booster().predict(
|
|
xgb.DMatrix(X), pred_contribs=True, approx_contribs=True
|
|
)
|
|
assert len(contrib.shape) == 2
|
|
assert contrib.shape[1] == X.shape[1] + 1
|
|
|
|
interaction = reg.get_booster().predict(
|
|
xgb.DMatrix(X), pred_interactions=True, approx_contribs=True
|
|
)
|
|
assert len(interaction.shape) == 3
|
|
assert interaction.shape[1] == X.shape[1] + 1
|
|
assert interaction.shape[2] == X.shape[1] + 1
|
|
|
|
interaction = reg.get_booster().predict(
|
|
xgb.DMatrix(X), pred_interactions=True, approx_contribs=True, strict_shape=True
|
|
)
|
|
assert len(interaction.shape) == 4
|
|
assert interaction.shape[1] == 1
|
|
assert interaction.shape[2] == X.shape[1] + 1
|
|
assert interaction.shape[3] == X.shape[1] + 1
|
|
|
|
|
|
class TestInplacePredict:
|
|
"""Tests for running inplace prediction"""
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.rows = 1000
|
|
cls.cols = 10
|
|
|
|
cls.missing = 11 # set to integer for testing
|
|
|
|
cls.rng = np.random.RandomState(1994)
|
|
|
|
cls.X = cls.rng.randn(cls.rows, cls.cols)
|
|
missing_idx = [i for i in range(0, cls.cols, 4)]
|
|
cls.X[:, missing_idx] = cls.missing # set to be missing
|
|
|
|
cls.y = cls.rng.randn(cls.rows)
|
|
|
|
dtrain = xgb.DMatrix(cls.X, cls.y)
|
|
cls.test = xgb.DMatrix(cls.X[:10, ...], missing=cls.missing)
|
|
|
|
cls.num_boost_round = 10
|
|
cls.booster = xgb.train({"tree_method": "hist"}, dtrain, num_boost_round=10)
|
|
|
|
def test_predict(self):
|
|
booster = self.booster
|
|
X = self.X
|
|
test = self.test
|
|
|
|
predt_from_array = booster.inplace_predict(X[:10, ...], missing=self.missing)
|
|
predt_from_dmatrix = booster.predict(test)
|
|
|
|
X_obj = X.copy().astype(object)
|
|
|
|
assert X_obj.dtype.hasobject is True
|
|
assert X.dtype.hasobject is False
|
|
np.testing.assert_allclose(
|
|
booster.inplace_predict(X_obj), booster.inplace_predict(X)
|
|
)
|
|
|
|
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
|
|
|
|
predt_from_array = booster.inplace_predict(
|
|
X[:10, ...], iteration_range=(0, 4), missing=self.missing
|
|
)
|
|
predt_from_dmatrix = booster.predict(test, iteration_range=(0, 4))
|
|
|
|
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
|
|
|
|
with pytest.raises(ValueError):
|
|
booster.predict(test, iteration_range=(0, booster.num_boosted_rounds() + 2))
|
|
|
|
default = booster.predict(test)
|
|
|
|
range_full = booster.predict(test, iteration_range=(0, self.num_boost_round))
|
|
np.testing.assert_allclose(range_full, default)
|
|
|
|
range_full = booster.predict(
|
|
test, iteration_range=(0, booster.num_boosted_rounds())
|
|
)
|
|
np.testing.assert_allclose(range_full, default)
|
|
|
|
def predict_dense(x):
|
|
inplace_predt = booster.inplace_predict(x)
|
|
d = xgb.DMatrix(x)
|
|
copied_predt = booster.predict(d)
|
|
return np.all(copied_predt == inplace_predt)
|
|
|
|
for i in range(10):
|
|
run_threaded_predict(X, self.rows, predict_dense)
|
|
|
|
def predict_csr(x):
|
|
inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
|
|
d = xgb.DMatrix(x)
|
|
copied_predt = booster.predict(d)
|
|
return np.all(copied_predt == inplace_predt)
|
|
|
|
for i in range(10):
|
|
run_threaded_predict(X, self.rows, predict_csr)
|
|
|
|
@pytest.mark.skipif(**tm.no_pandas())
|
|
def test_predict_pd(self):
|
|
X = self.X
|
|
# construct it in column major style
|
|
df = pd.DataFrame({str(i): X[:, i] for i in range(X.shape[1])})
|
|
booster = self.booster
|
|
df_predt = booster.inplace_predict(df)
|
|
arr_predt = booster.inplace_predict(X)
|
|
dmat_predt = booster.predict(xgb.DMatrix(X))
|
|
|
|
X = df.values
|
|
X = np.asfortranarray(X)
|
|
fort_predt = booster.inplace_predict(X)
|
|
|
|
np.testing.assert_allclose(dmat_predt, arr_predt)
|
|
np.testing.assert_allclose(df_predt, arr_predt)
|
|
np.testing.assert_allclose(fort_predt, arr_predt)
|
|
|
|
def test_base_margin(self):
|
|
booster = self.booster
|
|
|
|
base_margin = self.rng.randn(self.rows)
|
|
from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin)
|
|
|
|
dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
|
|
from_dmatrix = booster.predict(dtrain)
|
|
np.testing.assert_allclose(from_dmatrix, from_inplace)
|
|
|
|
@pytest.mark.skipif(**tm.no_pandas())
|
|
def test_dtypes(self) -> None:
|
|
for orig, x in np_dtypes(self.rows, self.cols):
|
|
predt_orig = self.booster.inplace_predict(orig)
|
|
predt = self.booster.inplace_predict(x)
|
|
np.testing.assert_allclose(predt, predt_orig)
|
|
|
|
# unsupported types
|
|
for dtype in [
|
|
np.string_,
|
|
np.complex64,
|
|
np.complex128,
|
|
]:
|
|
X: np.ndarray = np.array(orig, dtype=dtype)
|
|
with pytest.raises(ValueError):
|
|
self.booster.inplace_predict(X)
|
|
|
|
@pytest.mark.skipif(**tm.no_pandas())
|
|
def test_pd_dtypes(self) -> None:
|
|
from pandas.api.types import is_bool_dtype
|
|
|
|
for orig, x in pd_dtypes():
|
|
dtypes = orig.dtypes if isinstance(orig, pd.DataFrame) else [orig.dtypes]
|
|
if isinstance(orig, pd.DataFrame) and is_bool_dtype(dtypes[0]):
|
|
continue
|
|
y = np.arange(x.shape[0])
|
|
Xy = xgb.DMatrix(orig, y, enable_categorical=True)
|
|
booster = xgb.train({"tree_method": "hist"}, Xy, num_boost_round=1)
|
|
predt_orig = booster.inplace_predict(orig)
|
|
predt = booster.inplace_predict(x)
|
|
np.testing.assert_allclose(predt, predt_orig)
|