* Use normal predictor for dart booster. * Implement `inplace_predict` for dart. * Enable `dart` for dask interface now that it's thread-safe. * categorical data should be working out of box for dart now. The implementation is not very efficient as it has to pull back the data and apply weight for each tree, but still a significant improvement over previous implementation as now we no longer binary search for each sample. * Fix output prediction shape on dataframe.
159 lines
4.8 KiB
Python
159 lines
4.8 KiB
Python
'''Tests for running inplace prediction.'''
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
import numpy as np
|
|
from scipy import sparse
|
|
|
|
import xgboost as xgb
|
|
|
|
|
|
def run_threaded_predict(X, rows, predict_func):
|
|
results = []
|
|
per_thread = 20
|
|
with ThreadPoolExecutor(max_workers=10) as e:
|
|
for i in range(0, rows, int(rows / per_thread)):
|
|
if hasattr(X, 'iloc'):
|
|
predictor = X.iloc[i:i+per_thread, :]
|
|
else:
|
|
predictor = X[i:i+per_thread, ...]
|
|
f = e.submit(predict_func, predictor)
|
|
results.append(f)
|
|
|
|
for f in results:
|
|
assert f.result()
|
|
|
|
|
|
def verify_leaf_output(leaf: np.ndarray, num_parallel_tree: int):
|
|
for i in range(leaf.shape[0]): # n_samples
|
|
for j in range(leaf.shape[1]): # n_rounds
|
|
for k in range(leaf.shape[2]): # n_classes
|
|
tree_group = leaf[i, j, k, :]
|
|
assert tree_group.shape[0] == num_parallel_tree
|
|
# No sampling, all trees within forest are the same
|
|
assert np.all(tree_group == tree_group[0])
|
|
|
|
|
|
def run_predict_leaf(predictor):
|
|
rows = 100
|
|
cols = 4
|
|
classes = 5
|
|
num_parallel_tree = 4
|
|
num_boost_round = 10
|
|
rng = np.random.RandomState(1994)
|
|
X = rng.randn(rows, cols)
|
|
y = rng.randint(low=0, high=classes, size=rows)
|
|
m = xgb.DMatrix(X, y)
|
|
booster = xgb.train(
|
|
{
|
|
"num_parallel_tree": num_parallel_tree,
|
|
"num_class": classes,
|
|
"predictor": predictor,
|
|
"tree_method": "hist",
|
|
},
|
|
m,
|
|
num_boost_round=num_boost_round,
|
|
)
|
|
|
|
empty = xgb.DMatrix(np.ones(shape=(0, cols)))
|
|
empty_leaf = booster.predict(empty, pred_leaf=True)
|
|
assert empty_leaf.shape[0] == 0
|
|
|
|
leaf = booster.predict(m, pred_leaf=True, strict_shape=True)
|
|
assert leaf.shape[0] == rows
|
|
assert leaf.shape[1] == num_boost_round
|
|
assert leaf.shape[2] == classes
|
|
assert leaf.shape[3] == num_parallel_tree
|
|
|
|
verify_leaf_output(leaf, num_parallel_tree)
|
|
|
|
ntree_limit = 2
|
|
sliced = booster.predict(
|
|
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit, strict_shape=True
|
|
)
|
|
first = sliced[0, ...]
|
|
|
|
assert np.prod(first.shape) == classes * num_parallel_tree * ntree_limit
|
|
return leaf
|
|
|
|
|
|
def test_predict_leaf():
|
|
run_predict_leaf('cpu_predictor')
|
|
|
|
|
|
def test_predict_shape():
|
|
from sklearn.datasets import load_boston
|
|
X, y = load_boston(return_X_y=True)
|
|
reg = xgb.XGBRegressor(n_estimators=1)
|
|
reg.fit(X, y)
|
|
predt = reg.get_booster().predict(xgb.DMatrix(X), strict_shape=True)
|
|
assert len(predt.shape) == 2
|
|
assert predt.shape[0] == X.shape[0]
|
|
assert predt.shape[1] == 1
|
|
|
|
contrib = reg.get_booster().predict(
|
|
xgb.DMatrix(X), pred_contribs=True, strict_shape=True
|
|
)
|
|
assert len(contrib.shape) == 3
|
|
assert contrib.shape[1] == 1
|
|
|
|
|
|
class TestInplacePredict:
|
|
'''Tests for running inplace prediction'''
|
|
@classmethod
|
|
def setup_class(cls):
|
|
cls.rows = 100
|
|
cls.cols = 10
|
|
|
|
cls.rng = np.random.RandomState(1994)
|
|
|
|
cls.X = cls.rng.randn(cls.rows, cls.cols)
|
|
cls.y = cls.rng.randn(cls.rows)
|
|
|
|
dtrain = xgb.DMatrix(cls.X, cls.y)
|
|
|
|
cls.booster = xgb.train({'tree_method': 'hist'}, dtrain, num_boost_round=10)
|
|
|
|
cls.test = xgb.DMatrix(cls.X[:10, ...])
|
|
|
|
def test_predict(self):
|
|
booster = self.booster
|
|
X = self.X
|
|
test = self.test
|
|
|
|
predt_from_array = booster.inplace_predict(X[:10, ...])
|
|
predt_from_dmatrix = booster.predict(test)
|
|
|
|
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
|
|
|
|
predt_from_array = booster.inplace_predict(X[:10, ...], iteration_range=(0, 4))
|
|
predt_from_dmatrix = booster.predict(test, ntree_limit=4)
|
|
|
|
np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
|
|
|
|
def predict_dense(x):
|
|
inplace_predt = booster.inplace_predict(x)
|
|
d = xgb.DMatrix(x)
|
|
copied_predt = booster.predict(d)
|
|
return np.all(copied_predt == inplace_predt)
|
|
|
|
for i in range(10):
|
|
run_threaded_predict(X, self.rows, predict_dense)
|
|
|
|
def predict_csr(x):
|
|
inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
|
|
d = xgb.DMatrix(x)
|
|
copied_predt = booster.predict(d)
|
|
return np.all(copied_predt == inplace_predt)
|
|
|
|
for i in range(10):
|
|
run_threaded_predict(X, self.rows, predict_csr)
|
|
|
|
def test_base_margin(self):
|
|
booster = self.booster
|
|
|
|
base_margin = self.rng.randn(self.rows)
|
|
from_inplace = booster.inplace_predict(data=self.X, base_margin=base_margin)
|
|
|
|
dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
|
|
from_dmatrix = booster.predict(dtrain)
|
|
np.testing.assert_allclose(from_dmatrix, from_inplace)
|