This is already partially supported but never properly tested. So the only possible way to use it is calling `numpy.ndarray.flatten` with `base_margin` before passing it into XGBoost. This PR adds proper support for most of the data types along with tests.
150 lines
4.5 KiB
Python
150 lines
4.5 KiB
Python
import json
|
|
import xgboost as xgb
|
|
import pytest
|
|
import tempfile
|
|
import sys
|
|
import numpy as np
|
|
import os
|
|
|
|
sys.path.append("tests/python")
|
|
import testing as tm # noqa
|
|
import test_with_sklearn as twskl # noqa
|
|
|
|
pytestmark = pytest.mark.skipif(**tm.no_sklearn())
|
|
|
|
rng = np.random.RandomState(1994)
|
|
|
|
|
|
def test_gpu_binary_classification():
|
|
from sklearn.datasets import load_digits
|
|
from sklearn.model_selection import KFold
|
|
|
|
digits = load_digits(n_class=2)
|
|
y = digits['target']
|
|
X = digits['data']
|
|
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
|
|
for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
|
|
for train_index, test_index in kf.split(X, y):
|
|
xgb_model = cls(
|
|
random_state=42, tree_method='gpu_hist',
|
|
n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
|
|
preds = xgb_model.predict(X[test_index])
|
|
labels = y[test_index]
|
|
err = sum(1 for i in range(len(preds))
|
|
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
|
|
assert err < 0.1
|
|
|
|
|
|
@pytest.mark.skipif(**tm.no_cupy())
|
|
@pytest.mark.skipif(**tm.no_cudf())
|
|
def test_boost_from_prediction_gpu_hist():
|
|
from sklearn.datasets import load_breast_cancer, load_digits
|
|
import cupy as cp
|
|
import cudf
|
|
|
|
tree_method = "gpu_hist"
|
|
X, y = load_breast_cancer(return_X_y=True)
|
|
X, y = cp.array(X), cp.array(y)
|
|
|
|
twskl.run_boost_from_prediction_binary(tree_method, X, y, None)
|
|
twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame)
|
|
|
|
X, y = load_digits(return_X_y=True)
|
|
X, y = cp.array(X), cp.array(y)
|
|
|
|
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
|
|
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame)
|
|
|
|
|
|
def test_num_parallel_tree():
|
|
twskl.run_boston_housing_rf_regression("gpu_hist")
|
|
|
|
|
|
@pytest.mark.skipif(**tm.no_pandas())
|
|
@pytest.mark.skipif(**tm.no_cudf())
|
|
@pytest.mark.skipif(**tm.no_sklearn())
|
|
def test_categorical():
|
|
import pandas as pd
|
|
import cudf
|
|
import cupy as cp
|
|
from sklearn.datasets import load_svmlight_file
|
|
|
|
data_dir = os.path.join(tm.PROJECT_ROOT, "demo", "data")
|
|
X, y = load_svmlight_file(os.path.join(data_dir, "agaricus.txt.train"))
|
|
clf = xgb.XGBClassifier(
|
|
tree_method="gpu_hist",
|
|
enable_categorical=True,
|
|
n_estimators=10,
|
|
)
|
|
X = pd.DataFrame(X.todense()).astype("category")
|
|
clf.fit(X, y)
|
|
|
|
with tempfile.TemporaryDirectory() as tempdir:
|
|
model = os.path.join(tempdir, "categorial.json")
|
|
clf.save_model(model)
|
|
|
|
with open(model) as fd:
|
|
categorical = json.load(fd)
|
|
categories_sizes = np.array(
|
|
categorical["learner"]["gradient_booster"]["model"]["trees"][0][
|
|
"categories_sizes"
|
|
]
|
|
)
|
|
assert categories_sizes.shape[0] != 0
|
|
np.testing.assert_allclose(categories_sizes, 1)
|
|
|
|
def check_predt(X, y):
|
|
reg = xgb.XGBRegressor(
|
|
tree_method="gpu_hist", enable_categorical=True, n_estimators=64
|
|
)
|
|
reg.fit(X, y)
|
|
predts = reg.predict(X)
|
|
booster = reg.get_booster()
|
|
assert "c" in booster.feature_types
|
|
assert len(booster.feature_types) == 1
|
|
inp_predts = booster.inplace_predict(X)
|
|
if isinstance(inp_predts, cp.ndarray):
|
|
inp_predts = cp.asnumpy(inp_predts)
|
|
np.testing.assert_allclose(predts, inp_predts)
|
|
|
|
y = [1, 2, 3]
|
|
X = pd.DataFrame({"f0": ["a", "b", "c"]})
|
|
X["f0"] = X["f0"].astype("category")
|
|
check_predt(X, y)
|
|
|
|
X = cudf.DataFrame(X)
|
|
check_predt(X, y)
|
|
|
|
|
|
@pytest.mark.skipif(**tm.no_cupy())
|
|
@pytest.mark.skipif(**tm.no_cudf())
|
|
def test_classififer():
|
|
from sklearn.datasets import load_digits
|
|
import cupy as cp
|
|
import cudf
|
|
|
|
X, y = load_digits(return_X_y=True)
|
|
y *= 10
|
|
|
|
clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
|
|
|
|
# numpy
|
|
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
|
clf.fit(X, y)
|
|
|
|
# cupy
|
|
X, y = cp.array(X), cp.array(y)
|
|
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
|
clf.fit(X, y)
|
|
|
|
# cudf
|
|
X, y = cudf.DataFrame(X), cudf.DataFrame(y)
|
|
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
|
clf.fit(X, y)
|
|
|
|
# pandas
|
|
X, y = load_digits(return_X_y=True, as_frame=True)
|
|
y *= 10
|
|
with pytest.raises(ValueError, match=r"Invalid classes.*"):
|
|
clf.fit(X, y)
|