Cleanup Python GPU tests. (#9934)
* Cleanup Python GPU tests. - Remove the use of `gpu_hist` and `gpu_id` in cudf/cupy tests. - Move base margin test into the testing directory.
This commit is contained in:
@@ -1,19 +1,17 @@
|
||||
import json
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import xgboost as xgb
|
||||
from xgboost import testing as tm
|
||||
from xgboost.testing.data import run_base_margin_info
|
||||
|
||||
sys.path.append("tests/python")
|
||||
from test_dmatrix import set_base_margin_info
|
||||
cudf = pytest.importorskip("cudf")
|
||||
|
||||
|
||||
def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
"""Test constructing DMatrix from cudf"""
|
||||
import pandas as pd
|
||||
|
||||
kRows = 80
|
||||
@@ -25,9 +23,7 @@ def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
|
||||
na[5, 0] = missing
|
||||
na[3, 1] = missing
|
||||
|
||||
pa = pd.DataFrame({'0': na[:, 0],
|
||||
'1': na[:, 1],
|
||||
'2': na[:, 2].astype(np.int32)})
|
||||
pa = pd.DataFrame({"0": na[:, 0], "1": na[:, 1], "2": na[:, 2].astype(np.int32)})
|
||||
|
||||
np_label = np.random.randn(kRows).astype(input_type)
|
||||
pa_label = pd.DataFrame(np_label)
|
||||
@@ -41,8 +37,7 @@ def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
|
||||
|
||||
|
||||
def _test_from_cudf(DMatrixT):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
"""Test constructing DMatrix from cudf"""
|
||||
dmatrix_from_cudf(np.float32, DMatrixT, np.NAN)
|
||||
dmatrix_from_cudf(np.float64, DMatrixT, np.NAN)
|
||||
|
||||
@@ -50,37 +45,38 @@ def _test_from_cudf(DMatrixT):
|
||||
dmatrix_from_cudf(np.int32, DMatrixT, -2)
|
||||
dmatrix_from_cudf(np.int64, DMatrixT, -3)
|
||||
|
||||
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
|
||||
cd = cudf.DataFrame({"x": [1, 2, 3], "y": [0.1, 0.2, 0.3]})
|
||||
dtrain = DMatrixT(cd)
|
||||
|
||||
assert dtrain.feature_names == ['x', 'y']
|
||||
assert dtrain.feature_types == ['int', 'float']
|
||||
assert dtrain.feature_names == ["x", "y"]
|
||||
assert dtrain.feature_types == ["int", "float"]
|
||||
|
||||
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
|
||||
series = cudf.DataFrame({"x": [1, 2, 3]}).iloc[:, 0]
|
||||
assert isinstance(series, cudf.Series)
|
||||
dtrain = DMatrixT(series)
|
||||
|
||||
assert dtrain.feature_names == ['x']
|
||||
assert dtrain.feature_types == ['int']
|
||||
assert dtrain.feature_names == ["x"]
|
||||
assert dtrain.feature_types == ["int"]
|
||||
|
||||
with pytest.raises(ValueError, match=r".*multi.*"):
|
||||
dtrain = DMatrixT(cd, label=cd)
|
||||
xgb.train({"tree_method": "gpu_hist", "objective": "multi:softprob"}, dtrain)
|
||||
xgb.train(
|
||||
{"tree_method": "hist", "device": "cuda", "objective": "multi:softprob"},
|
||||
dtrain,
|
||||
)
|
||||
|
||||
# Test when number of elements is less than 8
|
||||
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
|
||||
dtype=np.int32)})
|
||||
X = cudf.DataFrame({"x": cudf.Series([0, 1, 2, np.NAN, 4], dtype=np.int32)})
|
||||
dtrain = DMatrixT(X)
|
||||
assert dtrain.num_col() == 1
|
||||
assert dtrain.num_row() == 5
|
||||
|
||||
# Boolean is not supported.
|
||||
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
|
||||
X_boolean = cudf.DataFrame({"x": cudf.Series([True, False])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean)
|
||||
|
||||
y_boolean = cudf.DataFrame({
|
||||
'x': cudf.Series([True, False, True, True, True])})
|
||||
y_boolean = cudf.DataFrame({"x": cudf.Series([True, False, True, True, True])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean, label=y_boolean)
|
||||
|
||||
@@ -88,6 +84,7 @@ def _test_from_cudf(DMatrixT):
|
||||
def _test_cudf_training(DMatrixT):
|
||||
import pandas as pd
|
||||
from cudf import DataFrame as df
|
||||
|
||||
np.random.seed(1)
|
||||
X = pd.DataFrame(np.random.randn(50, 10))
|
||||
y = pd.DataFrame(np.random.randn(50))
|
||||
@@ -97,21 +94,33 @@ def _test_cudf_training(DMatrixT):
|
||||
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
|
||||
|
||||
evals_result_cudf = {}
|
||||
dtrain_cudf = DMatrixT(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
|
||||
base_margin=cudf_base_margin)
|
||||
params = {'gpu_id': 0, 'tree_method': 'gpu_hist'}
|
||||
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
|
||||
evals_result=evals_result_cudf)
|
||||
dtrain_cudf = DMatrixT(
|
||||
df.from_pandas(X),
|
||||
df.from_pandas(y),
|
||||
weight=cudf_weights,
|
||||
base_margin=cudf_base_margin,
|
||||
)
|
||||
params = {"device": "cuda", "tree_method": "hist"}
|
||||
xgb.train(
|
||||
params,
|
||||
dtrain_cudf,
|
||||
evals=[(dtrain_cudf, "train")],
|
||||
evals_result=evals_result_cudf,
|
||||
)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
|
||||
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
xgb.train(
|
||||
params, dtrain_np, evals=[(dtrain_np, "train")], evals_result=evals_result_np
|
||||
)
|
||||
assert np.array_equal(
|
||||
evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"]
|
||||
)
|
||||
|
||||
|
||||
def _test_cudf_metainfo(DMatrixT):
|
||||
import pandas as pd
|
||||
from cudf import DataFrame as df
|
||||
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cudf = DMatrixT(df.from_pandas(pd.DataFrame(X)))
|
||||
@@ -120,39 +129,53 @@ def _test_cudf_metainfo(DMatrixT):
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cudf_floats = df.from_pandas(pd.DataFrame(floats))
|
||||
cudf_uints = df.from_pandas(pd.DataFrame(uints))
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat.set_float_info("weight", floats)
|
||||
dmat.set_float_info("label", floats)
|
||||
dmat.set_float_info("base_margin", floats)
|
||||
dmat.set_uint_info("group", uints)
|
||||
dmat_cudf.set_info(weight=cudf_floats)
|
||||
dmat_cudf.set_info(label=cudf_floats)
|
||||
dmat_cudf.set_info(base_margin=cudf_floats)
|
||||
dmat_cudf.set_info(group=cudf_uints)
|
||||
|
||||
# Test setting info with cudf DataFrame
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("weight"), dmat_cudf.get_float_info("weight")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("label"), dmat_cudf.get_float_info("label")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("base_margin"), dmat_cudf.get_float_info("base_margin")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_uint_info("group_ptr"), dmat_cudf.get_uint_info("group_ptr")
|
||||
)
|
||||
|
||||
# Test setting info with cudf Series
|
||||
dmat_cudf.set_info(weight=cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_info(label=cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_info(base_margin=cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_info(group=cudf_uints[cudf_uints.columns[0]])
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("weight"), dmat_cudf.get_float_info("weight")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("label"), dmat_cudf.get_float_info("label")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_float_info("base_margin"), dmat_cudf.get_float_info("base_margin")
|
||||
)
|
||||
assert np.array_equal(
|
||||
dmat.get_uint_info("group_ptr"), dmat_cudf.get_uint_info("group_ptr")
|
||||
)
|
||||
|
||||
set_base_margin_info(df, DMatrixT, "gpu_hist")
|
||||
run_base_margin_info(df, DMatrixT, "cuda")
|
||||
|
||||
|
||||
class TestFromColumnar:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification.'''
|
||||
"""Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification."""
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_simple_dmatrix_from_cudf(self):
|
||||
@@ -180,7 +203,6 @@ Arrow specification.'''
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_categorical(self) -> None:
|
||||
import cudf
|
||||
n_features = 30
|
||||
_X, _y = tm.make_categorical(100, n_features, 17, False)
|
||||
X = cudf.from_pandas(_X)
|
||||
@@ -251,6 +273,7 @@ def test_cudf_training_with_sklearn():
|
||||
import pandas as pd
|
||||
from cudf import DataFrame as df
|
||||
from cudf import Series as ss
|
||||
|
||||
np.random.seed(1)
|
||||
X = pd.DataFrame(np.random.randn(50, 10))
|
||||
y = pd.DataFrame((np.random.randn(50) > 0).astype(np.int8))
|
||||
@@ -264,29 +287,34 @@ def test_cudf_training_with_sklearn():
|
||||
y_cudf_series = ss(data=y.iloc[:, 0])
|
||||
|
||||
for y_obj in [y_cudf, y_cudf_series]:
|
||||
clf = xgb.XGBClassifier(gpu_id=0, tree_method='gpu_hist')
|
||||
clf.fit(X_cudf, y_obj, sample_weight=cudf_weights, base_margin=cudf_base_margin,
|
||||
eval_set=[(X_cudf, y_obj)])
|
||||
clf = xgb.XGBClassifier(tree_method="hist", device="cuda:0")
|
||||
clf.fit(
|
||||
X_cudf,
|
||||
y_obj,
|
||||
sample_weight=cudf_weights,
|
||||
base_margin=cudf_base_margin,
|
||||
eval_set=[(X_cudf, y_obj)],
|
||||
)
|
||||
pred = clf.predict(X_cudf)
|
||||
assert np.array_equal(np.unique(pred), np.array([0, 1]))
|
||||
|
||||
|
||||
class IterForDMatrixTest(xgb.core.DataIter):
|
||||
'''A data iterator for XGBoost DMatrix.
|
||||
"""A data iterator for XGBoost DMatrix.
|
||||
|
||||
`reset` and `next` are required for any data iterator, other functions here
|
||||
are utilites for demonstration's purpose.
|
||||
|
||||
'''
|
||||
ROWS_PER_BATCH = 100 # data is splited by rows
|
||||
"""
|
||||
|
||||
ROWS_PER_BATCH = 100 # data is splited by rows
|
||||
BATCHES = 16
|
||||
|
||||
def __init__(self, categorical):
|
||||
'''Generate some random data for demostration.
|
||||
"""Generate some random data for demostration.
|
||||
|
||||
Actual data can be anything that is currently supported by XGBoost.
|
||||
'''
|
||||
import cudf
|
||||
"""
|
||||
self.rows = self.ROWS_PER_BATCH
|
||||
|
||||
if categorical:
|
||||
@@ -300,34 +328,37 @@ class IterForDMatrixTest(xgb.core.DataIter):
|
||||
rng = np.random.RandomState(1994)
|
||||
self._data = [
|
||||
cudf.DataFrame(
|
||||
{'a': rng.randn(self.ROWS_PER_BATCH),
|
||||
'b': rng.randn(self.ROWS_PER_BATCH)})] * self.BATCHES
|
||||
{
|
||||
"a": rng.randn(self.ROWS_PER_BATCH),
|
||||
"b": rng.randn(self.ROWS_PER_BATCH),
|
||||
}
|
||||
)
|
||||
] * self.BATCHES
|
||||
self._labels = [rng.randn(self.rows)] * self.BATCHES
|
||||
|
||||
self.it = 0 # set iterator to 0
|
||||
self.it = 0 # set iterator to 0
|
||||
super().__init__(cache_prefix=None)
|
||||
|
||||
def as_array(self):
|
||||
import cudf
|
||||
return cudf.concat(self._data)
|
||||
|
||||
def as_array_labels(self):
|
||||
return np.concatenate(self._labels)
|
||||
|
||||
def data(self):
|
||||
'''Utility function for obtaining current batch of data.'''
|
||||
"""Utility function for obtaining current batch of data."""
|
||||
return self._data[self.it]
|
||||
|
||||
def labels(self):
|
||||
'''Utility function for obtaining current batch of label.'''
|
||||
"""Utility function for obtaining current batch of label."""
|
||||
return self._labels[self.it]
|
||||
|
||||
def reset(self):
|
||||
'''Reset the iterator'''
|
||||
"""Reset the iterator"""
|
||||
self.it = 0
|
||||
|
||||
def next(self, input_data):
|
||||
'''Yield next batch of data'''
|
||||
"""Yield next batch of data"""
|
||||
if self.it == len(self._data):
|
||||
# Return 0 when there's no more batch.
|
||||
return 0
|
||||
@@ -341,7 +372,7 @@ class IterForDMatrixTest(xgb.core.DataIter):
|
||||
def test_from_cudf_iter(enable_categorical):
|
||||
rounds = 100
|
||||
it = IterForDMatrixTest(enable_categorical)
|
||||
params = {"tree_method": "gpu_hist"}
|
||||
params = {"tree_method": "hist", "device": "cuda"}
|
||||
|
||||
# Use iterator
|
||||
m_it = xgb.QuantileDMatrix(it, enable_categorical=enable_categorical)
|
||||
|
||||
Reference in New Issue
Block a user