Device dmatrix (#5420)

This commit is contained in:
Rory Mitchell
2020-03-28 14:42:21 +13:00
committed by GitHub
parent 780de49ddb
commit 13b10a6370
24 changed files with 915 additions and 310 deletions

View File

@@ -1,148 +0,0 @@
import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
import testing as tm
def dmatrix_from_cudf(input_type, missing=np.NAN):
'''Test constructing DMatrix from cudf'''
import cudf
import pandas as pd
kRows = 80
kCols = 3
na = np.random.randn(kRows, kCols)
na[:, 0:2] = na[:, 0:2].astype(input_type)
na[5, 0] = missing
na[3, 1] = missing
pa = pd.DataFrame({'0': na[:, 0],
'1': na[:, 1],
'2': na[:, 2].astype(np.int32)})
np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label)
cd = cudf.from_pandas(pa)
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@pytest.mark.skipif(**tm.no_cudf())
def test_from_cudf(self):
'''Test constructing DMatrix from cudf'''
import cudf
dmatrix_from_cudf(np.float32, np.NAN)
dmatrix_from_cudf(np.float64, np.NAN)
dmatrix_from_cudf(np.int8, 2)
dmatrix_from_cudf(np.int32, -2)
dmatrix_from_cudf(np.int64, -3)
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
dtrain = xgb.DMatrix(cd)
assert dtrain.feature_names == ['x', 'y']
assert dtrain.feature_types == ['int', 'float']
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
assert isinstance(series, cudf.Series)
dtrain = xgb.DMatrix(series)
assert dtrain.feature_names == ['x']
assert dtrain.feature_types == ['int']
with pytest.raises(Exception):
dtrain = xgb.DMatrix(cd, label=cd)
# Test when number of elements is less than 8
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
dtype=np.int32)})
dtrain = xgb.DMatrix(X)
assert dtrain.num_col() == 1
assert dtrain.num_row() == 5
# Boolean is not supported.
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
with pytest.raises(Exception):
dtrain = xgb.DMatrix(X_boolean)
y_boolean = cudf.DataFrame({
'x': cudf.Series([True, False, True, True, True])})
with pytest.raises(Exception):
dtrain = xgb.DMatrix(X_boolean, label=y_boolean)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_training(self):
from cudf import DataFrame as df
import pandas as pd
np.random.seed(1)
X = pd.DataFrame(np.random.randn(50, 10))
y = pd.DataFrame(np.random.randn(50))
weights = np.random.random(50) + 1.0
cudf_weights = df.from_pandas(pd.DataFrame(weights))
base_margin = np.random.random(50)
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
evals_result_cudf = {}
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
base_margin=cudf_base_margin)
params = {'gpu_id': 0}
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
evals_result=evals_result_cudf)
evals_result_np = {}
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_metainfo(self):
from cudf import DataFrame as df
import pandas as pd
n = 100
X = np.random.random((n, 2))
dmat_cudf = xgb.DMatrix(X)
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cudf_floats = df.from_pandas(pd.DataFrame(floats))
cudf_uints = df.from_pandas(pd.DataFrame(uints))
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cudf.set_interface_info('weight', cudf_floats)
dmat_cudf.set_interface_info('label', cudf_floats)
dmat_cudf.set_interface_info('base_margin', cudf_floats)
dmat_cudf.set_interface_info('group', cudf_uints)
# Test setting info with cudf DataFrame
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
# Test setting info with cudf Series
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))

View File

@@ -0,0 +1,172 @@
import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
import testing as tm
def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
'''Test constructing DMatrix from cudf'''
import cudf
import pandas as pd
kRows = 80
kCols = 3
na = np.random.randn(kRows, kCols)
na[:, 0:2] = na[:, 0:2].astype(input_type)
na[5, 0] = missing
na[3, 1] = missing
pa = pd.DataFrame({'0': na[:, 0],
'1': na[:, 1],
'2': na[:, 2].astype(np.int32)})
np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label)
cd = cudf.from_pandas(pa)
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
dtrain = DMatrixT(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
def _test_from_cudf(DMatrixT):
'''Test constructing DMatrix from cudf'''
import cudf
dmatrix_from_cudf(np.float32, DMatrixT, np.NAN)
dmatrix_from_cudf(np.float64, DMatrixT, np.NAN)
dmatrix_from_cudf(np.int8, DMatrixT, 2)
dmatrix_from_cudf(np.int32, DMatrixT, -2)
dmatrix_from_cudf(np.int64, DMatrixT, -3)
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
dtrain = DMatrixT(cd)
assert dtrain.feature_names == ['x', 'y']
assert dtrain.feature_types == ['int', 'float']
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
assert isinstance(series, cudf.Series)
dtrain = DMatrixT(series)
assert dtrain.feature_names == ['x']
assert dtrain.feature_types == ['int']
with pytest.raises(Exception):
dtrain = DMatrixT(cd, label=cd)
# Test when number of elements is less than 8
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
dtype=np.int32)})
dtrain = DMatrixT(X)
assert dtrain.num_col() == 1
assert dtrain.num_row() == 5
# Boolean is not supported.
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
with pytest.raises(Exception):
dtrain = DMatrixT(X_boolean)
y_boolean = cudf.DataFrame({
'x': cudf.Series([True, False, True, True, True])})
with pytest.raises(Exception):
dtrain = DMatrixT(X_boolean, label=y_boolean)
def _test_cudf_training(DMatrixT):
from cudf import DataFrame as df
import pandas as pd
np.random.seed(1)
X = pd.DataFrame(np.random.randn(50, 10))
y = pd.DataFrame(np.random.randn(50))
weights = np.random.random(50) + 1.0
cudf_weights = df.from_pandas(pd.DataFrame(weights))
base_margin = np.random.random(50)
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
evals_result_cudf = {}
dtrain_cudf = DMatrixT(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
base_margin=cudf_base_margin)
params = {'gpu_id': 0, 'tree_method': 'gpu_hist'}
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
evals_result=evals_result_cudf)
evals_result_np = {}
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
def _test_cudf_metainfo(DMatrixT):
from cudf import DataFrame as df
import pandas as pd
n = 100
X = np.random.random((n, 2))
dmat_cudf = DMatrixT(df.from_pandas(pd.DataFrame(X)))
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cudf_floats = df.from_pandas(pd.DataFrame(floats))
cudf_uints = df.from_pandas(pd.DataFrame(uints))
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cudf.set_interface_info('weight', cudf_floats)
dmat_cudf.set_interface_info('label', cudf_floats)
dmat_cudf.set_interface_info('base_margin', cudf_floats)
dmat_cudf.set_interface_info('group', cudf_uints)
# Test setting info with cudf DataFrame
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
# Test setting info with cudf Series
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@pytest.mark.skipif(**tm.no_cudf())
def test_simple_dmatrix_from_cudf(self):
_test_from_cudf(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_device_dmatrix_from_cudf(self):
_test_from_cudf(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_training_simple_dmatrix(self):
_test_cudf_training(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_training_device_dmatrix(self):
_test_cudf_training(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_metainfo_simple_dmatrix(self):
_test_cudf_metainfo(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_metainfo_device_dmatrix(self):
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)

View File

@@ -7,7 +7,7 @@ sys.path.append("tests/python")
import testing as tm
def dmatrix_from_cupy(input_type, missing=np.NAN):
def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
'''Test constructing DMatrix from cupy'''
import cupy as cp
@@ -19,82 +19,106 @@ def dmatrix_from_cupy(input_type, missing=np.NAN):
X[5, 0] = missing
X[3, 1] = missing
y = cp.random.randn(kRows).astype(dtype=input_type)
dtrain = xgb.DMatrix(X, missing=missing, label=y)
dtrain = DMatrixT(X, missing=missing, label=y)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
return dtrain
def _test_from_cupy(DMatrixT):
'''Test constructing DMatrix from cupy'''
import cupy as cp
dmatrix_from_cupy(np.float32, DMatrixT, np.NAN)
dmatrix_from_cupy(np.float64, DMatrixT, np.NAN)
dmatrix_from_cupy(np.uint8, DMatrixT, 2)
dmatrix_from_cupy(np.uint32, DMatrixT, 3)
dmatrix_from_cupy(np.uint64, DMatrixT, 4)
dmatrix_from_cupy(np.int8, DMatrixT, 2)
dmatrix_from_cupy(np.int32, DMatrixT, -2)
dmatrix_from_cupy(np.int64, DMatrixT, -3)
with pytest.raises(Exception):
X = cp.random.randn(2, 2, dtype="float32")
dtrain = DMatrixT(X, label=X)
def _test_cupy_training(DMatrixT):
import cupy as cp
np.random.seed(1)
cp.random.seed(1)
X = cp.random.randn(50, 10, dtype="float32")
y = cp.random.randn(50, dtype="float32")
weights = np.random.random(50) + 1
cupy_weights = cp.array(weights)
base_margin = np.random.random(50)
cupy_base_margin = cp.array(base_margin)
evals_result_cupy = {}
dtrain_cp = DMatrixT(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
params = {'gpu_id': 0, 'nthread': 1, 'tree_method': 'gpu_hist'}
xgb.train(params, dtrain_cp, evals=[(dtrain_cp, "train")],
evals_result=evals_result_cupy)
evals_result_np = {}
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
base_margin=base_margin)
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
def _test_cupy_metainfo(DMatrixT):
import cupy as cp
n = 100
X = np.random.random((n, 2))
dmat_cupy = DMatrixT(cp.array(X))
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cupy_floats = cp.array(floats)
cupy_uints = cp.array(uints)
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cupy.set_interface_info('weight', cupy_floats)
dmat_cupy.set_interface_info('label', cupy_floats)
dmat_cupy.set_interface_info('base_margin', cupy_floats)
dmat_cupy.set_interface_info('group', cupy_uints)
# Test setting info with cupy
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cupy.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
class TestFromArrayInterface:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@pytest.mark.skipif(**tm.no_cupy())
def test_from_cupy(self):
'''Test constructing DMatrix from cupy'''
import cupy as cp
dmatrix_from_cupy(np.float32, np.NAN)
dmatrix_from_cupy(np.float64, np.NAN)
dmatrix_from_cupy(np.uint8, 2)
dmatrix_from_cupy(np.uint32, 3)
dmatrix_from_cupy(np.uint64, 4)
dmatrix_from_cupy(np.int8, 2)
dmatrix_from_cupy(np.int32, -2)
dmatrix_from_cupy(np.int64, -3)
with pytest.raises(Exception):
X = cp.random.randn(2, 2, dtype="float32")
dtrain = xgb.DMatrix(X, label=X)
def test_simple_dmat_from_cupy(self):
_test_from_cupy(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_training(self):
import cupy as cp
np.random.seed(1)
cp.random.seed(1)
X = cp.random.randn(50, 10, dtype="float32")
y = cp.random.randn(50, dtype="float32")
weights = np.random.random(50) + 1
cupy_weights = cp.array(weights)
base_margin = np.random.random(50)
cupy_base_margin = cp.array(base_margin)
evals_result_cupy = {}
dtrain_cp = xgb.DMatrix(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
params = {'gpu_id': 0, 'nthread': 1}
xgb.train(params, dtrain_cp, evals=[(dtrain_cp, "train")],
evals_result=evals_result_cupy)
evals_result_np = {}
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
base_margin=base_margin)
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
def test_device_dmat_from_cupy(self):
_test_from_cupy(xgb.DeviceQuantileDMatrix)
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_metainfo(self):
import cupy as cp
n = 100
X = np.random.random((n, 2))
dmat_cupy = xgb.DMatrix(X)
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cupy_floats = cp.array(floats)
cupy_uints = cp.array(uints)
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cupy.set_interface_info('weight', cupy_floats)
dmat_cupy.set_interface_info('label', cupy_floats)
dmat_cupy.set_interface_info('base_margin', cupy_floats)
dmat_cupy.set_interface_info('group', cupy_uints)
def test_cupy_training_device_dmat(self):
_test_cupy_training(xgb.DeviceQuantileDMatrix)
# Test setting info with cupy
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cupy.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_training_simple_dmat(self):
_test_cupy_training(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_metainfo_simple_dmat(self):
_test_cupy_metainfo(xgb.DMatrix)
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_metainfo_device_dmat(self):
_test_cupy_metainfo(xgb.DeviceQuantileDMatrix)

View File

@@ -2,9 +2,10 @@ import numpy as np
import sys
import unittest
import pytest
import xgboost
import xgboost as xgb
sys.path.append("tests/python")
import testing as tm
from regression_test_utilities import run_suite, parameter_combinations, \
assert_results_non_increasing
@@ -40,6 +41,19 @@ class TestGPU(unittest.TestCase):
cpu_results = run_suite(param, select_datasets=datasets)
assert_gpu_results(cpu_results, gpu_results)
@pytest.mark.skipif(**tm.no_cupy())
def test_gpu_hist_device_dmatrix(self):
# DeviceDMatrix does not currently accept sparse formats
device_dmatrix_datasets = ["Boston", "Cancer", "Digits"]
for param in test_param:
param['tree_method'] = 'gpu_hist'
gpu_results_device_dmatrix = run_suite(param, select_datasets=device_dmatrix_datasets,
DMatrixT=xgb.DeviceQuantileDMatrix,
dmatrix_params={'max_bin': param['max_bin']})
assert_results_non_increasing(gpu_results_device_dmatrix, 1e-2)
gpu_results = run_suite(param, select_datasets=device_dmatrix_datasets)
assert_gpu_results(gpu_results, gpu_results_device_dmatrix)
# NOTE(rongou): Because the `Boston` dataset is too small, this only tests external memory mode
# with a single page. To test multiple pages, set DMatrix::kPageSize to, say, 1024.
def test_external_memory(self):
@@ -61,20 +75,20 @@ class TestGPU(unittest.TestCase):
X = np.empty((kRows, kCols))
y = np.empty((kRows))
dtrain = xgboost.DMatrix(X, y)
dtrain = xgb.DMatrix(X, y)
bst = xgboost.train({'verbosity': 2,
'tree_method': 'gpu_hist',
'gpu_id': 0},
dtrain,
verbose_eval=True,
num_boost_round=6,
evals=[(dtrain, 'Train')])
bst = xgb.train({'verbosity': 2,
'tree_method': 'gpu_hist',
'gpu_id': 0},
dtrain,
verbose_eval=True,
num_boost_round=6,
evals=[(dtrain, 'Train')])
kRows = 100
X = np.random.randn(kRows, kCols)
dtest = xgboost.DMatrix(X)
dtest = xgb.DMatrix(X)
predictions = bst.predict(dtest)
np.testing.assert_allclose(predictions, 0.5, 1e-6)