Device dmatrix (#5420)
This commit is contained in:
@@ -44,7 +44,7 @@ case "$suite" in
|
||||
cudf)
|
||||
source activate cudf_test
|
||||
install_xgboost
|
||||
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_columnar.py tests/python-gpu/test_from_cupy.py
|
||||
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_cudf.py tests/python-gpu/test_from_cupy.py
|
||||
;;
|
||||
|
||||
cpu)
|
||||
|
||||
@@ -284,5 +284,28 @@ TEST(hist_util, AdapterDeviceSketchBatches) {
|
||||
ValidateCuts(cuts, dmat.get(), num_bins);
|
||||
}
|
||||
}
|
||||
|
||||
// Check sketching from adapter or DMatrix results in the same answer
|
||||
// Consistency here is useful for testing and user experience
|
||||
TEST(hist_util, SketchingEquivalent) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = GenerateRandom(num_rows, num_columns);
|
||||
auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
auto dmat_cuts = DeviceSketch(0, dmat.get(), num_bins);
|
||||
auto x_device = thrust::device_vector<float>(x);
|
||||
auto adapter = AdapterFromData(x_device, num_rows, num_columns);
|
||||
auto adapter_cuts = AdapterDeviceSketch(
|
||||
&adapter, num_bins, std::numeric_limits<float>::quiet_NaN());
|
||||
EXPECT_EQ(dmat_cuts.Values(), adapter_cuts.Values());
|
||||
EXPECT_EQ(dmat_cuts.Ptrs(), adapter_cuts.Ptrs());
|
||||
EXPECT_EQ(dmat_cuts.MinValues(), adapter_cuts.MinValues());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
131
tests/cpp/data/test_device_dmatrix.cu
Normal file
131
tests/cpp/data/test_device_dmatrix.cu
Normal file
@@ -0,0 +1,131 @@
|
||||
|
||||
// Copyright (c) 2019 by Contributors
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/data.h>
|
||||
#include "../../../src/data/adapter.h"
|
||||
#include "../../../src/data/ellpack_page.cuh"
|
||||
#include "../../../src/data/device_dmatrix.h"
|
||||
#include "../helpers.h"
|
||||
#include <thrust/device_vector.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
#include "../../../src/gbm/gbtree_model.h"
|
||||
#include "../common/test_hist_util.h"
|
||||
#include "../../../src/common/compressed_iterator.h"
|
||||
#include "../../../src/common/math.h"
|
||||
#include "test_array_interface.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
TEST(DeviceDMatrix, RowMajor) {
|
||||
int num_rows = 1000;
|
||||
int num_columns = 50;
|
||||
auto x = common::GenerateRandom(num_rows, num_columns);
|
||||
auto x_device = thrust::device_vector<float>(x);
|
||||
auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
|
||||
|
||||
data::DeviceDMatrix dmat(&adapter,
|
||||
std::numeric_limits<float>::quiet_NaN(), 1, 256);
|
||||
|
||||
auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
|
||||
auto impl = batch.Impl();
|
||||
common::CompressedIterator<uint32_t> iterator(
|
||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||
for(auto i = 0ull; i < x.size(); i++)
|
||||
{
|
||||
int column_idx = i % num_columns;
|
||||
EXPECT_EQ(impl->cuts_.SearchBin(x[i], column_idx), iterator[i]);
|
||||
}
|
||||
EXPECT_EQ(dmat.Info().num_col_, num_columns);
|
||||
EXPECT_EQ(dmat.Info().num_row_, num_rows);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, num_rows * num_columns);
|
||||
|
||||
}
|
||||
|
||||
TEST(DeviceDMatrix, RowMajorMissing) {
|
||||
const float kMissing = std::numeric_limits<float>::quiet_NaN();
|
||||
int num_rows = 10;
|
||||
int num_columns = 2;
|
||||
auto x = common::GenerateRandom(num_rows, num_columns);
|
||||
x[1] = kMissing;
|
||||
x[5] = kMissing;
|
||||
x[6] = kMissing;
|
||||
auto x_device = thrust::device_vector<float>(x);
|
||||
auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
|
||||
|
||||
data::DeviceDMatrix dmat(&adapter, kMissing, 1, 256);
|
||||
|
||||
auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
|
||||
auto impl = batch.Impl();
|
||||
common::CompressedIterator<uint32_t> iterator(
|
||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||
EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
|
||||
EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
|
||||
// null values get placed after valid values in a row
|
||||
EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
|
||||
EXPECT_EQ(dmat.Info().num_col_, num_columns);
|
||||
EXPECT_EQ(dmat.Info().num_row_, num_rows);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, num_rows*num_columns-3);
|
||||
|
||||
}
|
||||
|
||||
TEST(DeviceDMatrix, ColumnMajor) {
|
||||
constexpr size_t kRows{100};
|
||||
std::vector<Json> columns;
|
||||
thrust::device_vector<double> d_data_0(kRows);
|
||||
thrust::device_vector<uint32_t> d_data_1(kRows);
|
||||
|
||||
columns.emplace_back(GenerateDenseColumn<double>("<f8", kRows, &d_data_0));
|
||||
columns.emplace_back(GenerateDenseColumn<uint32_t>("<u4", kRows, &d_data_1));
|
||||
|
||||
Json column_arr{columns};
|
||||
|
||||
std::stringstream ss;
|
||||
Json::Dump(column_arr, &ss);
|
||||
std::string str = ss.str();
|
||||
|
||||
data::CudfAdapter adapter(str);
|
||||
data::DeviceDMatrix dmat(&adapter, std::numeric_limits<float>::quiet_NaN(),
|
||||
-1, 256);
|
||||
auto &batch = *dmat.GetBatches<EllpackPage>({0, 256, 0}).begin();
|
||||
auto impl = batch.Impl();
|
||||
common::CompressedIterator<uint32_t> iterator(
|
||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||
|
||||
for (auto i = 0ull; i < kRows; i++) {
|
||||
for (auto j = 0ull; j < columns.size(); j++) {
|
||||
if (j == 0) {
|
||||
EXPECT_EQ(iterator[i * 2 + j], impl->cuts_.SearchBin(d_data_0[i], j));
|
||||
} else {
|
||||
EXPECT_EQ(iterator[i * 2 + j], impl->cuts_.SearchBin(d_data_1[i], j));
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(dmat.Info().num_col_, 2);
|
||||
EXPECT_EQ(dmat.Info().num_row_, kRows);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, kRows*2);
|
||||
|
||||
}
|
||||
|
||||
// Test equivalence with simple DMatrix
|
||||
TEST(DeviceDMatrix, Equivalent) {
|
||||
int bin_sizes[] = {2, 16, 256, 512};
|
||||
int sizes[] = {100, 1000, 1500};
|
||||
int num_columns = 5;
|
||||
for (auto num_rows : sizes) {
|
||||
auto x = common::GenerateRandom(num_rows, num_columns);
|
||||
for (auto num_bins : bin_sizes) {
|
||||
auto dmat = common::GetDMatrixFromData(x, num_rows, num_columns);
|
||||
auto x_device = thrust::device_vector<float>(x);
|
||||
auto adapter = common::AdapterFromData(x_device, num_rows, num_columns);
|
||||
data::DeviceDMatrix device_dmat(
|
||||
&adapter, std::numeric_limits<float>::quiet_NaN(), 1, num_bins);
|
||||
|
||||
const auto &batch = *dmat->GetBatches<EllpackPage>({0, num_bins}).begin();
|
||||
const auto &device_dmat_batch =
|
||||
*device_dmat.GetBatches<EllpackPage>({0, num_bins}).begin();
|
||||
|
||||
ASSERT_EQ(batch.Impl()->cuts_.Values(), device_dmat_batch.Impl()->cuts_.Values());
|
||||
ASSERT_EQ(batch.Impl()->gidx_buffer.HostVector(),
|
||||
device_dmat_batch.Impl()->gidx_buffer.HostVector());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,148 +0,0 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
|
||||
def dmatrix_from_cudf(input_type, missing=np.NAN):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
import pandas as pd
|
||||
|
||||
kRows = 80
|
||||
kCols = 3
|
||||
|
||||
na = np.random.randn(kRows, kCols)
|
||||
na[:, 0:2] = na[:, 0:2].astype(input_type)
|
||||
|
||||
na[5, 0] = missing
|
||||
na[3, 1] = missing
|
||||
|
||||
pa = pd.DataFrame({'0': na[:, 0],
|
||||
'1': na[:, 1],
|
||||
'2': na[:, 2].astype(np.int32)})
|
||||
|
||||
np_label = np.random.randn(kRows).astype(input_type)
|
||||
pa_label = pd.DataFrame(np_label)
|
||||
|
||||
cd = cudf.from_pandas(pa)
|
||||
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
|
||||
|
||||
dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
|
||||
assert dtrain.num_col() == kCols
|
||||
assert dtrain.num_row() == kRows
|
||||
|
||||
|
||||
class TestFromColumnar:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification.'''
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_from_cudf(self):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
dmatrix_from_cudf(np.float32, np.NAN)
|
||||
dmatrix_from_cudf(np.float64, np.NAN)
|
||||
|
||||
dmatrix_from_cudf(np.int8, 2)
|
||||
dmatrix_from_cudf(np.int32, -2)
|
||||
dmatrix_from_cudf(np.int64, -3)
|
||||
|
||||
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
|
||||
dtrain = xgb.DMatrix(cd)
|
||||
|
||||
assert dtrain.feature_names == ['x', 'y']
|
||||
assert dtrain.feature_types == ['int', 'float']
|
||||
|
||||
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
|
||||
assert isinstance(series, cudf.Series)
|
||||
dtrain = xgb.DMatrix(series)
|
||||
|
||||
assert dtrain.feature_names == ['x']
|
||||
assert dtrain.feature_types == ['int']
|
||||
|
||||
with pytest.raises(Exception):
|
||||
dtrain = xgb.DMatrix(cd, label=cd)
|
||||
|
||||
# Test when number of elements is less than 8
|
||||
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
|
||||
dtype=np.int32)})
|
||||
dtrain = xgb.DMatrix(X)
|
||||
assert dtrain.num_col() == 1
|
||||
assert dtrain.num_row() == 5
|
||||
|
||||
# Boolean is not supported.
|
||||
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = xgb.DMatrix(X_boolean)
|
||||
|
||||
y_boolean = cudf.DataFrame({
|
||||
'x': cudf.Series([True, False, True, True, True])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = xgb.DMatrix(X_boolean, label=y_boolean)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_training(self):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
np.random.seed(1)
|
||||
X = pd.DataFrame(np.random.randn(50, 10))
|
||||
y = pd.DataFrame(np.random.randn(50))
|
||||
weights = np.random.random(50) + 1.0
|
||||
cudf_weights = df.from_pandas(pd.DataFrame(weights))
|
||||
base_margin = np.random.random(50)
|
||||
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
|
||||
|
||||
evals_result_cudf = {}
|
||||
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
|
||||
base_margin=cudf_base_margin)
|
||||
params = {'gpu_id': 0}
|
||||
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
|
||||
evals_result=evals_result_cudf)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
|
||||
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_metainfo(self):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cudf = xgb.DMatrix(X)
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cudf_floats = df.from_pandas(pd.DataFrame(floats))
|
||||
cudf_uints = df.from_pandas(pd.DataFrame(uints))
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats)
|
||||
dmat_cudf.set_interface_info('label', cudf_floats)
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats)
|
||||
dmat_cudf.set_interface_info('group', cudf_uints)
|
||||
|
||||
# Test setting info with cudf DataFrame
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
# Test setting info with cudf Series
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
172
tests/python-gpu/test_from_cudf.py
Normal file
172
tests/python-gpu/test_from_cudf.py
Normal file
@@ -0,0 +1,172 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
|
||||
def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
import pandas as pd
|
||||
|
||||
kRows = 80
|
||||
kCols = 3
|
||||
|
||||
na = np.random.randn(kRows, kCols)
|
||||
na[:, 0:2] = na[:, 0:2].astype(input_type)
|
||||
|
||||
na[5, 0] = missing
|
||||
na[3, 1] = missing
|
||||
|
||||
pa = pd.DataFrame({'0': na[:, 0],
|
||||
'1': na[:, 1],
|
||||
'2': na[:, 2].astype(np.int32)})
|
||||
|
||||
np_label = np.random.randn(kRows).astype(input_type)
|
||||
pa_label = pd.DataFrame(np_label)
|
||||
|
||||
cd = cudf.from_pandas(pa)
|
||||
cd_label = cudf.from_pandas(pa_label).iloc[:, 0]
|
||||
|
||||
dtrain = DMatrixT(cd, missing=missing, label=cd_label)
|
||||
assert dtrain.num_col() == kCols
|
||||
assert dtrain.num_row() == kRows
|
||||
|
||||
|
||||
def _test_from_cudf(DMatrixT):
|
||||
'''Test constructing DMatrix from cudf'''
|
||||
import cudf
|
||||
dmatrix_from_cudf(np.float32, DMatrixT, np.NAN)
|
||||
dmatrix_from_cudf(np.float64, DMatrixT, np.NAN)
|
||||
|
||||
dmatrix_from_cudf(np.int8, DMatrixT, 2)
|
||||
dmatrix_from_cudf(np.int32, DMatrixT, -2)
|
||||
dmatrix_from_cudf(np.int64, DMatrixT, -3)
|
||||
|
||||
cd = cudf.DataFrame({'x': [1, 2, 3], 'y': [0.1, 0.2, 0.3]})
|
||||
dtrain = DMatrixT(cd)
|
||||
|
||||
assert dtrain.feature_names == ['x', 'y']
|
||||
assert dtrain.feature_types == ['int', 'float']
|
||||
|
||||
series = cudf.DataFrame({'x': [1, 2, 3]}).iloc[:, 0]
|
||||
assert isinstance(series, cudf.Series)
|
||||
dtrain = DMatrixT(series)
|
||||
|
||||
assert dtrain.feature_names == ['x']
|
||||
assert dtrain.feature_types == ['int']
|
||||
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(cd, label=cd)
|
||||
|
||||
# Test when number of elements is less than 8
|
||||
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],
|
||||
dtype=np.int32)})
|
||||
dtrain = DMatrixT(X)
|
||||
assert dtrain.num_col() == 1
|
||||
assert dtrain.num_row() == 5
|
||||
|
||||
# Boolean is not supported.
|
||||
X_boolean = cudf.DataFrame({'x': cudf.Series([True, False])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean)
|
||||
|
||||
y_boolean = cudf.DataFrame({
|
||||
'x': cudf.Series([True, False, True, True, True])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = DMatrixT(X_boolean, label=y_boolean)
|
||||
|
||||
|
||||
def _test_cudf_training(DMatrixT):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
np.random.seed(1)
|
||||
X = pd.DataFrame(np.random.randn(50, 10))
|
||||
y = pd.DataFrame(np.random.randn(50))
|
||||
weights = np.random.random(50) + 1.0
|
||||
cudf_weights = df.from_pandas(pd.DataFrame(weights))
|
||||
base_margin = np.random.random(50)
|
||||
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
|
||||
|
||||
evals_result_cudf = {}
|
||||
dtrain_cudf = DMatrixT(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
|
||||
base_margin=cudf_base_margin)
|
||||
params = {'gpu_id': 0, 'tree_method': 'gpu_hist'}
|
||||
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
|
||||
evals_result=evals_result_cudf)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
|
||||
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
|
||||
|
||||
def _test_cudf_metainfo(DMatrixT):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cudf = DMatrixT(df.from_pandas(pd.DataFrame(X)))
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cudf_floats = df.from_pandas(pd.DataFrame(floats))
|
||||
cudf_uints = df.from_pandas(pd.DataFrame(uints))
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats)
|
||||
dmat_cudf.set_interface_info('label', cudf_floats)
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats)
|
||||
dmat_cudf.set_interface_info('group', cudf_uints)
|
||||
|
||||
# Test setting info with cudf DataFrame
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
# Test setting info with cudf Series
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
|
||||
class TestFromColumnar:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification.'''
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_simple_dmatrix_from_cudf(self):
|
||||
_test_from_cudf(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_device_dmatrix_from_cudf(self):
|
||||
_test_from_cudf(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_training_simple_dmatrix(self):
|
||||
_test_cudf_training(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_training_device_dmatrix(self):
|
||||
_test_cudf_training(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_metainfo_simple_dmatrix(self):
|
||||
_test_cudf_metainfo(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_metainfo_device_dmatrix(self):
|
||||
_test_cudf_metainfo(xgb.DeviceQuantileDMatrix)
|
||||
@@ -7,7 +7,7 @@ sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
|
||||
def dmatrix_from_cupy(input_type, missing=np.NAN):
|
||||
def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
|
||||
'''Test constructing DMatrix from cupy'''
|
||||
import cupy as cp
|
||||
|
||||
@@ -19,82 +19,106 @@ def dmatrix_from_cupy(input_type, missing=np.NAN):
|
||||
X[5, 0] = missing
|
||||
X[3, 1] = missing
|
||||
y = cp.random.randn(kRows).astype(dtype=input_type)
|
||||
dtrain = xgb.DMatrix(X, missing=missing, label=y)
|
||||
dtrain = DMatrixT(X, missing=missing, label=y)
|
||||
assert dtrain.num_col() == kCols
|
||||
assert dtrain.num_row() == kRows
|
||||
return dtrain
|
||||
|
||||
|
||||
def _test_from_cupy(DMatrixT):
|
||||
'''Test constructing DMatrix from cupy'''
|
||||
import cupy as cp
|
||||
dmatrix_from_cupy(np.float32, DMatrixT, np.NAN)
|
||||
dmatrix_from_cupy(np.float64, DMatrixT, np.NAN)
|
||||
|
||||
dmatrix_from_cupy(np.uint8, DMatrixT, 2)
|
||||
dmatrix_from_cupy(np.uint32, DMatrixT, 3)
|
||||
dmatrix_from_cupy(np.uint64, DMatrixT, 4)
|
||||
|
||||
dmatrix_from_cupy(np.int8, DMatrixT, 2)
|
||||
dmatrix_from_cupy(np.int32, DMatrixT, -2)
|
||||
dmatrix_from_cupy(np.int64, DMatrixT, -3)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
X = cp.random.randn(2, 2, dtype="float32")
|
||||
dtrain = DMatrixT(X, label=X)
|
||||
|
||||
|
||||
def _test_cupy_training(DMatrixT):
|
||||
import cupy as cp
|
||||
np.random.seed(1)
|
||||
cp.random.seed(1)
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = cp.random.randn(50, dtype="float32")
|
||||
weights = np.random.random(50) + 1
|
||||
cupy_weights = cp.array(weights)
|
||||
base_margin = np.random.random(50)
|
||||
cupy_base_margin = cp.array(base_margin)
|
||||
|
||||
evals_result_cupy = {}
|
||||
dtrain_cp = DMatrixT(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
|
||||
params = {'gpu_id': 0, 'nthread': 1, 'tree_method': 'gpu_hist'}
|
||||
xgb.train(params, dtrain_cp, evals=[(dtrain_cp, "train")],
|
||||
evals_result=evals_result_cupy)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
|
||||
base_margin=base_margin)
|
||||
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
|
||||
|
||||
def _test_cupy_metainfo(DMatrixT):
|
||||
import cupy as cp
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cupy = DMatrixT(cp.array(X))
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cupy_floats = cp.array(floats)
|
||||
cupy_uints = cp.array(uints)
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cupy.set_interface_info('weight', cupy_floats)
|
||||
dmat_cupy.set_interface_info('label', cupy_floats)
|
||||
dmat_cupy.set_interface_info('base_margin', cupy_floats)
|
||||
dmat_cupy.set_interface_info('group', cupy_uints)
|
||||
|
||||
# Test setting info with cupy
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cupy.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
|
||||
|
||||
|
||||
class TestFromArrayInterface:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification.'''
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_from_cupy(self):
|
||||
'''Test constructing DMatrix from cupy'''
|
||||
import cupy as cp
|
||||
dmatrix_from_cupy(np.float32, np.NAN)
|
||||
dmatrix_from_cupy(np.float64, np.NAN)
|
||||
|
||||
dmatrix_from_cupy(np.uint8, 2)
|
||||
dmatrix_from_cupy(np.uint32, 3)
|
||||
dmatrix_from_cupy(np.uint64, 4)
|
||||
|
||||
dmatrix_from_cupy(np.int8, 2)
|
||||
dmatrix_from_cupy(np.int32, -2)
|
||||
dmatrix_from_cupy(np.int64, -3)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
X = cp.random.randn(2, 2, dtype="float32")
|
||||
dtrain = xgb.DMatrix(X, label=X)
|
||||
def test_simple_dmat_from_cupy(self):
|
||||
_test_from_cupy(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_training(self):
|
||||
import cupy as cp
|
||||
np.random.seed(1)
|
||||
cp.random.seed(1)
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = cp.random.randn(50, dtype="float32")
|
||||
weights = np.random.random(50) + 1
|
||||
cupy_weights = cp.array(weights)
|
||||
base_margin = np.random.random(50)
|
||||
cupy_base_margin = cp.array(base_margin)
|
||||
|
||||
evals_result_cupy = {}
|
||||
dtrain_cp = xgb.DMatrix(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
|
||||
params = {'gpu_id': 0, 'nthread': 1}
|
||||
xgb.train(params, dtrain_cp, evals=[(dtrain_cp, "train")],
|
||||
evals_result=evals_result_cupy)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
|
||||
base_margin=base_margin)
|
||||
xgb.train(params, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
def test_device_dmat_from_cupy(self):
|
||||
_test_from_cupy(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_metainfo(self):
|
||||
import cupy as cp
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cupy = xgb.DMatrix(X)
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cupy_floats = cp.array(floats)
|
||||
cupy_uints = cp.array(uints)
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cupy.set_interface_info('weight', cupy_floats)
|
||||
dmat_cupy.set_interface_info('label', cupy_floats)
|
||||
dmat_cupy.set_interface_info('base_margin', cupy_floats)
|
||||
dmat_cupy.set_interface_info('group', cupy_uints)
|
||||
def test_cupy_training_device_dmat(self):
|
||||
_test_cupy_training(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
# Test setting info with cupy
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cupy.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_training_simple_dmat(self):
|
||||
_test_cupy_training(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_metainfo_simple_dmat(self):
|
||||
_test_cupy_metainfo(xgb.DMatrix)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_metainfo_device_dmat(self):
|
||||
_test_cupy_metainfo(xgb.DeviceQuantileDMatrix)
|
||||
|
||||
@@ -2,9 +2,10 @@ import numpy as np
|
||||
import sys
|
||||
import unittest
|
||||
import pytest
|
||||
import xgboost
|
||||
import xgboost as xgb
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
from regression_test_utilities import run_suite, parameter_combinations, \
|
||||
assert_results_non_increasing
|
||||
|
||||
@@ -40,6 +41,19 @@ class TestGPU(unittest.TestCase):
|
||||
cpu_results = run_suite(param, select_datasets=datasets)
|
||||
assert_gpu_results(cpu_results, gpu_results)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_gpu_hist_device_dmatrix(self):
|
||||
# DeviceDMatrix does not currently accept sparse formats
|
||||
device_dmatrix_datasets = ["Boston", "Cancer", "Digits"]
|
||||
for param in test_param:
|
||||
param['tree_method'] = 'gpu_hist'
|
||||
gpu_results_device_dmatrix = run_suite(param, select_datasets=device_dmatrix_datasets,
|
||||
DMatrixT=xgb.DeviceQuantileDMatrix,
|
||||
dmatrix_params={'max_bin': param['max_bin']})
|
||||
assert_results_non_increasing(gpu_results_device_dmatrix, 1e-2)
|
||||
gpu_results = run_suite(param, select_datasets=device_dmatrix_datasets)
|
||||
assert_gpu_results(gpu_results, gpu_results_device_dmatrix)
|
||||
|
||||
# NOTE(rongou): Because the `Boston` dataset is too small, this only tests external memory mode
|
||||
# with a single page. To test multiple pages, set DMatrix::kPageSize to, say, 1024.
|
||||
def test_external_memory(self):
|
||||
@@ -61,20 +75,20 @@ class TestGPU(unittest.TestCase):
|
||||
X = np.empty((kRows, kCols))
|
||||
y = np.empty((kRows))
|
||||
|
||||
dtrain = xgboost.DMatrix(X, y)
|
||||
dtrain = xgb.DMatrix(X, y)
|
||||
|
||||
bst = xgboost.train({'verbosity': 2,
|
||||
'tree_method': 'gpu_hist',
|
||||
'gpu_id': 0},
|
||||
dtrain,
|
||||
verbose_eval=True,
|
||||
num_boost_round=6,
|
||||
evals=[(dtrain, 'Train')])
|
||||
bst = xgb.train({'verbosity': 2,
|
||||
'tree_method': 'gpu_hist',
|
||||
'gpu_id': 0},
|
||||
dtrain,
|
||||
verbose_eval=True,
|
||||
num_boost_round=6,
|
||||
evals=[(dtrain, 'Train')])
|
||||
|
||||
kRows = 100
|
||||
X = np.random.randn(kRows, kCols)
|
||||
|
||||
dtest = xgboost.DMatrix(X)
|
||||
dtest = xgb.DMatrix(X)
|
||||
predictions = bst.predict(dtest)
|
||||
np.testing.assert_allclose(predictions, 0.5, 1e-6)
|
||||
|
||||
|
||||
@@ -84,7 +84,8 @@ def get_weights_regression(min_weight, max_weight):
|
||||
return X, y, w
|
||||
|
||||
|
||||
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False):
|
||||
def train_dataset(dataset, param_in, num_rounds=10, scale_features=False, DMatrixT=xgb.DMatrix,
|
||||
dmatrix_params={}):
|
||||
param = param_in.copy()
|
||||
param["objective"] = dataset.objective
|
||||
if dataset.objective == "multi:softmax":
|
||||
@@ -99,10 +100,13 @@ def train_dataset(dataset, param_in, num_rounds=10, scale_features=False):
|
||||
if dataset.use_external_memory:
|
||||
np.savetxt('tmptmp_1234.csv', np.hstack((dataset.y.reshape(len(dataset.y), 1), X)),
|
||||
delimiter=',')
|
||||
dtrain = xgb.DMatrix('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
||||
dtrain = DMatrixT('tmptmp_1234.csv?format=csv&label_column=0#tmptmp_',
|
||||
weight=dataset.w)
|
||||
elif DMatrixT is xgb.DeviceQuantileDMatrix:
|
||||
import cupy as cp
|
||||
dtrain = DMatrixT(cp.array(X), dataset.y, weight=dataset.w, **dmatrix_params)
|
||||
else:
|
||||
dtrain = xgb.DMatrix(X, dataset.y, weight=dataset.w)
|
||||
dtrain = DMatrixT(X, dataset.y, weight=dataset.w, **dmatrix_params)
|
||||
|
||||
print("Training on dataset: " + dataset.name, file=sys.stderr)
|
||||
print("Using parameters: " + str(param), file=sys.stderr)
|
||||
@@ -139,7 +143,8 @@ def parameter_combinations(variable_param):
|
||||
return result
|
||||
|
||||
|
||||
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False):
|
||||
def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False,
|
||||
DMatrixT=xgb.DMatrix, dmatrix_params={}):
|
||||
"""
|
||||
Run the given parameters on a range of datasets. Objective and eval metric will be automatically set
|
||||
"""
|
||||
@@ -162,7 +167,8 @@ def run_suite(param, num_rounds=10, select_datasets=None, scale_features=False):
|
||||
for d in datasets:
|
||||
if select_datasets is None or d.name in select_datasets:
|
||||
results.append(
|
||||
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features))
|
||||
train_dataset(d, param, num_rounds=num_rounds, scale_features=scale_features,
|
||||
DMatrixT=DMatrixT, dmatrix_params=dmatrix_params))
|
||||
return results
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user