Support dmatrix construction from cupy array (#5206)

This commit is contained in:
Rory Mitchell
2020-01-22 13:15:27 +13:00
committed by GitHub
parent 2a071cebc5
commit 9c56480c61
19 changed files with 522 additions and 158 deletions

View File

@@ -18,7 +18,7 @@ ENV PATH=/opt/python/bin:$PATH
# Create new Conda environment with cuDF and dask
RUN \
conda create -n cudf_test -c rapidsai -c nvidia -c numba -c conda-forge -c anaconda \
cudf=0.9 python=3.7 anaconda::cudatoolkit=$CUDA_VERSION dask dask-cuda
cudf=0.9 python=3.7 anaconda::cudatoolkit=$CUDA_VERSION dask dask-cuda cupy
# Install other Python packages
RUN \

View File

@@ -39,7 +39,7 @@ case "$suite" in
cudf)
source activate cudf_test
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_columnar.py
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_columnar.py tests/python-gpu/test_from_cupy.py
;;
cpu)

View File

@@ -8,7 +8,6 @@
#include "../../../src/common/bitfield.h"
#include "../../../src/common/device_helpers.cuh"
#include "../../../src/data/simple_csr_source.h"
#include "../../../src/data/columnar.h"
namespace xgboost {
@@ -62,4 +61,24 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
column["typestr"] = String(typestr);
return column;
}
template <typename T>
Json Generate2dArrayInterface(int rows, int cols, std::string typestr,
thrust::device_vector<T>* p_data) {
auto& data = *p_data;
thrust::sequence(data.begin(), data.end());
Json array_interface{Object()};
std::vector<Json> shape = {Json(static_cast<Integer::Int>(rows)),
Json(static_cast<Integer::Int>(cols))};
array_interface["shape"] = Array(shape);
std::vector<Json> j_data{
Json(Integer(reinterpret_cast<Integer::Int>(data.data().get()))),
Json(Boolean(false))};
array_interface["data"] = j_data;
array_interface["version"] = Integer(static_cast<Integer::Int>(1));
array_interface["typestr"] = String(typestr);
return array_interface;
}
} // namespace xgboost

View File

@@ -7,7 +7,7 @@
#include "../helpers.h"
#include <thrust/device_vector.h>
#include "../../../src/data/device_adapter.cuh"
#include "test_columnar.h"
#include "test_array_interface.h"
using namespace xgboost; // NOLINT
void TestCudfAdapter()

View File

@@ -9,8 +9,7 @@
namespace xgboost {
template <typename T>
std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
constexpr size_t kRows = 16;
std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, const size_t kRows=16) {
out->resize(kRows);
auto& d_data = *out;
@@ -66,7 +65,15 @@ TEST(MetaInfo, FromInterface) {
ASSERT_EQ(h_base_margin[i], d_data[i]);
}
EXPECT_ANY_THROW({info.SetInfo("group", str.c_str());});
thrust::device_vector<int> d_group_data;
std::string group_str = PrepareData<int>("<i4", &d_group_data, 4);
d_group_data[0] = 4;
d_group_data[1] = 3;
d_group_data[2] = 2;
d_group_data[3] = 1;
info.SetInfo("group", group_str.c_str());
std::vector<bst_group_t> expected_group_ptr = {0, 4, 7, 9, 10};
EXPECT_EQ(info.group_ptr_, expected_group_ptr);
}
TEST(MetaInfo, Group) {
@@ -83,4 +90,4 @@ TEST(MetaInfo, Group) {
ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i;
}
}
} // namespace xgboost
} // namespace xgboost

View File

@@ -6,7 +6,8 @@
#include <thrust/sequence.h>
#include "../../../src/data/device_adapter.cuh"
#include "../helpers.h"
#include "test_columnar.h"
#include "test_array_interface.h"
#include "../../../src/data/array_interface.h"
using namespace xgboost; // NOLINT
@@ -316,3 +317,55 @@ TEST(SimpleDMatrix, FromColumnarSparseBasic) {
}
}
}
TEST(SimpleDMatrix, FromCupy){
int rows = 50;
int cols = 10;
thrust::device_vector< float> data(rows*cols);
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
std::stringstream ss;
Json::Dump(json_array_interface, &ss);
std::string str = ss.str();
data::CupyAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, -1, 1);
EXPECT_EQ(dmat.Info().num_col_, cols);
EXPECT_EQ(dmat.Info().num_row_, rows);
EXPECT_EQ(dmat.Info().num_nonzero_, rows*cols);
for (auto& batch : dmat.GetBatches<SparsePage>()) {
for (auto i = 0ull; i < batch.Size(); i++) {
auto inst = batch[i];
for (auto j = 0ull; j < inst.size(); j++) {
EXPECT_EQ(inst[j].fvalue, i * cols + j);
EXPECT_EQ(inst[j].index, j);
}
}
}
}
TEST(SimpleDMatrix, FromCupySparse){
int rows = 2;
int cols = 2;
thrust::device_vector< float> data(rows*cols);
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
data[1] = std::numeric_limits<float>::quiet_NaN();
data[2] = std::numeric_limits<float>::quiet_NaN();
std::stringstream ss;
Json::Dump(json_array_interface, &ss);
std::string str = ss.str();
data::CupyAdapter adapter(str);
data::SimpleDMatrix dmat(&adapter, -1, 1);
EXPECT_EQ(dmat.Info().num_col_, cols);
EXPECT_EQ(dmat.Info().num_row_, rows);
EXPECT_EQ(dmat.Info().num_nonzero_, rows * cols - 2);
auto& batch = *dmat.GetBatches<SparsePage>().begin();
auto inst0 = batch[0];
auto inst1 = batch[1];
EXPECT_EQ(batch[0].size(), 1);
EXPECT_EQ(batch[1].size(), 1);
EXPECT_EQ(batch[0][0].fvalue, 0.0f);
EXPECT_EQ(batch[0][0].index, 0);
EXPECT_EQ(batch[1][0].fvalue, 3.0f);
EXPECT_EQ(batch[1][0].index, 1);
}

View File

@@ -2,6 +2,7 @@ import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
import testing as tm
@@ -86,3 +87,64 @@ Arrow specification.'''
'x': cudf.Series([True, False, True, True, True])})
with pytest.raises(Exception):
dtrain = xgb.DMatrix(X_boolean, label=y_boolean)
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_training(self):
from cudf import DataFrame as df
import pandas as pd
X = pd.DataFrame(np.random.randn(50, 10))
y = pd.DataFrame(np.random.randn(50))
weights = np.random.random(50)
cudf_weights = df.from_pandas(pd.DataFrame(weights))
base_margin = np.random.random(50)
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
evals_result_cudf = {}
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
base_margin=cudf_base_margin)
xgb.train({'gpu_id': 0}, dtrain_cudf, evals=[(dtrain_cudf, "train")],
evals_result=evals_result_cudf)
evals_result_np = {}
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
xgb.train({}, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
@pytest.mark.skipif(**tm.no_cudf())
def test_cudf_metainfo(self):
from cudf import DataFrame as df
import pandas as pd
n = 100
X = np.random.random((n, 2))
dmat_cudf = xgb.DMatrix(X)
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cudf_floats = df.from_pandas(pd.DataFrame(floats))
cudf_uints = df.from_pandas(pd.DataFrame(uints))
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cudf.set_interface_info('weight', cudf_floats)
dmat_cudf.set_interface_info('label', cudf_floats)
dmat_cudf.set_interface_info('base_margin', cudf_floats)
dmat_cudf.set_interface_info('group', cudf_uints)
# Test setting info with cudf DataFrame
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
# Test setting info with cudf Series
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))

View File

@@ -0,0 +1,97 @@
import numpy as np
import xgboost as xgb
import sys
import pytest
sys.path.append("tests/python")
import testing as tm
def dmatrix_from_cupy(input_type, missing=np.NAN):
'''Test constructing DMatrix from cupy'''
import cupy as cp
kRows = 80
kCols = 3
np_X = np.random.randn(kRows, kCols).astype(dtype=input_type)
X = cp.array(np_X)
X[5, 0] = missing
X[3, 1] = missing
y = cp.random.randn(kRows).astype(dtype=input_type)
dtrain = xgb.DMatrix(X, missing=missing, label=y)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
return dtrain
class TestFromArrayInterface:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@pytest.mark.skipif(**tm.no_cupy())
def test_from_cupy(self):
'''Test constructing DMatrix from cupy'''
import cupy as cp
dmatrix_from_cupy(np.float32, np.NAN)
dmatrix_from_cupy(np.float64, np.NAN)
dmatrix_from_cupy(np.uint8, 2)
dmatrix_from_cupy(np.uint32, 3)
dmatrix_from_cupy(np.uint64, 4)
dmatrix_from_cupy(np.int8, 2)
dmatrix_from_cupy(np.int32, -2)
dmatrix_from_cupy(np.int64, -3)
with pytest.raises(Exception):
X = cp.random.randn(2, 2, dtype="float32")
dtrain = xgb.DMatrix(X, label=X)
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_training(self):
import cupy as cp
X = cp.random.randn(50, 10, dtype="float32")
y = cp.random.randn(50, dtype="float32")
weights = np.random.random(50)
cupy_weights = cp.array(weights)
base_margin = np.random.random(50)
cupy_base_margin = cp.array(base_margin)
evals_result_cupy = {}
dtrain_cp = xgb.DMatrix(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
xgb.train({'gpu_id': 0}, dtrain_cp, evals=[(dtrain_cp, "train")],
evals_result=evals_result_cupy)
evals_result_np = {}
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
base_margin=base_margin)
xgb.train({'gpu_id': 0}, dtrain_np, evals=[(dtrain_np, "train")],
evals_result=evals_result_np)
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
@pytest.mark.skipif(**tm.no_cupy())
def test_cupy_metainfo(self):
import cupy as cp
n = 100
X = np.random.random((n, 2))
dmat_cupy = xgb.DMatrix(X)
dmat = xgb.DMatrix(X)
floats = np.random.random(n)
uints = np.array([4, 2, 8]).astype("uint32")
cupy_floats = cp.array(floats)
cupy_uints = cp.array(uints)
dmat.set_float_info('weight', floats)
dmat.set_float_info('label', floats)
dmat.set_float_info('base_margin', floats)
dmat.set_uint_info('group', uints)
dmat_cupy.set_interface_info('weight', cupy_floats)
dmat_cupy.set_interface_info('label', cupy_floats)
dmat_cupy.set_interface_info('base_margin', cupy_floats)
dmat_cupy.set_interface_info('group', cupy_uints)
# Test setting info with cupy
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
assert np.array_equal(dmat.get_float_info('base_margin'),
dmat_cupy.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))

View File

@@ -48,6 +48,15 @@ def no_cudf():
'reason': 'CUDF is not installed'}
def no_cupy():
reason = 'cupy is not installed.'
try:
import cupy as _ # noqa
return {'condition': False, 'reason': reason}
except ImportError:
return {'condition': True, 'reason': reason}
def no_dask_cudf():
reason = 'dask_cudf is not installed.'
try:

View File

@@ -16,10 +16,9 @@ if [ ${TASK} == "python_test" ]; then
echo "-------------------------------"
conda activate python3
python --version
conda install numpy scipy pandas matplotlib scikit-learn
conda install numpy scipy pandas matplotlib scikit-learn dask
python -m pip install graphviz pytest pytest-cov codecov
python -m pip install dask distributed dask[dataframe]
python -m pip install datatable
python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
codecov