Support dmatrix construction from cupy array (#5206)
This commit is contained in:
@@ -18,7 +18,7 @@ ENV PATH=/opt/python/bin:$PATH
|
||||
# Create new Conda environment with cuDF and dask
|
||||
RUN \
|
||||
conda create -n cudf_test -c rapidsai -c nvidia -c numba -c conda-forge -c anaconda \
|
||||
cudf=0.9 python=3.7 anaconda::cudatoolkit=$CUDA_VERSION dask dask-cuda
|
||||
cudf=0.9 python=3.7 anaconda::cudatoolkit=$CUDA_VERSION dask dask-cuda cupy
|
||||
|
||||
# Install other Python packages
|
||||
RUN \
|
||||
|
||||
@@ -39,7 +39,7 @@ case "$suite" in
|
||||
|
||||
cudf)
|
||||
source activate cudf_test
|
||||
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_columnar.py
|
||||
pytest -v -s --fulltrace -m "not mgpu" tests/python-gpu/test_from_columnar.py tests/python-gpu/test_from_cupy.py
|
||||
;;
|
||||
|
||||
cpu)
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "../../../src/common/bitfield.h"
|
||||
#include "../../../src/common/device_helpers.cuh"
|
||||
#include "../../../src/data/simple_csr_source.h"
|
||||
#include "../../../src/data/columnar.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
@@ -62,4 +61,24 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
|
||||
column["typestr"] = String(typestr);
|
||||
return column;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Json Generate2dArrayInterface(int rows, int cols, std::string typestr,
|
||||
thrust::device_vector<T>* p_data) {
|
||||
auto& data = *p_data;
|
||||
thrust::sequence(data.begin(), data.end());
|
||||
|
||||
Json array_interface{Object()};
|
||||
std::vector<Json> shape = {Json(static_cast<Integer::Int>(rows)),
|
||||
Json(static_cast<Integer::Int>(cols))};
|
||||
array_interface["shape"] = Array(shape);
|
||||
std::vector<Json> j_data{
|
||||
Json(Integer(reinterpret_cast<Integer::Int>(data.data().get()))),
|
||||
Json(Boolean(false))};
|
||||
array_interface["data"] = j_data;
|
||||
array_interface["version"] = Integer(static_cast<Integer::Int>(1));
|
||||
array_interface["typestr"] = String(typestr);
|
||||
return array_interface;
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
@@ -7,7 +7,7 @@
|
||||
#include "../helpers.h"
|
||||
#include <thrust/device_vector.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
#include "test_columnar.h"
|
||||
#include "test_array_interface.h"
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
void TestCudfAdapter()
|
||||
|
||||
@@ -9,8 +9,7 @@
|
||||
namespace xgboost {
|
||||
|
||||
template <typename T>
|
||||
std::string PrepareData(std::string typestr, thrust::device_vector<T>* out) {
|
||||
constexpr size_t kRows = 16;
|
||||
std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, const size_t kRows=16) {
|
||||
out->resize(kRows);
|
||||
auto& d_data = *out;
|
||||
|
||||
@@ -66,7 +65,15 @@ TEST(MetaInfo, FromInterface) {
|
||||
ASSERT_EQ(h_base_margin[i], d_data[i]);
|
||||
}
|
||||
|
||||
EXPECT_ANY_THROW({info.SetInfo("group", str.c_str());});
|
||||
thrust::device_vector<int> d_group_data;
|
||||
std::string group_str = PrepareData<int>("<i4", &d_group_data, 4);
|
||||
d_group_data[0] = 4;
|
||||
d_group_data[1] = 3;
|
||||
d_group_data[2] = 2;
|
||||
d_group_data[3] = 1;
|
||||
info.SetInfo("group", group_str.c_str());
|
||||
std::vector<bst_group_t> expected_group_ptr = {0, 4, 7, 9, 10};
|
||||
EXPECT_EQ(info.group_ptr_, expected_group_ptr);
|
||||
}
|
||||
|
||||
TEST(MetaInfo, Group) {
|
||||
@@ -83,4 +90,4 @@ TEST(MetaInfo, Group) {
|
||||
ASSERT_EQ(h_group[i], d_data[i-1] + h_group[i-1]) << "i: " << i;
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -6,7 +6,8 @@
|
||||
#include <thrust/sequence.h>
|
||||
#include "../../../src/data/device_adapter.cuh"
|
||||
#include "../helpers.h"
|
||||
#include "test_columnar.h"
|
||||
#include "test_array_interface.h"
|
||||
#include "../../../src/data/array_interface.h"
|
||||
|
||||
using namespace xgboost; // NOLINT
|
||||
|
||||
@@ -316,3 +317,55 @@ TEST(SimpleDMatrix, FromColumnarSparseBasic) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
TEST(SimpleDMatrix, FromCupy){
|
||||
int rows = 50;
|
||||
int cols = 10;
|
||||
thrust::device_vector< float> data(rows*cols);
|
||||
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
|
||||
std::stringstream ss;
|
||||
Json::Dump(json_array_interface, &ss);
|
||||
std::string str = ss.str();
|
||||
data::CupyAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, -1, 1);
|
||||
EXPECT_EQ(dmat.Info().num_col_, cols);
|
||||
EXPECT_EQ(dmat.Info().num_row_, rows);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, rows*cols);
|
||||
|
||||
for (auto& batch : dmat.GetBatches<SparsePage>()) {
|
||||
for (auto i = 0ull; i < batch.Size(); i++) {
|
||||
auto inst = batch[i];
|
||||
for (auto j = 0ull; j < inst.size(); j++) {
|
||||
EXPECT_EQ(inst[j].fvalue, i * cols + j);
|
||||
EXPECT_EQ(inst[j].index, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SimpleDMatrix, FromCupySparse){
|
||||
int rows = 2;
|
||||
int cols = 2;
|
||||
thrust::device_vector< float> data(rows*cols);
|
||||
auto json_array_interface = Generate2dArrayInterface(rows, cols, "<f4", &data);
|
||||
data[1] = std::numeric_limits<float>::quiet_NaN();
|
||||
data[2] = std::numeric_limits<float>::quiet_NaN();
|
||||
std::stringstream ss;
|
||||
Json::Dump(json_array_interface, &ss);
|
||||
std::string str = ss.str();
|
||||
data::CupyAdapter adapter(str);
|
||||
data::SimpleDMatrix dmat(&adapter, -1, 1);
|
||||
EXPECT_EQ(dmat.Info().num_col_, cols);
|
||||
EXPECT_EQ(dmat.Info().num_row_, rows);
|
||||
EXPECT_EQ(dmat.Info().num_nonzero_, rows * cols - 2);
|
||||
auto& batch = *dmat.GetBatches<SparsePage>().begin();
|
||||
auto inst0 = batch[0];
|
||||
auto inst1 = batch[1];
|
||||
EXPECT_EQ(batch[0].size(), 1);
|
||||
EXPECT_EQ(batch[1].size(), 1);
|
||||
EXPECT_EQ(batch[0][0].fvalue, 0.0f);
|
||||
EXPECT_EQ(batch[0][0].index, 0);
|
||||
EXPECT_EQ(batch[1][0].fvalue, 3.0f);
|
||||
EXPECT_EQ(batch[1][0].index, 1);
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ import numpy as np
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
@@ -86,3 +87,64 @@ Arrow specification.'''
|
||||
'x': cudf.Series([True, False, True, True, True])})
|
||||
with pytest.raises(Exception):
|
||||
dtrain = xgb.DMatrix(X_boolean, label=y_boolean)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_training(self):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
X = pd.DataFrame(np.random.randn(50, 10))
|
||||
y = pd.DataFrame(np.random.randn(50))
|
||||
weights = np.random.random(50)
|
||||
cudf_weights = df.from_pandas(pd.DataFrame(weights))
|
||||
base_margin = np.random.random(50)
|
||||
cudf_base_margin = df.from_pandas(pd.DataFrame(base_margin))
|
||||
|
||||
evals_result_cudf = {}
|
||||
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
|
||||
base_margin=cudf_base_margin)
|
||||
xgb.train({'gpu_id': 0}, dtrain_cudf, evals=[(dtrain_cudf, "train")],
|
||||
evals_result=evals_result_cudf)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(X, y, weight=weights, base_margin=base_margin)
|
||||
xgb.train({}, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cudf["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
def test_cudf_metainfo(self):
|
||||
from cudf import DataFrame as df
|
||||
import pandas as pd
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cudf = xgb.DMatrix(X)
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cudf_floats = df.from_pandas(pd.DataFrame(floats))
|
||||
cudf_uints = df.from_pandas(pd.DataFrame(uints))
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats)
|
||||
dmat_cudf.set_interface_info('label', cudf_floats)
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats)
|
||||
dmat_cudf.set_interface_info('group', cudf_uints)
|
||||
|
||||
# Test setting info with cudf DataFrame
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
# Test setting info with cudf Series
|
||||
dmat_cudf.set_interface_info('weight', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('label', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('base_margin', cudf_floats[cudf_floats.columns[0]])
|
||||
dmat_cudf.set_interface_info('group', cudf_uints[cudf_uints.columns[0]])
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cudf.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cudf.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cudf.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))
|
||||
|
||||
97
tests/python-gpu/test_from_cupy.py
Normal file
97
tests/python-gpu/test_from_cupy.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import sys
|
||||
import pytest
|
||||
|
||||
sys.path.append("tests/python")
|
||||
import testing as tm
|
||||
|
||||
|
||||
def dmatrix_from_cupy(input_type, missing=np.NAN):
|
||||
'''Test constructing DMatrix from cupy'''
|
||||
import cupy as cp
|
||||
|
||||
kRows = 80
|
||||
kCols = 3
|
||||
|
||||
np_X = np.random.randn(kRows, kCols).astype(dtype=input_type)
|
||||
X = cp.array(np_X)
|
||||
X[5, 0] = missing
|
||||
X[3, 1] = missing
|
||||
y = cp.random.randn(kRows).astype(dtype=input_type)
|
||||
dtrain = xgb.DMatrix(X, missing=missing, label=y)
|
||||
assert dtrain.num_col() == kCols
|
||||
assert dtrain.num_row() == kRows
|
||||
return dtrain
|
||||
|
||||
|
||||
class TestFromArrayInterface:
|
||||
'''Tests for constructing DMatrix from data structure conforming Apache
|
||||
Arrow specification.'''
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_from_cupy(self):
|
||||
'''Test constructing DMatrix from cupy'''
|
||||
import cupy as cp
|
||||
dmatrix_from_cupy(np.float32, np.NAN)
|
||||
dmatrix_from_cupy(np.float64, np.NAN)
|
||||
|
||||
dmatrix_from_cupy(np.uint8, 2)
|
||||
dmatrix_from_cupy(np.uint32, 3)
|
||||
dmatrix_from_cupy(np.uint64, 4)
|
||||
|
||||
dmatrix_from_cupy(np.int8, 2)
|
||||
dmatrix_from_cupy(np.int32, -2)
|
||||
dmatrix_from_cupy(np.int64, -3)
|
||||
|
||||
with pytest.raises(Exception):
|
||||
X = cp.random.randn(2, 2, dtype="float32")
|
||||
dtrain = xgb.DMatrix(X, label=X)
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_training(self):
|
||||
import cupy as cp
|
||||
X = cp.random.randn(50, 10, dtype="float32")
|
||||
y = cp.random.randn(50, dtype="float32")
|
||||
weights = np.random.random(50)
|
||||
cupy_weights = cp.array(weights)
|
||||
base_margin = np.random.random(50)
|
||||
cupy_base_margin = cp.array(base_margin)
|
||||
|
||||
evals_result_cupy = {}
|
||||
dtrain_cp = xgb.DMatrix(X, y, weight=cupy_weights, base_margin=cupy_base_margin)
|
||||
xgb.train({'gpu_id': 0}, dtrain_cp, evals=[(dtrain_cp, "train")],
|
||||
evals_result=evals_result_cupy)
|
||||
evals_result_np = {}
|
||||
dtrain_np = xgb.DMatrix(cp.asnumpy(X), cp.asnumpy(y), weight=weights,
|
||||
base_margin=base_margin)
|
||||
xgb.train({'gpu_id': 0}, dtrain_np, evals=[(dtrain_np, "train")],
|
||||
evals_result=evals_result_np)
|
||||
assert np.array_equal(evals_result_cupy["train"]["rmse"], evals_result_np["train"]["rmse"])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
def test_cupy_metainfo(self):
|
||||
import cupy as cp
|
||||
n = 100
|
||||
X = np.random.random((n, 2))
|
||||
dmat_cupy = xgb.DMatrix(X)
|
||||
dmat = xgb.DMatrix(X)
|
||||
floats = np.random.random(n)
|
||||
uints = np.array([4, 2, 8]).astype("uint32")
|
||||
cupy_floats = cp.array(floats)
|
||||
cupy_uints = cp.array(uints)
|
||||
dmat.set_float_info('weight', floats)
|
||||
dmat.set_float_info('label', floats)
|
||||
dmat.set_float_info('base_margin', floats)
|
||||
dmat.set_uint_info('group', uints)
|
||||
dmat_cupy.set_interface_info('weight', cupy_floats)
|
||||
dmat_cupy.set_interface_info('label', cupy_floats)
|
||||
dmat_cupy.set_interface_info('base_margin', cupy_floats)
|
||||
dmat_cupy.set_interface_info('group', cupy_uints)
|
||||
|
||||
# Test setting info with cupy
|
||||
assert np.array_equal(dmat.get_float_info('weight'), dmat_cupy.get_float_info('weight'))
|
||||
assert np.array_equal(dmat.get_float_info('label'), dmat_cupy.get_float_info('label'))
|
||||
assert np.array_equal(dmat.get_float_info('base_margin'),
|
||||
dmat_cupy.get_float_info('base_margin'))
|
||||
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr'))
|
||||
@@ -48,6 +48,15 @@ def no_cudf():
|
||||
'reason': 'CUDF is not installed'}
|
||||
|
||||
|
||||
def no_cupy():
|
||||
reason = 'cupy is not installed.'
|
||||
try:
|
||||
import cupy as _ # noqa
|
||||
return {'condition': False, 'reason': reason}
|
||||
except ImportError:
|
||||
return {'condition': True, 'reason': reason}
|
||||
|
||||
|
||||
def no_dask_cudf():
|
||||
reason = 'dask_cudf is not installed.'
|
||||
try:
|
||||
|
||||
@@ -16,10 +16,9 @@ if [ ${TASK} == "python_test" ]; then
|
||||
echo "-------------------------------"
|
||||
conda activate python3
|
||||
python --version
|
||||
conda install numpy scipy pandas matplotlib scikit-learn
|
||||
conda install numpy scipy pandas matplotlib scikit-learn dask
|
||||
|
||||
python -m pip install graphviz pytest pytest-cov codecov
|
||||
python -m pip install dask distributed dask[dataframe]
|
||||
python -m pip install datatable
|
||||
python -m pytest -v --fulltrace -s tests/python --cov=python-package/xgboost || exit -1
|
||||
codecov
|
||||
|
||||
Reference in New Issue
Block a user