Complete cudf support. (#4850)

* Handles missing value.
* Accept all floating point and integer types.
* Move to cudf 9.0 API.
* Remove requirement on `null_count`.
* Arbitrary column types support.
This commit is contained in:
Jiaming Yuan
2019-09-16 23:52:00 -04:00
committed by GitHub
parent 125bcec62e
commit 5374f52531
17 changed files with 702 additions and 339 deletions

View File

@@ -6,6 +6,35 @@ sys.path.append("tests/python")
import testing as tm
def dmatrix_from_cudf(input_type, missing=np.NAN):
'''Test constructing DMatrix from cudf'''
import cudf
import pandas as pd
kRows = 80
kCols = 3
na = np.random.randn(kRows, kCols)
na[:, 0:2] = na[:, 0:2].astype(input_type)
na[5, 0] = missing
na[3, 1] = missing
pa = pd.DataFrame({'0': na[:, 0],
'1': na[:, 1],
'2': na[:, 2].astype(np.int32)})
np_label = np.random.randn(kRows).astype(input_type)
pa_label = pd.DataFrame(np_label)
cd: cudf.DataFrame = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label)
dtrain = xgb.DMatrix(cd, missing=missing, label=cd_label)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache
Arrow specification.'''
@@ -13,30 +42,13 @@ Arrow specification.'''
@pytest.mark.skipif(**tm.no_cudf())
def test_from_cudf(self):
'''Test constructing DMatrix from cudf'''
import cudf
import pandas as pd
dmatrix_from_cudf(np.float32, np.NAN)
dmatrix_from_cudf(np.float64, np.NAN)
kRows = 80
kCols = 2
dmatrix_from_cudf(np.uint8, 2)
dmatrix_from_cudf(np.uint32, 3)
dmatrix_from_cudf(np.uint64, 4)
na = np.random.randn(kRows, kCols).astype(np.float32)
na[3, 1] = np.NAN
na[5, 0] = np.NAN
pa = pd.DataFrame(na)
np_label = np.random.randn(kRows).astype(np.float32)
pa_label = pd.DataFrame(np_label)
names = []
for i in range(0, kCols):
names.append(str(i))
pa.columns = names
cd: cudf.DataFrame = cudf.from_pandas(pa)
cd_label: cudf.DataFrame = cudf.from_pandas(pa_label)
dtrain = xgb.DMatrix(cd, label=cd_label)
assert dtrain.num_col() == kCols
assert dtrain.num_row() == kRows
dmatrix_from_cudf(np.int8, 2)
dmatrix_from_cudf(np.int32, -2)
dmatrix_from_cudf(np.int64, -3)