Support all pandas nullable integer types. (#8480)

- Enumerate all pandas integer types.
- Tests for `None`, `nan`, and `pd.NA`
This commit is contained in:
Jiaming Yuan
2022-11-28 22:38:16 +08:00
committed by GitHub
parent f2209c1fe4
commit d666ba775e
7 changed files with 145 additions and 113 deletions

View File

@@ -461,8 +461,4 @@ class TestDMatrix:
for orig, x in np_dtypes(n_samples, n_features):
m0 = xgb.DMatrix(orig)
m1 = xgb.DMatrix(x)
csr0 = m0.get_data()
csr1 = m1.get_data()
np.testing.assert_allclose(csr0.data, csr1.data)
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
np.testing.assert_allclose(csr0.indices, csr1.indices)
assert tm.predictor_equal(m0, m1)

View File

@@ -10,6 +10,7 @@ from xgboost.testing import (
make_batches_sparse,
make_categorical,
make_sparse_regression,
predictor_equal,
)
from xgboost.testing.data import np_dtypes
@@ -246,11 +247,7 @@ class TestQuantileDMatrix:
for orig, x in np_dtypes(n_samples, n_features):
m0 = xgb.QuantileDMatrix(orig)
m1 = xgb.QuantileDMatrix(x)
csr0 = m0.get_data()
csr1 = m1.get_data()
np.testing.assert_allclose(csr0.data, csr1.data)
np.testing.assert_allclose(csr0.indptr, csr1.indptr)
np.testing.assert_allclose(csr0.indices, csr1.indices)
assert predictor_equal(m0, m1)
# unsupported types
for dtype in [

View File

@@ -4,6 +4,7 @@ import tempfile
import numpy as np
import pytest
from test_dmatrix import set_base_margin_info
from xgboost.testing.data import pd_dtypes
import xgboost as xgb
from xgboost import testing as tm
@@ -297,70 +298,22 @@ class TestPandas:
assert 'auc' not in cv.columns[0]
assert 'error' in cv.columns[0]
def test_nullable_type(self):
y = np.random.default_rng(0).random(4)
def test_nullable_type(self) -> None:
from pandas.api.types import is_categorical
def to_bytes(Xy: xgb.DMatrix) -> bytes:
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "Xy.dmatrix")
Xy.save_binary(path)
with open(path, "rb") as fd:
result = fd.read()
return result
for DMatrixT in (xgb.DMatrix, xgb.QuantileDMatrix):
for orig, df in pd_dtypes():
enable_categorical = any(is_categorical for dtype in df.dtypes)
def test_int(dtype) -> bytes:
arr = pd.DataFrame(
{"f0": [1, 2, None, 3], "f1": [4, 3, None, 1]}, dtype=dtype
)
Xy = xgb.DMatrix(arr, y)
Xy.feature_types = None
return to_bytes(Xy)
m_orig = DMatrixT(orig, enable_categorical=enable_categorical)
# extension types
m_etype = DMatrixT(df, enable_categorical=enable_categorical)
# different from pd.BooleanDtype(), None is converted to False with bool
if any(dtype == "bool" for dtype in orig.dtypes):
assert not tm.predictor_equal(m_orig, m_etype)
else:
assert tm.predictor_equal(m_orig, m_etype)
b0 = test_int(np.float32)
b1 = test_int(pd.Int16Dtype())
assert b0 == b1
def test_bool(dtype) -> bytes:
arr = pd.DataFrame(
{"f0": [True, False, None, True], "f1": [False, True, None, True]},
dtype=dtype,
)
Xy = xgb.DMatrix(arr, y)
Xy.feature_types = None
return to_bytes(Xy)
b0 = test_bool(pd.BooleanDtype())
b1 = test_bool(bool)
assert b0 != b1 # None is converted to False with np.bool
data = {"f0": [1.0, 2.0, None, 3.0], "f1": [3.0, 2.0, None, 1.0]}
arr = np.array([data["f0"], data["f1"]]).T
Xy = xgb.DMatrix(arr, y)
Xy.feature_types = None
Xy.feature_names = None
from_np = to_bytes(Xy)
def test_float(dtype) -> bytes:
arr = pd.DataFrame(data, dtype=dtype)
Xy = xgb.DMatrix(arr, y)
Xy.feature_types = None
Xy.feature_names = None
return to_bytes(Xy)
b0 = test_float(pd.Float64Dtype())
b1 = test_float(float)
assert b0 == b1 # both are converted to NaN
assert b0 == from_np
def test_cat(dtype) -> bytes:
arr = pd.DataFrame(data, dtype=dtype)
if dtype is None:
arr = arr.astype("category")
Xy = xgb.DMatrix(arr, y, enable_categorical=True)
Xy.feature_types = None
return to_bytes(Xy)
b0 = test_cat(pd.CategoricalDtype())
b1 = test_cat(None)
assert b0 == b1
f0 = df["f0"]
with pytest.raises(ValueError, match="Label contains NaN"):
xgb.DMatrix(df, f0, enable_categorical=enable_categorical)