[Breaking] Accept multi-dim meta info. (#7405)

This PR changes base_margin into a 3-dim array, with one of them being reserved for multi-target classification. Also, a breaking change is made for binary serialization due to extra dimension along with a fix for saving the feature weights. Lastly, it unifies the prediction initialization between CPU and GPU. After this PR, the meta info setter in Python will be based on array interface.
This commit is contained in:
Jiaming Yuan
2021-11-18 23:02:54 +08:00
committed by GitHub
parent 9fb4338964
commit d33854af1b
25 changed files with 545 additions and 256 deletions

View File

@@ -17,7 +17,7 @@ rng = np.random.RandomState(1994)
def set_base_margin_info(DType, DMatrixT, tm: str):
rng = np.random.default_rng()
X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2))
X = DType(rng.normal(0, 1.0, size=100).astype(np.float32).reshape(50, 2))
if hasattr(X, "iloc"):
y = X.iloc[:, 0]
else:
@@ -29,8 +29,35 @@ def set_base_margin_info(DType, DMatrixT, tm: str):
with pytest.raises(ValueError, match=r".*base_margin.*"):
xgb.train({"tree_method": tm}, Xy)
# FIXME(jiamingy): Currently the metainfo has no concept of shape. If you pass a
# base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined.
if not hasattr(X, "iloc"):
# column major matrix
got = DType(Xy.get_base_margin().reshape(50, 2))
assert (got == base_margin).all()
assert base_margin.T.flags.c_contiguous is False
assert base_margin.T.flags.f_contiguous is True
Xy.set_info(base_margin=base_margin.T)
got = DType(Xy.get_base_margin().reshape(2, 50))
assert (got == base_margin.T).all()
# Row vs col vec.
base_margin = y
Xy.set_base_margin(base_margin)
bm_col = Xy.get_base_margin()
Xy.set_base_margin(base_margin.reshape(1, base_margin.size))
bm_row = Xy.get_base_margin()
assert (bm_row == bm_col).all()
# type
base_margin = base_margin.astype(np.float64)
Xy.set_base_margin(base_margin)
bm_f64 = Xy.get_base_margin()
assert (bm_f64 == bm_col).all()
# too many dimensions
base_margin = X.reshape(2, 5, 2, 5)
with pytest.raises(ValueError, match=r".*base_margin.*"):
Xy.set_base_margin(base_margin)
class TestDMatrix:
@@ -141,6 +168,7 @@ class TestDMatrix:
# base margin is per-class in multi-class classifier
base_margin = rng.randn(100, 3).astype(np.float32)
d.set_base_margin(base_margin)
np.testing.assert_allclose(d.get_base_margin().reshape(100, 3), base_margin)
ridxs = [1, 2, 3, 4, 5, 6]
sliced = d.slice(ridxs)
@@ -154,7 +182,7 @@ class TestDMatrix:
# Slicing a DMatrix results into a DMatrix that's equivalent to a DMatrix that's
# constructed from the corresponding NumPy slice
d2 = xgb.DMatrix(X[1:7, :], y[1:7])
d2.set_base_margin(base_margin[1:7, :].flatten())
d2.set_base_margin(base_margin[1:7, :])
eval_res = {}
_ = xgb.train(
{'num_class': 3, 'objective': 'multi:softprob',
@@ -280,7 +308,7 @@ class TestDMatrix:
m.set_info(feature_weights=fw)
np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
# Handle empty
m.set_info(feature_weights=np.empty((0, 0)))
m.set_info(feature_weights=np.empty((0, )))
assert m.get_float_info('feature_weights').shape[0] == 0