Initial support for multi-target tree. (#8616)
* Implement multi-target for hist. - Add new hist tree builder. - Move data fetchers for tests. - Dispatch function calls in gbm base on the tree type.
This commit is contained in:
@@ -3,7 +3,7 @@ import os
|
||||
import subprocess
|
||||
import sys
|
||||
from multiprocessing import Pool, cpu_count
|
||||
from typing import Dict, Optional, Tuple
|
||||
from typing import Dict, Tuple
|
||||
|
||||
from pylint import epylint
|
||||
from test_utils import PY_PACKAGE, ROOT, cd, print_time, record_time
|
||||
@@ -15,8 +15,11 @@ SRCPATH = os.path.normpath(
|
||||
|
||||
|
||||
@record_time
|
||||
def run_black(rel_path: str) -> bool:
|
||||
cmd = ["black", "-q", "--check", rel_path]
|
||||
def run_black(rel_path: str, fix: bool) -> bool:
|
||||
if fix:
|
||||
cmd = ["black", "-q", rel_path]
|
||||
else:
|
||||
cmd = ["black", "-q", "--check", rel_path]
|
||||
ret = subprocess.run(cmd).returncode
|
||||
if ret != 0:
|
||||
subprocess.run(["black", "--version"])
|
||||
@@ -31,8 +34,11 @@ Please run the following command on your machine to address the formatting error
|
||||
|
||||
|
||||
@record_time
|
||||
def run_isort(rel_path: str) -> bool:
|
||||
cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
|
||||
def run_isort(rel_path: str, fix: bool) -> bool:
|
||||
if fix:
|
||||
cmd = ["isort", f"--src={SRCPATH}", "--profile=black", rel_path]
|
||||
else:
|
||||
cmd = ["isort", f"--src={SRCPATH}", "--check", "--profile=black", rel_path]
|
||||
ret = subprocess.run(cmd).returncode
|
||||
if ret != 0:
|
||||
subprocess.run(["isort", "--version"])
|
||||
@@ -132,7 +138,7 @@ def run_pylint() -> bool:
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
if args.format == 1:
|
||||
black_results = [
|
||||
run_black(path)
|
||||
run_black(path, args.fix)
|
||||
for path in [
|
||||
# core
|
||||
"python-package/",
|
||||
@@ -166,7 +172,7 @@ def main(args: argparse.Namespace) -> None:
|
||||
sys.exit(-1)
|
||||
|
||||
isort_results = [
|
||||
run_isort(path)
|
||||
run_isort(path, args.fix)
|
||||
for path in [
|
||||
# core
|
||||
"python-package/",
|
||||
@@ -230,6 +236,11 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--format", type=int, choices=[0, 1], default=1)
|
||||
parser.add_argument("--type-check", type=int, choices=[0, 1], default=1)
|
||||
parser.add_argument("--pylint", type=int, choices=[0, 1], default=1)
|
||||
parser.add_argument(
|
||||
"--fix",
|
||||
action="store_true",
|
||||
help="Fix the formatting issues instead of emitting an error.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
try:
|
||||
main(args)
|
||||
|
||||
@@ -412,7 +412,7 @@ std::pair<Json, Json> TestModelSlice(std::string booster) {
|
||||
j++;
|
||||
}
|
||||
|
||||
// CHECK sliced model doesn't have dependency on old one
|
||||
// CHECK sliced model doesn't have dependency on the old one
|
||||
learner.reset();
|
||||
CHECK_EQ(sliced->GetNumFeature(), kCols);
|
||||
|
||||
|
||||
@@ -473,7 +473,7 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
|
||||
int32_t device = Context::kCpuId) {
|
||||
size_t shape[1]{1};
|
||||
LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
|
||||
n_groups, 1, MultiStrategy::kComposite);
|
||||
n_groups, 1, MultiStrategy::kOneOutputPerTree);
|
||||
return mparam;
|
||||
}
|
||||
|
||||
|
||||
@@ -428,7 +428,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
|
||||
|
||||
LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
|
||||
linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
|
||||
MultiStrategy::kMonolithic};
|
||||
MultiStrategy::kMultiOutputTree};
|
||||
|
||||
std::vector<std::unique_ptr<RegTree>> trees;
|
||||
trees.emplace_back(new RegTree{mparam.LeafLength(), mparam.num_feature});
|
||||
|
||||
@@ -124,11 +124,11 @@ TEST(MultiStrategy, Configure) {
|
||||
auto p_fmat = RandomDataGenerator{12ul, 3ul, 0.0}.GenerateDMatrix();
|
||||
p_fmat->Info().labels.Reshape(p_fmat->Info().num_row_, 2);
|
||||
std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
|
||||
learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "2"}});
|
||||
learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "2"}});
|
||||
learner->Configure();
|
||||
ASSERT_EQ(learner->Groups(), 2);
|
||||
|
||||
learner->SetParams(Args{{"multi_strategy", "monolithic"}, {"num_target", "0"}});
|
||||
learner->SetParams(Args{{"multi_strategy", "multi_output_tree"}, {"num_target", "0"}});
|
||||
ASSERT_THROW({ learner->Configure(); }, dmlc::Error);
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -116,7 +116,7 @@ def test_with_mq2008(objective, metric) -> None:
|
||||
x_valid,
|
||||
y_valid,
|
||||
qid_valid,
|
||||
) = tm.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
|
||||
) = tm.data.get_mq2008(os.path.join(os.path.join(tm.demo_dir(__file__), "rank")))
|
||||
|
||||
if metric.find("map") != -1 or objective.find("map") != -1:
|
||||
y_train[y_train <= 1] = 0.0
|
||||
|
||||
@@ -32,6 +32,19 @@ def train_result(param, dmat: xgb.DMatrix, num_rounds: int) -> dict:
|
||||
return result
|
||||
|
||||
|
||||
class TestGPUUpdatersMulti:
|
||||
@given(
|
||||
hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
|
||||
)
|
||||
@settings(deadline=None, max_examples=50, print_blob=True)
|
||||
def test_hist(self, param, num_rounds, dataset):
|
||||
param["tree_method"] = "gpu_hist"
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
note(result)
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
|
||||
class TestGPUUpdaters:
|
||||
cputest = test_up.TestTreeMethod()
|
||||
|
||||
@@ -101,7 +114,7 @@ class TestGPUUpdaters:
|
||||
) -> None:
|
||||
cat_parameters.update(hist_parameters)
|
||||
dataset = tm.TestDataset(
|
||||
"ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
|
||||
"ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
|
||||
)
|
||||
cat_parameters["tree_method"] = "gpu_hist"
|
||||
results = train_result(cat_parameters, dataset.get_dmat(), 16)
|
||||
|
||||
@@ -15,13 +15,17 @@ rng = np.random.RandomState(1994)
|
||||
|
||||
|
||||
def json_model(model_path: str, parameters: dict) -> dict:
|
||||
X = np.random.random((10, 3))
|
||||
y = np.random.randint(2, size=(10,))
|
||||
datasets = pytest.importorskip("sklearn.datasets")
|
||||
|
||||
X, y = datasets.make_classification(64, n_features=8, n_classes=3, n_informative=6)
|
||||
if parameters.get("objective", None) == "multi:softmax":
|
||||
parameters["num_class"] = 3
|
||||
|
||||
dm1 = xgb.DMatrix(X, y)
|
||||
|
||||
bst = xgb.train(parameters, dm1)
|
||||
bst.save_model(model_path)
|
||||
|
||||
if model_path.endswith("ubj"):
|
||||
import ubjson
|
||||
with open(model_path, "rb") as ubjfd:
|
||||
@@ -326,24 +330,43 @@ class TestModels:
|
||||
from_ubjraw = xgb.Booster()
|
||||
from_ubjraw.load_model(ubj_raw)
|
||||
|
||||
old_from_json = from_jraw.save_raw(raw_format="deprecated")
|
||||
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
|
||||
if parameters.get("multi_strategy", None) != "multi_output_tree":
|
||||
# old binary model is not supported.
|
||||
old_from_json = from_jraw.save_raw(raw_format="deprecated")
|
||||
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
|
||||
|
||||
assert old_from_json == old_from_ubj
|
||||
assert old_from_json == old_from_ubj
|
||||
|
||||
raw_json = bst.save_raw(raw_format="json")
|
||||
pretty = json.dumps(json.loads(raw_json), indent=2) + "\n\n"
|
||||
bst.load_model(bytearray(pretty, encoding="ascii"))
|
||||
|
||||
old_from_json = from_jraw.save_raw(raw_format="deprecated")
|
||||
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
|
||||
if parameters.get("multi_strategy", None) != "multi_output_tree":
|
||||
# old binary model is not supported.
|
||||
old_from_json = from_jraw.save_raw(raw_format="deprecated")
|
||||
old_from_ubj = from_ubjraw.save_raw(raw_format="deprecated")
|
||||
|
||||
assert old_from_json == old_from_ubj
|
||||
assert old_from_json == old_from_ubj
|
||||
|
||||
rng = np.random.default_rng()
|
||||
X = rng.random(size=from_jraw.num_features() * 10).reshape(
|
||||
(10, from_jraw.num_features())
|
||||
)
|
||||
predt_from_jraw = from_jraw.predict(xgb.DMatrix(X))
|
||||
predt_from_bst = bst.predict(xgb.DMatrix(X))
|
||||
np.testing.assert_allclose(predt_from_jraw, predt_from_bst)
|
||||
|
||||
@pytest.mark.parametrize("ext", ["json", "ubj"])
|
||||
def test_model_json_io(self, ext: str) -> None:
|
||||
parameters = {"booster": "gbtree", "tree_method": "hist"}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
parameters = {
|
||||
"booster": "gbtree",
|
||||
"tree_method": "hist",
|
||||
"multi_strategy": "multi_output_tree",
|
||||
"objective": "multi:softmax",
|
||||
}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
parameters = {"booster": "gblinear"}
|
||||
self.run_model_json_io(parameters, ext)
|
||||
parameters = {"booster": "dart", "tree_method": "hist"}
|
||||
|
||||
@@ -465,7 +465,7 @@ class TestCallbacks:
|
||||
assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".pkl"))
|
||||
|
||||
def test_callback_list(self):
|
||||
X, y = tm.get_california_housing()
|
||||
X, y = tm.data.get_california_housing()
|
||||
m = xgb.DMatrix(X, y)
|
||||
callbacks = [xgb.callback.EarlyStopping(rounds=10)]
|
||||
for i in range(4):
|
||||
|
||||
@@ -82,7 +82,7 @@ class TestRanking:
|
||||
"""
|
||||
cls.dpath = 'demo/rank/'
|
||||
(x_train, y_train, qid_train, x_test, y_test, qid_test,
|
||||
x_valid, y_valid, qid_valid) = tm.get_mq2008(cls.dpath)
|
||||
x_valid, y_valid, qid_valid) = tm.data.get_mq2008(cls.dpath)
|
||||
|
||||
# instantiate the matrices
|
||||
cls.dtrain = xgboost.DMatrix(x_train, y_train)
|
||||
|
||||
@@ -11,6 +11,7 @@ from xgboost import testing as tm
|
||||
from xgboost.testing.params import (
|
||||
cat_parameter_strategy,
|
||||
exact_parameter_strategy,
|
||||
hist_multi_parameter_strategy,
|
||||
hist_parameter_strategy,
|
||||
)
|
||||
from xgboost.testing.updater import check_init_estimation, check_quantile_loss
|
||||
@@ -18,11 +19,70 @@ from xgboost.testing.updater import check_init_estimation, check_quantile_loss
|
||||
|
||||
def train_result(param, dmat, num_rounds):
|
||||
result = {}
|
||||
xgb.train(param, dmat, num_rounds, [(dmat, 'train')], verbose_eval=False,
|
||||
evals_result=result)
|
||||
booster = xgb.train(
|
||||
param,
|
||||
dmat,
|
||||
num_rounds,
|
||||
[(dmat, "train")],
|
||||
verbose_eval=False,
|
||||
evals_result=result,
|
||||
)
|
||||
assert booster.num_features() == dmat.num_col()
|
||||
assert booster.num_boosted_rounds() == num_rounds
|
||||
assert booster.feature_names == dmat.feature_names
|
||||
assert booster.feature_types == dmat.feature_types
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class TestTreeMethodMulti:
|
||||
@given(
|
||||
exact_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
|
||||
)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_exact(self, param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
|
||||
if dataset.name.endswith("-l1"):
|
||||
return
|
||||
param["tree_method"] = "exact"
|
||||
param = dataset.set_params(param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
@given(
|
||||
exact_parameter_strategy,
|
||||
hist_parameter_strategy,
|
||||
strategies.integers(1, 20),
|
||||
tm.multi_dataset_strategy,
|
||||
)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_approx(self, param, hist_param, num_rounds, dataset):
|
||||
param["tree_method"] = "approx"
|
||||
param = dataset.set_params(param)
|
||||
param.update(hist_param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
note(result)
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
@given(
|
||||
exact_parameter_strategy,
|
||||
hist_multi_parameter_strategy,
|
||||
strategies.integers(1, 20),
|
||||
tm.multi_dataset_strategy,
|
||||
)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_hist(
|
||||
self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
|
||||
) -> None:
|
||||
if dataset.name.endswith("-l1"):
|
||||
return
|
||||
param["tree_method"] = "hist"
|
||||
param = dataset.set_params(param)
|
||||
param.update(hist_param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
note(result)
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
|
||||
class TestTreeMethod:
|
||||
USE_ONEHOT = np.iinfo(np.int32).max
|
||||
USE_PART = 1
|
||||
@@ -77,10 +137,14 @@ class TestTreeMethod:
|
||||
# Second prune should not change the tree
|
||||
assert after_prune == second_prune
|
||||
|
||||
@given(exact_parameter_strategy, hist_parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@given(
|
||||
exact_parameter_strategy,
|
||||
hist_parameter_strategy,
|
||||
strategies.integers(1, 20),
|
||||
tm.dataset_strategy
|
||||
)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_hist(self, param, hist_param, num_rounds, dataset):
|
||||
def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
|
||||
param['tree_method'] = 'hist'
|
||||
param = dataset.set_params(param)
|
||||
param.update(hist_param)
|
||||
@@ -88,23 +152,6 @@ class TestTreeMethod:
|
||||
note(result)
|
||||
assert tm.non_increasing(result['train'][dataset.metric])
|
||||
|
||||
@given(tm.sparse_datasets_strategy)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_sparse(self, dataset):
|
||||
param = {"tree_method": "hist", "max_bin": 64}
|
||||
hist_result = train_result(param, dataset.get_dmat(), 16)
|
||||
note(hist_result)
|
||||
assert tm.non_increasing(hist_result['train'][dataset.metric])
|
||||
|
||||
param = {"tree_method": "approx", "max_bin": 64}
|
||||
approx_result = train_result(param, dataset.get_dmat(), 16)
|
||||
note(approx_result)
|
||||
assert tm.non_increasing(approx_result['train'][dataset.metric])
|
||||
|
||||
np.testing.assert_allclose(
|
||||
hist_result["train"]["rmse"], approx_result["train"]["rmse"]
|
||||
)
|
||||
|
||||
def test_hist_categorical(self):
|
||||
# hist must be same as exact on all-categorial data
|
||||
dpath = 'demo/data/'
|
||||
@@ -143,6 +190,23 @@ class TestTreeMethod:
|
||||
w = [0, 0, 1, 0]
|
||||
model.fit(X, y, sample_weight=w)
|
||||
|
||||
@given(tm.sparse_datasets_strategy)
|
||||
@settings(deadline=None, print_blob=True)
|
||||
def test_sparse(self, dataset):
|
||||
param = {"tree_method": "hist", "max_bin": 64}
|
||||
hist_result = train_result(param, dataset.get_dmat(), 16)
|
||||
note(hist_result)
|
||||
assert tm.non_increasing(hist_result['train'][dataset.metric])
|
||||
|
||||
param = {"tree_method": "approx", "max_bin": 64}
|
||||
approx_result = train_result(param, dataset.get_dmat(), 16)
|
||||
note(approx_result)
|
||||
assert tm.non_increasing(approx_result['train'][dataset.metric])
|
||||
|
||||
np.testing.assert_allclose(
|
||||
hist_result["train"]["rmse"], approx_result["train"]["rmse"]
|
||||
)
|
||||
|
||||
def run_invalid_category(self, tree_method: str) -> None:
|
||||
rng = np.random.default_rng()
|
||||
# too large
|
||||
@@ -365,7 +429,7 @@ class TestTreeMethod:
|
||||
) -> None:
|
||||
cat_parameters.update(hist_parameters)
|
||||
dataset = tm.TestDataset(
|
||||
"ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
|
||||
"ames_housing", tm.data.get_ames_housing, "reg:squarederror", "rmse"
|
||||
)
|
||||
cat_parameters["tree_method"] = tree_method
|
||||
results = train_result(cat_parameters, dataset.get_dmat(), 16)
|
||||
|
||||
@@ -1168,7 +1168,7 @@ def test_dask_aft_survival() -> None:
|
||||
|
||||
def test_dask_ranking(client: "Client") -> None:
|
||||
dpath = "demo/rank/"
|
||||
mq2008 = tm.get_mq2008(dpath)
|
||||
mq2008 = tm.data.get_mq2008(dpath)
|
||||
data = []
|
||||
for d in mq2008:
|
||||
if isinstance(d, scipy.sparse.csr_matrix):
|
||||
|
||||
Reference in New Issue
Block a user