Rewrite approx (#7214)
This PR rewrites the approx tree method to use codebase from hist for better performance and code sharing. The rewrite has many benefits: - Support for both `max_leaves` and `max_depth`. - Support for `grow_policy`. - Support for mono constraint. - Support for feature weights. - Support for easier bin configuration (`max_bin`). - Support for categorical data. - Faster performance for most of the datasets. (many times faster) - Support for prediction cache. - Significantly better performance for external memory. - Unites the code base between approx and hist.
This commit is contained in:
@@ -63,7 +63,6 @@ training_dset = xgb.DMatrix(x, label=y)
|
||||
|
||||
|
||||
class TestMonotoneConstraints:
|
||||
|
||||
def test_monotone_constraints_for_exact_tree_method(self):
|
||||
|
||||
# first check monotonicity for the 'exact' tree method
|
||||
@@ -76,32 +75,23 @@ class TestMonotoneConstraints:
|
||||
)
|
||||
assert is_correctly_constrained(constrained_exact_method)
|
||||
|
||||
def test_monotone_constraints_for_depthwise_hist_tree_method(self):
|
||||
|
||||
# next check monotonicity for the 'hist' tree method
|
||||
params_for_constrained_hist_method = {
|
||||
'tree_method': 'hist', 'verbosity': 1,
|
||||
'monotone_constraints': '(1, -1)'
|
||||
@pytest.mark.parametrize(
|
||||
"tree_method,policy",
|
||||
[
|
||||
("hist", "depthwise"),
|
||||
("approx", "depthwise"),
|
||||
("hist", "lossguide"),
|
||||
("approx", "lossguide"),
|
||||
],
|
||||
)
|
||||
def test_monotone_constraints(self, tree_method: str, policy: str) -> None:
|
||||
params_for_constrained = {
|
||||
"tree_method": tree_method,
|
||||
"grow_policy": policy,
|
||||
"monotone_constraints": "(1, -1)",
|
||||
}
|
||||
constrained_hist_method = xgb.train(
|
||||
params_for_constrained_hist_method, training_dset
|
||||
)
|
||||
|
||||
assert is_correctly_constrained(constrained_hist_method)
|
||||
|
||||
def test_monotone_constraints_for_lossguide_hist_tree_method(self):
|
||||
|
||||
# next check monotonicity for the 'hist' tree method
|
||||
params_for_constrained_hist_method = {
|
||||
'tree_method': 'hist', 'verbosity': 1,
|
||||
'grow_policy': 'lossguide',
|
||||
'monotone_constraints': '(1, -1)'
|
||||
}
|
||||
constrained_hist_method = xgb.train(
|
||||
params_for_constrained_hist_method, training_dset
|
||||
)
|
||||
|
||||
assert is_correctly_constrained(constrained_hist_method)
|
||||
constrained = xgb.train(params_for_constrained, training_dset)
|
||||
assert is_correctly_constrained(constrained)
|
||||
|
||||
@pytest.mark.parametrize('format', [dict, list])
|
||||
def test_monotone_constraints_feature_names(self, format):
|
||||
|
||||
@@ -45,14 +45,20 @@ class TestTreeMethod:
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
assert tm.non_increasing(result['train'][dataset.metric])
|
||||
|
||||
@given(exact_parameter_strategy, strategies.integers(1, 20),
|
||||
tm.dataset_strategy)
|
||||
@given(
|
||||
exact_parameter_strategy,
|
||||
hist_parameter_strategy,
|
||||
strategies.integers(1, 20),
|
||||
tm.dataset_strategy,
|
||||
)
|
||||
@settings(deadline=None)
|
||||
def test_approx(self, param, num_rounds, dataset):
|
||||
param['tree_method'] = 'approx'
|
||||
def test_approx(self, param, hist_param, num_rounds, dataset):
|
||||
param["tree_method"] = "approx"
|
||||
param = dataset.set_params(param)
|
||||
param.update(hist_param)
|
||||
result = train_result(param, dataset.get_dmat(), num_rounds)
|
||||
assert tm.non_increasing(result['train'][dataset.metric], 1e-3)
|
||||
note(result)
|
||||
assert tm.non_increasing(result["train"][dataset.metric])
|
||||
|
||||
@pytest.mark.skipif(**tm.no_sklearn())
|
||||
def test_pruner(self):
|
||||
@@ -126,3 +132,53 @@ class TestTreeMethod:
|
||||
y = [1000000., 0., 0., 500000.]
|
||||
w = [0, 0, 1, 0]
|
||||
model.fit(X, y, sample_weight=w)
|
||||
|
||||
def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
|
||||
onehot, label = tm.make_categorical(rows, cols, cats, True)
|
||||
cat, _ = tm.make_categorical(rows, cols, cats, False)
|
||||
|
||||
by_etl_results = {}
|
||||
by_builtin_results = {}
|
||||
|
||||
predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
|
||||
# Use one-hot exclusively
|
||||
parameters = {
|
||||
"tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999
|
||||
}
|
||||
|
||||
m = xgb.DMatrix(onehot, label, enable_categorical=False)
|
||||
xgb.train(
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
evals_result=by_etl_results,
|
||||
)
|
||||
|
||||
m = xgb.DMatrix(cat, label, enable_categorical=True)
|
||||
xgb.train(
|
||||
parameters,
|
||||
m,
|
||||
num_boost_round=rounds,
|
||||
evals=[(m, "Train")],
|
||||
evals_result=by_builtin_results,
|
||||
)
|
||||
|
||||
# There are guidelines on how to specify tolerance based on considering output as
|
||||
# random variables. But in here the tree construction is extremely sensitive to
|
||||
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
|
||||
# different tree. So even though the test is quite lenient, hypothesis can still
|
||||
# pick up falsifying examples from time to time.
|
||||
np.testing.assert_allclose(
|
||||
np.array(by_etl_results["Train"]["rmse"]),
|
||||
np.array(by_builtin_results["Train"]["rmse"]),
|
||||
rtol=1e-3,
|
||||
)
|
||||
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
|
||||
|
||||
@given(strategies.integers(10, 400), strategies.integers(3, 8),
|
||||
strategies.integers(1, 2), strategies.integers(4, 7))
|
||||
@settings(deadline=None)
|
||||
@pytest.mark.skipif(**tm.no_pandas())
|
||||
def test_categorical(self, rows, cols, rounds, cats):
|
||||
self.run_categorical_basic(rows, cols, rounds, cats, "approx")
|
||||
|
||||
@@ -1184,9 +1184,13 @@ class TestWithDask:
|
||||
for arg in rabit_args:
|
||||
if arg.decode('utf-8').startswith('DMLC_TRACKER_PORT'):
|
||||
port_env = arg.decode('utf-8')
|
||||
if arg.decode("utf-8").startswith("DMLC_TRACKER_URI"):
|
||||
uri_env = arg.decode("utf-8")
|
||||
port = port_env.split('=')
|
||||
env = os.environ.copy()
|
||||
env[port[0]] = port[1]
|
||||
uri = uri_env.split("=")
|
||||
env["DMLC_TRACKER_URI"] = uri[1]
|
||||
return subprocess.run([str(exe), test], env=env, capture_output=True)
|
||||
|
||||
with LocalCluster(n_workers=4) as cluster:
|
||||
@@ -1210,11 +1214,13 @@ class TestWithDask:
|
||||
@pytest.mark.gtest
|
||||
def test_quantile_basic(self) -> None:
|
||||
self.run_quantile('DistributedBasic')
|
||||
self.run_quantile('SortedDistributedBasic')
|
||||
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
@pytest.mark.gtest
|
||||
def test_quantile(self) -> None:
|
||||
self.run_quantile('Distributed')
|
||||
self.run_quantile('SortedDistributed')
|
||||
|
||||
@pytest.mark.skipif(**tm.no_dask())
|
||||
@pytest.mark.gtest
|
||||
@@ -1252,13 +1258,17 @@ class TestWithDask:
|
||||
for i in range(kCols):
|
||||
fw[i] *= float(i)
|
||||
fw = da.from_array(fw)
|
||||
poly_increasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor)
|
||||
poly_increasing = run_feature_weights(
|
||||
X, y, fw, "approx", model=xgb.dask.DaskXGBRegressor
|
||||
)
|
||||
|
||||
fw = np.ones(shape=(kCols,))
|
||||
for i in range(kCols):
|
||||
fw[i] *= float(kCols - i)
|
||||
fw = da.from_array(fw)
|
||||
poly_decreasing = run_feature_weights(X, y, fw, model=xgb.dask.DaskXGBRegressor)
|
||||
poly_decreasing = run_feature_weights(
|
||||
X, y, fw, "approx", model=xgb.dask.DaskXGBRegressor
|
||||
)
|
||||
|
||||
# Approxmated test, this is dependent on the implementation of random
|
||||
# number generator in std library.
|
||||
|
||||
@@ -1031,10 +1031,10 @@ def test_pandas_input():
|
||||
np.array([0, 1]))
|
||||
|
||||
|
||||
def run_feature_weights(X, y, fw, model=xgb.XGBRegressor):
|
||||
def run_feature_weights(X, y, fw, tree_method, model=xgb.XGBRegressor):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
colsample_bynode = 0.5
|
||||
reg = model(tree_method='hist', colsample_bynode=colsample_bynode)
|
||||
reg = model(tree_method=tree_method, colsample_bynode=colsample_bynode)
|
||||
|
||||
reg.fit(X, y, feature_weights=fw)
|
||||
model_path = os.path.join(tmpdir, 'model.json')
|
||||
@@ -1069,7 +1069,8 @@ def run_feature_weights(X, y, fw, model=xgb.XGBRegressor):
|
||||
return w
|
||||
|
||||
|
||||
def test_feature_weights():
|
||||
@pytest.mark.parametrize("tree_method", ["approx", "hist"])
|
||||
def test_feature_weights(tree_method):
|
||||
kRows = 512
|
||||
kCols = 64
|
||||
X = rng.randn(kRows, kCols)
|
||||
@@ -1078,12 +1079,12 @@ def test_feature_weights():
|
||||
fw = np.ones(shape=(kCols,))
|
||||
for i in range(kCols):
|
||||
fw[i] *= float(i)
|
||||
poly_increasing = run_feature_weights(X, y, fw, xgb.XGBRegressor)
|
||||
poly_increasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
|
||||
|
||||
fw = np.ones(shape=(kCols,))
|
||||
for i in range(kCols):
|
||||
fw[i] *= float(kCols - i)
|
||||
poly_decreasing = run_feature_weights(X, y, fw, xgb.XGBRegressor)
|
||||
poly_decreasing = run_feature_weights(X, y, fw, tree_method, xgb.XGBRegressor)
|
||||
|
||||
# Approxmated test, this is dependent on the implementation of random
|
||||
# number generator in std library.
|
||||
|
||||
Reference in New Issue
Block a user