Initial support for multioutput regression. (#7514)

* Add num target model parameter, which is configured from input labels.
* Change elementwise metric and indexing for weights.
* Add demo.
* Add tests.
This commit is contained in:
Jiaming Yuan
2021-12-18 09:28:38 +08:00
committed by GitHub
parent 9ab73f737e
commit 58a6723eb1
22 changed files with 306 additions and 67 deletions

View File

@@ -92,6 +92,7 @@ TEST(CAPI, ConfigIO) {
labels[i] = i;
}
p_dmat->Info().labels.Data()->HostVector() = labels;
p_dmat->Info().labels.Reshape(kRows);
std::shared_ptr<Learner> learner { Learner::Create(mat) };
@@ -126,6 +127,7 @@ TEST(CAPI, JsonModelIO) {
labels[i] = i;
}
p_dmat->Info().labels.Data()->HostVector() = labels;
p_dmat->Info().labels.Reshape(kRows);
std::shared_ptr<Learner> learner { Learner::Create(mat) };

View File

@@ -9,8 +9,9 @@
#include <xgboost/linalg.h>
#include <numeric>
#include "../../../src/data/array_interface.h"
#include "../../../src/common/linalg_op.h"
#include "../../../src/data/array_interface.h"
namespace xgboost {
inline void TestMetaInfoStridedData(int32_t device) {

View File

@@ -144,15 +144,26 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
}
xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
std::vector<xgboost::bst_float> labels,
std::vector<xgboost::bst_float> weights,
std::vector<xgboost::bst_uint> groups) {
return GetMultiMetricEval(
metric, preds,
xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
groups);
}
double GetMultiMetricEval(xgboost::Metric* metric,
xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
xgboost::linalg::Tensor<float, 2> const& labels,
std::vector<xgboost::bst_float> weights,
std::vector<xgboost::bst_uint> groups) {
xgboost::MetaInfo info;
info.num_row_ = labels.size();
info.labels =
xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1};
info.num_row_ = labels.Shape(0);
info.labels.Reshape(labels.Shape()[0], labels.Shape()[1]);
info.labels.Data()->Copy(*labels.Data());
info.weights_.HostVector() = weights;
info.group_ptr_ = groups;
@@ -344,13 +355,14 @@ RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
RandomDataGenerator gen(rows_, 1, 0);
if (!float_label) {
gen.Lower(0).Upper(classes).GenerateDense(out->Info().labels.Data());
out->Info().labels.Reshape(out->Info().labels.Size());
out->Info().labels.Reshape(this->rows_);
auto& h_labels = out->Info().labels.Data()->HostVector();
for (auto& v : h_labels) {
v = static_cast<float>(static_cast<uint32_t>(v));
}
} else {
gen.GenerateDense(out->Info().labels.Data());
out->Info().labels.Reshape(this->rows_);
}
}
if (device_ >= 0) {

View File

@@ -91,6 +91,12 @@ xgboost::bst_float GetMetricEval(
std::vector<xgboost::bst_float> weights = std::vector<xgboost::bst_float>(),
std::vector<xgboost::bst_uint> groups = std::vector<xgboost::bst_uint>());
double GetMultiMetricEval(xgboost::Metric* metric,
xgboost::HostDeviceVector<xgboost::bst_float> const& preds,
xgboost::linalg::Tensor<float, 2> const& labels,
std::vector<xgboost::bst_float> weights = {},
std::vector<xgboost::bst_uint> groups = {});
namespace xgboost {
bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
std::vector<xgboost::bst_float>::const_iterator _end1,

View File

@@ -40,6 +40,9 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
} // anonymous namespace
} // namespace xgboost
namespace xgboost {
namespace metric {
TEST(Metric, DeclareUnifiedTest(RMSE)) {
auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam);
@@ -276,3 +279,27 @@ TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) {
xgboost::CheckDeterministicMetricElementWise(xgboost::StringView{"mphe"}, GPUIDX);
}
TEST(Metric, DeclareUnifiedTest(MultiRMSE)) {
size_t n_samples = 32, n_targets = 8;
linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
auto &h_y = y.Data()->HostVector();
std::iota(h_y.begin(), h_y.end(), 0);
HostDeviceVector<float> predt(n_samples * n_targets, 0);
auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Metric> metric{Metric::Create("rmse", &lparam)};
metric->Configure({});
auto loss = GetMultiMetricEval(metric.get(), predt, y);
std::vector<float> weights(n_samples, 1);
auto loss_w = GetMultiMetricEval(metric.get(), predt, y, weights);
std::transform(h_y.cbegin(), h_y.cend(), h_y.begin(), [](auto &v) { return v * v; });
auto ret = std::sqrt(std::accumulate(h_y.cbegin(), h_y.cend(), 1.0, std::plus<>{}) / h_y.size());
ASSERT_FLOAT_EQ(ret, loss);
ASSERT_FLOAT_EQ(ret, loss_w);
}
} // namespace metric
} // namespace xgboost

View File

@@ -12,9 +12,9 @@
#include "xgboost/json.h"
#include "../../src/common/io.h"
#include "../../src/common/random.h"
#include "../../src/common/linalg_op.h"
namespace xgboost {
TEST(Learner, Basic) {
using Arg = std::pair<std::string, std::string>;
auto args = {Arg("tree_method", "exact")};
@@ -278,6 +278,7 @@ TEST(Learner, GPUConfiguration) {
labels[i] = i;
}
p_dmat->Info().labels.Data()->HostVector() = labels;
p_dmat->Info().labels.Reshape(kRows);
{
std::unique_ptr<Learner> learner {Learner::Create(mat)};
learner->SetParams({Arg{"booster", "gblinear"},
@@ -424,4 +425,28 @@ TEST(Learner, FeatureInfo) {
ASSERT_TRUE(std::equal(out_types.begin(), out_types.end(), types.begin()));
}
}
TEST(Learner, MultiTarget) {
size_t constexpr kRows{128}, kCols{10}, kTargets{3};
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
m->Info().labels.Reshape(kRows, kTargets);
linalg::ElementWiseKernelHost(m->Info().labels.HostView(), omp_get_max_threads(),
[](auto i, auto) { return i; });
{
std::unique_ptr<Learner> learner{Learner::Create({m})};
learner->Configure();
Json model{Object()};
learner->SaveModel(&model);
ASSERT_EQ(get<String>(model["learner"]["learner_model_param"]["num_target"]),
std::to_string(kTargets));
}
{
std::unique_ptr<Learner> learner{Learner::Create({m})};
learner->SetParam("objective", "multi:softprob");
// unsupported objective.
EXPECT_THROW({ learner->Configure(); }, dmlc::Error);
}
}
} // namespace xgboost

View File

@@ -60,8 +60,9 @@ def _test_from_cudf(DMatrixT):
assert dtrain.feature_names == ['x']
assert dtrain.feature_types == ['int']
with pytest.raises(Exception):
with pytest.raises(ValueError, match=r".*multi.*"):
dtrain = DMatrixT(cd, label=cd)
xgb.train({"tree_method": "gpu_hist", "objective": "multi:softprob"}, dtrain)
# Test when number of elements is less than 8
X = cudf.DataFrame({'x': cudf.Series([0, 1, 2, np.NAN, 4],

View File

@@ -50,9 +50,10 @@ def _test_from_cupy(DMatrixT):
dmatrix_from_cupy(np.int32, DMatrixT, -2)
dmatrix_from_cupy(np.int64, DMatrixT, -3)
with pytest.raises(Exception):
with pytest.raises(ValueError):
X = cp.random.randn(2, 2, dtype="float32")
DMatrixT(X, label=X)
y = cp.random.randn(2, 2, 3, dtype="float32")
DMatrixT(X, label=y)
def _test_cupy_training(DMatrixT):

View File

@@ -277,7 +277,9 @@ def run_gpu_hist(
X = to_cp(dataset.X, DMatrixT)
X = da.from_array(X, chunks=(chunk, dataset.X.shape[1]))
y = to_cp(dataset.y, DMatrixT)
y = da.from_array(y, chunks=(chunk,))
y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk, dataset.y.shape[1])
y = da.from_array(y, chunks=y_chunk)
if dataset.w is not None:
w = to_cp(dataset.w, DMatrixT)
w = da.from_array(w, chunks=(chunk,))

View File

@@ -52,8 +52,12 @@ def test_boost_from_prediction_gpu_hist():
X, y = load_digits(return_X_y=True)
X, y = cp.array(X), cp.array(y)
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame)
twskl.run_boost_from_prediction_multi_clasas(
xgb.XGBClassifier, tree_method, X, y, None
)
twskl.run_boost_from_prediction_multi_clasas(
xgb.XGBClassifier, tree_method, X, y, cudf.DataFrame
)
def test_num_parallel_tree():

View File

@@ -127,6 +127,14 @@ def test_continuation_demo():
subprocess.check_call(cmd)
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.skipif(**tm.no_matplotlib())
def test_multioutput_reg() -> None:
script = os.path.join(PYTHON_DEMO_DIR, "multioutput_regression.py")
cmd = ['python', script, "--plot=0"]
subprocess.check_call(cmd)
# gpu_acceleration is not tested due to covertype dataset is being too huge.
# gamma regression is not tested as it requires running a R script first.
# aft viz is not tested due to ploting is not controled

View File

@@ -1114,9 +1114,9 @@ class TestWithDask:
return
chunk = 128
X = da.from_array(dataset.X,
chunks=(chunk, dataset.X.shape[1]))
y = da.from_array(dataset.y, chunks=(chunk,))
y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk, dataset.y.shape[1])
X = da.from_array(dataset.X, chunks=(chunk, dataset.X.shape[1]))
y = da.from_array(dataset.y, chunks=y_chunk)
if dataset.w is not None:
w = da.from_array(dataset.w, chunks=(chunk,))
else:

View File

@@ -1118,10 +1118,10 @@ def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Calla
def run_boost_from_prediction_multi_clasas(
tree_method, X, y, as_frame: Optional[Callable]
estimator, tree_method, X, y, as_frame: Optional[Callable]
):
# Multi-class
model_0 = xgb.XGBClassifier(
model_0 = estimator(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_0.fit(X=X, y=y)
@@ -1129,7 +1129,7 @@ def run_boost_from_prediction_multi_clasas(
if as_frame is not None:
margin = as_frame(margin)
model_1 = xgb.XGBClassifier(
model_1 = estimator(
learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method
)
model_1.fit(X=X, y=y, base_margin=margin)
@@ -1137,7 +1137,7 @@ def run_boost_from_prediction_multi_clasas(
xgb.DMatrix(X, base_margin=margin), output_margin=True
)
model_2 = xgb.XGBClassifier(
model_2 = estimator(
learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method
)
model_2.fit(X=X, y=y)
@@ -1152,8 +1152,9 @@ def run_boost_from_prediction_multi_clasas(
@pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"])
def test_boost_from_prediction(tree_method):
from sklearn.datasets import load_breast_cancer, load_digits
from sklearn.datasets import load_breast_cancer, load_digits, make_regression
import pandas as pd
X, y = load_breast_cancer(return_X_y=True)
run_boost_from_prediction_binary(tree_method, X, y, None)
@@ -1161,8 +1162,13 @@ def test_boost_from_prediction(tree_method):
X, y = load_digits(return_X_y=True)
run_boost_from_prediction_multi_clasas(tree_method, X, y, None)
run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame)
run_boost_from_prediction_multi_clasas(xgb.XGBClassifier, tree_method, X, y, None)
run_boost_from_prediction_multi_clasas(
xgb.XGBClassifier, tree_method, X, y, pd.DataFrame
)
X, y = make_regression(n_samples=100, n_targets=4)
run_boost_from_prediction_multi_clasas(xgb.XGBRegressor, tree_method, X, y, None)
def test_estimator_type():

View File

@@ -305,26 +305,48 @@ def make_categorical(
_unweighted_datasets_strategy = strategies.sampled_from(
[TestDataset('boston', get_boston, 'reg:squarederror', 'rmse'),
TestDataset('digits', get_digits, 'multi:softmax', 'mlogloss'),
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
TestDataset
("sparse", get_sparse, "reg:squarederror", "rmse"),
TestDataset("empty", lambda: (np.empty((0, 100)), np.empty(0)), "reg:squarederror",
"rmse")])
[
TestDataset("boston", get_boston, "reg:squarederror", "rmse"),
TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
TestDataset(
"mtreg",
lambda: datasets.make_regression(n_samples=128, n_targets=3),
"reg:squarederror",
"rmse",
),
TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
TestDataset(
"empty",
lambda: (np.empty((0, 100)), np.empty(0)),
"reg:squarederror",
"rmse",
),
]
)
@strategies.composite
def _dataset_weight_margin(draw):
data: TestDataset = draw(_unweighted_datasets_strategy)
if draw(strategies.booleans()):
data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0)))
data.w = draw(
arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))
)
if draw(strategies.booleans()):
num_class = 1
if data.objective == "multi:softmax":
num_class = int(np.max(data.y) + 1)
elif data.name == "mtreg":
num_class = data.y.shape[1]
data.margin = draw(
arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0)))
arrays(
np.float64,
(data.y.shape[0] * num_class),
elements=strategies.floats(0.5, 1.0),
)
)
if num_class != 1:
data.margin = data.margin.reshape(data.y.shape[0], num_class)