sync Jun 5

This commit is contained in:
amdsc21
2023-06-07 02:43:21 +02:00
56 changed files with 531 additions and 2106 deletions

View File

@@ -24,7 +24,7 @@ set -x
CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=23.02
RAPIDS_VERSION=23.04
SPARK_VERSION=3.4.0
JDK_VERSION=8

View File

@@ -0,0 +1,10 @@
#!/bin/bash
set -euo pipefail
LATEST_RAPIDS_VERSION=$(gh api repos/rapidsai/cuml/releases/latest --jq '.name' | sed -e 's/^v\([[:digit:]]\+\.[[:digit:]]\+\).*/\1/')
echo "LATEST_RAPIDS_VERSION = $LATEST_RAPIDS_VERSION"
PARENT_PATH=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
sed -i "s/^RAPIDS_VERSION=[[:digit:]]\+\.[[:digit:]]\+/RAPIDS_VERSION=${LATEST_RAPIDS_VERSION}/" $PARENT_PATH/conftest.sh

View File

@@ -18,8 +18,17 @@ rm -rf $(find . -name target)
rm -rf ../build/
# Re-build package without Mock Rabit
# Maven profiles:
# `default` includes modules: xgboost4j, xgboost4j-spark, xgboost4j-flink, xgboost4j-example
# `gpu` includes modules: xgboost4j-gpu, xgboost4j-spark-gpu, sets `use.cuda = ON`
# `scala-2.13` sets the scala binary version to the 2.13
# `release-to-s3` sets maven deployment targets
# Deploy to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
mvn --no-transfer-progress package deploy -P default,gpu,release-to-s3 -Dspark.version=${spark_version} -DskipTests
# Deploy scala 2.13 to S3 bucket xgboost-maven-repo
mvn --no-transfer-progress package deploy -P release-to-s3,default,scala-2.13 -Dspark.version=${spark_version} -DskipTests
set +x
set +e

View File

@@ -90,7 +90,7 @@ def check_cmd_print_failure_assistance(cmd: List[str]) -> bool:
subprocess.run([cmd[0], "--version"])
msg = """
Please run the following command on your machine to address the formatting error:
Please run the following command on your machine to address the error:
"""
msg += " ".join(cmd)

View File

@@ -17,34 +17,30 @@
#include "xgboost/host_device_vector.h" // for HostDeviceVector
#include "xgboost/json.h" // for Json, String, Object
namespace xgboost {
namespace metric {
namespace xgboost::metric {
inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
// When the limit for precision is not given, it takes the limit at
// std::numeric_limits<unsigned>::max(); hence all values are very small
// NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default.
auto ctx = xgboost::CreateEmptyGenericParam(GPUIDX);
xgboost::Metric * metric = xgboost::Metric::Create("pre", &ctx);
std::unique_ptr<xgboost::Metric> metric{Metric::Create("pre", &ctx)};
ASSERT_STREQ(metric->Name(), "pre");
EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0, 1e-7);
EXPECT_NEAR(GetMetricEval(metric,
{0.1f, 0.9f, 0.1f, 0.9f},
{ 0, 0, 1, 1}, {}, {}, data_split_mode),
0, 1e-7);
EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5, 1e-7);
EXPECT_NEAR(
GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
0.5, 1e-7);
delete metric;
metric = xgboost::Metric::Create("pre@2", &ctx);
metric.reset(xgboost::Metric::Create("pre@2", &ctx));
ASSERT_STREQ(metric->Name(), "pre@2");
EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
EXPECT_NEAR(GetMetricEval(metric,
{0.1f, 0.9f, 0.1f, 0.9f},
{ 0, 0, 1, 1}, {}, {}, data_split_mode),
0.5f, 0.001f);
EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5f, 1e-7);
EXPECT_NEAR(
GetMetricEval(metric.get(), {0.1f, 0.9f, 0.1f, 0.9f}, {0, 0, 1, 1}, {}, {}, data_split_mode),
0.5f, 0.001f);
EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
EXPECT_ANY_THROW(GetMetricEval(metric.get(), {0, 1}, {}, {}, {}, data_split_mode));
delete metric;
metric.reset(xgboost::Metric::Create("pre@4", &ctx));
EXPECT_NEAR(GetMetricEval(metric.get(), {0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f},
{0.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f}, {}, {}, data_split_mode),
0.5f, 1e-7);
}
inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
@@ -187,5 +183,4 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
ndcg = metric->Evaluate(predt, p_fmat);
ASSERT_NEAR(ndcg, 1.0, kRtEps);
}
} // namespace metric
} // namespace xgboost
} // namespace xgboost::metric

View File

@@ -17,13 +17,15 @@
#include "test_predictor.h"
namespace xgboost {
TEST(CpuPredictor, Basic) {
namespace {
void TestBasic(DMatrix* dmat) {
auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor =
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
size_t constexpr kRows = 5;
size_t constexpr kCols = 5;
size_t const kRows = dmat->Info().num_row_;
size_t const kCols = dmat->Info().num_col_;
LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
@@ -31,12 +33,10 @@ TEST(CpuPredictor, Basic) {
ctx.UpdateAllowUnknown(Args{});
gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
// Test predict batch
PredictionCacheEntry out_predictions;
cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
cpu_predictor->PredictBatch(dmat, &out_predictions, model, 0);
std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
@@ -44,26 +44,32 @@ TEST(CpuPredictor, Basic) {
}
// Test predict instance
auto const &batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
auto const& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
auto page = batch.GetView();
for (size_t i = 0; i < batch.Size(); i++) {
std::vector<float> instance_out_predictions;
cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model);
cpu_predictor->PredictInstance(page[i], &instance_out_predictions, model, 0,
dmat->Info().IsColumnSplit());
ASSERT_EQ(instance_out_predictions[0], 1.5);
}
// Test predict leaf
HostDeviceVector<float> leaf_out_predictions;
cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
cpu_predictor->PredictLeaf(dmat, &leaf_out_predictions, model);
auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
for (auto v : h_leaf_out_predictions) {
ASSERT_EQ(v, 0);
}
if (dmat->Info().IsColumnSplit()) {
// Predict contribution is not supported for column split.
return;
}
// Test predict contribution
HostDeviceVector<float> out_contribution_hdv;
auto& out_contribution = out_contribution_hdv.HostVector();
cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model);
ASSERT_EQ(out_contribution.size(), kRows * (kCols + 1));
for (size_t i = 0; i < out_contribution.size(); ++i) {
auto const& contri = out_contribution[i];
@@ -76,8 +82,7 @@ TEST(CpuPredictor, Basic) {
}
}
// Test predict contribution (approximate method)
cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model,
0, nullptr, true);
cpu_predictor->PredictContribution(dmat, &out_contribution_hdv, model, 0, nullptr, true);
for (size_t i = 0; i < out_contribution.size(); ++i) {
auto const& contri = out_contribution[i];
// shift 1 for bias, as test tree is a decision dump, only global bias is
@@ -89,41 +94,32 @@ TEST(CpuPredictor, Basic) {
}
}
}
} // anonymous namespace
namespace {
void TestColumnSplitPredictBatch() {
TEST(CpuPredictor, Basic) {
size_t constexpr kRows = 5;
size_t constexpr kCols = 5;
auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
TestBasic(dmat.get());
}
namespace {
void TestColumnSplit() {
size_t constexpr kRows = 5;
size_t constexpr kCols = 5;
auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
auto const world_size = collective::GetWorldSize();
auto const rank = collective::GetRank();
dmat = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor =
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
LearnerModelParam mparam{MakeMP(kCols, .0, 1)};
Context ctx;
ctx.UpdateAllowUnknown(Args{});
gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
// Test predict batch
PredictionCacheEntry out_predictions;
cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
auto sliced = std::unique_ptr<DMatrix>{dmat->SliceCol(world_size, rank)};
cpu_predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
ASSERT_EQ(out_predictions_h[i], 1.5);
}
TestBasic(dmat.get());
}
} // anonymous namespace
TEST(CpuPredictor, ColumnSplit) {
TEST(CpuPredictor, ColumnSplitBasic) {
auto constexpr kWorldSize = 2;
RunWithInMemoryCommunicator(kWorldSize, TestColumnSplitPredictBatch);
RunWithInMemoryCommunicator(kWorldSize, TestColumnSplit);
}
TEST(CpuPredictor, IterationRange) {
@@ -133,69 +129,8 @@ TEST(CpuPredictor, IterationRange) {
TEST(CpuPredictor, ExternalMemory) {
size_t constexpr kPageSize = 64, kEntriesPerCol = 3;
size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2;
std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(kEntries);
auto lparam = CreateEmptyGenericParam(GPUIDX);
std::unique_ptr<Predictor> cpu_predictor =
std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", &lparam));
LearnerModelParam mparam{MakeMP(dmat->Info().num_col_, .0, 1)};
Context ctx;
ctx.UpdateAllowUnknown(Args{});
gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
// Test predict batch
PredictionCacheEntry out_predictions;
cpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
ASSERT_EQ(out_predictions.predictions.Size(), dmat->Info().num_row_);
for (const auto& v : out_predictions_h) {
ASSERT_EQ(v, 1.5);
}
// Test predict leaf
HostDeviceVector<float> leaf_out_predictions;
cpu_predictor->PredictLeaf(dmat.get(), &leaf_out_predictions, model);
auto const& h_leaf_out_predictions = leaf_out_predictions.ConstHostVector();
ASSERT_EQ(h_leaf_out_predictions.size(), dmat->Info().num_row_);
for (const auto& v : h_leaf_out_predictions) {
ASSERT_EQ(v, 0);
}
// Test predict contribution
HostDeviceVector<float> out_contribution_hdv;
auto& out_contribution = out_contribution_hdv.HostVector();
cpu_predictor->PredictContribution(dmat.get(), &out_contribution_hdv, model);
ASSERT_EQ(out_contribution.size(), dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
for (size_t i = 0; i < out_contribution.size(); ++i) {
auto const& contri = out_contribution[i];
// shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
ASSERT_EQ(out_contribution.back(), 1.5f);
} else {
ASSERT_EQ(contri, 0);
}
}
// Test predict contribution (approximate method)
HostDeviceVector<float> out_contribution_approximate_hdv;
auto& out_contribution_approximate = out_contribution_approximate_hdv.HostVector();
cpu_predictor->PredictContribution(
dmat.get(), &out_contribution_approximate_hdv, model, 0, nullptr, true);
ASSERT_EQ(out_contribution_approximate.size(),
dmat->Info().num_row_ * (dmat->Info().num_col_ + 1));
for (size_t i = 0; i < out_contribution.size(); ++i) {
auto const& contri = out_contribution[i];
// shift 1 for bias, as test tree is a decision dump, only global bias is filled with LeafValue().
if ((i + 1) % (dmat->Info().num_col_ + 1) == 0) {
ASSERT_EQ(out_contribution.back(), 1.5f);
} else {
ASSERT_EQ(contri, 0);
}
}
TestBasic(dmat.get());
}
TEST(CpuPredictor, InplacePredict) {

View File

@@ -5,7 +5,7 @@ import pytest
import xgboost
from xgboost import testing as tm
from xgboost.testing.metrics import check_quantile_error
from xgboost.testing.metrics import check_precision_score, check_quantile_error
sys.path.append("tests/python")
import test_eval_metrics as test_em # noqa
@@ -59,6 +59,9 @@ class TestGPUEvalMetrics:
def test_pr_auc_ltr(self):
self.cpu_test.run_pr_auc_ltr("gpu_hist")
def test_precision_score(self):
check_precision_score("gpu_hist")
@pytest.mark.skipif(**tm.no_sklearn())
def test_quantile_error(self) -> None:
check_quantile_error("gpu_hist")

View File

@@ -3,7 +3,7 @@ import pytest
import xgboost as xgb
from xgboost import testing as tm
from xgboost.testing.metrics import check_quantile_error
from xgboost.testing.metrics import check_precision_score, check_quantile_error
rng = np.random.RandomState(1337)
@@ -315,6 +315,9 @@ class TestEvalMetrics:
def test_pr_auc_ltr(self):
self.run_pr_auc_ltr("hist")
def test_precision_score(self):
check_precision_score("hist")
@pytest.mark.skipif(**tm.no_sklearn())
def test_quantile_error(self) -> None:
check_quantile_error("hist")

View File

@@ -55,6 +55,38 @@ class TestQuantileDMatrix:
r = np.arange(1.0, n_samples)
np.testing.assert_allclose(Xy.get_data().toarray()[1:, 0], r)
def test_error(self):
from sklearn.model_selection import train_test_split
rng = np.random.default_rng(1994)
X, y = make_categorical(
n_samples=128, n_features=2, n_categories=3, onehot=False
)
reg = xgb.XGBRegressor(tree_method="hist", enable_categorical=True)
w = rng.uniform(0, 1, size=y.shape[0])
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
X, y, w, random_state=1994
)
with pytest.raises(ValueError, match="sample weight"):
reg.fit(
X,
y,
sample_weight=w_train,
eval_set=[(X_test, y_test)],
sample_weight_eval_set=[w_test],
)
with pytest.raises(ValueError, match="sample weight"):
reg.fit(
X_train,
y_train,
sample_weight=w,
eval_set=[(X_test, y_test)],
sample_weight_eval_set=[w_test],
)
@pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.8, 0.9])
def test_with_iterator(self, sparsity: float) -> None:
n_samples_per_batch = 317