[breaking] Add prediction fucntion for DMatrix and use inplace predict for dask. (#6668)

* Add a new API function for predicting on `DMatrix`.  This function aligns
with rest of the `XGBoosterPredictFrom*` functions on semantic of function
arguments.
* Purge `ntree_limit` from libxgboost, use iteration instead.
* [dask] Use `inplace_predict` by default for dask sklearn models.
* [dask] Run prediction shape inference on worker instead of client.

The breaking change is in the Python sklearn `apply` function, I made it to be
consistent with other prediction functions where `best_iteration` is used by
default.
This commit is contained in:
Jiaming Yuan
2021-02-08 18:26:32 +08:00
committed by GitHub
parent dbb5208a0a
commit 4656b09d5d
29 changed files with 1134 additions and 604 deletions

View File

@@ -51,6 +51,53 @@ TEST(GBTree, SelectTreeMethod) {
#endif // XGBOOST_USE_CUDA
}
TEST(GBTree, PredictionCache) {
size_t constexpr kRows = 100, kCols = 10;
GenericParameter generic_param;
generic_param.UpdateAllowUnknown(Args{});
LearnerModelParam mparam;
mparam.base_score = 0.5;
mparam.num_feature = kCols;
mparam.num_output_group = 1;
std::unique_ptr<GradientBooster> p_gbm {
GradientBooster::Create("gbtree", &generic_param, &mparam)};
auto& gbtree = dynamic_cast<gbm::GBTree&> (*p_gbm);
gbtree.Configure({{"tree_method", "hist"}});
auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
auto gpair = GenerateRandomGradients(kRows);
PredictionCacheEntry out_predictions;
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
ASSERT_EQ(1, out_predictions.version);
std::vector<float> first_iter = out_predictions.predictions.HostVector();
// Add 1 more boosted round
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
ASSERT_EQ(2, out_predictions.version);
// Update the cache for all rounds
out_predictions.version = 0;
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
ASSERT_EQ(2, out_predictions.version);
gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
// drop the cache.
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 1, 2);
ASSERT_EQ(0, out_predictions.version);
// half open set [1, 3)
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 1, 3);
ASSERT_EQ(0, out_predictions.version);
// iteration end
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 2);
ASSERT_EQ(2, out_predictions.version);
// restart the cache when end iteration is smaller than cache version
gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 1);
ASSERT_EQ(1, out_predictions.version);
ASSERT_EQ(out_predictions.predictions.HostVector(), first_iter);
}
TEST(GBTree, WrongUpdater) {
size_t constexpr kRows = 17;
size_t constexpr kCols = 15;

View File

@@ -32,7 +32,7 @@ TEST(CpuPredictor, Basic) {
// Test predict batch
PredictionCacheEntry out_predictions;
cpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
ASSERT_EQ(model.trees.size(), out_predictions.version);
std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
for (size_t i = 0; i < out_predictions.predictions.Size(); i++) {
ASSERT_EQ(out_predictions_h[i], 1.5);
@@ -215,7 +215,7 @@ TEST(CpuPredictor, UpdatePredictionCache) {
PredictionCacheEntry out_predictions;
// perform fair prediction on the same input data, should be equal to cached result
gbm->PredictBatch(dmat.get(), &out_predictions, false, 0);
gbm->PredictBatch(dmat.get(), &out_predictions, false, 0, 0);
std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
std::vector<float> &predtion_cache_from_train = predtion_cache.predictions.HostVector();

View File

@@ -45,7 +45,6 @@ TEST(GPUPredictor, Basic) {
PredictionCacheEntry cpu_out_predictions;
gpu_predictor->PredictBatch(dmat.get(), &gpu_out_predictions, model, 0);
ASSERT_EQ(model.trees.size(), gpu_out_predictions.version);
cpu_predictor->PredictBatch(dmat.get(), &cpu_out_predictions, model, 0);
std::vector<float>& gpu_out_predictions_h = gpu_out_predictions.predictions.HostVector();

View File

@@ -64,10 +64,10 @@ void TestTrainingPrediction(size_t rows, size_t bins,
}
HostDeviceVector<float> from_full;
learner->Predict(p_full, false, &from_full);
learner->Predict(p_full, false, &from_full, 0, 0);
HostDeviceVector<float> from_hist;
learner->Predict(p_hist, false, &from_hist);
learner->Predict(p_hist, false, &from_hist, 0, 0);
for (size_t i = 0; i < rows; ++i) {
EXPECT_NEAR(from_hist.ConstHostVector()[i],
@@ -157,20 +157,20 @@ void TestPredictionWithLesserFeatures(std::string predictor_name) {
learner->SaveConfig(&config);
ASSERT_EQ(get<String>(config["learner"]["gradient_booster"]["gbtree_train_param"]["predictor"]), predictor_name);
learner->Predict(m_test, false, &prediction);
learner->Predict(m_test, false, &prediction, 0, 0);
ASSERT_EQ(prediction.Size(), kRows);
auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
ASSERT_THROW({learner->Predict(m_invalid, false, &prediction);}, dmlc::Error);
ASSERT_THROW({learner->Predict(m_invalid, false, &prediction, 0, 0);}, dmlc::Error);
#if defined(XGBOOST_USE_CUDA)
HostDeviceVector<float> from_cpu;
learner->SetParam("predictor", "cpu_predictor");
learner->Predict(m_test, false, &from_cpu);
learner->Predict(m_test, false, &from_cpu, 0, 0);
HostDeviceVector<float> from_cuda;
learner->SetParam("predictor", "gpu_predictor");
learner->Predict(m_test, false, &from_cuda);
learner->Predict(m_test, false, &from_cuda, 0, 0);
auto const& h_cpu = from_cpu.ConstHostVector();
auto const& h_gpu = from_cuda.ConstHostVector();

View File

@@ -221,9 +221,10 @@ TEST(Learner, MultiThreadedPredict) {
auto &entry = learner->GetThreadLocal().prediction_entry;
HostDeviceVector<float> predictions;
for (size_t iter = 0; iter < kIters; ++iter) {
learner->Predict(p_data, false, &entry.predictions);
learner->Predict(p_data, false, &predictions, 0, true); // leaf
learner->Predict(p_data, false, &predictions, 0, false, true); // contribs
learner->Predict(p_data, false, &entry.predictions, 0, 0);
learner->Predict(p_data, false, &predictions, 0, 0, false, true); // leaf
learner->Predict(p_data, false, &predictions, 0, 0, false, false, true); // contribs
}
});
}