[breaking] Add prediction fucntion for DMatrix and use inplace predict for dask. (#6668)

* Add a new API function for predicting on `DMatrix`.  This function aligns
with rest of the `XGBoosterPredictFrom*` functions on semantic of function
arguments.
* Purge `ntree_limit` from libxgboost, use iteration instead.
* [dask] Use `inplace_predict` by default for dask sklearn models.
* [dask] Run prediction shape inference on worker instead of client.

The breaking change is in the Python sklearn `apply` function, I made it to be
consistent with other prediction functions where `best_iteration` is used by
default.
This commit is contained in:
Jiaming Yuan
2021-02-08 18:26:32 +08:00
committed by GitHub
parent dbb5208a0a
commit 4656b09d5d
29 changed files with 1134 additions and 604 deletions

View File

@@ -47,6 +47,12 @@ struct GBLinearTrainParam : public XGBoostParameter<GBLinearTrainParam> {
.describe("Maximum rows per batch.");
}
};
void LinearCheckLayer(unsigned layer_begin, unsigned layer_end) {
CHECK_EQ(layer_begin, 0) << "Linear booster does not support prediction range.";
CHECK_EQ(layer_end, 0) << "Linear booster does not support prediction range.";
}
/*!
* \brief gradient boosted linear model
*/
@@ -130,20 +136,19 @@ class GBLinear : public GradientBooster {
monitor_.Stop("DoBoost");
}
void PredictBatch(DMatrix *p_fmat,
PredictionCacheEntry *predts,
bool, unsigned ntree_limit) override {
void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *predts,
bool training, unsigned layer_begin, unsigned layer_end) override {
monitor_.Start("PredictBatch");
LinearCheckLayer(layer_begin, layer_end);
auto* out_preds = &predts->predictions;
CHECK_EQ(ntree_limit, 0U)
<< "GBLinear::Predict ntrees is only valid for gbtree predictor";
this->PredictBatchInternal(p_fmat, &out_preds->HostVector());
monitor_.Stop("PredictBatch");
}
// add base margin
void PredictInstance(const SparsePage::Inst &inst,
std::vector<bst_float> *out_preds,
unsigned) override {
unsigned layer_begin, unsigned layer_end) override {
LinearCheckLayer(layer_begin, layer_end);
const int ngroup = model_.learner_model_param->num_output_group;
for (int gid = 0; gid < ngroup; ++gid) {
this->Pred(inst, dmlc::BeginPtr(*out_preds), gid,
@@ -151,16 +156,15 @@ class GBLinear : public GradientBooster {
}
}
void PredictLeaf(DMatrix *, HostDeviceVector<bst_float> *, unsigned) override {
void PredictLeaf(DMatrix *, HostDeviceVector<bst_float> *, unsigned, unsigned) override {
LOG(FATAL) << "gblinear does not support prediction of leaf index";
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned ntree_limit, bool, int, unsigned) override {
unsigned layer_begin, unsigned layer_end, bool, int, unsigned) override {
model_.LazyInitModel();
CHECK_EQ(ntree_limit, 0U)
<< "GBLinear::PredictContribution: ntrees is only valid for gbtree predictor";
LinearCheckLayer(layer_begin, layer_end);
const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
const int ngroup = model_.learner_model_param->num_output_group;
const size_t ncolumns = model_.learner_model_param->num_feature + 1;
@@ -197,7 +201,8 @@ class GBLinear : public GradientBooster {
void PredictInteractionContributions(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned, bool) override {
unsigned layer_begin, unsigned layer_end, bool) override {
LinearCheckLayer(layer_begin, layer_end);
std::vector<bst_float>& contribs = out_contribs->HostVector();
// linear models have no interaction effects

View File

@@ -414,7 +414,7 @@ void GBTree::Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
auto layer_trees = this->LayerTrees();
layer_end = layer_end == 0 ? model_.trees.size() / layer_trees : layer_end;
CHECK_GE(layer_end, layer_begin);
CHECK_GT(layer_end, layer_begin);
CHECK_GE(step, 1);
int32_t n_layers = (layer_end - layer_begin) / step;
std::vector<std::unique_ptr<RegTree>> &out_trees = out_model.trees;
@@ -438,10 +438,35 @@ void GBTree::Slice(int32_t layer_begin, int32_t layer_end, int32_t step,
void GBTree::PredictBatch(DMatrix* p_fmat,
PredictionCacheEntry* out_preds,
bool,
unsigned ntree_limit) {
unsigned layer_begin,
unsigned layer_end) {
CHECK(configured_);
if (layer_end == 0) {
layer_end = this->BoostedRounds();
}
if (layer_begin != 0 || layer_end < out_preds->version) {
// cache is dropped.
out_preds->version = 0;
}
bool reset = false;
if (layer_begin == 0) {
layer_begin = out_preds->version;
} else {
// When begin layer is not 0, the cache is not useful.
reset = true;
}
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) =
detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
GetPredictor(&out_preds->predictions, p_fmat)
->PredictBatch(p_fmat, out_preds, model_, 0, ntree_limit);
->PredictBatch(p_fmat, out_preds, model_, tree_begin, tree_end);
if (reset) {
out_preds->version = 0;
} else {
uint32_t delta = layer_end - out_preds->version;
out_preds->Update(delta);
}
}
std::unique_ptr<Predictor> const &
@@ -603,13 +628,14 @@ class Dart : public GBTree {
void PredictBatch(DMatrix* p_fmat,
PredictionCacheEntry* p_out_preds,
bool training,
unsigned ntree_limit) override {
unsigned layer_begin,
unsigned layer_end) override {
DropTrees(training);
int num_group = model_.learner_model_param->num_output_group;
ntree_limit *= num_group;
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
ntree_limit = static_cast<unsigned>(model_.trees.size());
}
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) =
detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
size_t n = num_group * p_fmat->Info().num_row_;
const auto &base_margin = p_fmat->Info().base_margin_.ConstHostVector();
auto& out_preds = p_out_preds->predictions.HostVector();
@@ -623,26 +649,24 @@ class Dart : public GBTree {
}
const int nthread = omp_get_max_threads();
InitThreadTemp(nthread);
PredLoopSpecalize(p_fmat, &out_preds, num_group, 0, ntree_limit);
PredLoopSpecalize(p_fmat, &out_preds, num_group, tree_begin, tree_end);
}
void PredictInstance(const SparsePage::Inst &inst,
std::vector<bst_float> *out_preds,
unsigned ntree_limit) override {
unsigned layer_begin, unsigned layer_end) override {
DropTrees(false);
if (thread_temp_.size() == 0) {
thread_temp_.resize(1, RegTree::FVec());
thread_temp_[0].Init(model_.learner_model_param->num_feature);
}
out_preds->resize(model_.learner_model_param->num_output_group);
ntree_limit *= model_.learner_model_param->num_output_group;
if (ntree_limit == 0 || ntree_limit > model_.trees.size()) {
ntree_limit = static_cast<unsigned>(model_.trees.size());
}
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
// loop over output groups
for (uint32_t gid = 0; gid < model_.learner_model_param->num_output_group; ++gid) {
(*out_preds)[gid] =
PredValue(inst, gid, &thread_temp_[0], 0, ntree_limit) +
PredValue(inst, gid, &thread_temp_[0], 0, tree_end) +
model_.learner_model_param->base_score;
}
}
@@ -653,22 +677,25 @@ class Dart : public GBTree {
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned ntree_limit, bool approximate, int,
unsigned layer_begin, unsigned layer_end, bool approximate, int,
unsigned) override {
CHECK(configured_);
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_,
ntree_limit, &weight_drop_, approximate);
tree_end, &weight_drop_, approximate);
}
void PredictInteractionContributions(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned ntree_limit, bool approximate) override {
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
unsigned layer_begin, unsigned layer_end, bool approximate) override {
CHECK(configured_);
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_,
ntree_limit, &weight_drop_, approximate);
tree_end, &weight_drop_, approximate);
}
protected:
inline void PredLoopSpecalize(
DMatrix* p_fmat,

View File

@@ -164,7 +164,9 @@ inline std::pair<uint32_t, uint32_t> LayerToTree(gbm::GBTreeModel const &model,
if (tree_end == 0) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
CHECK_LT(tree_begin, tree_end);
if (model.trees.size() != 0) {
CHECK_LE(tree_begin, tree_end);
}
return {tree_begin, tree_end};
}
@@ -260,10 +262,8 @@ class GBTree : public GradientBooster {
return model_.trees.size() / this->LayerTrees();
}
void PredictBatch(DMatrix* p_fmat,
PredictionCacheEntry* out_preds,
bool training,
unsigned ntree_limit) override;
void PredictBatch(DMatrix *p_fmat, PredictionCacheEntry *out_preds,
bool training, unsigned layer_begin, unsigned layer_end) override;
void InplacePredict(dmlc::any const &x, std::shared_ptr<DMatrix> p_m,
float missing, PredictionCacheEntry *out_preds,
@@ -297,33 +297,49 @@ class GBTree : public GradientBooster {
void PredictInstance(const SparsePage::Inst& inst,
std::vector<bst_float>* out_preds,
unsigned ntree_limit) override {
uint32_t layer_begin, uint32_t layer_end) override {
CHECK(configured_);
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
cpu_predictor_->PredictInstance(inst, out_preds, model_,
ntree_limit);
tree_end);
}
void PredictLeaf(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_preds,
unsigned ntree_limit) override {
this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, ntree_limit);
uint32_t layer_begin, uint32_t layer_end) override {
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
"n_iteration), use model slicing instead.";
this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned ntree_limit, bool approximate,
uint32_t layer_begin, uint32_t layer_end, bool approximate,
int, unsigned) override {
CHECK(configured_);
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictContribution(
p_fmat, out_contribs, model_, ntree_limit, nullptr, approximate);
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
}
void PredictInteractionContributions(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned ntree_limit, bool approximate) override {
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
CHECK(configured_);
this->GetPredictor()->PredictInteractionContributions(p_fmat, out_contribs, model_,
ntree_limit, nullptr, approximate);
uint32_t tree_begin, tree_end;
std::tie(tree_begin, tree_end) = detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict interaction contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictInteractionContributions(
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
}
std::vector<std::string> DumpModel(const FeatureMap& fmap,