Serialise booster after training to reset state (#5484)

* Serialise booster after training to reset state * Prevent process_type being set on load * Check for correct updater sequence
2020-04-11 16:27:12 +12:00 · 2020-04-11 16:27:12 +12:00 · 093e2227e3
commit 093e2227e3
parent 4a0c8ef237
5 changed files with 27 additions and 7 deletions
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@ -109,7 +109,9 @@ def _train_internal(params, dtrain,
    else:
        bst.best_iteration = nboost - 1
    bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
-    return bst
+
+    # Copy to serialise and unserialise booster to reset state and free training memory
+    return bst.copy()


 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@ -267,6 +267,11 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair,
  // create the trees
  for (int i = 0; i < tparam_.num_parallel_tree; ++i) {
    if (tparam_.process_type == TreeProcessType::kDefault) {
+      CHECK(!updaters_.front()->CanModifyTree())
+          << "Updater: `" << updaters_.front()->Name() << "` "
+          << "can not be used to create new trees. "
+          << "Set `process_type` to `update` if you want to update existing "
+             "trees.";
      // create new tree
      std::unique_ptr<RegTree> ptr(new RegTree());
      ptr->param.UpdateAllowUnknown(this->cfg_);
@ -319,6 +324,10 @@ void GBTree::CommitModel(std::vector<std::vector<std::unique_ptr<RegTree>>>&& ne
 void GBTree::LoadConfig(Json const& in) {
  CHECK_EQ(get<String>(in["name"]), "gbtree");
  FromJson(in["gbtree_train_param"], &tparam_);
+  // Process type cannot be kUpdate from loaded model
+  // This would cause all trees to be pushed to trees_to_update
+  // e.g. updating a model, then saving and loading it would result in an empty model
+  tparam_.process_type = TreeProcessType::kDefault;
  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
    LOG(WARNING)
@ -348,6 +357,13 @@ void GBTree::SaveConfig(Json* p_out) const {
  auto& out = *p_out;
  out["name"] = String("gbtree");
  out["gbtree_train_param"] = ToJson(tparam_);
+
+  // Process type cannot be kUpdate from loaded model
+  // This would cause all trees to be pushed to trees_to_update
+  // e.g. updating a model, then saving and loading it would result in an empty
+  // model
+  out["gbtree_train_param"]["process_type"] = String("default");
+
  out["updater"] = Object();

  auto& j_updaters = out["updater"];
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@ -63,6 +63,10 @@ TEST(GBTree, WrongUpdater) {
  // Hist can not be used for updating tree.
  learner->SetParams(Args{{"tree_method", "hist"}, {"process_type", "update"}});
  ASSERT_THROW(learner->UpdateOneIter(0, p_dmat), dmlc::Error);
+  // Prune can not be used for learning new tree.
+  learner->SetParams(
+      Args{{"tree_method", "prune"}, {"process_type", "default"}});
+  ASSERT_THROW(learner->UpdateOneIter(0, p_dmat), dmlc::Error);
 }

 #ifdef XGBOOST_USE_CUDA
--- a/tests/distributed/test_basic.py
+++ b/tests/distributed/test_basic.py
@ -20,9 +20,8 @@ num_round = 20
 bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=2)

 # Save the model, only ask process 0 to save the model.
-if xgb.rabit.get_rank() == 0:
-    bst.save_model("test.model")
-    xgb.rabit.tracker_print("Finished training\n")
+bst.save_model("test.model{}".format(xgb.rabit.get_rank()))
+xgb.rabit.tracker_print("Finished training\n")

 # Notify the tracker all training has been successful
 # This is only needed in distributed training.
--- a/tests/distributed/test_issue3402.py
+++ b/tests/distributed/test_issue3402.py
@ -70,9 +70,8 @@ watchlist  = [(dtrain,'train')]
 num_round = 2
 bst = xgb.train(param, dtrain, num_round, watchlist)

-if xgb.rabit.get_rank() == 0:
-  bst.save_model("test_issue3402.model")
-  xgb.rabit.tracker_print("Finished training\n")
+bst.save_model("test_issue3402.model{}".format(xgb.rabit.get_rank()))
+xgb.rabit.tracker_print("Finished training\n")

 # Notify the tracker all training has been successful
 # This is only needed in distributed training.