[CORE] The update process for a tree model, and its application to feature importance (#1670)

* [CORE] allow updating trees in an existing model * [CORE] in refresh updater, allow keeping old leaf values and update stats only * [R-package] xgb.train mod to allow updating trees in an existing model * [R-package] added check for nrounds when is_update * [CORE] merge parameter declaration changes; unify their code style * [CORE] move the update-process trees initialization to Configure; rename default process_type to 'default'; fix the trees and trees_to_update sizes comparison check * [R-package] unit tests for the update process type * [DOC] documentation for process_type parameter; improved docs for updater, Gamma and Tweedie; added some parameter aliases; metrics indentation and some were non-documented * fix my sloppy merge conflict resolutions * [CORE] add a TreeProcessType enum * whitespace fix
2016-12-04 11:33:52 -06:00
parent 4398fbbe4a
commit a44032d095
6 changed files with 221 additions and 60 deletions
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -284,7 +284,9 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  # Sort the callbacks into categories
  cb <- categorize.callbacks(callbacks)

-  
+  # The tree updating process would need slightly different handling
+  is_update <- NVL(params[['process_type']], '.') == 'update'
+
  # Construct a booster (either a new one or load from xgb_model)
  handle <- xgb.Booster(params, append(watchlist, dtrain), xgb_model)
  bst <- xgb.handleToBooster(handle)
@@ -294,17 +296,20 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
  num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1)

  # When the 'xgb_model' was set, find out how many boosting iterations it has
-  niter_skip <- 0
+  niter_init <- 0
  if (!is.null(xgb_model)) {
-    niter_skip <- as.numeric(xgb.attr(bst, 'niter')) + 1
-    if (length(niter_skip) == 0) {
-      niter_skip <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
+    niter_init <- as.numeric(xgb.attr(bst, 'niter')) + 1
+    if (length(niter_init) == 0) {
+      niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class)
    }
  }
+  if(is_update && nrounds > niter_init)
+    stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")

  # TODO: distributed code
  rank <- 0
  
+  niter_skip <- ifelse(is_update, 0, niter_init)
  begin_iteration <- niter_skip + 1
  end_iteration <- niter_skip + nrounds
  
@@ -337,6 +342,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
      nrow(evaluation_log) > 0) {
    # include the previous compatible history when available
    if (class(xgb_model) == 'xgb.Booster' &&
+        !is_update &&
        !is.null(xgb_model$evaluation_log) &&
        all.equal(colnames(evaluation_log),
                  colnames(xgb_model$evaluation_log))) {
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -0,0 +1,76 @@
+require(xgboost)
+
+context("update trees in an existing model")
+
+data(agaricus.train, package = 'xgboost')
+data(agaricus.test, package = 'xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+test_that("updating the model works", {
+  watchlist = list(train = dtrain, test = dtest)
+  cb = list(cb.evaluation.log()) # to run silent, but store eval. log
+  
+  # no-subsampling
+  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
+  set.seed(11)
+  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
+  tr1 <- xgb.model.dt.tree(model = bst1)
+  
+  # with subsampling
+  p2 <- modifyList(p1, list(subsample = 0.1))
+  set.seed(11)
+  bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
+  tr2 <- xgb.model.dt.tree(model = bst2)
+  
+  # the same no-subsampling boosting with an extra 'refresh' updater:
+  p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
+  set.seed(11)
+  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
+  tr1r <- xgb.model.dt.tree(model = bst1r)
+  # all should be the same when no subsampling
+  expect_equal(bst1$evaluation_log, bst1r$evaluation_log)
+  expect_equal(tr1, tr1r, tolerance = 0.00001, check.attributes = FALSE)
+
+  # the same boosting with subsampling with an extra 'refresh' updater:
+  p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
+  set.seed(11)
+  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb)
+  tr2r <- xgb.model.dt.tree(model = bst2r)
+  # should be the same evaluation but different gains and larger cover
+  expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
+  expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2r[Feature != 'Leaf']$Quality)), 100)
+  expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
+
+  # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
+  p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1)
+  tr1u <- xgb.model.dt.tree(model = bst1u)
+  # all should be the same when no subsampling
+  expect_equal(bst1$evaluation_log, bst1u$evaluation_log)
+  expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE)
+  
+  # process type 'update' for model with subsampling, refreshing only the tree stats from training data:
+  p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
+  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst2)
+  tr2u <- xgb.model.dt.tree(model = bst2u)
+  # should be the same evaluation but different gains and larger cover
+  expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
+  expect_equal(tr2[Feature == 'Leaf']$Quality, tr2u[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2u[Feature != 'Leaf']$Quality)), 100)
+  expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
+  # the results should be the same as for the model with an extra 'refresh' updater
+  expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
+  expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE)
+  
+  # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
+  p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
+  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, callbacks = cb, xgb_model = bst1)
+  tr1ut <- xgb.model.dt.tree(model = bst1ut)
+  # should be the same evaluations but different gains and smaller cover (test data is smaller)
+  expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
+  expect_equal(tr1[Feature == 'Leaf']$Quality, tr1ut[Feature == 'Leaf']$Quality)
+  expect_gt(sum(abs(tr1[Feature != 'Leaf']$Quality - tr1ut[Feature != 'Leaf']$Quality)), 100)
+  expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5)
+})