From ace7fd328b805deae7e29e14b892515eab599547 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sun, 26 Jul 2020 00:06:49 -0700 Subject: [PATCH] [R] Add a compatibility layer to load Booster object from an old RDS file (#5940) * [R] Add a compatibility layer to load Booster from an old RDS * Modify QuantileHistMaker::LoadConfig() to be backward compatible with 1.1.x * Add a big warning about compatibility in QuantileHistMaker::LoadConfig() * Add testing suite * Discourage use of saveRDS() in CRAN doc --- R-package/R/utils.R | 14 +++ R-package/R/xgb.unserialize.R | 21 ++++- .../man/a-compatibility-note-for-saveRDS.Rd | 15 +++ R-package/src/xgboost_R.cc | 6 +- R-package/tests/generate_models.R | 94 +++++++++++++++++++ R-package/tests/generate_models_params.R | 10 ++ R-package/tests/testthat.R | 2 +- R-package/tests/testthat/test_lint.R | 2 +- .../tests/testthat/test_model_compatibility.R | 77 +++++++++++++++ src/tree/updater_quantile_hist.h | 17 +++- 10 files changed, 251 insertions(+), 7 deletions(-) create mode 100644 R-package/man/a-compatibility-note-for-saveRDS.Rd create mode 100644 R-package/tests/generate_models.R create mode 100644 R-package/tests/generate_models_params.R create mode 100644 R-package/tests/testthat/test_model_compatibility.R diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 894e22031..820a28048 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -308,6 +308,20 @@ xgb.createFolds <- function(y, k = 10) #' @name xgboost-deprecated NULL +#' Do not use saveRDS() for long-term archival of models. Use xgb.save() instead. +#' +#' It is a common practice to use the built-in \code{saveRDS()} function to persist R objects to +#' the disk. While \code{xgb.Booster} objects can be persisted with \code{saveRDS()} as well, it +#' is not advisable to use it if the model is to be accessed in the future. If you train a model +#' with the current version of XGBoost and persist it with \code{saveRDS()}, the model is not +#' guaranteed to be accessible in later releases of XGBoost. To ensure that your model can be +#' accessed in future releases of XGBoost, use \code{xgb.save()} instead. For more details and +#' explanation, consult the page +#' \url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. +#' +#' @name a-compatibility-note-for-saveRDS +NULL + # Lookup table for the deprecated parameters bookkeeping depr_par_lut <- matrix(c( 'print.every.n', 'print_every_n', diff --git a/R-package/R/xgb.unserialize.R b/R-package/R/xgb.unserialize.R index 1a62e4cde..411225f89 100644 --- a/R-package/R/xgb.unserialize.R +++ b/R-package/R/xgb.unserialize.R @@ -6,7 +6,26 @@ xgb.unserialize <- function(buffer) { cachelist <- list() handle <- .Call(XGBoosterCreate_R, cachelist) - .Call(XGBoosterUnserializeFromBuffer_R, handle, buffer) + tryCatch( + .Call(XGBoosterUnserializeFromBuffer_R, handle, buffer), + error = function(e) { + error_msg <- conditionMessage(e) + m <- regexec("(src[\\\\/]learner.cc:[0-9]+): Check failed: (header == serialisation_header_)", + error_msg, perl = TRUE) + groups <- regmatches(error_msg, m)[[1]] + if (length(groups) == 3) { + warning(paste("The model had been generated by XGBoost version 1.0.0 or earlier and was ", + "loaded from a RDS file. We strongly ADVISE AGAINST using saveRDS() ", + "function, to ensure that your model can be read in current and upcoming ", + "XGBoost releases. Please use xgb.save() instead to preserve models for the ", + "long term. For more details and explanation, see ", + "https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html", + sep = "")) + .Call(XGBoosterLoadModelFromRaw_R, handle, buffer) + } else { + stop(e) + } + }) class(handle) <- "xgb.Booster.handle" return (handle) } diff --git a/R-package/man/a-compatibility-note-for-saveRDS.Rd b/R-package/man/a-compatibility-note-for-saveRDS.Rd new file mode 100644 index 000000000..2c5c4a1b4 --- /dev/null +++ b/R-package/man/a-compatibility-note-for-saveRDS.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{a-compatibility-note-for-saveRDS} +\alias{a-compatibility-note-for-saveRDS} +\title{Do not use saveRDS() for long-term archival of models. Use xgb.save() instead.} +\description{ +It is a common practice to use the built-in \code{saveRDS()} function to persist R objects to +the disk. While \code{xgb.Booster} objects can be persisted with \code{saveRDS()} as well, it +is not advisable to use it if the model is to be accessed in the future. If you train a model +with the current version of XGBoost and persist it with \code{saveRDS()}, the model is not +guaranteed to be accessible in later releases of XGBoost. To ensure that your model can be +accessed in future releases of XGBoost, use \code{xgb.save()} instead. For more details and +explanation, consult the page +\url{https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html}. +} diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index cb86ef469..2e315028f 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -375,7 +375,7 @@ SEXP XGBoosterSaveJsonConfig_R(SEXP handle) { SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value) { R_API_BEGIN(); - XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value))); + CHECK_CALL(XGBoosterLoadJsonConfig(R_ExternalPtrAddr(handle), CHAR(asChar(value)))); R_API_END(); return R_NilValue; } @@ -397,9 +397,9 @@ SEXP XGBoosterSerializeToBuffer_R(SEXP handle) { SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) { R_API_BEGIN(); - XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle), + CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle), RAW(raw), - length(raw)); + length(raw))); R_API_END(); return R_NilValue; } diff --git a/R-package/tests/generate_models.R b/R-package/tests/generate_models.R new file mode 100644 index 000000000..854b7944e --- /dev/null +++ b/R-package/tests/generate_models.R @@ -0,0 +1,94 @@ +# Script to generate reference models. The reference models are used to test backward compatibility +# of saved model files from XGBoost version 0.90 and 1.0.x. +library(xgboost) +library(Matrix) +source('./generate_models_params.R') + +set.seed(0) +metadata <- model_generator_metadata() +X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows, + ncol = metadata$kCols, sparse = TRUE) +w <- runif(metadata$kRows) + +version <- packageVersion('xgboost') +target_dir <- 'models' + +save_booster <- function (booster, model_name) { + booster_bin <- function (model_name) { + return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = ''))) + } + booster_json <- function (model_name) { + return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = ''))) + } + booster_rds <- function (model_name) { + return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = ''))) + } + xgb.save(booster, booster_bin(model_name)) + saveRDS(booster, booster_rds(model_name)) + if (version >= '1.0.0') { + xgb.save(booster, booster_json(model_name)) + } +} + +generate_regression_model <- function () { + print('Regression') + y <- rnorm(metadata$kRows) + + data <- xgb.DMatrix(X, label = y) + params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests, + max_depth = metadata$kMaxDepth) + booster <- xgb.train(params, data, nrounds = metadata$kRounds) + save_booster(booster, 'reg') +} + +generate_logistic_model <- function () { + print('Binary classification with logistic loss') + y <- sample(0:1, size = metadata$kRows, replace = TRUE) + stopifnot(max(y) == 1, min(y) == 0) + + data <- xgb.DMatrix(X, label = y, weight = w) + params <- list(tree_method = 'hist', num_parallel_tree = metadata$kForests, + max_depth = metadata$kMaxDepth, objective = 'binary:logistic') + booster <- xgb.train(params, data, nrounds = metadata$kRounds) + save_booster(booster, 'logit') +} + +generate_classification_model <- function () { + print('Multi-class classification') + y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE) + stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0) + + data <- xgb.DMatrix(X, label = y, weight = w) + params <- list(num_class = metadata$kClasses, tree_method = 'hist', + num_parallel_tree = metadata$kForests, max_depth = metadata$kMaxDepth, + objective = 'multi:softmax') + booster <- xgb.train(params, data, nrounds = metadata$kRounds) + save_booster(booster, 'cls') +} + +generate_ranking_model <- function () { + print('Learning to rank') + y <- sample(0:4, size = metadata$kRows, replace = TRUE) + stopifnot(max(y) == 4, min(y) == 0) + kGroups <- 20 + w <- runif(kGroups) + g <- rep(50, times = kGroups) + + data <- xgb.DMatrix(X, label = y, group = g) + # setinfo(data, 'weight', w) + # ^^^ does not work in version <= 1.1.0; see https://github.com/dmlc/xgboost/issues/5942 + # So call low-level function XGDMatrixSetInfo_R directly. Since this function is not an exported + # symbol, use the triple-colon operator. + .Call(xgboost:::XGDMatrixSetInfo_R, data, 'weight', as.numeric(w)) + params <- list(objective = 'rank:ndcg', num_parallel_tree = metadata$kForests, + tree_method = 'hist', max_depth = metadata$kMaxDepth) + booster <- xgb.train(params, data, nrounds = metadata$kRounds) + save_booster(booster, 'ltr') +} + +dir.create(target_dir) + +invisible(generate_regression_model()) +invisible(generate_logistic_model()) +invisible(generate_classification_model()) +invisible(generate_ranking_model()) diff --git a/R-package/tests/generate_models_params.R b/R-package/tests/generate_models_params.R new file mode 100644 index 000000000..0f71ab22c --- /dev/null +++ b/R-package/tests/generate_models_params.R @@ -0,0 +1,10 @@ +model_generator_metadata <- function() { + return (list( + kRounds = 2, + kRows = 1000, + kCols = 4, + kForests = 2, + kMaxDepth = 2, + kClasses = 3 + )) +} diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R index 53cc6caba..3bb229e70 100644 --- a/R-package/tests/testthat.R +++ b/R-package/tests/testthat.R @@ -1,4 +1,4 @@ library(testthat) library(xgboost) -test_check("xgboost") +test_check("xgboost", reporter = ProgressReporter) diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R index 5d32c85d6..77fd264ef 100644 --- a/R-package/tests/testthat/test_lint.R +++ b/R-package/tests/testthat/test_lint.R @@ -2,7 +2,7 @@ context("Code is of high quality and lint free") test_that("Code Lint", { skip_on_cran() my_linters <- list( - absolute_paths_linter = lintr::absolute_paths_linter, + absolute_path_linter = lintr::absolute_path_linter, assignment_linter = lintr::assignment_linter, closed_curly_linter = lintr::closed_curly_linter, commas_linter = lintr::commas_linter, diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R new file mode 100644 index 000000000..d56207e3b --- /dev/null +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -0,0 +1,77 @@ +require(xgboost) +require(jsonlite) +source('../generate_models_params.R') + +context("Models from previous versions of XGBoost can be loaded") + +metadata <- model_generator_metadata() + +run_model_param_check <- function (config) { + expect_equal(config$learner$learner_model_param$num_feature, '4') + expect_equal(config$learner$learner_train_param$booster, 'gbtree') +} + +get_num_tree <- function (booster) { + dump <- xgb.dump(booster) + m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE) + m <- regmatches(dump, m) + num_tree <- Reduce('+', lapply(m, length)) + return (num_tree) +} + +run_booster_check <- function (booster, name) { + # If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config(). + if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) { + booster <- xgb.Booster.complete(booster) + } + config <- jsonlite::fromJSON(xgb.config(booster)) + run_model_param_check(config) + if (name == 'cls') { + expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds * metadata$kClasses) + expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) + expect_equal(config$learner$learner_train_param$objective, 'multi:softmax') + expect_equal(as.numeric(config$learner$learner_model_param$num_class), metadata$kClasses) + } else if (name == 'logit') { + expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + expect_equal(as.numeric(config$learner$learner_model_param$num_class), 0) + expect_equal(config$learner$learner_train_param$objective, 'binary:logistic') + } else if (name == 'ltr') { + expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + expect_equal(config$learner$learner_train_param$objective, 'rank:ndcg') + } else { + expect_equal(name, 'reg') + expect_equal(get_num_tree(booster), metadata$kForests * metadata$kRounds) + expect_equal(as.numeric(config$learner$learner_model_param$base_score), 0.5) + expect_equal(config$learner$learner_train_param$objective, 'reg:squarederror') + } +} + +test_that("Models from previous versions of XGBoost can be loaded", { + bucket <- 'xgboost-ci-jenkins-artifacts' + region <- 'us-west-2' + file_name <- 'xgboost_r_model_compatibility_test.zip' + zipfile <- file.path(getwd(), file_name) + model_dir <- file.path(getwd(), 'models') + download.file(paste('https://', bucket, '.s3-', region, '.amazonaws.com/', file_name, sep = ''), + destfile = zipfile, mode = 'wb') + unzip(zipfile, overwrite = TRUE) + + pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4)) + + lapply(list.files(model_dir), function (x) { + model_file <- file.path(model_dir, x) + m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE) + m <- regmatches(model_file, m)[[1]] + model_xgb_ver <- m[2] + name <- m[3] + + if (endsWith(model_file, '.rds')) { + booster <- readRDS(model_file) + } else { + booster <- xgb.load(model_file) + } + predict(booster, newdata = pred_data) + run_booster_check(booster, name) + }) + expect_true(TRUE) +}) diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index d74c16f72..a32c6617d 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -125,7 +125,22 @@ class QuantileHistMaker: public TreeUpdater { void LoadConfig(Json const& in) override { auto const& config = get(in); FromJson(config.at("train_param"), &this->param_); - FromJson(config.at("cpu_hist_train_param"), &this->hist_maker_param_); + try { + FromJson(config.at("cpu_hist_train_param"), &this->hist_maker_param_); + } catch (std::out_of_range& e) { + // XGBoost model is from 1.1.x, so 'cpu_hist_train_param' is missing. + // We add this compatibility check because it's just recently that we (developers) began + // persuade R users away from using saveRDS() for model serialization. Hopefully, one day, + // everyone will be using xgb.save(). + LOG(WARNING) << "Attempted to load interal configuration for a model file that was generated " + << "by a previous version of XGBoost. A likely cause for this warning is that the model " + << "was saved with saveRDS() in R or pickle.dump() in Python. We strongly ADVISE AGAINST " + << "using saveRDS() or pickle.dump() so that the model remains accessible in current and " + << "upcoming XGBoost releases. Please use xgb.save() instead to preserve models for the " + << "long term. For more details and explanation, see " + << "https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html"; + this->hist_maker_param_.UpdateAllowUnknown(Args{}); + } } void SaveConfig(Json* p_out) const override { auto& out = *p_out;