From b21e658a0251d692d4e6ff3b614893c54e7d0c7c Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 11 Dec 2016 12:48:39 -0600 Subject: [PATCH] [R-package] JSON dump format and a couple of bugfixes (#1855) * [R-package] JSON tree dump interface * [R-package] precision bugfix in xgb.attributes * [R-package] bugfix for cb.early.stop called from xgb.cv * [R-package] a bit more clarity on labels checking in xgb.cv * [R-package] test JSON dump for gblinear as well * whitespace lint --- R-package/R/callbacks.R | 2 +- R-package/R/xgb.Booster.R | 2 +- R-package/R/xgb.cv.R | 11 +++++---- R-package/R/xgb.dump.R | 17 ++++++++++---- R-package/man/xgb.cv.Rd | 4 ++-- R-package/man/xgb.dump.Rd | 11 +++++++-- R-package/src/xgboost_R.cc | 27 ++++++++++++++++++----- R-package/src/xgboost_R.h | 3 ++- R-package/tests/testthat/test_callbacks.R | 5 +++++ R-package/tests/testthat/test_helpers.R | 12 ++++++++++ 10 files changed, 72 insertions(+), 22 deletions(-) diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index 4d7916d38..f35565273 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -229,7 +229,7 @@ cb.reset.parameters <- function(new_params) { xgb.parameters(env$bst$handle) <- pars } else { for (fd in env$bst_folds) - xgb.parameters(fd$bst$handle) <- pars + xgb.parameters(fd$bst) <- pars } } attr(callback, 'is_pre_iteration') <- TRUE diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 587fc1b90..362165c6a 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -339,7 +339,7 @@ xgb.attributes <- function(object) { # Q: should we warn a user about non-scalar elements? a <- lapply(a, function(x) { if (is.null(x)) return(NULL) - if (is.numeric(value[1])) { + if (is.numeric(x[1])) { format(x[1], digits = 17) } else { as.character(x[1]) diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index d9b4e429b..8325f5976 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -16,10 +16,10 @@ #' #' See \code{\link{xgb.train}} for further details. #' See also demo/ for walkthrough example in R. -#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input. +#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input. #' @param nrounds the max number of iterations #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. -#' @param label vector of response values. Should be provided only when data is \code{DMatrix}. +#' @param label vector of response values. Should be provided only when data is an R-matrix. #' @param missing is only used when input is a dense matrix. By default is set to NA, which means #' that NA values should be considered as 'missing' by the algorithm. #' Sometimes, 0 or other extreme value might be used to represent missing values. @@ -129,10 +129,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = #if (is.null(params[['eval_metric']]) && is.null(feval)) # stop("Either 'eval_metric' or 'feval' must be provided for CV") - # Labels - if (class(data) == 'xgb.DMatrix') - labels <- getinfo(data, 'label') - if (is.null(labels)) + # Check the labels + if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) || + (class(data) != 'xgb.DMatrix' && is.null(label))) stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") # CV folds diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 48f4bdf5b..9bd9a2135 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -14,6 +14,7 @@ #' When this option is on, the model dump comes with two additional statistics: #' gain is the approximate loss function gain we get in each split; #' cover is the sum of second order gradient in each node. +#' @param dump_fomat either 'text' or 'json' format could be specified. #' @param ... currently not used #' #' @return @@ -30,10 +31,15 @@ #' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE) #' #' # print the model without saving it to a file -#' print(xgb.dump(bst)) +#' print(xgb.dump(bst, with_stats = TRUE)) +#' +#' # print in JSON format: +#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) +#' #' @export -xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) { +xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) { check.deprecation(...) + dump_format <- match.arg(dump_format) if (class(model) != "xgb.Booster") stop("model: argument must be of type xgb.Booster") if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) @@ -42,12 +48,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, .. stop("fmap: argument must be of type character (when provided)") model <- xgb.Booster.check(model) - model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost") + model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), + as.character(dump_format), PACKAGE = "xgboost") if (is.null(fname)) model_dump <- stri_replace_all_regex(model_dump, '\t', '') - model_dump <- unlist(stri_split_regex(model_dump, '\n')) + if (dump_format == "text") + model_dump <- unlist(stri_split_regex(model_dump, '\n')) + model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE) if (is.null(fname)) { diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index bb93f40f9..e027df4a7 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA, See \code{\link{xgb.train}} for further details. See also demo/ for walkthrough example in R.} -\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.} +\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.} \item{nrounds}{the max number of iterations} \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} -\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.} +\item{label}{vector of response values. Should be provided only when data is an R-matrix.} \item{missing}{is only used when input is a dense matrix. By default is set to NA, which means that NA values should be considered as 'missing' by the algorithm. diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index efbf8b629..206e32022 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -4,7 +4,8 @@ \alias{xgb.dump} \title{Save xgboost model to text file} \usage{ -xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...) +xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, + dump_format = c("text", "json"), ...) } \arguments{ \item{model}{the model object.} @@ -24,6 +25,8 @@ gain is the approximate loss function gain we get in each split; cover is the sum of second order gradient in each node.} \item{...}{currently not used} + +\item{dump_fomat}{either 'text' or 'json' format could be specified.} } \value{ if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. @@ -42,6 +45,10 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2, xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE) # print the model without saving it to a file -print(xgb.dump(bst)) +print(xgb.dump(bst, with_stats = TRUE)) + +# print in JSON format: +cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) + } diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 7d6ec26ee..3dfafdd00 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -350,20 +350,37 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) { return ret; } -SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { +SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) { SEXP out; R_API_BEGIN(); bst_ulong olen; const char **res; - CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle), + const char *fmt = CHAR(asChar(dump_format)); + CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle), CHAR(asChar(fmap)), asInteger(with_stats), + fmt, &olen, &res)); out = PROTECT(allocVector(STRSXP, olen)); - for (size_t i = 0; i < olen; ++i) { + if (!strcmp("json", fmt)) { std::stringstream stream; - stream << "booster[" << i <<"]\n" << res[i]; - SET_STRING_ELT(out, i, mkChar(stream.str().c_str())); + stream << "[\n"; + for (size_t i = 0; i < olen; ++i) { + stream << res[i]; + if (i < olen - 1) { + stream << ",\n"; + } else { + stream << "\n"; + } + } + stream << "]"; + SET_STRING_ELT(out, 0, mkChar(stream.str().c_str())); + } else { + for (size_t i = 0; i < olen; ++i) { + std::stringstream stream; + stream << "booster[" << i <<"]\n" << res[i]; + SET_STRING_ELT(out, i, mkChar(stream.str().c_str())); + } } R_API_END(); UNPROTECT(1); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 24c9b78f1..a1e9604a1 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -185,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle); * \param handle handle * \param fmap name to fmap can be empty string * \param with_stats whether dump statistics of splits + * \param dump_format the format to dump the model in */ -XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats); +XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format); /*! * \brief get learner attribute value diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index 26253bf62..a95d10797 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -147,6 +147,11 @@ test_that("cb.reset.parameters works as expected", { bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, callbacks = list(cb.reset.parameters(my_par))) , NA) # NA = no error + # CV works as well + expect_error( + bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, + callbacks = list(cb.reset.parameters(my_par))) + , NA) # NA = no error # expect no learning with 0 learning rate my_par <- list(eta = c(0., 0.)) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 1d3ef3437..cd25c1dbb 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -27,6 +27,11 @@ test_that("xgb.dump works", { expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T)) expect_true(file.exists('xgb.model.dump')) expect_gt(file.size('xgb.model.dump'), 8000) + + # JSON format + dmp <- xgb.dump(bst.Tree, dump_format = "json") + expect_length(dmp, 1) + expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162) }) test_that("xgb.dump works for gblinear", { @@ -38,6 +43,11 @@ test_that("xgb.dump works for gblinear", { d.sp <- xgb.dump(bst.GLM.sp) expect_length(d.sp, 14) expect_gt(sum(d.sp == "0"), 0) + + # JSON format + dmp <- xgb.dump(bst.GLM.sp, dump_format = "json") + expect_length(dmp, 1) + expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11) }) test_that("xgb-attribute functionality", { @@ -83,6 +93,8 @@ test_that("xgb-attribute numeric precision", { for (x in X) { xgb.attr(bst.Tree, "x") <- x expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x) + xgb.attributes(bst.Tree) <- list(a = "A", b = x) + expect_identical(as.numeric(xgb.attr(bst.Tree, "b")), x) } })