[R-package] JSON dump format and a couple of bugfixes (#1855)

* [R-package] JSON tree dump interface * [R-package] precision bugfix in xgb.attributes * [R-package] bugfix for cb.early.stop called from xgb.cv * [R-package] a bit more clarity on labels checking in xgb.cv * [R-package] test JSON dump for gblinear as well * whitespace lint
2016-12-11 12:48:39 -06:00
parent 0268dedeea
commit b21e658a02
10 changed files with 72 additions and 22 deletions
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -229,7 +229,7 @@ cb.reset.parameters <- function(new_params) {
      xgb.parameters(env$bst$handle) <- pars
    } else {
      for (fd in env$bst_folds)
-        xgb.parameters(fd$bst$handle) <- pars
+        xgb.parameters(fd$bst) <- pars
    }
  }
  attr(callback, 'is_pre_iteration') <- TRUE
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -339,7 +339,7 @@ xgb.attributes <- function(object) {
  # Q: should we warn a user about non-scalar elements?
  a <- lapply(a, function(x) {
    if (is.null(x)) return(NULL)
-    if (is.numeric(value[1])) {
+    if (is.numeric(x[1])) {
      format(x[1], digits = 17)
    } else {
      as.character(x[1])
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -16,10 +16,10 @@
 #'
 #'   See \code{\link{xgb.train}} for further details.
 #'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
+#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
 #' @param nrounds the max number of iterations
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. 
-#' @param label vector of response values. Should be provided only when data is \code{DMatrix}.
+#' @param label vector of response values. Should be provided only when data is an R-matrix.
 #' @param missing is only used when input is a dense matrix. By default is set to NA, which means 
 #'        that NA values should be considered as 'missing' by the algorithm. 
 #'        Sometimes, 0 or other extreme value might be used to represent missing values.
@@ -129,10 +129,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
  #if (is.null(params[['eval_metric']]) && is.null(feval))
  #  stop("Either 'eval_metric' or 'feval' must be provided for CV")
  
-  # Labels
-  if (class(data) == 'xgb.DMatrix')
-    labels <- getinfo(data, 'label')
-  if (is.null(labels))
+  # Check the labels
+  if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) ||
+       (class(data) != 'xgb.DMatrix' && is.null(label)))
    stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
  
  # CV folds
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -14,6 +14,7 @@
 #'        When this option is on, the model dump comes with two additional statistics:
 #'        gain is the approximate loss function gain we get in each split;
 #'        cover is the sum of second order gradient in each node.
+#' @param dump_fomat either 'text' or 'json' format could be specified.
 #' @param ... currently not used
 #'
 #' @return
@@ -30,10 +31,15 @@
 #' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
 #' 
 #' # print the model without saving it to a file
-#' print(xgb.dump(bst))
+#' print(xgb.dump(bst, with_stats = TRUE))
+#' 
+#' # print in JSON format:
+#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+#' 
 #' @export
-xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) {
+xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) {
  check.deprecation(...)
+  dump_format <- match.arg(dump_format)
  if (class(model) != "xgb.Booster")
    stop("model: argument must be of type xgb.Booster")
  if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
@@ -42,12 +48,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
    stop("fmap: argument must be of type character (when provided)")
  
  model <- xgb.Booster.check(model)
-  model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
+  model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
+                      as.character(dump_format), PACKAGE = "xgboost")

  if (is.null(fname)) 
    model_dump <- stri_replace_all_regex(model_dump, '\t', '')
  
-  model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  if (dump_format == "text")
+    model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  
  model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
  
  if (is.null(fname)) {
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
  See \code{\link{xgb.train}} for further details.
  See also demo/ for walkthrough example in R.}

-\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
+\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}

 \item{nrounds}{the max number of iterations}

 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}

-\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.}
+\item{label}{vector of response values. Should be provided only when data is an R-matrix.}

 \item{missing}{is only used when input is a dense matrix. By default is set to NA, which means 
 that NA values should be considered as 'missing' by the algorithm. 
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -4,7 +4,8 @@
 \alias{xgb.dump}
 \title{Save xgboost model to text file}
 \usage{
-xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...)
+xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
+  dump_format = c("text", "json"), ...)
 }
 \arguments{
 \item{model}{the model object.}
@@ -24,6 +25,8 @@ gain is the approximate loss function gain we get in each split;
 cover is the sum of second order gradient in each node.}

 \item{...}{currently not used}
+
+\item{dump_fomat}{either 'text' or 'json' format could be specified.}
 }
 \value{
 if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
@@ -42,6 +45,10 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)

 # print the model without saving it to a file
-print(xgb.dump(bst))
+print(xgb.dump(bst, with_stats = TRUE))
+
+# print in JSON format:
+cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+
 }

--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -350,20 +350,37 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
  return ret;
 }

-SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
+SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
  SEXP out;
  R_API_BEGIN();
  bst_ulong olen;
  const char **res;
-  CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+  const char *fmt = CHAR(asChar(dump_format));
+  CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
                                CHAR(asChar(fmap)),
                                asInteger(with_stats),
+                                fmt,
                                &olen, &res));
  out = PROTECT(allocVector(STRSXP, olen));
-  for (size_t i = 0; i < olen; ++i) {
+  if (!strcmp("json", fmt)) {
    std::stringstream stream;
-    stream <<  "booster[" << i <<"]\n" << res[i];
-    SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    stream <<  "[\n";
+    for (size_t i = 0; i < olen; ++i) {
+      stream << res[i];
+      if (i < olen - 1) {
+        stream << ",\n";
+      } else {
+        stream << "\n";
+      }
+    }
+    stream <<  "]";
+    SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
+  } else {
+    for (size_t i = 0; i < olen; ++i) {
+      std::stringstream stream;
+      stream <<  "booster[" << i <<"]\n" << res[i];
+      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    }
  }
  R_API_END();
  UNPROTECT(1);
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -185,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
 * \param handle handle
 * \param fmap  name to fmap can be empty string
 * \param with_stats whether dump statistics of splits
+ * \param dump_format the format to dump the model in
 */
-XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
+XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format);

 /*!
 * \brief get learner attribute value
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -147,6 +147,11 @@ test_that("cb.reset.parameters works as expected", {
    bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                      callbacks = list(cb.reset.parameters(my_par)))
  , NA) # NA = no error
+  # CV works as well
+  expect_error(
+    bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2,
+                   callbacks = list(cb.reset.parameters(my_par)))
+  , NA) # NA = no error

  # expect no learning with 0 learning rate
  my_par <- list(eta = c(0., 0.))
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -27,6 +27,11 @@ test_that("xgb.dump works", {
  expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
  expect_true(file.exists('xgb.model.dump'))
  expect_gt(file.size('xgb.model.dump'), 8000)
+
+  # JSON format
+  dmp <- xgb.dump(bst.Tree, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
 })

 test_that("xgb.dump works for gblinear", {
@@ -38,6 +43,11 @@ test_that("xgb.dump works for gblinear", {
  d.sp <- xgb.dump(bst.GLM.sp)
  expect_length(d.sp, 14)
  expect_gt(sum(d.sp == "0"), 0)
+
+  # JSON format
+  dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
 })

 test_that("xgb-attribute functionality", {
@@ -83,6 +93,8 @@ test_that("xgb-attribute numeric precision", {
  for (x in X) {
    xgb.attr(bst.Tree, "x") <- x
    expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x)
+    xgb.attributes(bst.Tree) <- list(a = "A", b = x)
+    expect_identical(as.numeric(xgb.attr(bst.Tree, "b")), x)
  }
 })