From b21e658a0251d692d4e6ff3b614893c54e7d0c7c Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Sun, 11 Dec 2016 12:48:39 -0600
Subject: [PATCH] [R-package] JSON dump format and a couple of bugfixes (#1855)

* [R-package] JSON tree dump interface

* [R-package] precision bugfix in xgb.attributes

* [R-package] bugfix for cb.early.stop called from xgb.cv

* [R-package] a bit more clarity on labels checking in xgb.cv

* [R-package] test JSON dump for gblinear as well

* whitespace lint
---
 R-package/R/callbacks.R                   |  2 +-
 R-package/R/xgb.Booster.R                 |  2 +-
 R-package/R/xgb.cv.R                      | 11 +++++----
 R-package/R/xgb.dump.R                    | 17 ++++++++++----
 R-package/man/xgb.cv.Rd                   |  4 ++--
 R-package/man/xgb.dump.Rd                 | 11 +++++++--
 R-package/src/xgboost_R.cc                | 27 ++++++++++++++++++-----
 R-package/src/xgboost_R.h                 |  3 ++-
 R-package/tests/testthat/test_callbacks.R |  5 +++++
 R-package/tests/testthat/test_helpers.R   | 12 ++++++++++
 10 files changed, 72 insertions(+), 22 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 4d7916d38..f35565273 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -229,7 +229,7 @@ cb.reset.parameters <- function(new_params) {
       xgb.parameters(env$bst$handle) <- pars
     } else {
       for (fd in env$bst_folds)
-        xgb.parameters(fd$bst$handle) <- pars
+        xgb.parameters(fd$bst) <- pars
     }
   }
   attr(callback, 'is_pre_iteration') <- TRUE
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 587fc1b90..362165c6a 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -339,7 +339,7 @@ xgb.attributes <- function(object) {
   # Q: should we warn a user about non-scalar elements?
   a <- lapply(a, function(x) {
     if (is.null(x)) return(NULL)
-    if (is.numeric(value[1])) {
+    if (is.numeric(x[1])) {
       format(x[1], digits = 17)
     } else {
       as.character(x[1])
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index d9b4e429b..8325f5976 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -16,10 +16,10 @@
 #'
 #'   See \code{\link{xgb.train}} for further details.
 #'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
+#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
 #' @param nrounds the max number of iterations
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. 
-#' @param label vector of response values. Should be provided only when data is \code{DMatrix}.
+#' @param label vector of response values. Should be provided only when data is an R-matrix.
 #' @param missing is only used when input is a dense matrix. By default is set to NA, which means 
 #'        that NA values should be considered as 'missing' by the algorithm. 
 #'        Sometimes, 0 or other extreme value might be used to represent missing values.
@@ -129,10 +129,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
   #if (is.null(params[['eval_metric']]) && is.null(feval))
   #  stop("Either 'eval_metric' or 'feval' must be provided for CV")
   
-  # Labels
-  if (class(data) == 'xgb.DMatrix')
-    labels <- getinfo(data, 'label')
-  if (is.null(labels))
+  # Check the labels
+  if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) ||
+       (class(data) != 'xgb.DMatrix' && is.null(label)))
     stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
   
   # CV folds
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 48f4bdf5b..9bd9a2135 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -14,6 +14,7 @@
 #'        When this option is on, the model dump comes with two additional statistics:
 #'        gain is the approximate loss function gain we get in each split;
 #'        cover is the sum of second order gradient in each node.
+#' @param dump_fomat either 'text' or 'json' format could be specified.
 #' @param ... currently not used
 #'
 #' @return
@@ -30,10 +31,15 @@
 #' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
 #' 
 #' # print the model without saving it to a file
-#' print(xgb.dump(bst))
+#' print(xgb.dump(bst, with_stats = TRUE))
+#' 
+#' # print in JSON format:
+#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+#' 
 #' @export
-xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) {
+xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) {
   check.deprecation(...)
+  dump_format <- match.arg(dump_format)
   if (class(model) != "xgb.Booster")
     stop("model: argument must be of type xgb.Booster")
   if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
@@ -42,12 +48,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
     stop("fmap: argument must be of type character (when provided)")
   
   model <- xgb.Booster.check(model)
-  model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
+  model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
+                      as.character(dump_format), PACKAGE = "xgboost")
 
   if (is.null(fname)) 
     model_dump <- stri_replace_all_regex(model_dump, '\t', '')
   
-  model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  if (dump_format == "text")
+    model_dump <- unlist(stri_split_regex(model_dump, '\n'))
+  
   model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
   
   if (is.null(fname)) {
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index bb93f40f9..e027df4a7 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
   See \code{\link{xgb.train}} for further details.
   See also demo/ for walkthrough example in R.}
 
-\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
+\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
 
 \item{nrounds}{the max number of iterations}
 
 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
 
-\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.}
+\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
 
 \item{missing}{is only used when input is a dense matrix. By default is set to NA, which means 
 that NA values should be considered as 'missing' by the algorithm. 
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index efbf8b629..206e32022 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -4,7 +4,8 @@
 \alias{xgb.dump}
 \title{Save xgboost model to text file}
 \usage{
-xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...)
+xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
+  dump_format = c("text", "json"), ...)
 }
 \arguments{
 \item{model}{the model object.}
@@ -24,6 +25,8 @@ gain is the approximate loss function gain we get in each split;
 cover is the sum of second order gradient in each node.}
 
 \item{...}{currently not used}
+
+\item{dump_fomat}{either 'text' or 'json' format could be specified.}
 }
 \value{
 if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
@@ -42,6 +45,10 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
 xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
 
 # print the model without saving it to a file
-print(xgb.dump(bst))
+print(xgb.dump(bst, with_stats = TRUE))
+
+# print in JSON format:
+cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
+
 }
 
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 7d6ec26ee..3dfafdd00 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -350,20 +350,37 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
   return ret;
 }
 
-SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
+SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
   SEXP out;
   R_API_BEGIN();
   bst_ulong olen;
   const char **res;
-  CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
+  const char *fmt = CHAR(asChar(dump_format));
+  CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
                                 CHAR(asChar(fmap)),
                                 asInteger(with_stats),
+                                fmt,
                                 &olen, &res));
   out = PROTECT(allocVector(STRSXP, olen));
-  for (size_t i = 0; i < olen; ++i) {
+  if (!strcmp("json", fmt)) {
     std::stringstream stream;
-    stream <<  "booster[" << i <<"]\n" << res[i];
-    SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    stream <<  "[\n";
+    for (size_t i = 0; i < olen; ++i) {
+      stream << res[i];
+      if (i < olen - 1) {
+        stream << ",\n";
+      } else {
+        stream << "\n";
+      }
+    }
+    stream <<  "]";
+    SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
+  } else {
+    for (size_t i = 0; i < olen; ++i) {
+      std::stringstream stream;
+      stream <<  "booster[" << i <<"]\n" << res[i];
+      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
+    }
   }
   R_API_END();
   UNPROTECT(1);
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 24c9b78f1..a1e9604a1 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -185,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
  * \param handle handle
  * \param fmap  name to fmap can be empty string
  * \param with_stats whether dump statistics of splits
+ * \param dump_format the format to dump the model in
  */
-XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
+XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format);
 
 /*!
  * \brief get learner attribute value
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index 26253bf62..a95d10797 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -147,6 +147,11 @@ test_that("cb.reset.parameters works as expected", {
     bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                       callbacks = list(cb.reset.parameters(my_par)))
   , NA) # NA = no error
+  # CV works as well
+  expect_error(
+    bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2,
+                   callbacks = list(cb.reset.parameters(my_par)))
+  , NA) # NA = no error
 
   # expect no learning with 0 learning rate
   my_par <- list(eta = c(0., 0.))
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 1d3ef3437..cd25c1dbb 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -27,6 +27,11 @@ test_that("xgb.dump works", {
   expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
   expect_true(file.exists('xgb.model.dump'))
   expect_gt(file.size('xgb.model.dump'), 8000)
+
+  # JSON format
+  dmp <- xgb.dump(bst.Tree, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
 })
 
 test_that("xgb.dump works for gblinear", {
@@ -38,6 +43,11 @@ test_that("xgb.dump works for gblinear", {
   d.sp <- xgb.dump(bst.GLM.sp)
   expect_length(d.sp, 14)
   expect_gt(sum(d.sp == "0"), 0)
+
+  # JSON format
+  dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
+  expect_length(dmp, 1)
+  expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
 })
 
 test_that("xgb-attribute functionality", {
@@ -83,6 +93,8 @@ test_that("xgb-attribute numeric precision", {
   for (x in X) {
     xgb.attr(bst.Tree, "x") <- x
     expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x)
+    xgb.attributes(bst.Tree) <- list(a = "A", b = x)
+    expect_identical(as.numeric(xgb.attr(bst.Tree, "b")), x)
   }
 })