[R-package] JSON dump format and a couple of bugfixes (#1855)

* [R-package] JSON tree dump interface

* [R-package] precision bugfix in xgb.attributes

* [R-package] bugfix for cb.early.stop called from xgb.cv

* [R-package] a bit more clarity on labels checking in xgb.cv

* [R-package] test JSON dump for gblinear as well

* whitespace lint
This commit is contained in:
Vadim Khotilovich 2016-12-11 12:48:39 -06:00 committed by Tianqi Chen
parent 0268dedeea
commit b21e658a02
10 changed files with 72 additions and 22 deletions

View File

@ -229,7 +229,7 @@ cb.reset.parameters <- function(new_params) {
xgb.parameters(env$bst$handle) <- pars xgb.parameters(env$bst$handle) <- pars
} else { } else {
for (fd in env$bst_folds) for (fd in env$bst_folds)
xgb.parameters(fd$bst$handle) <- pars xgb.parameters(fd$bst) <- pars
} }
} }
attr(callback, 'is_pre_iteration') <- TRUE attr(callback, 'is_pre_iteration') <- TRUE

View File

@ -339,7 +339,7 @@ xgb.attributes <- function(object) {
# Q: should we warn a user about non-scalar elements? # Q: should we warn a user about non-scalar elements?
a <- lapply(a, function(x) { a <- lapply(a, function(x) {
if (is.null(x)) return(NULL) if (is.null(x)) return(NULL)
if (is.numeric(value[1])) { if (is.numeric(x[1])) {
format(x[1], digits = 17) format(x[1], digits = 17)
} else { } else {
as.character(x[1]) as.character(x[1])

View File

@ -16,10 +16,10 @@
#' #'
#' See \code{\link{xgb.train}} for further details. #' See \code{\link{xgb.train}} for further details.
#' See also demo/ for walkthrough example in R. #' See also demo/ for walkthrough example in R.
#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input. #' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
#' @param nrounds the max number of iterations #' @param nrounds the max number of iterations
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is \code{DMatrix}. #' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means #' @param missing is only used when input is a dense matrix. By default is set to NA, which means
#' that NA values should be considered as 'missing' by the algorithm. #' that NA values should be considered as 'missing' by the algorithm.
#' Sometimes, 0 or other extreme value might be used to represent missing values. #' Sometimes, 0 or other extreme value might be used to represent missing values.
@ -129,10 +129,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
#if (is.null(params[['eval_metric']]) && is.null(feval)) #if (is.null(params[['eval_metric']]) && is.null(feval))
# stop("Either 'eval_metric' or 'feval' must be provided for CV") # stop("Either 'eval_metric' or 'feval' must be provided for CV")
# Labels # Check the labels
if (class(data) == 'xgb.DMatrix') if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) ||
labels <- getinfo(data, 'label') (class(data) != 'xgb.DMatrix' && is.null(label)))
if (is.null(labels))
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
# CV folds # CV folds

View File

@ -14,6 +14,7 @@
#' When this option is on, the model dump comes with two additional statistics: #' When this option is on, the model dump comes with two additional statistics:
#' gain is the approximate loss function gain we get in each split; #' gain is the approximate loss function gain we get in each split;
#' cover is the sum of second order gradient in each node. #' cover is the sum of second order gradient in each node.
#' @param dump_fomat either 'text' or 'json' format could be specified.
#' @param ... currently not used #' @param ... currently not used
#' #'
#' @return #' @return
@ -30,10 +31,15 @@
#' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE) #' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
#' #'
#' # print the model without saving it to a file #' # print the model without saving it to a file
#' print(xgb.dump(bst)) #' print(xgb.dump(bst, with_stats = TRUE))
#'
#' # print in JSON format:
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
#'
#' @export #' @export
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) { xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) {
check.deprecation(...) check.deprecation(...)
dump_format <- match.arg(dump_format)
if (class(model) != "xgb.Booster") if (class(model) != "xgb.Booster")
stop("model: argument must be of type xgb.Booster") stop("model: argument must be of type xgb.Booster")
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1)) if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
@ -42,12 +48,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
stop("fmap: argument must be of type character (when provided)") stop("fmap: argument must be of type character (when provided)")
model <- xgb.Booster.check(model) model <- xgb.Booster.check(model)
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost") model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
as.character(dump_format), PACKAGE = "xgboost")
if (is.null(fname)) if (is.null(fname))
model_dump <- stri_replace_all_regex(model_dump, '\t', '') model_dump <- stri_replace_all_regex(model_dump, '\t', '')
model_dump <- unlist(stri_split_regex(model_dump, '\n')) if (dump_format == "text")
model_dump <- unlist(stri_split_regex(model_dump, '\n'))
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE) model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
if (is.null(fname)) { if (is.null(fname)) {

View File

@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
See \code{\link{xgb.train}} for further details. See \code{\link{xgb.train}} for further details.
See also demo/ for walkthrough example in R.} See also demo/ for walkthrough example in R.}
\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.} \item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
\item{nrounds}{the max number of iterations} \item{nrounds}{the max number of iterations}
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.} \item{label}{vector of response values. Should be provided only when data is an R-matrix.}
\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means \item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
that NA values should be considered as 'missing' by the algorithm. that NA values should be considered as 'missing' by the algorithm.

View File

@ -4,7 +4,8 @@
\alias{xgb.dump} \alias{xgb.dump}
\title{Save xgboost model to text file} \title{Save xgboost model to text file}
\usage{ \usage{
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...) xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
dump_format = c("text", "json"), ...)
} }
\arguments{ \arguments{
\item{model}{the model object.} \item{model}{the model object.}
@ -24,6 +25,8 @@ gain is the approximate loss function gain we get in each split;
cover is the sum of second order gradient in each node.} cover is the sum of second order gradient in each node.}
\item{...}{currently not used} \item{...}{currently not used}
\item{dump_fomat}{either 'text' or 'json' format could be specified.}
} }
\value{ \value{
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
@ -42,6 +45,10 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE) xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
# print the model without saving it to a file # print the model without saving it to a file
print(xgb.dump(bst)) print(xgb.dump(bst, with_stats = TRUE))
# print in JSON format:
cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
} }

View File

@ -350,20 +350,37 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
return ret; return ret;
} }
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) { SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
SEXP out; SEXP out;
R_API_BEGIN(); R_API_BEGIN();
bst_ulong olen; bst_ulong olen;
const char **res; const char **res;
CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle), const char *fmt = CHAR(asChar(dump_format));
CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)), CHAR(asChar(fmap)),
asInteger(with_stats), asInteger(with_stats),
fmt,
&olen, &res)); &olen, &res));
out = PROTECT(allocVector(STRSXP, olen)); out = PROTECT(allocVector(STRSXP, olen));
for (size_t i = 0; i < olen; ++i) { if (!strcmp("json", fmt)) {
std::stringstream stream; std::stringstream stream;
stream << "booster[" << i <<"]\n" << res[i]; stream << "[\n";
SET_STRING_ELT(out, i, mkChar(stream.str().c_str())); for (size_t i = 0; i < olen; ++i) {
stream << res[i];
if (i < olen - 1) {
stream << ",\n";
} else {
stream << "\n";
}
}
stream << "]";
SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
} else {
for (size_t i = 0; i < olen; ++i) {
std::stringstream stream;
stream << "booster[" << i <<"]\n" << res[i];
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
}
} }
R_API_END(); R_API_END();
UNPROTECT(1); UNPROTECT(1);

View File

@ -185,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
* \param handle handle * \param handle handle
* \param fmap name to fmap can be empty string * \param fmap name to fmap can be empty string
* \param with_stats whether dump statistics of splits * \param with_stats whether dump statistics of splits
* \param dump_format the format to dump the model in
*/ */
XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats); XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format);
/*! /*!
* \brief get learner attribute value * \brief get learner attribute value

View File

@ -147,6 +147,11 @@ test_that("cb.reset.parameters works as expected", {
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset.parameters(my_par))) callbacks = list(cb.reset.parameters(my_par)))
, NA) # NA = no error , NA) # NA = no error
# CV works as well
expect_error(
bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2,
callbacks = list(cb.reset.parameters(my_par)))
, NA) # NA = no error
# expect no learning with 0 learning rate # expect no learning with 0 learning rate
my_par <- list(eta = c(0., 0.)) my_par <- list(eta = c(0., 0.))

View File

@ -27,6 +27,11 @@ test_that("xgb.dump works", {
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T)) expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
expect_true(file.exists('xgb.model.dump')) expect_true(file.exists('xgb.model.dump'))
expect_gt(file.size('xgb.model.dump'), 8000) expect_gt(file.size('xgb.model.dump'), 8000)
# JSON format
dmp <- xgb.dump(bst.Tree, dump_format = "json")
expect_length(dmp, 1)
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
}) })
test_that("xgb.dump works for gblinear", { test_that("xgb.dump works for gblinear", {
@ -38,6 +43,11 @@ test_that("xgb.dump works for gblinear", {
d.sp <- xgb.dump(bst.GLM.sp) d.sp <- xgb.dump(bst.GLM.sp)
expect_length(d.sp, 14) expect_length(d.sp, 14)
expect_gt(sum(d.sp == "0"), 0) expect_gt(sum(d.sp == "0"), 0)
# JSON format
dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
expect_length(dmp, 1)
expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
}) })
test_that("xgb-attribute functionality", { test_that("xgb-attribute functionality", {
@ -83,6 +93,8 @@ test_that("xgb-attribute numeric precision", {
for (x in X) { for (x in X) {
xgb.attr(bst.Tree, "x") <- x xgb.attr(bst.Tree, "x") <- x
expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x) expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x)
xgb.attributes(bst.Tree) <- list(a = "A", b = x)
expect_identical(as.numeric(xgb.attr(bst.Tree, "b")), x)
} }
}) })