[R-package] JSON dump format and a couple of bugfixes (#1855)
* [R-package] JSON tree dump interface * [R-package] precision bugfix in xgb.attributes * [R-package] bugfix for cb.early.stop called from xgb.cv * [R-package] a bit more clarity on labels checking in xgb.cv * [R-package] test JSON dump for gblinear as well * whitespace lint
This commit is contained in:
parent
0268dedeea
commit
b21e658a02
@ -229,7 +229,7 @@ cb.reset.parameters <- function(new_params) {
|
|||||||
xgb.parameters(env$bst$handle) <- pars
|
xgb.parameters(env$bst$handle) <- pars
|
||||||
} else {
|
} else {
|
||||||
for (fd in env$bst_folds)
|
for (fd in env$bst_folds)
|
||||||
xgb.parameters(fd$bst$handle) <- pars
|
xgb.parameters(fd$bst) <- pars
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
attr(callback, 'is_pre_iteration') <- TRUE
|
attr(callback, 'is_pre_iteration') <- TRUE
|
||||||
|
|||||||
@ -339,7 +339,7 @@ xgb.attributes <- function(object) {
|
|||||||
# Q: should we warn a user about non-scalar elements?
|
# Q: should we warn a user about non-scalar elements?
|
||||||
a <- lapply(a, function(x) {
|
a <- lapply(a, function(x) {
|
||||||
if (is.null(x)) return(NULL)
|
if (is.null(x)) return(NULL)
|
||||||
if (is.numeric(value[1])) {
|
if (is.numeric(x[1])) {
|
||||||
format(x[1], digits = 17)
|
format(x[1], digits = 17)
|
||||||
} else {
|
} else {
|
||||||
as.character(x[1])
|
as.character(x[1])
|
||||||
|
|||||||
@ -16,10 +16,10 @@
|
|||||||
#'
|
#'
|
||||||
#' See \code{\link{xgb.train}} for further details.
|
#' See \code{\link{xgb.train}} for further details.
|
||||||
#' See also demo/ for walkthrough example in R.
|
#' See also demo/ for walkthrough example in R.
|
||||||
#' @param data takes an \code{xgb.DMatrix} or \code{Matrix} as the input.
|
#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
|
||||||
#' @param nrounds the max number of iterations
|
#' @param nrounds the max number of iterations
|
||||||
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
|
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
|
||||||
#' @param label vector of response values. Should be provided only when data is \code{DMatrix}.
|
#' @param label vector of response values. Should be provided only when data is an R-matrix.
|
||||||
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
|
#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
|
||||||
#' that NA values should be considered as 'missing' by the algorithm.
|
#' that NA values should be considered as 'missing' by the algorithm.
|
||||||
#' Sometimes, 0 or other extreme value might be used to represent missing values.
|
#' Sometimes, 0 or other extreme value might be used to represent missing values.
|
||||||
@ -129,10 +129,9 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
|
|||||||
#if (is.null(params[['eval_metric']]) && is.null(feval))
|
#if (is.null(params[['eval_metric']]) && is.null(feval))
|
||||||
# stop("Either 'eval_metric' or 'feval' must be provided for CV")
|
# stop("Either 'eval_metric' or 'feval' must be provided for CV")
|
||||||
|
|
||||||
# Labels
|
# Check the labels
|
||||||
if (class(data) == 'xgb.DMatrix')
|
if ( (class(data) == 'xgb.DMatrix' && is.null(getinfo(data, 'label'))) ||
|
||||||
labels <- getinfo(data, 'label')
|
(class(data) != 'xgb.DMatrix' && is.null(label)))
|
||||||
if (is.null(labels))
|
|
||||||
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
|
stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
|
||||||
|
|
||||||
# CV folds
|
# CV folds
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
#' When this option is on, the model dump comes with two additional statistics:
|
#' When this option is on, the model dump comes with two additional statistics:
|
||||||
#' gain is the approximate loss function gain we get in each split;
|
#' gain is the approximate loss function gain we get in each split;
|
||||||
#' cover is the sum of second order gradient in each node.
|
#' cover is the sum of second order gradient in each node.
|
||||||
|
#' @param dump_fomat either 'text' or 'json' format could be specified.
|
||||||
#' @param ... currently not used
|
#' @param ... currently not used
|
||||||
#'
|
#'
|
||||||
#' @return
|
#' @return
|
||||||
@ -30,10 +31,15 @@
|
|||||||
#' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
|
#' xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
|
||||||
#'
|
#'
|
||||||
#' # print the model without saving it to a file
|
#' # print the model without saving it to a file
|
||||||
#' print(xgb.dump(bst))
|
#' print(xgb.dump(bst, with_stats = TRUE))
|
||||||
|
#'
|
||||||
|
#' # print in JSON format:
|
||||||
|
#' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||||
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ...) {
|
xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) {
|
||||||
check.deprecation(...)
|
check.deprecation(...)
|
||||||
|
dump_format <- match.arg(dump_format)
|
||||||
if (class(model) != "xgb.Booster")
|
if (class(model) != "xgb.Booster")
|
||||||
stop("model: argument must be of type xgb.Booster")
|
stop("model: argument must be of type xgb.Booster")
|
||||||
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
|
if (!(class(fname) %in% c("character", "NULL") && length(fname) <= 1))
|
||||||
@ -42,12 +48,15 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with_stats=FALSE, ..
|
|||||||
stop("fmap: argument must be of type character (when provided)")
|
stop("fmap: argument must be of type character (when provided)")
|
||||||
|
|
||||||
model <- xgb.Booster.check(model)
|
model <- xgb.Booster.check(model)
|
||||||
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats), PACKAGE = "xgboost")
|
model_dump <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with_stats),
|
||||||
|
as.character(dump_format), PACKAGE = "xgboost")
|
||||||
|
|
||||||
if (is.null(fname))
|
if (is.null(fname))
|
||||||
model_dump <- stri_replace_all_regex(model_dump, '\t', '')
|
model_dump <- stri_replace_all_regex(model_dump, '\t', '')
|
||||||
|
|
||||||
model_dump <- unlist(stri_split_regex(model_dump, '\n'))
|
if (dump_format == "text")
|
||||||
|
model_dump <- unlist(stri_split_regex(model_dump, '\n'))
|
||||||
|
|
||||||
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
|
model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE)
|
||||||
|
|
||||||
if (is.null(fname)) {
|
if (is.null(fname)) {
|
||||||
|
|||||||
@ -26,13 +26,13 @@ xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
|
|||||||
See \code{\link{xgb.train}} for further details.
|
See \code{\link{xgb.train}} for further details.
|
||||||
See also demo/ for walkthrough example in R.}
|
See also demo/ for walkthrough example in R.}
|
||||||
|
|
||||||
\item{data}{takes an \code{xgb.DMatrix} or \code{Matrix} as the input.}
|
\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
|
||||||
|
|
||||||
\item{nrounds}{the max number of iterations}
|
\item{nrounds}{the max number of iterations}
|
||||||
|
|
||||||
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
|
||||||
|
|
||||||
\item{label}{vector of response values. Should be provided only when data is \code{DMatrix}.}
|
\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
|
||||||
|
|
||||||
\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
|
\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
|
||||||
that NA values should be considered as 'missing' by the algorithm.
|
that NA values should be considered as 'missing' by the algorithm.
|
||||||
|
|||||||
@ -4,7 +4,8 @@
|
|||||||
\alias{xgb.dump}
|
\alias{xgb.dump}
|
||||||
\title{Save xgboost model to text file}
|
\title{Save xgboost model to text file}
|
||||||
\usage{
|
\usage{
|
||||||
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE, ...)
|
xgb.dump(model = NULL, fname = NULL, fmap = "", with_stats = FALSE,
|
||||||
|
dump_format = c("text", "json"), ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{model}{the model object.}
|
\item{model}{the model object.}
|
||||||
@ -24,6 +25,8 @@ gain is the approximate loss function gain we get in each split;
|
|||||||
cover is the sum of second order gradient in each node.}
|
cover is the sum of second order gradient in each node.}
|
||||||
|
|
||||||
\item{...}{currently not used}
|
\item{...}{currently not used}
|
||||||
|
|
||||||
|
\item{dump_fomat}{either 'text' or 'json' format could be specified.}
|
||||||
}
|
}
|
||||||
\value{
|
\value{
|
||||||
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
|
||||||
@ -42,6 +45,10 @@ bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
|
|||||||
xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
|
xgb.dump(bst, 'xgb.model.dump', with_stats = TRUE)
|
||||||
|
|
||||||
# print the model without saving it to a file
|
# print the model without saving it to a file
|
||||||
print(xgb.dump(bst))
|
print(xgb.dump(bst, with_stats = TRUE))
|
||||||
|
|
||||||
|
# print in JSON format:
|
||||||
|
cat(xgb.dump(bst, with_stats = TRUE, dump_format='json'))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -350,20 +350,37 @@ SEXP XGBoosterModelToRaw_R(SEXP handle) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats) {
|
SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
|
||||||
SEXP out;
|
SEXP out;
|
||||||
R_API_BEGIN();
|
R_API_BEGIN();
|
||||||
bst_ulong olen;
|
bst_ulong olen;
|
||||||
const char **res;
|
const char **res;
|
||||||
CHECK_CALL(XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
const char *fmt = CHAR(asChar(dump_format));
|
||||||
|
CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
|
||||||
CHAR(asChar(fmap)),
|
CHAR(asChar(fmap)),
|
||||||
asInteger(with_stats),
|
asInteger(with_stats),
|
||||||
|
fmt,
|
||||||
&olen, &res));
|
&olen, &res));
|
||||||
out = PROTECT(allocVector(STRSXP, olen));
|
out = PROTECT(allocVector(STRSXP, olen));
|
||||||
for (size_t i = 0; i < olen; ++i) {
|
if (!strcmp("json", fmt)) {
|
||||||
std::stringstream stream;
|
std::stringstream stream;
|
||||||
stream << "booster[" << i <<"]\n" << res[i];
|
stream << "[\n";
|
||||||
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
|
stream << res[i];
|
||||||
|
if (i < olen - 1) {
|
||||||
|
stream << ",\n";
|
||||||
|
} else {
|
||||||
|
stream << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stream << "]";
|
||||||
|
SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < olen; ++i) {
|
||||||
|
std::stringstream stream;
|
||||||
|
stream << "booster[" << i <<"]\n" << res[i];
|
||||||
|
SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
R_API_END();
|
R_API_END();
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
|||||||
@ -185,8 +185,9 @@ XGB_DLL SEXP XGBoosterModelToRaw_R(SEXP handle);
|
|||||||
* \param handle handle
|
* \param handle handle
|
||||||
* \param fmap name to fmap can be empty string
|
* \param fmap name to fmap can be empty string
|
||||||
* \param with_stats whether dump statistics of splits
|
* \param with_stats whether dump statistics of splits
|
||||||
|
* \param dump_format the format to dump the model in
|
||||||
*/
|
*/
|
||||||
XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats);
|
XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief get learner attribute value
|
* \brief get learner attribute value
|
||||||
|
|||||||
@ -147,6 +147,11 @@ test_that("cb.reset.parameters works as expected", {
|
|||||||
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
|
||||||
callbacks = list(cb.reset.parameters(my_par)))
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
, NA) # NA = no error
|
, NA) # NA = no error
|
||||||
|
# CV works as well
|
||||||
|
expect_error(
|
||||||
|
bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2,
|
||||||
|
callbacks = list(cb.reset.parameters(my_par)))
|
||||||
|
, NA) # NA = no error
|
||||||
|
|
||||||
# expect no learning with 0 learning rate
|
# expect no learning with 0 learning rate
|
||||||
my_par <- list(eta = c(0., 0.))
|
my_par <- list(eta = c(0., 0.))
|
||||||
|
|||||||
@ -27,6 +27,11 @@ test_that("xgb.dump works", {
|
|||||||
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
|
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
|
||||||
expect_true(file.exists('xgb.model.dump'))
|
expect_true(file.exists('xgb.model.dump'))
|
||||||
expect_gt(file.size('xgb.model.dump'), 8000)
|
expect_gt(file.size('xgb.model.dump'), 8000)
|
||||||
|
|
||||||
|
# JSON format
|
||||||
|
dmp <- xgb.dump(bst.Tree, dump_format = "json")
|
||||||
|
expect_length(dmp, 1)
|
||||||
|
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb.dump works for gblinear", {
|
test_that("xgb.dump works for gblinear", {
|
||||||
@ -38,6 +43,11 @@ test_that("xgb.dump works for gblinear", {
|
|||||||
d.sp <- xgb.dump(bst.GLM.sp)
|
d.sp <- xgb.dump(bst.GLM.sp)
|
||||||
expect_length(d.sp, 14)
|
expect_length(d.sp, 14)
|
||||||
expect_gt(sum(d.sp == "0"), 0)
|
expect_gt(sum(d.sp == "0"), 0)
|
||||||
|
|
||||||
|
# JSON format
|
||||||
|
dmp <- xgb.dump(bst.GLM.sp, dump_format = "json")
|
||||||
|
expect_length(dmp, 1)
|
||||||
|
expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
|
||||||
})
|
})
|
||||||
|
|
||||||
test_that("xgb-attribute functionality", {
|
test_that("xgb-attribute functionality", {
|
||||||
@ -83,6 +93,8 @@ test_that("xgb-attribute numeric precision", {
|
|||||||
for (x in X) {
|
for (x in X) {
|
||||||
xgb.attr(bst.Tree, "x") <- x
|
xgb.attr(bst.Tree, "x") <- x
|
||||||
expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x)
|
expect_identical(as.numeric(xgb.attr(bst.Tree, "x")), x)
|
||||||
|
xgb.attributes(bst.Tree) <- list(a = "A", b = x)
|
||||||
|
expect_identical(as.numeric(xgb.attr(bst.Tree, "b")), x)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user