refactor dump function to adapt to the new possibilities of exporting a String

This commit is contained in:
El Potaeto 2015-01-09 00:14:01 +01:00
parent 6fd8bbe71a
commit 3e1eea0eea
4 changed files with 21 additions and 9 deletions

View File

@ -32,5 +32,6 @@ importFrom(stringr,str_extract)
importFrom(stringr,str_extract_all) importFrom(stringr,str_extract_all)
importFrom(stringr,str_match) importFrom(stringr,str_match)
importFrom(stringr,str_replace) importFrom(stringr,str_replace)
importFrom(stringr,str_replace_all)
importFrom(stringr,str_split) importFrom(stringr,str_split)
importFrom(stringr,str_trim) importFrom(stringr,str_trim)

View File

@ -2,8 +2,11 @@
#' #'
#' Save a xgboost model to text file. Could be parsed later. #' Save a xgboost model to text file. Could be parsed later.
#' #'
#' @importFrom magrittr %>%
#' @importFrom stringr str_split
#' @importFrom stringr str_replace_all
#' @param model the model object. #' @param model the model object.
#' @param fname the name of the binary file. #' @param fname the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.
#' @param fmap feature map file representing the type of feature. #' @param fmap feature map file representing the type of feature.
#' Detailed description could be found at #' Detailed description could be found at
#' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}. #' \url{https://github.com/tqchen/xgboost/wiki/Binary-Classification#dump-model}.
@ -15,6 +18,9 @@
#' gain is the approximate loss function gain we get in each split; #' gain is the approximate loss function gain we get in each split;
#' cover is the sum of second order gradient in each node. #' cover is the sum of second order gradient in each node.
#' #'
#' @return
#' if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
#'
#' @examples #' @examples
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
@ -25,15 +31,17 @@
#' xgb.dump(bst, 'xgb.model.dump') #' xgb.dump(bst, 'xgb.model.dump')
#' @export #' @export
#' #'
xgb.dump <- function(model, fname, fmap = "", with.stats=FALSE) { xgb.dump <- function(model, fname = NULL, fmap = "", with.stats=FALSE) {
if (class(model) != "xgb.Booster") { if (class(model) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster") stop("xgb.dump: first argument must be type xgb.Booster")
} }
if (typeof(fname) != "character") { if (!class(fname) %in% c("character", "NULL")) {
stop("xgb.dump: second argument must be type character") stop("xgb.dump: second argument must be type character if provided")
} }
result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost") result <- .Call("XGBoosterDumpModel_R", model, fmap, as.integer(with.stats), PACKAGE = "xgboost")
if(is.null(fname)) return(str_split(result, "\n") %>% unlist %>% str_replace_all("\t"," ") %>% Filter(function(x) x != "", .))
writeLines(result, fname) writeLines(result, fname)
#unlist(str_split(a, "\n"))=="" TRUE
return(TRUE)
} }

View File

@ -4,12 +4,12 @@
\alias{xgb.dump} \alias{xgb.dump}
\title{Save xgboost model to text file} \title{Save xgboost model to text file}
\usage{ \usage{
xgb.dump(model, fname, fmap = "", with.stats = FALSE) xgb.dump(model, fname = NULL, fmap = "", with.stats = FALSE)
} }
\arguments{ \arguments{
\item{model}{the model object.} \item{model}{the model object.}
\item{fname}{the name of the binary file.} \item{fname}{the name of the text file where to save the model. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
\item{fmap}{feature map file representing the type of feature. \item{fmap}{feature map file representing the type of feature.
Detailed description could be found at Detailed description could be found at
@ -23,6 +23,9 @@ for example Format.}
gain is the approximate loss function gain we get in each split; gain is the approximate loss function gain we get in each split;
cover is the sum of second order gradient in each node.} cover is the sum of second order gradient in each node.}
} }
\value{
if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
}
\description{ \description{
Save a xgboost model to text file. Could be parsed later. Save a xgboost model to text file. Could be parsed later.
} }

View File

@ -27,7 +27,7 @@ Results are returned for both linear and tree models.
There are 3 columns : There are 3 columns :
\itemize{ \itemize{
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means most important feature regarding the \code{label} used for the training ; \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ; \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning. \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
} }