convert S4 to S3; add some extra methods to DMatrix

This commit is contained in:
Vadim Khotilovich
2016-03-27 19:22:22 -05:00
parent d27bfb61b0
commit 71f402ac16
13 changed files with 675 additions and 149 deletions

View File

@@ -1,4 +1,4 @@
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @import methods
# depends on matrix
@@ -9,131 +9,10 @@
library.dynam.unload("xgboost", libpath)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
}
if (name == "label") {
if (length(info) != xgb.numrow(dmat))
stop("The length of labels must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "weight") {
if (length(info) != xgb.numrow(dmat))
stop("The length of weights must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "base_margin") {
# if (length(info)!=xgb.numrow(dmat))
# stop("The length of base margin must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "group") {
if (sum(info) != xgb.numrow(dmat))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
}
stop(paste("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else {
stop("xgb.Booster: modelfile must be character or raw vector")
}
}
return(structure(handle, class = "xgb.Booster.handle"))
}
# convert xgb.Booster.handle to xgb.Booster
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
# Check whether an xgb.Booster object is complete
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)
}
return(bst)
}
## ----the following are low level iteratively function, not needed if
## ----the following are low level iterative functions, not needed if
## you do not want to use them ---------------------------------------
# get dmatrix from data, label
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
if (!is.null(weight)){
xgb.setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else if (inClass == "data.frame") {
stop("xgboost only support numerical matrix input,
use 'data.matrix' to transform the data.")
} else {
stop("xgboost: Invalid input of data")
}
}
return (dtrain)
}
xgb.numrow <- function(dmat) {
nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost")
return(nrow)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster.handle") {
@@ -227,7 +106,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
"\tConsider providing pre-computed CV-folds through the folds parameter.")
}
y <- getinfo(dall, 'label')
randidx <- sample(1 : xgb.numrow(dall))
randidx <- sample(1 : nrow(dall))
if (stratified & length(y) == length(randidx)) {
y <- y[randidx]
#

144
R-package/R/xgb.Booster.R Normal file
View File

@@ -0,0 +1,144 @@
# Construct a Booster from cachelist
# internal utility function
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) == "character") {
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
} else if (typeof(modelfile) == "raw") {
.Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
} else {
stop("xgb.Booster: modelfile must be character or raw vector")
}
}
return(structure(handle, class = "xgb.Booster.handle"))
}
# Convert xgb.Booster.handle to xgb.Booster
# internal utility function
xgb.handleToBooster <- function(handle, raw = NULL)
{
bst <- list(handle = handle, raw = raw)
class(bst) <- "xgb.Booster"
return(bst)
}
# Check whether an xgb.Booster object is complete
# internal utility function
xgb.Booster.check <- function(bst, saveraw = TRUE)
{
isnull <- is.null(bst$handle)
if (!isnull) {
isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost")
}
if (isnull) {
bst$handle <- xgb.Booster(modelfile = bst$raw)
} else {
if (is.null(bst$raw) && saveraw)
bst$raw <- xgb.save.raw(bst$handle)
}
return(bst)
}
#' Predict method for eXtreme Gradient Boosting model
#'
#' Predicted values based on either xgboost model or model handle object.
#'
#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}
#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or
#' \code{xgb.DMatrix}.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
#' @param outputmargin whether the prediction should be shown in the original
#' value of sum of functions, when outputmargin=TRUE, the prediction is
#' untransformed margin value. In logistic regression, outputmargin=T will
#' output value before logistic transformation.
#' @param ntreelimit limit number of trees used in prediction, this parameter is
#' only valid for gbtree, but not for gblinear. set it to be value bigger
#' than 0. It will use all trees by default.
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
#' @param ... Parameters pass to \code{predict.xgb.Booster}
#'
#' @details
#' The option \code{ntreelimit} purpose is to let the user train a model with lots
#' of trees but use only the first trees for prediction to avoid overfitting
#' (without having to train a new model with less trees).
#'
#' The option \code{predleaf} purpose is inspired from §3.1 of the paper
#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
#' The idea is to use the model as a generator of new features which capture non linear link
#' from original features.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#' pred <- predict(bst, test$data)
#' @rdname predict.xgb.Booster
#' @export
predict.xgb.Booster <- function(object, newdata, missing = NA,
outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
if (class(object) != "xgb.Booster"){
stop("predict: model in prediction must be of class xgb.Booster")
} else {
object <- xgb.Booster.check(object, saveraw = FALSE)
}
if (class(newdata) != "xgb.DMatrix") {
newdata <- xgb.DMatrix(newdata, missing = missing)
}
if (is.null(ntreelimit)) {
ntreelimit <- 0
} else {
if (ntreelimit < 1){
stop("predict: ntreelimit must be equal to or greater than 1")
}
}
option <- 0
if (outputmargin) {
option <- option + 1
}
if (predleaf) {
option <- option + 2
}
ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option),
as.integer(ntreelimit), PACKAGE = "xgboost")
if (predleaf){
len <- getinfo(newdata, "nrow")
if (length(ret) == len){
ret <- matrix(ret,ncol = 1)
} else {
ret <- matrix(ret, ncol = len)
ret <- t(ret)
}
}
return(ret)
}
#' @rdname predict.xgb.Booster
#' @export
predict.xgb.Booster.handle <- function(object, ...) {
bst <- xgb.handleToBooster(object)
ret <- predict(bst, ...)
return(ret)
}

View File

@@ -1,9 +1,9 @@
#' Contruct xgb.DMatrix object
#'
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file.
#' Contruct xgb.DMatrix object from dense matrix, sparse matrix
#' or local file (that was created previously by saving an \code{xgb.DMatrix}).
#'
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character
#' indicating the data file.
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param info a list of information of the xgb.DMatrix object
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.
@@ -18,27 +18,339 @@
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' @export
xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
cnames <- NULL
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
PACKAGE = "xgboost")
} else if (is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
PACKAGE = "xgboost")
cnames <- colnames(data)
} else if (class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
PACKAGE = "xgboost")
cnames <- colnames(data)
} else {
stop(paste("xgb.DMatrix: does not support to construct from ",
typeof(data)))
}
dmat <- structure(handle, class = "xgb.DMatrix")
dmat <- handle
attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix")
#dmat <- list(handle = handle, colnames = cnames)
#attr(dmat, 'class') <- "xgb.DMatrix"
info <- append(info, list(...))
if (length(info) == 0)
return(dmat)
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
setinfo(dmat, names(p), p[[1]])
}
return(dmat)
}
# get dmatrix from data, label
# internal helper method
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
if (!is.null(weight)){
setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (inClass == "character") {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else if (inClass == "data.frame") {
stop("xgboost only support numerical matrix input,
use 'data.matrix' to transform the data.")
} else {
stop("xgboost: Invalid input of data")
}
}
return (dtrain)
}
#' Dimensions of xgb.DMatrix
#'
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
#' @param x Object of class \code{xgb.DMatrix}
#'
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{xgb.DMatrix} object.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#'
#' @export
dim.xgb.DMatrix <- function(x) {
c(.Call("XGDMatrixNumRow_R", x, PACKAGE="xgboost"),
.Call("XGDMatrixNumCol_R", x, PACKAGE="xgboost"))
}
#' Handling of column names of \code{xgb.DMatrix}
#'
#' Only column names are supported for \code{xgb.DMatrix}, thus setting of
#' row names would have no effect and returnten row names would be NULL.
#'
#' @param x object of class \code{xgb.DMatrix}
#' @param value a list of two elements: the first one is ignored
#' and the second one is column names
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose=TRUE)
#'
#' @rdname dimnames.xgb.DMatrix
#' @export
dimnames.xgb.DMatrix <- function(x) {
attr(x, '.Dimnames')
}
#' @rdname dimnames.xgb.DMatrix
#' @export
`dimnames<-.xgb.DMatrix` <- function(x, value) {
if (!is.list(value) || length(value) != 2L)
stop("invalid 'dimnames' given: must be a list of two elements")
if (!is.null(value[[1L]]))
stop("xgb.DMatrix does not have rownames")
if (is.null(value[[2]])) {
attr(x, '.Dimnames') <- NULL
return(x)
}
if (ncol(x) != length(value[[2]]))
stop("can't assign ", length(value[[2]]), " colnames to a ",
ncol(x), " column xgb.DMatrix")
attr(x, '.Dimnames') <- value
x
}
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#' @param object Object of class \code{xgb.DMatrix}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#'
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
getinfo <- function(object, ...) UseMethod("getinfo")
#' @rdname getinfo
#' @export
getinfo.xgb.DMatrix <- function(object, name) {
if (typeof(name) != "character") {
stop("getinfo: name must be character")
}
if (name != "label" && name != "weight" &&
name != "base_margin" && name != "nrow") {
stop(paste("getinfo: unknown info name", name))
}
if (name != "nrow"){
ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost")
} else {
ret <- nrow(object)
}
return(ret)
}
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label Xgboost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ;
#' \item \code{group}.
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all.equal(labels2, 1-labels))
#' @rdname setinfo
#' @export
setinfo <- function(object, ...) UseMethod("setinfo")
#' @rdname setinfo
#' @export
setinfo.xgb.DMatrix <- function(object, name, info) {
if (name == "label") {
if (length(info) != nrow(object))
stop("The length of labels must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "weight") {
if (length(info) != nrow(object))
stop("The length of weights must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "base_margin") {
# if (length(info)!=nrow(object))
# stop("The length of base margin must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", object, name, as.numeric(info),
PACKAGE = "xgboost")
return(TRUE)
}
if (name == "group") {
if (sum(info) != nrow(object))
stop("The sum of groups must equal to the number of rows in the input data")
.Call("XGDMatrixSetInfo_R", object, name, as.integer(info),
PACKAGE = "xgboost")
return(TRUE)
}
stop(paste("setinfo: unknown info name", name))
return(FALSE)
}
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' orginal xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param colset currently not used (columns subsetting is not available)
#' @param ... other parameters (currently not used)
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#'
#' dsub <- slice(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, 'label')
#' all.equal(labels1, labels2)
#'
#' @rdname slice.xgb.DMatrix
#' @export
slice <- function(object, ...) UseMethod("slice")
#' @rdname slice.xgb.DMatrix
#' @export
slice.xgb.DMatrix <- function(object, idxset, ...) {
if (class(object) != "xgb.DMatrix") {
stop("slice: first argument dtrain must be xgb.DMatrix")
}
ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost")
attr_list <- attributes(object)
nr <- nrow(object)
len <- sapply(attr_list, length)
ind <- which(len == nr)
if (length(ind) > 0) {
nms <- names(attr_list)[ind]
for (i in 1:length(ind)) {
attr(ret, nms[i]) <- attr(object, nms[i])[idxset]
}
}
return(structure(ret, class = "xgb.DMatrix"))
}
#' @rdname slice.xgb.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset=NULL) {
slice(object, idxset)
}
#' Print xgb.DMatrix
#'
#' Print information about an xgb.DMatrix.
#' Currently is would display
#' @param x an xgb.DMatrix object
#' @param verbose whether to print colnames (when present)
#'
#' @export
print.xgb.DMatrix <- function(x, verbose=FALSE, ...) {
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- c()
if(length(getinfo(x, 'label')) > 0) infos <- 'label'
if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
if (length(infos) == 0) infos <- 'NA'
cat(infos)
cnames <- colnames(x)
cat(' colnames:')
if (verbose & !is.null(cnames)) {
cat("\n'")
cat(cnames, sep="','")
cat("'")
} else {
if (is.null(cnames)) cat(' no')
else cat(' yes')
}
cat("\n")
invisible(x)
}

View File

@@ -169,11 +169,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
num_class <- params[['num_class']]
if (is.null(num_class))
stop('must set num_class to use softmax')
predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
predictValues <- matrix(0, nrow(dtrain), num_class)
mat_pred <- TRUE
}
else
predictValues <- rep(0,xgb.numrow(dtrain))
predictValues <- rep(0, nrow(dtrain))
history <- c()
print.every.n <- max(as.integer(print.every.n), 1L)
for (i in 1:nrounds) {