From 71f402ac16ea408321c41473609a1a537570f4f4 Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Sun, 27 Mar 2016 19:22:22 -0500 Subject: [PATCH] convert S4 to S3; add some extra methods to DMatrix --- R-package/NAMESPACE | 12 +- R-package/R/utils.R | 129 +---------- R-package/R/xgb.Booster.R | 144 ++++++++++++ R-package/R/xgb.DMatrix.R | 322 +++++++++++++++++++++++++- R-package/R/xgb.cv.R | 4 +- R-package/man/dim.xgb.DMatrix.Rd | 29 +++ R-package/man/dimnames.xgb.DMatrix.Rd | 36 +++ R-package/man/getinfo.Rd | 13 +- R-package/man/predict.xgb.Booster.Rd | 58 +++++ R-package/man/print.xgb.DMatrix.Rd | 18 ++ R-package/man/setinfo.Rd | 12 +- R-package/man/slice.xgb.DMatrix.Rd | 41 ++++ R-package/man/xgb.DMatrix.Rd | 6 +- 13 files changed, 675 insertions(+), 149 deletions(-) create mode 100644 R-package/R/xgb.Booster.R create mode 100644 R-package/man/dim.xgb.DMatrix.Rd create mode 100644 R-package/man/dimnames.xgb.DMatrix.Rd create mode 100644 R-package/man/predict.xgb.Booster.Rd create mode 100644 R-package/man/print.xgb.DMatrix.Rd create mode 100644 R-package/man/slice.xgb.DMatrix.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 3cd80d5c2..dd788ca2f 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,6 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method("[",xgb.DMatrix) +S3method("dimnames<-",xgb.DMatrix) +S3method(dim,xgb.DMatrix) +S3method(dimnames,xgb.DMatrix) +S3method(getinfo,xgb.DMatrix) +S3method(predict,xgb.Booster) +S3method(predict,xgb.Booster.handle) +S3method(setinfo,xgb.DMatrix) +S3method(slice,xgb.DMatrix) export(getinfo) +export(print.xgb.DMatrix) export(setinfo) export(slice) export(xgb.DMatrix) @@ -19,8 +29,6 @@ export(xgb.save) export(xgb.save.raw) export(xgb.train) export(xgboost) -exportMethods(nrow) -exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 926e82994..514826b3c 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -1,4 +1,4 @@ - #' @importClassesFrom Matrix dgCMatrix dgeMatrix +#' @importClassesFrom Matrix dgCMatrix dgeMatrix #' @import methods # depends on matrix @@ -9,131 +9,10 @@ library.dynam.unload("xgboost", libpath) } -# set information into dmatrix, this mutate dmatrix -xgb.setinfo <- function(dmat, name, info) { - if (class(dmat) != "xgb.DMatrix") { - stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix") - } - if (name == "label") { - if (length(info) != xgb.numrow(dmat)) - stop("The length of labels must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "weight") { - if (length(info) != xgb.numrow(dmat)) - stop("The length of weights must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "base_margin") { - # if (length(info)!=xgb.numrow(dmat)) - # stop("The length of base margin must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), - PACKAGE = "xgboost") - return(TRUE) - } - if (name == "group") { - if (sum(info) != xgb.numrow(dmat)) - stop("The sum of groups must equal to the number of rows in the input data") - .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), - PACKAGE = "xgboost") - return(TRUE) - } - stop(paste("xgb.setinfo: unknown info name", name)) - return(FALSE) -} -# construct a Booster from cachelist -xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { - if (typeof(cachelist) != "list") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - for (dm in cachelist) { - if (class(dm) != "xgb.DMatrix") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - } - handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") - if (length(params) != 0) { - for (i in 1:length(params)) { - p <- params[i] - .Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p), - PACKAGE = "xgboost") - } - } - if (!is.null(modelfile)) { - if (typeof(modelfile) == "character") { - .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") - } else if (typeof(modelfile) == "raw") { - .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") - } else { - stop("xgb.Booster: modelfile must be character or raw vector") - } - } - return(structure(handle, class = "xgb.Booster.handle")) -} - -# convert xgb.Booster.handle to xgb.Booster -xgb.handleToBooster <- function(handle, raw = NULL) -{ - bst <- list(handle = handle, raw = raw) - class(bst) <- "xgb.Booster" - return(bst) -} - -# Check whether an xgb.Booster object is complete -xgb.Booster.check <- function(bst, saveraw = TRUE) -{ - isnull <- is.null(bst$handle) - if (!isnull) { - isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost") - } - if (isnull) { - bst$handle <- xgb.Booster(modelfile = bst$raw) - } else { - if (is.null(bst$raw) && saveraw) - bst$raw <- xgb.save.raw(bst$handle) - } - return(bst) -} - -## ----the following are low level iteratively function, not needed if +## ----the following are low level iterative functions, not needed if ## you do not want to use them --------------------------------------- -# get dmatrix from data, label -xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { - inClass <- class(data) - if (inClass == "dgCMatrix" || inClass == "matrix") { - if (is.null(label)) { - stop("xgboost: need label when data is a matrix") - } - dtrain <- xgb.DMatrix(data, label = label, missing = missing) - if (!is.null(weight)){ - xgb.setinfo(dtrain, "weight", weight) - } - } else { - if (!is.null(label)) { - warning("xgboost: label will be ignored.") - } - if (inClass == "character") { - dtrain <- xgb.DMatrix(data) - } else if (inClass == "xgb.DMatrix") { - dtrain <- data - } else if (inClass == "data.frame") { - stop("xgboost only support numerical matrix input, - use 'data.matrix' to transform the data.") - } else { - stop("xgboost: Invalid input of data") - } - } - return (dtrain) -} -xgb.numrow <- function(dmat) { - nrow <- .Call("XGDMatrixNumRow_R", dmat, PACKAGE="xgboost") - return(nrow) -} + # iteratively update booster with customized statistics xgb.iter.boost <- function(booster, dtrain, gpair) { if (class(booster) != "xgb.Booster.handle") { @@ -227,7 +106,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) { "\tConsider providing pre-computed CV-folds through the folds parameter.") } y <- getinfo(dall, 'label') - randidx <- sample(1 : xgb.numrow(dall)) + randidx <- sample(1 : nrow(dall)) if (stratified & length(y) == length(randidx)) { y <- y[randidx] # diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R new file mode 100644 index 000000000..97f5ea37f --- /dev/null +++ b/R-package/R/xgb.Booster.R @@ -0,0 +1,144 @@ +# Construct a Booster from cachelist +# internal utility function +xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { + if (typeof(cachelist) != "list") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + for (dm in cachelist) { + if (class(dm) != "xgb.DMatrix") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + } + handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE = "xgboost") + if (length(params) != 0) { + for (i in 1:length(params)) { + p <- params[i] + .Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p), + PACKAGE = "xgboost") + } + } + if (!is.null(modelfile)) { + if (typeof(modelfile) == "character") { + .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost") + } else if (typeof(modelfile) == "raw") { + .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost") + } else { + stop("xgb.Booster: modelfile must be character or raw vector") + } + } + return(structure(handle, class = "xgb.Booster.handle")) +} + +# Convert xgb.Booster.handle to xgb.Booster +# internal utility function +xgb.handleToBooster <- function(handle, raw = NULL) +{ + bst <- list(handle = handle, raw = raw) + class(bst) <- "xgb.Booster" + return(bst) +} + +# Check whether an xgb.Booster object is complete +# internal utility function +xgb.Booster.check <- function(bst, saveraw = TRUE) +{ + isnull <- is.null(bst$handle) + if (!isnull) { + isnull <- .Call("XGCheckNullPtr_R", bst$handle, PACKAGE="xgboost") + } + if (isnull) { + bst$handle <- xgb.Booster(modelfile = bst$raw) + } else { + if (is.null(bst$raw) && saveraw) + bst$raw <- xgb.save.raw(bst$handle) + } + return(bst) +} + +#' Predict method for eXtreme Gradient Boosting model +#' +#' Predicted values based on either xgboost model or model handle object. +#' +#' @param object Object of class \code{xgb.Booster} or \code{xgb.Booster.handle} +#' @param newdata takes \code{matrix}, \code{dgCMatrix}, local data file or +#' \code{xgb.DMatrix}. +#' @param missing Missing is only used when input is dense matrix, pick a float +#' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. +#' @param outputmargin whether the prediction should be shown in the original +#' value of sum of functions, when outputmargin=TRUE, the prediction is +#' untransformed margin value. In logistic regression, outputmargin=T will +#' output value before logistic transformation. +#' @param ntreelimit limit number of trees used in prediction, this parameter is +#' only valid for gbtree, but not for gblinear. set it to be value bigger +#' than 0. It will use all trees by default. +#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object. +#' @param ... Parameters pass to \code{predict.xgb.Booster} +#' +#' @details +#' The option \code{ntreelimit} purpose is to let the user train a model with lots +#' of trees but use only the first trees for prediction to avoid overfitting +#' (without having to train a new model with less trees). +#' +#' The option \code{predleaf} purpose is inspired from §3.1 of the paper +#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +#' The idea is to use the model as a generator of new features which capture non linear link +#' from original features. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' data(agaricus.test, package='xgboost') +#' train <- agaricus.train +#' test <- agaricus.test +#' +#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +#' pred <- predict(bst, test$data) +#' @rdname predict.xgb.Booster +#' @export +predict.xgb.Booster <- function(object, newdata, missing = NA, + outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) { + if (class(object) != "xgb.Booster"){ + stop("predict: model in prediction must be of class xgb.Booster") + } else { + object <- xgb.Booster.check(object, saveraw = FALSE) + } + if (class(newdata) != "xgb.DMatrix") { + newdata <- xgb.DMatrix(newdata, missing = missing) + } + if (is.null(ntreelimit)) { + ntreelimit <- 0 + } else { + if (ntreelimit < 1){ + stop("predict: ntreelimit must be equal to or greater than 1") + } + } + option <- 0 + if (outputmargin) { + option <- option + 1 + } + if (predleaf) { + option <- option + 2 + } + ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), + as.integer(ntreelimit), PACKAGE = "xgboost") + if (predleaf){ + len <- getinfo(newdata, "nrow") + if (length(ret) == len){ + ret <- matrix(ret,ncol = 1) + } else { + ret <- matrix(ret, ncol = len) + ret <- t(ret) + } + } + return(ret) +} + +#' @rdname predict.xgb.Booster +#' @export +predict.xgb.Booster.handle <- function(object, ...) { + + bst <- xgb.handleToBooster(object) + + ret <- predict(bst, ...) + return(ret) +} diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index c34c65d95..744c2a05c 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -1,9 +1,9 @@ #' Contruct xgb.DMatrix object #' -#' Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. +#' Contruct xgb.DMatrix object from dense matrix, sparse matrix +#' or local file (that was created previously by saving an \code{xgb.DMatrix}). #' -#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character -#' indicating the data file. +#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename #' @param info a list of information of the xgb.DMatrix object #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values. @@ -18,27 +18,339 @@ #' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' @export xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { + cnames <- NULL if (typeof(data) == "character") { handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE = "xgboost") } else if (is.matrix(data)) { handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE = "xgboost") + cnames <- colnames(data) } else if (class(data) == "dgCMatrix") { handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE = "xgboost") + cnames <- colnames(data) } else { stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) } - dmat <- structure(handle, class = "xgb.DMatrix") + dmat <- handle + attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix") + #dmat <- list(handle = handle, colnames = cnames) + #attr(dmat, 'class') <- "xgb.DMatrix" info <- append(info, list(...)) if (length(info) == 0) return(dmat) for (i in 1:length(info)) { p <- info[i] - xgb.setinfo(dmat, names(p), p[[1]]) + setinfo(dmat, names(p), p[[1]]) } return(dmat) } + + +# get dmatrix from data, label +# internal helper method +xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) { + inClass <- class(data) + if (inClass == "dgCMatrix" || inClass == "matrix") { + if (is.null(label)) { + stop("xgboost: need label when data is a matrix") + } + dtrain <- xgb.DMatrix(data, label = label, missing = missing) + if (!is.null(weight)){ + setinfo(dtrain, "weight", weight) + } + } else { + if (!is.null(label)) { + warning("xgboost: label will be ignored.") + } + if (inClass == "character") { + dtrain <- xgb.DMatrix(data) + } else if (inClass == "xgb.DMatrix") { + dtrain <- data + } else if (inClass == "data.frame") { + stop("xgboost only support numerical matrix input, + use 'data.matrix' to transform the data.") + } else { + stop("xgboost: Invalid input of data") + } + } + return (dtrain) +} + + +#' Dimensions of xgb.DMatrix +#' +#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. +#' @param x Object of class \code{xgb.DMatrix} +#' +#' @details +#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also +#' be directly used with an \code{xgb.DMatrix} object. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' stopifnot(nrow(dtrain) == nrow(train$data)) +#' stopifnot(ncol(dtrain) == ncol(train$data)) +#' stopifnot(all(dim(dtrain) == dim(train$data))) +#' +#' @export +dim.xgb.DMatrix <- function(x) { + c(.Call("XGDMatrixNumRow_R", x, PACKAGE="xgboost"), + .Call("XGDMatrixNumCol_R", x, PACKAGE="xgboost")) +} + + +#' Handling of column names of \code{xgb.DMatrix} +#' +#' Only column names are supported for \code{xgb.DMatrix}, thus setting of +#' row names would have no effect and returnten row names would be NULL. +#' +#' @param x object of class \code{xgb.DMatrix} +#' @param value a list of two elements: the first one is ignored +#' and the second one is column names +#' +#' @details +#' Generic \code{dimnames} methods are used by \code{colnames}. +#' Since row names are irrelevant, it is recommended to use \code{colnames} directly. +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' dimnames(dtrain) +#' colnames(dtrain) +#' colnames(dtrain) <- make.names(1:ncol(train$data)) +#' print(dtrain, verbose=TRUE) +#' +#' @rdname dimnames.xgb.DMatrix +#' @export +dimnames.xgb.DMatrix <- function(x) { + attr(x, '.Dimnames') +} + +#' @rdname dimnames.xgb.DMatrix +#' @export +`dimnames<-.xgb.DMatrix` <- function(x, value) { + if (!is.list(value) || length(value) != 2L) + stop("invalid 'dimnames' given: must be a list of two elements") + if (!is.null(value[[1L]])) + stop("xgb.DMatrix does not have rownames") + if (is.null(value[[2]])) { + attr(x, '.Dimnames') <- NULL + return(x) + } + if (ncol(x) != length(value[[2]])) + stop("can't assign ", length(value[[2]]), " colnames to a ", + ncol(x), " column xgb.DMatrix") + attr(x, '.Dimnames') <- value + x +} + + +#' Get information of an xgb.DMatrix object +#' +#' Get information of an xgb.DMatrix object +#' @param object Object of class \code{xgb.DMatrix} +#' @param name the name of the information field to get (see details) +#' @param ... other parameters +#' +#' @details +#' The \code{name} field can be one of the following: +#' +#' \itemize{ +#' \item \code{label}: label Xgboost learn from ; +#' \item \code{weight}: to do a weight rescale ; +#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; +#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}. +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' labels <- getinfo(dtrain, 'label') +#' setinfo(dtrain, 'label', 1-labels) +#' +#' labels2 <- getinfo(dtrain, 'label') +#' stopifnot(all(labels2 == 1-labels)) +#' @rdname getinfo +#' @export +getinfo <- function(object, ...) UseMethod("getinfo") + +#' @rdname getinfo +#' @export +getinfo.xgb.DMatrix <- function(object, name) { + if (typeof(name) != "character") { + stop("getinfo: name must be character") + } + if (name != "label" && name != "weight" && + name != "base_margin" && name != "nrow") { + stop(paste("getinfo: unknown info name", name)) + } + if (name != "nrow"){ + ret <- .Call("XGDMatrixGetInfo_R", object, name, PACKAGE = "xgboost") + } else { + ret <- nrow(object) + } + return(ret) +} + + +#' Set information of an xgb.DMatrix object +#' +#' Set information of an xgb.DMatrix object +#' +#' @param object Object of class "xgb.DMatrix" +#' @param name the name of the field to get +#' @param info the specific field of information to set +#' @param ... other parameters +#' +#' @details +#' The \code{name} field can be one of the following: +#' +#' \itemize{ +#' \item \code{label}: label Xgboost learn from ; +#' \item \code{weight}: to do a weight rescale ; +#' \item \code{base_margin}: base margin is the base prediction Xgboost will boost from ; +#' \item \code{group}. +#' } +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' labels <- getinfo(dtrain, 'label') +#' setinfo(dtrain, 'label', 1-labels) +#' labels2 <- getinfo(dtrain, 'label') +#' stopifnot(all.equal(labels2, 1-labels)) +#' @rdname setinfo +#' @export +setinfo <- function(object, ...) UseMethod("setinfo") + +#' @rdname setinfo +#' @export +setinfo.xgb.DMatrix <- function(object, name, info) { + if (name == "label") { + if (length(info) != nrow(object)) + stop("The length of labels must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "weight") { + if (length(info) != nrow(object)) + stop("The length of weights must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "base_margin") { + # if (length(info)!=nrow(object)) + # stop("The length of base margin must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.numeric(info), + PACKAGE = "xgboost") + return(TRUE) + } + if (name == "group") { + if (sum(info) != nrow(object)) + stop("The sum of groups must equal to the number of rows in the input data") + .Call("XGDMatrixSetInfo_R", object, name, as.integer(info), + PACKAGE = "xgboost") + return(TRUE) + } + stop(paste("setinfo: unknown info name", name)) + return(FALSE) +} + + +#' Get a new DMatrix containing the specified rows of +#' orginal xgb.DMatrix object +#' +#' Get a new DMatrix containing the specified rows of +#' orginal xgb.DMatrix object +#' +#' @param object Object of class "xgb.DMatrix" +#' @param idxset a integer vector of indices of rows needed +#' @param colset currently not used (columns subsetting is not available) +#' @param ... other parameters (currently not used) +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' train <- agaricus.train +#' dtrain <- xgb.DMatrix(train$data, label=train$label) +#' +#' dsub <- slice(dtrain, 1:42) +#' labels1 <- getinfo(dsub, 'label') +#' dsub <- dtrain[1:42, ] +#' labels2 <- getinfo(dsub, 'label') +#' all.equal(labels1, labels2) +#' +#' @rdname slice.xgb.DMatrix +#' @export +slice <- function(object, ...) UseMethod("slice") + +#' @rdname slice.xgb.DMatrix +#' @export +slice.xgb.DMatrix <- function(object, idxset, ...) { + if (class(object) != "xgb.DMatrix") { + stop("slice: first argument dtrain must be xgb.DMatrix") + } + ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, PACKAGE = "xgboost") + + attr_list <- attributes(object) + nr <- nrow(object) + len <- sapply(attr_list, length) + ind <- which(len == nr) + if (length(ind) > 0) { + nms <- names(attr_list)[ind] + for (i in 1:length(ind)) { + attr(ret, nms[i]) <- attr(object, nms[i])[idxset] + } + } + return(structure(ret, class = "xgb.DMatrix")) +} + +#' @rdname slice.xgb.DMatrix +#' @export +`[.xgb.DMatrix` <- function(object, idxset, colset=NULL) { + slice(object, idxset) +} + + +#' Print xgb.DMatrix +#' +#' Print information about an xgb.DMatrix. +#' Currently is would display +#' @param x an xgb.DMatrix object +#' @param verbose whether to print colnames (when present) +#' +#' @export +print.xgb.DMatrix <- function(x, verbose=FALSE, ...) { + cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') + infos <- c() + if(length(getinfo(x, 'label')) > 0) infos <- 'label' + if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') + if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') + if (length(infos) == 0) infos <- 'NA' + cat(infos) + cnames <- colnames(x) + cat(' colnames:') + if (verbose & !is.null(cnames)) { + cat("\n'") + cat(cnames, sep="','") + cat("'") + } else { + if (is.null(cnames)) cat(' no') + else cat(' yes') + } + cat("\n") + invisible(x) +} diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 7ad2e5c41..e3faf33a0 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -169,11 +169,11 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = num_class <- params[['num_class']] if (is.null(num_class)) stop('must set num_class to use softmax') - predictValues <- matrix(0,xgb.numrow(dtrain),num_class) + predictValues <- matrix(0, nrow(dtrain), num_class) mat_pred <- TRUE } else - predictValues <- rep(0,xgb.numrow(dtrain)) + predictValues <- rep(0, nrow(dtrain)) history <- c() print.every.n <- max(as.integer(print.every.n), 1L) for (i in 1:nrounds) { diff --git a/R-package/man/dim.xgb.DMatrix.Rd b/R-package/man/dim.xgb.DMatrix.Rd new file mode 100644 index 000000000..168782dec --- /dev/null +++ b/R-package/man/dim.xgb.DMatrix.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{dim.xgb.DMatrix} +\alias{dim.xgb.DMatrix} +\title{Dimensions of xgb.DMatrix} +\usage{ +\method{dim}{xgb.DMatrix}(x) +} +\arguments{ +\item{x}{Object of class \code{xgb.DMatrix}} +} +\description{ +Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. +} +\details{ +Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also +be directly used with an \code{xgb.DMatrix} object. +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) + +stopifnot(nrow(dtrain) == nrow(train$data)) +stopifnot(ncol(dtrain) == ncol(train$data)) +stopifnot(all(dim(dtrain) == dim(train$data))) + +} + diff --git a/R-package/man/dimnames.xgb.DMatrix.Rd b/R-package/man/dimnames.xgb.DMatrix.Rd new file mode 100644 index 000000000..0877f294b --- /dev/null +++ b/R-package/man/dimnames.xgb.DMatrix.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{dimnames.xgb.DMatrix} +\alias{dimnames.xgb.DMatrix} +\alias{dimnames<-.xgb.DMatrix} +\title{Handling of column names of \code{xgb.DMatrix}} +\usage{ +\method{dimnames}{xgb.DMatrix}(x) + +\method{dimnames}{xgb.DMatrix}(x) <- value +} +\arguments{ +\item{x}{object of class \code{xgb.DMatrix}} + +\item{value}{a list of two elements: the first one is ignored +and the second one is column names} +} +\description{ +Only column names are supported for \code{xgb.DMatrix}, thus setting of +row names would have no effect and returnten row names would be NULL. +} +\details{ +Generic \code{dimnames} methods are used by \code{colnames}. +Since row names are irrelevant, it is recommended to use \code{colnames} directly. +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) +dimnames(dtrain) +colnames(dtrain) +colnames(dtrain) <- make.names(1:ncol(train$data)) +print(dtrain, verbose=TRUE) + +} + diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index f8b4f6b99..108d7edf2 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,27 +1,26 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getinfo.xgb.DMatrix.R -\docType{methods} +% Please edit documentation in R/xgb.DMatrix.R \name{getinfo} \alias{getinfo} -\alias{getinfo,xgb.DMatrix-method} +\alias{getinfo.xgb.DMatrix} \title{Get information of an xgb.DMatrix object} \usage{ getinfo(object, ...) -\S4method{getinfo}{xgb.DMatrix}(object, name) +\method{getinfo}{xgb.DMatrix}(object, name) } \arguments{ \item{object}{Object of class \code{xgb.DMatrix}} \item{...}{other parameters} -\item{name}{the name of the field to get} +\item{name}{the name of the information field to get (see details)} } \description{ Get information of an xgb.DMatrix object } \details{ -The information can be one of the following: +The \code{name} field can be one of the following: \itemize{ \item \code{label}: label Xgboost learn from ; @@ -34,8 +33,10 @@ The information can be one of the following: data(agaricus.train, package='xgboost') train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) + labels <- getinfo(dtrain, 'label') setinfo(dtrain, 'label', 1-labels) + labels2 <- getinfo(dtrain, 'label') stopifnot(all(labels2 == 1-labels)) } diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd new file mode 100644 index 000000000..504037937 --- /dev/null +++ b/R-package/man/predict.xgb.Booster.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.Booster.R +\name{predict.xgb.Booster} +\alias{predict.xgb.Booster} +\alias{predict.xgb.Booster.handle} +\title{Predict method for eXtreme Gradient Boosting model} +\usage{ +\method{predict}{xgb.Booster}(object, newdata, missing = NA, + outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) + +\method{predict}{xgb.Booster.handle}(object, ...) +} +\arguments{ +\item{object}{Object of class \code{xgb.Booster} or \code{xgb.Booster.handle}} + +\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or +\code{xgb.DMatrix}.} + +\item{missing}{Missing is only used when input is dense matrix, pick a float +value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} + +\item{outputmargin}{whether the prediction should be shown in the original +value of sum of functions, when outputmargin=TRUE, the prediction is +untransformed margin value. In logistic regression, outputmargin=T will +output value before logistic transformation.} + +\item{ntreelimit}{limit number of trees used in prediction, this parameter is +only valid for gbtree, but not for gblinear. set it to be value bigger +than 0. It will use all trees by default.} + +\item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.} + +\item{...}{Parameters pass to \code{predict.xgb.Booster}} +} +\description{ +Predicted values based on either xgboost model or model handle object. +} +\details{ +The option \code{ntreelimit} purpose is to let the user train a model with lots +of trees but use only the first trees for prediction to avoid overfitting +(without having to train a new model with less trees). + +The option \code{predleaf} purpose is inspired from §3.1 of the paper +\code{Practical Lessons from Predicting Clicks on Ads at Facebook}. +The idea is to use the model as a generator of new features which capture non linear link +from original features. +} +\examples{ +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +train <- agaricus.train +test <- agaricus.test + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +pred <- predict(bst, test$data) +} + diff --git a/R-package/man/print.xgb.DMatrix.Rd b/R-package/man/print.xgb.DMatrix.Rd new file mode 100644 index 000000000..92c87eec8 --- /dev/null +++ b/R-package/man/print.xgb.DMatrix.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{print.xgb.DMatrix} +\alias{print.xgb.DMatrix} +\title{Print xgb.DMatrix} +\usage{ +print.xgb.DMatrix(x, verbose = FALSE, ...) +} +\arguments{ +\item{x}{an xgb.DMatrix object} + +\item{verbose}{whether to print colnames (when present)} +} +\description{ +Print information about an xgb.DMatrix. +Currently is would display +} + diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index cb939721e..b182a9042 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,14 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/setinfo.xgb.DMatrix.R -\docType{methods} +% Please edit documentation in R/xgb.DMatrix.R \name{setinfo} \alias{setinfo} -\alias{setinfo,xgb.DMatrix-method} +\alias{setinfo.xgb.DMatrix} \title{Set information of an xgb.DMatrix object} \usage{ setinfo(object, ...) -\S4method{setinfo}{xgb.DMatrix}(object, name, info) +\method{setinfo}{xgb.DMatrix}(object, name, info) } \arguments{ \item{object}{Object of class "xgb.DMatrix"} @@ -23,7 +22,7 @@ setinfo(object, ...) Set information of an xgb.DMatrix object } \details{ -It can be one of the following: +The \code{name} field can be one of the following: \itemize{ \item \code{label}: label Xgboost learn from ; @@ -36,9 +35,10 @@ It can be one of the following: data(agaricus.train, package='xgboost') train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) + labels <- getinfo(dtrain, 'label') setinfo(dtrain, 'label', 1-labels) labels2 <- getinfo(dtrain, 'label') -stopifnot(all(labels2 == 1-labels)) +stopifnot(all.equal(labels2, 1-labels)) } diff --git a/R-package/man/slice.xgb.DMatrix.Rd b/R-package/man/slice.xgb.DMatrix.Rd new file mode 100644 index 000000000..c352203fb --- /dev/null +++ b/R-package/man/slice.xgb.DMatrix.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.DMatrix.R +\name{slice} +\alias{[.xgb.DMatrix} +\alias{slice} +\alias{slice.xgb.DMatrix} +\title{Get a new DMatrix containing the specified rows of +orginal xgb.DMatrix object} +\usage{ +slice(object, ...) + +\method{slice}{xgb.DMatrix}(object, idxset, ...) + +\method{[}{xgb.DMatrix}(object, idxset, colset = NULL) +} +\arguments{ +\item{object}{Object of class "xgb.DMatrix"} + +\item{...}{other parameters (currently not used)} + +\item{idxset}{a integer vector of indices of rows needed} + +\item{colset}{currently not used (columns subsetting is not available)} +} +\description{ +Get a new DMatrix containing the specified rows of +orginal xgb.DMatrix object +} +\examples{ +data(agaricus.train, package='xgboost') +train <- agaricus.train +dtrain <- xgb.DMatrix(train$data, label=train$label) + +dsub <- slice(dtrain, 1:42) +labels1 <- getinfo(dsub, 'label') +dsub <- dtrain[1:42, ] +labels2 <- getinfo(dsub, 'label') +all.equal(labels1, labels2) + +} + diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 2e892cc6d..a960d585a 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -7,8 +7,7 @@ xgb.DMatrix(data, info = list(), missing = NA, ...) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character -indicating the data file.} +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename} \item{info}{a list of information of the xgb.DMatrix object} @@ -18,7 +17,8 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item{...}{other information to pass to \code{info}.} } \description{ -Contruct xgb.DMatrix object from dense matrix, sparse matrix or local file. +Contruct xgb.DMatrix object from dense matrix, sparse matrix +or local file (that is a previously saved \code{xgb.DMatrix}). } \examples{ data(agaricus.train, package='xgboost')