diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index 7cb36f9dc..ce2d7d901 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,12 +1,18 @@ importClassesFrom("Matrix", dgCMatrix, dgeMatrix) + +export(xgboost) export(xgb.DMatrix) export(xgb.getinfo) export(xgb.setinfo) -export(xgb.Booster) -export(xgb.train) -export(xgb.save) -export(xgb.predict) -export(xgb.dump) + +# exportClasses(xgb.Boost) +exportMethods(predict) + +# export(xgb.Booster) +# export(xgb.train) +# export(xgb.save) +# export(xgb.predict) +# export(xgb.dump) diff --git a/R-package/R/predict.xgboost.R b/R-package/R/predict.xgboost.R new file mode 100644 index 000000000..25cb5d6aa --- /dev/null +++ b/R-package/R/predict.xgboost.R @@ -0,0 +1,16 @@ +#' @export +setClass("xgb.Booster") + +#' @export +setMethod("predict", + signature = "xgb.Booster", + definition = function(object, newdata, outputmargin = FALSE) + { + if (class(newdata) != "xgb.DMatrix") { + newdata = xgb.DMatrix(newdata) + } + ret <- .Call("XGBoosterPredict_R", object, newdata, + as.integer(outputmargin), PACKAGE="xgboost") + return(ret) + }) + diff --git a/R-package/R/xgb.Utils.R b/R-package/R/xgb.Utils.R new file mode 100644 index 000000000..d9710121b --- /dev/null +++ b/R-package/R/xgb.Utils.R @@ -0,0 +1,222 @@ +# depends on matrix +.onLoad <- function(libname, pkgname) { + library.dynam("xgboost", pkgname, libname); +} +.onUnload <- function(libpath) { + library.dynam.unload("xgboost", libpath); +} + +# constructing DMatrix +xgb.DMatrix <- function(data, info=list(), missing=0.0) { + if (typeof(data) == "character") { + handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost") + } else if(is.matrix(data)) { + handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost") + } else if(class(data) == "dgCMatrix") { + handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost") + } else { + stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) + } + dmat <- structure(handle, class="xgb.DMatrix") + if (length(info) != 0) { + for (i in 1:length(info)) { + p <- info[i] + xgb.setinfo(dmat, names(p), p[[1]]) + } + } + return(dmat) +} +# get information from dmatrix +xgb.getinfo <- function(dmat, name) { + if (typeof(name) != "character") { + stop("xgb.getinfo: name must be character") + } + if (class(dmat) != "xgb.DMatrix") { + stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); + } + if (name != "label" && + name != "weight" && + name != "base_margin" ) { + stop(paste("xgb.getinfo: unknown info name", name)) + } + ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost") + return(ret) +} +# set information into dmatrix, this mutate dmatrix +xgb.setinfo <- function(dmat, name, info) { + if (class(dmat) != "xgb.DMatrix") { + stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); + } + if (name == "label") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") + return(TRUE) + } + if (name == "weight") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") + return(TRUE) + } + if (name == "base_margin") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") + return(TRUE) + } + if (name == "group") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost") + return(TRUE) + } + stop(pase("xgb.setinfo: unknown info name", name)) + return(FALSE) +} +# construct a Booster from cachelist +xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { + if (typeof(cachelist) != "list") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + for (dm in cachelist) { + if (class(dm) != "xgb.DMatrix") { + stop("xgb.Booster: only accepts list of DMatrix as cachelist") + } + } + handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE="xgboost") + .Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost") + if (length(params) != 0) { + for (i in 1:length(params)) { + p <- params[i] + .Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost") + } + } + if (!is.null(modelfile)) { + if (typeof(modelfile) != "character"){ + stop("xgb.Booster: modelfile must be character"); + } + .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE="xgboost") + } + return(structure(handle, class="xgb.Booster")) +} +# train a model using given parameters +xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) { + if (typeof(params) != "list") { + stop("xgb.train: first argument params must be list"); + } + if (class(dtrain) != "xgb.DMatrix") { + stop("xgb.train: second argument dtrain must be xgb.DMatrix"); + } + bst <- xgb.Booster(params, append(watchlist,dtrain)) + for (i in 1:nrounds) { + if (is.null(obj)) { + succ <- xgb.iter.update(bst, dtrain, i-1) + } else { + pred <- xgb.predict(bst, dtrain) + gpair <- obj(pred, dtrain) + succ <- xgb.iter.boost(bst, dtrain, gpair) + } + if (length(watchlist) != 0) { + if (is.null(feval)) { + msg <- xgb.iter.eval(bst, watchlist, i-1) + cat(msg); cat("\n") + } else { + cat("["); cat(i); cat("]"); + for (j in 1:length(watchlist)) { + w <- watchlist[j] + if (length(names(w)) == 0) { + stop("xgb.eval: name tag must be presented for every elements in watchlist") + } + ret <- feval(xgb.predict(bst, w[[1]]), w[[1]]) + cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); + cat(":"); cat(ret$value) + } + cat("\n") + } + } + } + return(bst) +} +# save model or DMatrix to file +xgb.save <- function(handle, fname) { + if (typeof(fname) != "character") { + stop("xgb.save: fname must be character") + } + if (class(handle) == "xgb.Booster") { + .Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost") + return(TRUE) + } + if (class(handle) == "xgb.DMatrix") { + .Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost") + return(TRUE) + } + stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster") + return(FALSE) +} +# predict +xgb.predict <- function(booster, dmat, outputmargin = FALSE) { + if (class(booster) != "xgb.Booster") { + stop("xgb.predict: first argument must be type xgb.Booster") + } + if (class(dmat) != "xgb.DMatrix") { + stop("xgb.predict: second argument must be type xgb.DMatrix") + } + ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost") + return(ret) +} +# dump model +xgb.dump <- function(booster, fname, fmap = "") { + if (class(booster) != "xgb.Booster") { + stop("xgb.dump: first argument must be type xgb.Booster") + } + if (typeof(fname) != "character"){ + stop("xgb.dump: second argument must be type character") + } + .Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost") + return(TRUE) +} +##-------------------------------------- +# the following are low level iteratively function, not needed +# if you do not want to use them +#--------------------------------------- +# iteratively update booster with dtrain +xgb.iter.update <- function(booster, dtrain, iter) { + if (class(booster) != "xgb.Booster") { + stop("xgb.iter.update: first argument must be type xgb.Booster") + } + if (class(dtrain) != "xgb.DMatrix") { + stop("xgb.iter.update: second argument must be type xgb.DMatrix") + } + .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost") + return(TRUE) +} +# iteratively update booster with customized statistics +xgb.iter.boost <- function(booster, dtrain, gpair) { + if (class(booster) != "xgb.Booster") { + stop("xgb.iter.update: first argument must be type xgb.Booster") + } + if (class(dtrain) != "xgb.DMatrix") { + stop("xgb.iter.update: second argument must be type xgb.DMatrix") + } + .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost") + return(TRUE) +} +# iteratively evaluate one iteration +xgb.iter.eval <- function(booster, watchlist, iter) { + if (class(booster) != "xgb.Booster") { + stop("xgb.eval: first argument must be type xgb.Booster") + } + if (typeof(watchlist) != "list") { + stop("xgb.eval: only accepts list of DMatrix as watchlist") + } + for (w in watchlist) { + if (class(w) != "xgb.DMatrix") { + stop("xgb.eval: watch list can only contain xgb.DMatrix") + } + } + evnames <- list() + if (length(watchlist) != 0) { + for (i in 1:length(watchlist)) { + w <- watchlist[i] + if (length(names(w)) == 0) { + stop("xgb.eval: name tag must be presented for every elements in watchlist") + } + evnames <- append(evnames, names(w)) + } + } + msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost") + return(msg) +} diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index d564accd0..d67fb5880 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1,222 +1,24 @@ -# depends on matrix -.onLoad <- function(libname, pkgname) { - library.dynam("xgboost", pkgname, libname); -} -.onUnload <- function(libpath) { - library.dynam.unload("xgboost", libpath); +# Main function for xgboost-package + +xgboost = function(x=NULL,y=NULL,file=NULL,nrounds=10,params,watchlist=list(), + obj=NULL, feval=NULL, margin=NULL) +{ + if (is.null(x) && is.null(y)) + { + if (is.null(file)) + stop('xgboost need input data, either R objects or local files.') + dtrain = xgb.DMatrix(file) + } + else + dtrain = xgb.DMatrix(x, info=list(label=y)) + if (!is.null(margin)) + { + succ <- xgb.setinfo(dtrain, "base_margin", margin) + if (!succ) + warning('Attemp to use margin failed.') + } + bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval) + return(bst) } -# constructing DMatrix -xgb.DMatrix <- function(data, info=list(), missing=0.0) { - if (typeof(data) == "character") { - handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost") - } else if(is.matrix(data)) { - handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost") - } else if(class(data) == "dgCMatrix") { - handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost") - } else { - stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) - } - dmat <- structure(handle, class="xgb.DMatrix") - if (length(info) != 0) { - for (i in 1:length(info)) { - p <- info[i] - xgb.setinfo(dmat, names(p), p[[1]]) - } - } - return(dmat) -} -# get information from dmatrix -xgb.getinfo <- function(dmat, name) { - if (typeof(name) != "character") { - stop("xgb.getinfo: name must be character") - } - if (class(dmat) != "xgb.DMatrix") { - stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); - } - if (name != "label" && - name != "weight" && - name != "base_margin" ) { - stop(paste("xgb.getinfo: unknown info name", name)) - } - ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost") - return(ret) -} -# set information into dmatrix, this mutate dmatrix -xgb.setinfo <- function(dmat, name, info) { - if (class(dmat) != "xgb.DMatrix") { - stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); - } - if (name == "label") { - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") - return(TRUE) - } - if (name == "weight") { - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") - return(TRUE) - } - if (name == "base_margin") { - .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost") - return(TRUE) - } - if (name == "group") { - .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost") - return(TRUE) - } - stop(pase("xgb.setinfo: unknown info name", name)) - return(FALSE) -} -# construct a Booster from cachelist -xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { - if (typeof(cachelist) != "list") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - for (dm in cachelist) { - if (class(dm) != "xgb.DMatrix") { - stop("xgb.Booster: only accepts list of DMatrix as cachelist") - } - } - handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE="xgboost") - .Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost") - if (length(params) != 0) { - for (i in 1:length(params)) { - p <- params[i] - .Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost") - } - } - if (!is.null(modelfile)) { - if (typeof(modelfile) != "character"){ - stop("xgb.Booster: modelfile must be character"); - } - .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE="xgboost") - } - return(structure(handle, class="xgb.Booster")) -} -# train a model using given parameters -xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) { - if (typeof(params) != "list") { - stop("xgb.train: first argument params must be list"); - } - if (class(dtrain) != "xgb.DMatrix") { - stop("xgb.train: second argument dtrain must be xgb.DMatrix"); - } - bst <- xgb.Booster(params, append(watchlist,dtrain)) - for (i in 1:nrounds) { - if (is.null(obj)) { - succ <- xgb.iter.update(bst, dtrain, i-1) - } else { - pred <- xgb.predict(bst, dtrain) - gpair <- obj(pred, dtrain) - succ <- xgb.iter.boost(bst, dtrain, gpair) - } - if (length(watchlist) != 0) { - if (is.null(feval)) { - msg <- xgb.iter.eval(bst, watchlist, i-1) - cat(msg); cat("\n") - } else { - cat("["); cat(i); cat("]"); - for (j in 1:length(watchlist)) { - w <- watchlist[j] - if (length(names(w)) == 0) { - stop("xgb.eval: name tag must be presented for every elements in watchlist") - } - ret <- feval(xgb.predict(bst, w[[1]]), w[[1]]) - cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); - cat(":"); cat(ret$value) - } - cat("\n") - } - } - } - return(bst) -} -# save model or DMatrix to file -xgb.save <- function(handle, fname) { - if (typeof(fname) != "character") { - stop("xgb.save: fname must be character") - } - if (class(handle) == "xgb.Booster") { - .Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost") - return(TRUE) - } - if (class(handle) == "xgb.DMatrix") { - .Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost") - return(TRUE) - } - stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster") - return(FALSE) -} -# predict -xgb.predict <- function(booster, dmat, outputmargin = FALSE) { - if (class(booster) != "xgb.Booster") { - stop("xgb.predict: first argument must be type xgb.Booster") - } - if (class(dmat) != "xgb.DMatrix") { - stop("xgb.predict: second argument must be type xgb.DMatrix") - } - ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost") - return(ret) -} -# dump model -xgb.dump <- function(booster, fname, fmap = "") { - if (class(booster) != "xgb.Booster") { - stop("xgb.dump: first argument must be type xgb.Booster") - } - if (typeof(fname) != "character"){ - stop("xgb.dump: second argument must be type character") - } - .Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost") - return(TRUE) -} -##-------------------------------------- -# the following are low level iteratively function, not needed -# if you do not want to use them -#--------------------------------------- -# iteratively update booster with dtrain -xgb.iter.update <- function(booster, dtrain, iter) { - if (class(booster) != "xgb.Booster") { - stop("xgb.iter.update: first argument must be type xgb.Booster") - } - if (class(dtrain) != "xgb.DMatrix") { - stop("xgb.iter.update: second argument must be type xgb.DMatrix") - } - .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost") - return(TRUE) -} -# iteratively update booster with customized statistics -xgb.iter.boost <- function(booster, dtrain, gpair) { - if (class(booster) != "xgb.Booster") { - stop("xgb.iter.update: first argument must be type xgb.Booster") - } - if (class(dtrain) != "xgb.DMatrix") { - stop("xgb.iter.update: second argument must be type xgb.DMatrix") - } - .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost") - return(TRUE) -} -# iteratively evaluate one iteration -xgb.iter.eval <- function(booster, watchlist, iter) { - if (class(booster) != "xgb.Booster") { - stop("xgb.eval: first argument must be type xgb.Booster") - } - if (typeof(watchlist) != "list") { - stop("xgb.eval: only accepts list of DMatrix as watchlist") - } - for (w in watchlist) { - if (class(w) != "xgb.DMatrix") { - stop("xgb.eval: watch list can only contain xgb.DMatrix") - } - } - evnames <- list() - if (length(watchlist) != 0) { - for (i in 1:length(watchlist)) { - w <- watchlist[i] - if (length(names(w)) == 0) { - stop("xgb.eval: name tag must be presented for every elements in watchlist") - } - evnames <- append(evnames, names(w)) - } - } - msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost") - return(msg) -} + diff --git a/R-package/README.md b/R-package/README.md index 9a70b5999..03cddedd1 100644 --- a/R-package/README.md +++ b/R-package/README.md @@ -1,4 +1,10 @@ -This is subfolder for experimental version of R package +This is subfolder for experimental version of R package. -Not yet ready +Not yet ready. +Installation: + +```r +require(devtools) +install_github('xgboost','tqchen',subdir='R-package') +``` diff --git a/R-package/inst/examples/demo-old.R b/R-package/inst/examples/demo-old.R new file mode 100644 index 000000000..6332ba37d --- /dev/null +++ b/R-package/inst/examples/demo-old.R @@ -0,0 +1,127 @@ +# load xgboost library +require(xgboost) +require(methods) + +# helper function to read libsvm format +# this is very badly written, load in dense, and convert to sparse +# use this only for demo purpose +# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r +read.libsvm <- function(fname, maxcol) { + content <- readLines(fname) + nline <- length(content) + label <- numeric(nline) + mat <- matrix(0, nline, maxcol+1) + for (i in 1:nline) { + arr <- as.vector(strsplit(content[i], " ")[[1]]) + label[i] <- as.numeric(arr[[1]]) + for (j in 2:length(arr)) { + kv <- strsplit(arr[j], ":")[[1]] + # to avoid 0 index + findex <- as.integer(kv[1]) + 1 + fvalue <- as.numeric(kv[2]) + mat[i,findex] <- fvalue + } + } + mat <- as(mat, "sparseMatrix") + return(list(label=label, data=mat)) +} + +# test code here +dtrain <- xgb.DMatrix("agaricus.txt.train") +dtest <- xgb.DMatrix("agaricus.txt.test") +param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") +watchlist <- list("eval"=dtest,"train"=dtrain) +# training xgboost model +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) +# make prediction +preds <- xgb.predict(bst, dtest) +labels <- xgb.getinfo(dtest, "label") +err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels) +# print error rate +print(paste("error=",err)) + +# dump model +xgb.dump(bst, "dump.raw.txt") +# dump model with feature map +xgb.dump(bst, "dump.nice.txt", "featmap.txt") + +# save dmatrix into binary buffer +succ <- xgb.save(dtest, "dtest.buffer") +# save model into file +succ <- xgb.save(bst, "xgb.model") +# load model and data in +bst2 <- xgb.Booster(modelfile="xgb.model") +dtest2 <- xgb.DMatrix("dtest.buffer") +preds2 <- xgb.predict(bst2, dtest2) +# assert they are the same +stopifnot(sum(abs(preds2-preds)) == 0) + +### +# build dmatrix from sparseMatrix +### +print ('start running example of build DMatrix from R.sparseMatrix') +csc <- read.libsvm("agaricus.txt.train", 126) +label <- csc$label +data <- csc$data +dtrain <- xgb.DMatrix(data, info=list(label=label) ) +watchlist <- list("eval"=dtest,"train"=dtrain) +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) + +### +# build dmatrix from dense matrix +### +print ('start running example of build DMatrix from R.Matrix') +mat = as.matrix(data) +dtrain <- xgb.DMatrix(mat, info=list(label=label) ) +watchlist <- list("eval"=dtest,"train"=dtrain) +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) + +### +# advanced: cutomsized loss function +# +print("start running example to used cutomized objective function") +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1) +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- xgb.getinfo(dtrain, "label") + preds <- 1.0 / (1.0 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1.0-preds) + return(list(grad=grad, hess=hess)) +} +# user defined evaluation function, return a list(metric="metric-name", value="metric-value") +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- xgb.getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels) + return(list(metric="error", value=err)) +} + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py"s implementation of train +bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror) + +### +# advanced: start from a initial base prediction +# +print ("start running example to start from a initial prediction") +# specify parameters via map, definition are same as c++ version +param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") +# train xgboost for 1 round +bst <- xgb.train( param, dtrain, 1, watchlist ) +# Note: we need the margin value instead of transformed prediction in set_base_margin +# do predict with output_margin=True, will always give you margin values before logistic transformation +ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE) +ptest <- xgb.predict(bst, dtest, outputmargin=TRUE) +succ <- xgb.setinfo(dtrain, "base_margin", ptrain) +succ <- xgb.setinfo(dtest, "base_margin", ptest) +print ("this is result of running from initial prediction") +bst <- xgb.train( param, dtrain, 1, watchlist ) diff --git a/R-package/inst/examples/demo.R b/R-package/inst/examples/demo.R index 6332ba37d..8b85c4cc0 100644 --- a/R-package/inst/examples/demo.R +++ b/R-package/inst/examples/demo.R @@ -1,97 +1,83 @@ -# load xgboost library require(xgboost) -require(methods) # helper function to read libsvm format # this is very badly written, load in dense, and convert to sparse # use this only for demo purpose # adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r -read.libsvm <- function(fname, maxcol) { - content <- readLines(fname) - nline <- length(content) - label <- numeric(nline) - mat <- matrix(0, nline, maxcol+1) - for (i in 1:nline) { - arr <- as.vector(strsplit(content[i], " ")[[1]]) - label[i] <- as.numeric(arr[[1]]) - for (j in 2:length(arr)) { - kv <- strsplit(arr[j], ":")[[1]] - # to avoid 0 index - findex <- as.integer(kv[1]) + 1 - fvalue <- as.numeric(kv[2]) - mat[i,findex] <- fvalue +read.libsvm = function(fname, maxcol) { + content = readLines(fname) + nline = length(content) + label = numeric(nline) + mat = matrix(0, nline, maxcol+1) + for (i in 1:nline) { + arr = as.vector(strsplit(content[i], " ")[[1]]) + label[i] = as.numeric(arr[[1]]) + for (j in 2:length(arr)) { + kv = strsplit(arr[j], ":")[[1]] + # to avoid 0 index + findex = as.integer(kv[1]) + 1 + fvalue = as.numeric(kv[2]) + mat[i,findex] = fvalue + } } - } - mat <- as(mat, "sparseMatrix") - return(list(label=label, data=mat)) + mat = as(mat, "sparseMatrix") + return(list(label=label, data=mat)) } -# test code here +# Parameter setting dtrain <- xgb.DMatrix("agaricus.txt.train") dtest <- xgb.DMatrix("agaricus.txt.test") param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") -watchlist <- list("eval"=dtest,"train"=dtrain) -# training xgboost model -bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) -# make prediction -preds <- xgb.predict(bst, dtest) -labels <- xgb.getinfo(dtest, "label") -err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels) -# print error rate +watchlist = list("eval"=dtest,"train"=dtrain) + +########################### +# Train from local file +########################### + +# Training +bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist) +# Prediction +pred = predict(bst, 'agaricus.txt.test') +# Performance +labels = xgb.getinfo(dtest, "label") +err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) print(paste("error=",err)) -# dump model -xgb.dump(bst, "dump.raw.txt") -# dump model with feature map -xgb.dump(bst, "dump.nice.txt", "featmap.txt") +########################### +# Train from R object +########################### -# save dmatrix into binary buffer -succ <- xgb.save(dtest, "dtest.buffer") -# save model into file -succ <- xgb.save(bst, "xgb.model") -# load model and data in -bst2 <- xgb.Booster(modelfile="xgb.model") -dtest2 <- xgb.DMatrix("dtest.buffer") -preds2 <- xgb.predict(bst2, dtest2) -# assert they are the same -stopifnot(sum(abs(preds2-preds)) == 0) +csc = read.libsvm("agaricus.txt.train", 126) +y = csc$label +x = csc$data +# x as Sparse Matrix +class(x) -### -# build dmatrix from sparseMatrix -### -print ('start running example of build DMatrix from R.sparseMatrix') -csc <- read.libsvm("agaricus.txt.train", 126) -label <- csc$label -data <- csc$data -dtrain <- xgb.DMatrix(data, info=list(label=label) ) -watchlist <- list("eval"=dtest,"train"=dtrain) -bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) +# Training +bst = xgboost(x,y,params=param,watchlist=watchlist) +# Prediction +pred = predict(bst, 'agaricus.txt.test') +# Performance +labels = xgb.getinfo(dtest, "label") +err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) +print(paste("error=",err)) -### -# build dmatrix from dense matrix -### -print ('start running example of build DMatrix from R.Matrix') -mat = as.matrix(data) -dtrain <- xgb.DMatrix(mat, info=list(label=label) ) -watchlist <- list("eval"=dtest,"train"=dtrain) -bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) +# Training with dense matrix +x = as.matrix(x) +bst = xgboost(x,y,params=param,watchlist=watchlist) + +########################### +# Train with customization +########################### -### -# advanced: cutomsized loss function -# -print("start running example to used cutomized objective function") -# note: for customized objective function, we leave objective as default -# note: what we are getting is margin value in prediction -# you must know what you are doing -param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1) # user define objective function, given prediction, return gradient and second order gradient # this is loglikelihood loss -logregobj <- function(preds, dtrain) { - labels <- xgb.getinfo(dtrain, "label") - preds <- 1.0 / (1.0 + exp(-preds)) - grad <- preds - labels - hess <- preds * (1.0-preds) - return(list(grad=grad, hess=hess)) +logregobj = function(preds, dtrain) { + labels = xgb.getinfo(dtrain, "label") + preds = 1.0 / (1.0 + exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return(list(grad=grad, hess=hess)) } # user defined evaluation function, return a list(metric="metric-name", value="metric-value") # NOTE: when you do customized loss function, the default prediction value is margin @@ -99,29 +85,18 @@ logregobj <- function(preds, dtrain) { # for example, we are doing logistic loss, the prediction is score before logistic transformation # the buildin evaluation error assumes input is after logistic transformation # Take this in mind when you use the customization, and maybe you need write customized evaluation function -evalerror <- function(preds, dtrain) { - labels <- xgb.getinfo(dtrain, "label") - err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels) - return(list(metric="error", value=err)) +evalerror = function(preds, dtrain) { + labels = xgb.getinfo(dtrain, "label") + err = as.numeric(sum(labels != (preds > 0.0))) / length(labels) + return(list(metric="error", value=err)) } -# training with customized objective, we can also do step by step training -# simply look at xgboost.py"s implementation of train -bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror) +bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror) -### -# advanced: start from a initial base prediction -# -print ("start running example to start from a initial prediction") -# specify parameters via map, definition are same as c++ version -param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") -# train xgboost for 1 round -bst <- xgb.train( param, dtrain, 1, watchlist ) -# Note: we need the margin value instead of transformed prediction in set_base_margin -# do predict with output_margin=True, will always give you margin values before logistic transformation -ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE) -ptest <- xgb.predict(bst, dtest, outputmargin=TRUE) -succ <- xgb.setinfo(dtrain, "base_margin", ptrain) -succ <- xgb.setinfo(dtest, "base_margin", ptest) -print ("this is result of running from initial prediction") -bst <- xgb.train( param, dtrain, 1, watchlist ) +############################ +# Train with previous result +############################ + +bst = xgboost(x,y,params=param,watchlist=watchlist) +pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE) +bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)