diff --git a/wrapper/R-example/demo.R b/wrapper/R-example/demo.R index 115637390..076dc79a9 100644 --- a/wrapper/R-example/demo.R +++ b/wrapper/R-example/demo.R @@ -6,9 +6,72 @@ dtrain <- xgb.DMatrix("agaricus.txt.train") dtest <- xgb.DMatrix("agaricus.txt.test") param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic') watchlist <- list('train'=dtrain,'test'=dtest) -bst <- xgb.train(param, dtrain, watchlist=watchlist, nround=3) +# training xgboost model +bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist) +# make prediction +preds <- xgb.predict(bst, dtest) +labels <- xgb.getinfo(dtest, "label") +err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels) +# print error rate +print(err) -succ <- xgb.save(bst, "iter.model") -print('finsih save model') -bst2 <- xgb.Booster(modelfile="iter.model") -pred = xgb.predict(bst2, dtest) +# save dmatrix into binary buffer +succ <- xgb.save(dtest, "dtest.buffer") +# save model into file +succ <- xgb.save(bst, "xgb.model") +# load model in +bst2 <- xgb.Booster(modelfile="xgb.model") +dtest2 <- xgb.DMatrix("dtest.buffer") +preds2 <- xgb.predict(bst2, dtest2) +# print difference +print(sum(abs(preds2-preds))) + +### +# advanced: cutomsized loss function +# +print("start running example to used cutomized objective function") +# note: for customized objective function, we leave objective as default +# note: what we are getting is margin value in prediction +# you must know what you are doing +param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1) +# user define objective function, given prediction, return gradient and second order gradient +# this is loglikelihood loss +logregobj <- function(preds, dtrain) { + labels <- xgb.getinfo(dtrain, "label") + preds <- 1.0 / (1.0 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1.0-preds) + return(list(grad=grad, hess=hess)) +} +# user defined evaluation function, return a list(metric="metric-name", value="metric-value") +# NOTE: when you do customized loss function, the default prediction value is margin +# this may make buildin evalution metric not function properly +# for example, we are doing logistic loss, the prediction is score before logistic transformation +# the buildin evaluation error assumes input is after logistic transformation +# Take this in mind when you use the customization, and maybe you need write customized evaluation function +evalerror <- function(preds, dtrain) { + labels <- xgb.getinfo(dtrain, "label") + err <- as.real(sum(labels != (preds > 0.0))) / length(labels) + return(list(metric="error", value=err)) +} + +# training with customized objective, we can also do step by step training +# simply look at xgboost.py's implementation of train +bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror) + +### +# advanced: start from a initial base prediction +# +print ('start running example to start from a initial prediction') +# specify parameters via map, definition are same as c++ version +param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic') +# train xgboost for 1 round +bst <- xgb.train( param, dtrain, 1, watchlist ) +# Note: we need the margin value instead of transformed prediction in set_base_margin +# do predict with output_margin=True, will always give you margin values before logistic transformation +ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE) +ptest <- xgb.predict(bst, dtest, outputmargin=TRUE) +succ <- xgb.setinfo(dtrain, "base_margin", ptrain) +succ <- xgb.setinfo(dtest, "base_margin", ptest) +print ('this is result of running from initial prediction') +bst <- xgb.train( param, dtrain, 1, watchlist ) diff --git a/wrapper/python-example/demo.py b/wrapper/python-example/demo.py index 6dde2be72..2b66e92a7 100755 --- a/wrapper/python-example/demo.py +++ b/wrapper/python-example/demo.py @@ -58,7 +58,7 @@ evallist = [(dtest,'eval'), (dtrain,'train')] bst = xgb.train( param, dtrain, num_round, evallist ) ### -# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score +# advanced: cutomsized loss function # print ('start running example to used cutomized objective function') diff --git a/wrapper/xgboost.R b/wrapper/xgboost.R index 79bcedc9e..7ad27b527 100644 --- a/wrapper/xgboost.R +++ b/wrapper/xgboost.R @@ -2,13 +2,60 @@ dyn.load("./libxgboostR.so") # constructing DMatrix -xgb.DMatrix <- function(data) { +xgb.DMatrix <- function(data, info=list()) { if (typeof(data) == "character") { handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE)) }else { - stop("xgb.DMatrix cannot recognize data type") + stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) } - return(structure(handle, class="xgb.DMatrix")) + dmat = structure(handle, class="xgb.DMatrix") + if (length(info) != 0) { + for (i in 1:length(info)) { + p = info[i] + xgb.setinfo(dmat, names(p), p) + } + } + return(dmat) +} +# get information from dmatrix +xgb.getinfo <- function(dmat, name) { + if (typeof(name) != "character") { + stop("xgb.getinfo: name must be character") + } + if (class(dmat) != "xgb.DMatrix") { + stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); + } + if (name != "label" && + name != "weight" && + name != "base_margin" ) { + stop(paste("xgb.getinfo: unknown info name", name)) + } + ret <- .Call("XGDMatrixGetInfo_R", dmat, name) + return(ret) +} +# set information into dmatrix, this mutate dmatrix +xgb.setinfo <- function(dmat, name, info) { + if (class(dmat) != "xgb.DMatrix") { + stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix"); + } + if (name == "label") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info)) + return(TRUE) + } + if (name == "weight") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info)) + return(TRUE) + } + if (name == "base_margin") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info)) + return(TRUE) + } + if (name == "group") { + .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info)) + return(TRUE) + } + stop(pase("xgb.setinfo: unknown info name", name)) + return(FALSE) } # construct a Booster from cachelist xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { @@ -21,9 +68,11 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { } } handle <- .Call("XGBoosterCreate_R", cachelist) - for (i in 1:length(params)) { - p = params[i] - .Call("XGBoosterSetParam_R", handle, names(p), as.character(p)) + if (length(params) != 0) { + for (i in 1:length(params)) { + p = params[i] + .Call("XGBoosterSetParam_R", handle, names(p), as.character(p)) + } } if (!is.null(modelfile)) { if (typeof(modelfile) != "character"){ @@ -34,7 +83,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { return(structure(handle, class="xgb.Booster")) } # train a model using given parameters -xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) { +xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) { if (typeof(params) != "list") { stop("xgb.train: first argument params must be list"); } @@ -49,10 +98,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) { pred = xgb.predict(bst, dtrain) gpair = obj(pred, dtrain) succ <- xgb.iter.boost(bst, dtrain, gpair) - } + } if (length(watchlist) != 0) { - msg <- xgb.iter.eval(bst, watchlist, i-1) + if (is.null(feval)) { + msg <- xgb.iter.eval(bst, watchlist, i-1) cat(msg); cat("\n") + } else { + cat("["); cat(i); cat("]"); + for (j in 1:length(watchlist)) { + w <- watchlist[j] + if (length(names(w)) == 0) { + stop("xgb.eval: name tag must be presented for every elements in watchlist") + } + ret <- feval(xgb.predict(bst, w[[1]]), w[[1]]) + cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); + cat(":"); cat(ret$value) + } + cat("\n") + } } } return(bst) @@ -124,12 +187,14 @@ xgb.iter.eval <- function(booster, watchlist, iter) { } } evnames <- list() - for (i in 1:length(watchlist)) { - w <- watchlist[i] - if (length(names(w)) == 0) { - stop("xgb.eval: name tag must be presented for every elements in watchlist") + if (length(watchlist) != 0) { + for (i in 1:length(watchlist)) { + w <- watchlist[i] + if (length(names(w)) == 0) { + stop("xgb.eval: name tag must be presented for every elements in watchlist") + } + evnames <- append(evnames, names(w)) } - evnames <- append(evnames, names(w)) } msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames) return(msg) diff --git a/wrapper/xgboost_R.cpp b/wrapper/xgboost_R.cpp index 339dd2027..7b7b232fe 100644 --- a/wrapper/xgboost_R.cpp +++ b/wrapper/xgboost_R.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "xgboost_wrapper.h" #include "xgboost_R.h" #include "../src/utils/utils.h" @@ -24,7 +25,40 @@ extern "C" { XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), CHAR(asChar(fname)), asInteger(silent)); } - + void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) { + int len = length(array); + const char *name = CHAR(asChar(field)); + if (!strcmp("group", name)) { + std::vector vec(len); + #pragma omp parallel for schedule(static) + for (int i = 0; i < len; ++i) { + vec[i] = static_cast(INTEGER(array)[i]); + } + XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len); + return; + } + { + std::vector vec(len); + #pragma omp parallel for schedule(static) + for (int i = 0; i < len; ++i) { + vec[i] = REAL(array)[i]; + } + XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), + CHAR(asChar(field)), + &vec[0], len); + } + } + SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) { + size_t olen; + const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), + CHAR(asChar(field)), &olen); + SEXP ret = PROTECT(allocVector(REALSXP, olen)); + for (size_t i = 0; i < olen; ++i) { + REAL(ret)[i] = res[i]; + } + UNPROTECT(1); + return ret; + } // functions related to booster void _BoosterFinalizer(SEXP ext) { if (R_ExternalPtrAddr(ext) == NULL) return; diff --git a/wrapper/xgboost_R.h b/wrapper/xgboost_R.h index d41117fd1..c572b03ca 100644 --- a/wrapper/xgboost_R.h +++ b/wrapper/xgboost_R.h @@ -24,6 +24,20 @@ extern "C" { * \param silent print statistics when saving */ void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent); + /*! + * \brief set information to dmatrix + * \param handle a instance of data matrix + * \param field field name, can be label, weight + * \param array pointer to float vector + */ + void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array); + /*! + * \brief get info vector from matrix + * \param handle a instance of data matrix + * \param field field name + * \return info vector + */ + SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field); /*! * \brief create xgboost learner * \param dmats a list of dmatrix handles that will be cached diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 702425329..3772cfd95 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -81,8 +81,8 @@ extern "C" { /*! * \brief get float info vector from matrix * \param handle a instance of data matrix - * \param len used to set result length * \param field field name + * \param out_len used to set result length * \return pointer to the label */ const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);