complete R example

This commit is contained in:
tqchen 2014-08-23 15:26:08 -07:00
parent 8bf758c63b
commit de83ac72ea
6 changed files with 198 additions and 22 deletions

View File

@ -6,9 +6,72 @@ dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test") dtest <- xgb.DMatrix("agaricus.txt.test")
param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic') param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
watchlist <- list('train'=dtrain,'test'=dtest) watchlist <- list('train'=dtrain,'test'=dtest)
bst <- xgb.train(param, dtrain, watchlist=watchlist, nround=3) # training xgboost model
bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(err)
succ <- xgb.save(bst, "iter.model") # save dmatrix into binary buffer
print('finsih save model') succ <- xgb.save(dtest, "dtest.buffer")
bst2 <- xgb.Booster(modelfile="iter.model") # save model into file
pred = xgb.predict(bst2, dtest) succ <- xgb.save(bst, "xgb.model")
# load model in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# print difference
print(sum(abs(preds2-preds)))
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.real(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ('this is result of running from initial prediction')
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -58,7 +58,7 @@ evallist = [(dtest,'eval'), (dtrain,'train')]
bst = xgb.train( param, dtrain, num_round, evallist ) bst = xgb.train( param, dtrain, num_round, evallist )
### ###
# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score # advanced: cutomsized loss function
# #
print ('start running example to used cutomized objective function') print ('start running example to used cutomized objective function')

View File

@ -2,13 +2,60 @@
dyn.load("./libxgboostR.so") dyn.load("./libxgboostR.so")
# constructing DMatrix # constructing DMatrix
xgb.DMatrix <- function(data) { xgb.DMatrix <- function(data, info=list()) {
if (typeof(data) == "character") { if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE)) handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
}else { }else {
stop("xgb.DMatrix cannot recognize data type") stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
} }
return(structure(handle, class="xgb.DMatrix")) dmat = structure(handle, class="xgb.DMatrix")
if (length(info) != 0) {
for (i in 1:length(info)) {
p = info[i]
xgb.setinfo(dmat, names(p), p)
}
}
return(dmat)
}
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
return(ret)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
return(TRUE)
}
if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
return(TRUE)
}
stop(pase("xgb.setinfo: unknown info name", name))
return(FALSE)
} }
# construct a Booster from cachelist # construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
@ -21,9 +68,11 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
} }
} }
handle <- .Call("XGBoosterCreate_R", cachelist) handle <- .Call("XGBoosterCreate_R", cachelist)
for (i in 1:length(params)) { if (length(params) != 0) {
p = params[i] for (i in 1:length(params)) {
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p)) p = params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
}
} }
if (!is.null(modelfile)) { if (!is.null(modelfile)) {
if (typeof(modelfile) != "character"){ if (typeof(modelfile) != "character"){
@ -34,7 +83,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
return(structure(handle, class="xgb.Booster")) return(structure(handle, class="xgb.Booster"))
} }
# train a model using given parameters # train a model using given parameters
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) { xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
if (typeof(params) != "list") { if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list"); stop("xgb.train: first argument params must be list");
} }
@ -49,10 +98,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
pred = xgb.predict(bst, dtrain) pred = xgb.predict(bst, dtrain)
gpair = obj(pred, dtrain) gpair = obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair) succ <- xgb.iter.boost(bst, dtrain, gpair)
} }
if (length(watchlist) != 0) { if (length(watchlist) != 0) {
msg <- xgb.iter.eval(bst, watchlist, i-1) if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i-1)
cat(msg); cat("\n") cat(msg); cat("\n")
} else {
cat("["); cat(i); cat("]");
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
cat(":"); cat(ret$value)
}
cat("\n")
}
} }
} }
return(bst) return(bst)
@ -124,12 +187,14 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
} }
} }
evnames <- list() evnames <- list()
for (i in 1:length(watchlist)) { if (length(watchlist) != 0) {
w <- watchlist[i] for (i in 1:length(watchlist)) {
if (length(names(w)) == 0) { w <- watchlist[i]
stop("xgb.eval: name tag must be presented for every elements in watchlist") if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
} }
evnames <- append(evnames, names(w))
} }
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames) msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
return(msg) return(msg)

View File

@ -1,5 +1,6 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <cstring>
#include "xgboost_wrapper.h" #include "xgboost_wrapper.h"
#include "xgboost_R.h" #include "xgboost_R.h"
#include "../src/utils/utils.h" #include "../src/utils/utils.h"
@ -24,7 +25,40 @@ extern "C" {
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent)); CHAR(asChar(fname)), asInteger(silent));
} }
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
return;
}
{
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
&vec[0], len);
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
size_t olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
// functions related to booster // functions related to booster
void _BoosterFinalizer(SEXP ext) { void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return; if (R_ExternalPtrAddr(ext) == NULL) return;

View File

@ -24,6 +24,20 @@ extern "C" {
* \param silent print statistics when saving * \param silent print statistics when saving
*/ */
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent); void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*! /*!
* \brief create xgboost learner * \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached * \param dmats a list of dmatrix handles that will be cached

View File

@ -81,8 +81,8 @@ extern "C" {
/*! /*!
* \brief get float info vector from matrix * \brief get float info vector from matrix
* \param handle a instance of data matrix * \param handle a instance of data matrix
* \param len used to set result length
* \param field field name * \param field field name
* \param out_len used to set result length
* \return pointer to the label * \return pointer to the label
*/ */
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len); const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);