complete R example

2014-08-23 15:26:08 -07:00 · 2014-08-23 15:26:08 -07:00 · de83ac72ea
commit de83ac72ea
parent 8bf758c63b
6 changed files with 198 additions and 22 deletions
--- a/wrapper/R-example/demo.R
+++ b/wrapper/R-example/demo.R
@ -6,9 +6,72 @@ dtrain <- xgb.DMatrix("agaricus.txt.train")
 dtest <- xgb.DMatrix("agaricus.txt.test")
 param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
 watchlist <- list('train'=dtrain,'test'=dtest)
-bst <- xgb.train(param, dtrain, watchlist=watchlist, nround=3)
+# training xgboost model
 bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist)
 # make prediction
 preds <- xgb.predict(bst, dtest)
 labels <- xgb.getinfo(dtest, "label")
 err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
 # print error rate
 print(err)
-succ <- xgb.save(bst, "iter.model")
+# save dmatrix into binary buffer
-print('finsih save model')
+succ <- xgb.save(dtest, "dtest.buffer")
-bst2 <- xgb.Booster(modelfile="iter.model")
+# save model into file
-pred = xgb.predict(bst2, dtest)
+succ <- xgb.save(bst, "xgb.model")
 # load model in
 bst2 <- xgb.Booster(modelfile="xgb.model")
 dtest2 <- xgb.DMatrix("dtest.buffer")
 preds2 <- xgb.predict(bst2, dtest2)
 # print difference
 print(sum(abs(preds2-preds)))
 ###
 # advanced: cutomsized loss function
 # 
 print("start running example to used cutomized objective function")
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1)
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
 logregobj <- function(preds, dtrain) {
  labels <- xgb.getinfo(dtrain, "label")
  preds <- 1.0 / (1.0 + exp(-preds))
  grad <- preds - labels
  hess <- preds * (1.0-preds)
  return(list(grad=grad, hess=hess))
 }
 # user defined evaluation function, return a list(metric="metric-name", value="metric-value")
 # NOTE: when you do customized loss function, the default prediction value is margin
 # this may make buildin evalution metric not function properly
 # for example, we are doing logistic loss, the prediction is score before logistic transformation
 # the buildin evaluation error assumes input is after logistic transformation
 # Take this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {
  labels <- xgb.getinfo(dtrain, "label")
  err <- as.real(sum(labels != (preds > 0.0))) / length(labels)
  return(list(metric="error", value=err))
 }
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
 ###
 # advanced: start from a initial base prediction
 #
 print ('start running example to start from a initial prediction')
 # specify parameters via map, definition are same as c++ version
 param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
 # train xgboost for 1 round
 bst <- xgb.train( param, dtrain, 1, watchlist )
 # Note: we need the margin value instead of transformed prediction in set_base_margin
 # do predict with output_margin=True, will always give you margin values before logistic transformation
 ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
 ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
 succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
 succ <- xgb.setinfo(dtest, "base_margin", ptest)
 print ('this is result of running from initial prediction')
 bst <- xgb.train( param, dtrain, 1, watchlist )
--- a/wrapper/python-example/demo.py
+++ b/wrapper/python-example/demo.py
@ -58,7 +58,7 @@ evallist  = [(dtest,'eval'), (dtrain,'train')]
 bst = xgb.train( param, dtrain, num_round, evallist )
 ###
-# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
+# advanced: cutomsized loss function
 # 
 print ('start running example to used cutomized objective function')
--- a/wrapper/xgboost.R
+++ b/wrapper/xgboost.R
@ -2,13 +2,60 @@
 dyn.load("./libxgboostR.so")
 # constructing DMatrix
-xgb.DMatrix <- function(data) {
+xgb.DMatrix <- function(data, info=list()) {
  if (typeof(data) == "character") {
    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
  }else {
-    stop("xgb.DMatrix cannot recognize data type")
+    stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
  }
-  return(structure(handle, class="xgb.DMatrix"))
+  dmat = structure(handle, class="xgb.DMatrix")
  if (length(info) != 0) {
    for (i in 1:length(info)) {
      p = info[i]
      xgb.setinfo(dmat, names(p), p)
    }
  }
  return(dmat)
 }
 # get information from dmatrix
 xgb.getinfo <- function(dmat, name) {
  if (typeof(name) != "character") {
    stop("xgb.getinfo: name must be character")
  }
  if (class(dmat) != "xgb.DMatrix") {
    stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
  }
  if (name != "label" &&
      name != "weight" &&
      name != "base_margin" ) {
    stop(paste("xgb.getinfo: unknown info name", name))
  }
  ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
  return(ret)
 }
 # set information into dmatrix, this mutate dmatrix
 xgb.setinfo <- function(dmat, name, info) {
  if (class(dmat) != "xgb.DMatrix") {
    stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
  }
  if (name == "label") {
    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
    return(TRUE)
  }
  if (name == "weight") {
    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
    return(TRUE)
  }
  if (name == "base_margin") {
    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
    return(TRUE)
  }
  if (name == "group") {
    .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
    return(TRUE)
  }
  stop(pase("xgb.setinfo: unknown info name", name))
  return(FALSE)
 }
 # construct a Booster from cachelist
 xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
@ -21,9 +68,11 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
    }
  }
  handle <- .Call("XGBoosterCreate_R", cachelist)
-  for (i in 1:length(params)) {
+  if (length(params) != 0) {
-    p = params[i]    
+    for (i in 1:length(params)) {
-    .Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
+      p = params[i]
      .Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
    }
  }
  if (!is.null(modelfile)) {
    if (typeof(modelfile) != "character"){
@ -34,7 +83,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
  return(structure(handle, class="xgb.Booster"))
 }
 # train a model using given parameters
-xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
+xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list");
  }
@ -49,10 +98,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
      pred = xgb.predict(bst, dtrain)
      gpair = obj(pred, dtrain)
      succ <- xgb.iter.boost(bst, dtrain, gpair)
-    }    
+    }
    if (length(watchlist) != 0) {
-      msg <- xgb.iter.eval(bst, watchlist, i-1)
+      if (is.null(feval)) {      
        msg <- xgb.iter.eval(bst, watchlist, i-1)
        cat(msg); cat("\n")
      } else {
        cat("["); cat(i); cat("]");
        for (j in 1:length(watchlist)) {
          w <- watchlist[j]
          if (length(names(w)) == 0) {
            stop("xgb.eval: name tag must be presented for every elements in watchlist")
          }
          ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
          cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); 
          cat(":"); cat(ret$value)
        }
        cat("\n")        
      }
    }
  }
  return(bst)
@ -124,12 +187,14 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
    }
  }
  evnames <- list()
-  for (i in 1:length(watchlist)) {
+  if (length(watchlist) != 0) {
-    w <- watchlist[i]
+    for (i in 1:length(watchlist)) {
-    if (length(names(w)) == 0) {
+      w <- watchlist[i]
-      stop("xgb.eval: name tag must be presented for every elements in watchlist")
+      if (length(names(w)) == 0) {
        stop("xgb.eval: name tag must be presented for every elements in watchlist")
      }
      evnames <- append(evnames, names(w))
    }
    evnames <- append(evnames, names(w))
  }
  msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
  return(msg)
--- a/wrapper/xgboost_R.cpp
+++ b/wrapper/xgboost_R.cpp
@ -1,5 +1,6 @@
 #include <vector>
 #include <string>
 #include <cstring>
 #include "xgboost_wrapper.h"
 #include "xgboost_R.h"
 #include "../src/utils/utils.h"
@ -24,7 +25,40 @@ extern "C" {
    XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
                        CHAR(asChar(fname)), asInteger(silent));
  }
-
+  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
    int len = length(array);
    const char *name = CHAR(asChar(field));
    if (!strcmp("group", name)) {
      std::vector<unsigned> vec(len);
      #pragma omp parallel for schedule(static)      
      for (int i = 0; i < len; ++i) {
        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
      }
      XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
      return;
    }
    {
      std::vector<float> vec(len);
      #pragma omp parallel for schedule(static)
      for (int i = 0; i < len; ++i) {
        vec[i] = REAL(array)[i];
      }
      XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), 
                            CHAR(asChar(field)),
                            &vec[0], len);
    }
  }
  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
    size_t olen;
    const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
                                             CHAR(asChar(field)), &olen);
    SEXP ret = PROTECT(allocVector(REALSXP, olen));
    for (size_t i = 0; i < olen; ++i) {
      REAL(ret)[i] = res[i];
    }
    UNPROTECT(1);
    return ret;
  }
  // functions related to booster
  void _BoosterFinalizer(SEXP ext) {    
    if (R_ExternalPtrAddr(ext) == NULL) return;
--- a/wrapper/xgboost_R.h
+++ b/wrapper/xgboost_R.h
@ -24,6 +24,20 @@ extern "C" {
   * \param silent print statistics when saving
   */
  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
  /*!
   * \brief set information to dmatrix
   * \param handle a instance of data matrix
   * \param field field name, can be label, weight
   * \param array pointer to float vector
   */
  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
  /*!
   * \brief get info vector from matrix
   * \param handle a instance of data matrix
   * \param field field name
   * \return info vector
   */  
  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
  /*! 
   * \brief create xgboost learner 
   * \param dmats a list of dmatrix handles that will be cached
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@ -81,8 +81,8 @@ extern "C" {
  /*!
   * \brief get float info vector from matrix
   * \param handle a instance of data matrix
   * \param len used to set result length
   * \param field field name
   * \param out_len used to set result length
   * \return pointer to the label
   */
  const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);