complete R example

2014-08-23 15:26:08 -07:00
parent 8bf758c63b
commit de83ac72ea
6 changed files with 198 additions and 22 deletions
--- a/wrapper/R-example/demo.R
+++ b/wrapper/R-example/demo.R
@@ -6,9 +6,72 @@ dtrain <- xgb.DMatrix("agaricus.txt.train")
 dtest <- xgb.DMatrix("agaricus.txt.test")
 param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
 watchlist <- list('train'=dtrain,'test'=dtest)
-bst <- xgb.train(param, dtrain, watchlist=watchlist, nround=3)
+# training xgboost model
+bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist)
+# make prediction
+preds <- xgb.predict(bst, dtest)
+labels <- xgb.getinfo(dtest, "label")
+err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
+# print error rate
+print(err)

-succ <- xgb.save(bst, "iter.model")
-print('finsih save model')
-bst2 <- xgb.Booster(modelfile="iter.model")
-pred = xgb.predict(bst2, dtest)
+# save dmatrix into binary buffer
+succ <- xgb.save(dtest, "dtest.buffer")
+# save model into file
+succ <- xgb.save(bst, "xgb.model")
+# load model in
+bst2 <- xgb.Booster(modelfile="xgb.model")
+dtest2 <- xgb.DMatrix("dtest.buffer")
+preds2 <- xgb.predict(bst2, dtest2)
+# print difference
+print(sum(abs(preds2-preds)))
+
+###
+# advanced: cutomsized loss function
+# 
+print("start running example to used cutomized objective function")
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1)
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+logregobj <- function(preds, dtrain) {
+  labels <- xgb.getinfo(dtrain, "label")
+  preds <- 1.0 / (1.0 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1.0-preds)
+  return(list(grad=grad, hess=hess))
+}
+# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror <- function(preds, dtrain) {
+  labels <- xgb.getinfo(dtrain, "label")
+  err <- as.real(sum(labels != (preds > 0.0))) / length(labels)
+  return(list(metric="error", value=err))
+}
+
+# training with customized objective, we can also do step by step training
+# simply look at xgboost.py's implementation of train
+bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
+
+###
+# advanced: start from a initial base prediction
+#
+print ('start running example to start from a initial prediction')
+# specify parameters via map, definition are same as c++ version
+param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
+# train xgboost for 1 round
+bst <- xgb.train( param, dtrain, 1, watchlist )
+# Note: we need the margin value instead of transformed prediction in set_base_margin
+# do predict with output_margin=True, will always give you margin values before logistic transformation
+ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
+ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
+succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
+succ <- xgb.setinfo(dtest, "base_margin", ptest)
+print ('this is result of running from initial prediction')
+bst <- xgb.train( param, dtrain, 1, watchlist )
--- a/wrapper/python-example/demo.py
+++ b/wrapper/python-example/demo.py
@@ -58,7 +58,7 @@ evallist  = [(dtest,'eval'), (dtrain,'train')]
 bst = xgb.train( param, dtrain, num_round, evallist )

 ###
-# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
+# advanced: cutomsized loss function
 # 
 print ('start running example to used cutomized objective function')

--- a/wrapper/xgboost.R
+++ b/wrapper/xgboost.R
@@ -2,13 +2,60 @@
 dyn.load("./libxgboostR.so")

 # constructing DMatrix
-xgb.DMatrix <- function(data) {
+xgb.DMatrix <- function(data, info=list()) {
  if (typeof(data) == "character") {
    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
  }else {
-    stop("xgb.DMatrix cannot recognize data type")
+    stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
  }
-  return(structure(handle, class="xgb.DMatrix"))
+  dmat = structure(handle, class="xgb.DMatrix")
+  if (length(info) != 0) {
+    for (i in 1:length(info)) {
+      p = info[i]
+      xgb.setinfo(dmat, names(p), p)
+    }
+  }
+  return(dmat)
+}
+# get information from dmatrix
+xgb.getinfo <- function(dmat, name) {
+  if (typeof(name) != "character") {
+    stop("xgb.getinfo: name must be character")
+  }
+  if (class(dmat) != "xgb.DMatrix") {
+    stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
+  }
+  if (name != "label" &&
+      name != "weight" &&
+      name != "base_margin" ) {
+    stop(paste("xgb.getinfo: unknown info name", name))
+  }
+  ret <- .Call("XGDMatrixGetInfo_R", dmat, name)
+  return(ret)
+}
+# set information into dmatrix, this mutate dmatrix
+xgb.setinfo <- function(dmat, name, info) {
+  if (class(dmat) != "xgb.DMatrix") {
+    stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
+  }
+  if (name == "label") {
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
+    return(TRUE)
+  }
+  if (name == "weight") {
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
+    return(TRUE)
+  }
+  if (name == "base_margin") {
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
+    return(TRUE)
+  }
+  if (name == "group") {
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info))
+    return(TRUE)
+  }
+  stop(pase("xgb.setinfo: unknown info name", name))
+  return(FALSE)
 }
 # construct a Booster from cachelist
 xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
@@ -21,9 +68,11 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
    }
  }
  handle <- .Call("XGBoosterCreate_R", cachelist)
-  for (i in 1:length(params)) {
-    p = params[i]    
-    .Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
+  if (length(params) != 0) {
+    for (i in 1:length(params)) {
+      p = params[i]
+      .Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
+    }
  }
  if (!is.null(modelfile)) {
    if (typeof(modelfile) != "character"){
@@ -34,7 +83,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
  return(structure(handle, class="xgb.Booster"))
 }
 # train a model using given parameters
-xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
+xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
  if (typeof(params) != "list") {
    stop("xgb.train: first argument params must be list");
  }
@@ -49,10 +98,24 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
      pred = xgb.predict(bst, dtrain)
      gpair = obj(pred, dtrain)
      succ <- xgb.iter.boost(bst, dtrain, gpair)
-    }    
+    }
    if (length(watchlist) != 0) {
-      msg <- xgb.iter.eval(bst, watchlist, i-1)
+      if (is.null(feval)) {      
+        msg <- xgb.iter.eval(bst, watchlist, i-1)
        cat(msg); cat("\n")
+      } else {
+        cat("["); cat(i); cat("]");
+        for (j in 1:length(watchlist)) {
+          w <- watchlist[j]
+          if (length(names(w)) == 0) {
+            stop("xgb.eval: name tag must be presented for every elements in watchlist")
+          }
+          ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
+          cat("\t"); cat(names(w)); cat("-"); cat(ret$metric); 
+          cat(":"); cat(ret$value)
+        }
+        cat("\n")        
+      }
    }
  }
  return(bst)
@@ -124,12 +187,14 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
    }
  }
  evnames <- list()
-  for (i in 1:length(watchlist)) {
-    w <- watchlist[i]
-    if (length(names(w)) == 0) {
-      stop("xgb.eval: name tag must be presented for every elements in watchlist")
+  if (length(watchlist) != 0) {
+    for (i in 1:length(watchlist)) {
+      w <- watchlist[i]
+      if (length(names(w)) == 0) {
+        stop("xgb.eval: name tag must be presented for every elements in watchlist")
+      }
+      evnames <- append(evnames, names(w))
    }
-    evnames <- append(evnames, names(w))
  }
  msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
  return(msg)
--- a/wrapper/xgboost_R.cpp
+++ b/wrapper/xgboost_R.cpp
@@ -1,5 +1,6 @@
 #include <vector>
 #include <string>
+#include <cstring>
 #include "xgboost_wrapper.h"
 #include "xgboost_R.h"
 #include "../src/utils/utils.h"
@@ -24,7 +25,40 @@ extern "C" {
    XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
                        CHAR(asChar(fname)), asInteger(silent));
  }
-
+  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
+    int len = length(array);
+    const char *name = CHAR(asChar(field));
+    if (!strcmp("group", name)) {
+      std::vector<unsigned> vec(len);
+      #pragma omp parallel for schedule(static)      
+      for (int i = 0; i < len; ++i) {
+        vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
+      }
+      XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
+      return;
+    }
+    {
+      std::vector<float> vec(len);
+      #pragma omp parallel for schedule(static)
+      for (int i = 0; i < len; ++i) {
+        vec[i] = REAL(array)[i];
+      }
+      XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), 
+                            CHAR(asChar(field)),
+                            &vec[0], len);
+    }
+  }
+  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
+    size_t olen;
+    const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
+                                             CHAR(asChar(field)), &olen);
+    SEXP ret = PROTECT(allocVector(REALSXP, olen));
+    for (size_t i = 0; i < olen; ++i) {
+      REAL(ret)[i] = res[i];
+    }
+    UNPROTECT(1);
+    return ret;
+  }
  // functions related to booster
  void _BoosterFinalizer(SEXP ext) {    
    if (R_ExternalPtrAddr(ext) == NULL) return;
--- a/wrapper/xgboost_R.h
+++ b/wrapper/xgboost_R.h
@@ -24,6 +24,20 @@ extern "C" {
   * \param silent print statistics when saving
   */
  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
+  /*!
+   * \brief set information to dmatrix
+   * \param handle a instance of data matrix
+   * \param field field name, can be label, weight
+   * \param array pointer to float vector
+   */
+  void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
+  /*!
+   * \brief get info vector from matrix
+   * \param handle a instance of data matrix
+   * \param field field name
+   * \return info vector
+   */  
+  SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
  /*! 
   * \brief create xgboost learner 
   * \param dmats a list of dmatrix handles that will be cached
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -81,8 +81,8 @@ extern "C" {
  /*!
   * \brief get float info vector from matrix
   * \param handle a instance of data matrix
-   * \param len used to set result length
   * \param field field name
+   * \param out_len used to set result length
   * \return pointer to the label
   */
  const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);