seems ok

2014-08-23 18:38:39 -07:00
parent de83ac72ea
commit 3b12ff51b9
8 changed files with 187 additions and 31 deletions
--- a/wrapper/R-example/demo.R
+++ b/wrapper/R-example/demo.R
@@ -1,30 +1,78 @@
 # include xgboost library, must set chdir=TRURE
-source('../xgboost.R', chdir=TRUE)
+source("../xgboost.R", chdir=TRUE)
+
+# helper function to read libsvm format
+# this is very badly written, load in dense, and convert to sparse
+# use this only for demo purpose
+read.libsvm <- function(fname, maxcol) {
+  content <- readLines(fname)
+  nline <- length(content)
+  label <- numeric(nline)
+  mat <- matrix(0, nline, maxcol+1)
+  for (i in 1:nline) {
+    arr <- as.vector(strsplit(content[i], " ")[[1]])
+    label[i] <- as.numeric(arr[[1]])
+    for (j in 2:length(arr)) {
+      kv <- strsplit(arr[j], ":")[[1]]
+      # to avoid 0 index
+      findex <- as.integer(kv[1]) + 1
+      fvalue <- as.numeric(kv[2])
+      mat[i,findex] <- fvalue
+    }
+  }
+  mat <- as(mat, "sparseMatrix")
+  return(list(label=label, data=mat))
+}

 # test code here
 dtrain <- xgb.DMatrix("agaricus.txt.train")
 dtest <- xgb.DMatrix("agaricus.txt.test")
-param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
-watchlist <- list('train'=dtrain,'test'=dtest)
+param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
+watchlist <- list("eval"=dtest,"train"=dtrain)
 # training xgboost model
-bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist)
+bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
 # make prediction
 preds <- xgb.predict(bst, dtest)
 labels <- xgb.getinfo(dtest, "label")
 err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
 # print error rate
-print(err)
+print(paste("error=",err))
+
+# dump model
+xgb.dump(bst, "dump.raw.txt")
+# dump model with feature map
+xgb.dump(bst, "dump.nice.txt", "featmap.txt")

 # save dmatrix into binary buffer
 succ <- xgb.save(dtest, "dtest.buffer")
 # save model into file
 succ <- xgb.save(bst, "xgb.model")
-# load model in
+# load model and data in 
 bst2 <- xgb.Booster(modelfile="xgb.model")
 dtest2 <- xgb.DMatrix("dtest.buffer")
 preds2 <- xgb.predict(bst2, dtest2)
-# print difference
-print(sum(abs(preds2-preds)))
+# assert they are the same
+stopifnot(sum(abs(preds2-preds)) == 0)
+
+###
+# build dmatrix from sparseMatrix
+###
+print ('start running example of build DMatrix from R.sparseMatrix')
+csc <- read.libsvm("agaricus.txt.train", 126)
+label <- csc$label
+data <- csc$data
+dtrain <- xgb.DMatrix(data, info=list(label=label) )
+watchlist <- list("eval"=dtest,"train"=dtrain)
+bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
+
+###
+# build dmatrix from dense matrix
+###
+print ('start running example of build DMatrix from R.Matrix')
+mat = as.matrix(data)
+dtrain <- xgb.DMatrix(mat, info=list(label=label) )
+watchlist <- list("eval"=dtest,"train"=dtrain)
+bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)

 ###
 # advanced: cutomsized loss function
@@ -33,7 +81,7 @@ print("start running example to used cutomized objective function")
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1)
+param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
 # user define objective function, given prediction, return gradient and second order gradient
 # this is loglikelihood loss
 logregobj <- function(preds, dtrain) {
@@ -56,15 +104,15 @@ evalerror <- function(preds, dtrain) {
 }

 # training with customized objective, we can also do step by step training
-# simply look at xgboost.py's implementation of train
+# simply look at xgboost.py"s implementation of train
 bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)

 ###
 # advanced: start from a initial base prediction
 #
-print ('start running example to start from a initial prediction')
+print ("start running example to start from a initial prediction")
 # specify parameters via map, definition are same as c++ version
-param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
+param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
 # train xgboost for 1 round
 bst <- xgb.train( param, dtrain, 1, watchlist )
 # Note: we need the margin value instead of transformed prediction in set_base_margin
@@ -73,5 +121,5 @@ ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
 ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
 succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
 succ <- xgb.setinfo(dtest, "base_margin", ptest)
-print ('this is result of running from initial prediction')
+print ("this is result of running from initial prediction")
 bst <- xgb.train( param, dtrain, 1, watchlist )
--- a/wrapper/README.md
+++ b/wrapper/README.md
@@ -6,8 +6,10 @@ This folder provides wrapper of xgboost to other languages
 Python
 =====
 * To make the python module, type ```make``` in the root directory of project
+* Refer to the walk through example in [python-example/demo.py](python-example/demo.py)

 R 
 =====
 * To make the R wrapper, type ```make R``` in the root directory of project
 * R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
+* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)
--- a/wrapper/python-example/demo.py
+++ b/wrapper/python-example/demo.py
@@ -30,6 +30,16 @@ bst.dump_model('dump.raw.txt')
 # dump model with feature map
 bst.dump_model('dump.nice.txt','featmap.txt')

+# save dmatrix into binary buffer
+dtest.save_binary('dtest.buffer')
+bst.save_model('xgb.model')
+# load model and data in 
+bst2 = xgb.Booster(model_file='xgb.model')
+dtest2 = xgb.DMatrix('dtest.buffer')
+preds2 = bst2.predict(dtest2)
+# assert they are the same
+assert np.sum(np.abs(preds2-preds)) == 0
+
 ###
 # build dmatrix from scipy.sparse
 print ('start running example of build DMatrix from scipy.sparse')
@@ -92,7 +102,6 @@ def evalerror(preds, dtrain):
 # simply look at xgboost.py's implementation of train
 bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)

-
 ###
 # advanced: start from a initial base prediction
 #
--- a/wrapper/xgboost.R
+++ b/wrapper/xgboost.R
@@ -1,18 +1,27 @@
+# depends on matrix
+succ <- require("Matrix")
+if (!succ) {
+  stop("xgboost depends on Matrix library")
+}
 # load in library
 dyn.load("./libxgboostR.so")

 # constructing DMatrix
-xgb.DMatrix <- function(data, info=list()) {
+xgb.DMatrix <- function(data, info=list(), missing=0.0) {
  if (typeof(data) == "character") {
    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
-  }else {
+  } else if(is.matrix(data)) {
+    handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
+  } else if(class(data) == "dgCMatrix") {
+    handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
+  } else {
    stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
  }
-  dmat = structure(handle, class="xgb.DMatrix")
+  dmat <- structure(handle, class="xgb.DMatrix")
  if (length(info) != 0) {
    for (i in 1:length(info)) {
-      p = info[i]
-      xgb.setinfo(dmat, names(p), p)
+      p <- info[i]
+      xgb.setinfo(dmat, names(p), p[[1]])
    }
  }
  return(dmat)
@@ -68,9 +77,10 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
    }
  }
  handle <- .Call("XGBoosterCreate_R", cachelist)
+  .Call("XGBoosterSetParam_R", handle, "seed", "0")
  if (length(params) != 0) {
    for (i in 1:length(params)) {
-      p = params[i]
+      p <- params[i]
      .Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
    }
  }
@@ -95,8 +105,8 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, fe
    if (is.null(obj)) {
      succ <- xgb.iter.update(bst, dtrain, i-1)
    } else {
-      pred = xgb.predict(bst, dtrain)
-      gpair = obj(pred, dtrain)
+      pred <- xgb.predict(bst, dtrain)
+      gpair <- obj(pred, dtrain)
      succ <- xgb.iter.boost(bst, dtrain, gpair)
    }
    if (length(watchlist) != 0) {
@@ -139,14 +149,25 @@ xgb.save <- function(handle, fname) {
 # predict 
 xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
  if (class(booster) != "xgb.Booster") {
-    stop("xgb.iter.update: first argument must be type xgb.Booster")
+    stop("xgb.predict: first argument must be type xgb.Booster")
  }
  if (class(dmat) != "xgb.DMatrix") {
-    stop("xgb.iter.update: second argument must be type xgb.DMatrix")
+    stop("xgb.predict: second argument must be type xgb.DMatrix")
  }
-  ret = .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
+  ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
  return(ret)
 }
+# dump model
+xgb.dump <- function(booster, fname, fmap = "") {
+  if (class(booster) != "xgb.Booster") {
+    stop("xgb.dump: first argument must be type xgb.Booster")
+  }
+  if (typeof(fname) != "character"){
+    stop("xgb.dump: second argument must be type character")
+  }
+  .Call("XGBoosterDumpModel_R", booster, fname, fmap)
+  return(TRUE)
+}
 ##--------------------------------------
 # the following are low level iteratively function, not needed
 # if you do not want to use them
--- a/wrapper/xgboost.py
+++ b/wrapper/xgboost.py
@@ -127,7 +127,7 @@ class DMatrix:

 class Booster:
    """learner class """
-    def __init__(self, params={}, cache=[], model_name = None):
+    def __init__(self, params={}, cache=[], model_file = None):
        """ constructor, param: """
        for d in cache:
            assert isinstance(d, DMatrix)
@@ -135,8 +135,8 @@ class Booster:
        self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
        self.set_param({'seed':0})
        self.set_param(params)
-        if model_name != None:
-            self.load_model(model_name)
+        if model_file != None:
+            self.load_model(model_file)
    def __del__(self):
        xglib.XGBoosterFree(self.handle)
    def set_param(self, params, pv=None):
--- a/wrapper/xgboost_R.cpp
+++ b/wrapper/xgboost_R.cpp
@@ -1,10 +1,12 @@
 #include <vector>
 #include <string>
+#include <utility>
 #include <cstring>
 #include "xgboost_wrapper.h"
 #include "xgboost_R.h"
 #include "../src/utils/utils.h"
 #include "../src/utils/omp.h"
+#include "../src/utils/matrix_csr.h"

 using namespace xgboost;

@@ -21,6 +23,63 @@ extern "C" {
    UNPROTECT(1);
    return ret;
  }
+  SEXP XGDMatrixCreateFromMat_R(SEXP mat, 
+                                SEXP missing) {
+    SEXP dim = getAttrib(mat, R_DimSymbol);
+    int nrow = INTEGER(dim)[0];
+    int ncol = INTEGER(dim)[1];    
+    double *din = REAL(mat);
+    std::vector<float> data(nrow * ncol);
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < nrow; ++i) {
+      for (int j = 0; j < ncol; ++j) {
+        data[i * ncol +j] = din[i + nrow * j];
+      }
+    }
+    void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
+    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+    UNPROTECT(1);
+    return ret;    
+  }
+  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
+                                SEXP indices,
+                                SEXP data) {
+    const int *col_ptr = INTEGER(indptr);
+    const int *row_index = INTEGER(indices);
+    const double *col_data = REAL(data);
+    int ncol = length(indptr) - 1;
+    int ndata = length(data);
+    // transform into CSR format
+    std::vector<size_t> row_ptr;
+    std::vector< std::pair<unsigned, float> > csr_data;
+    utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
+    builder.InitBudget();
+    for (int i = 0; i < ncol; ++i) {
+      for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
+        builder.AddBudget(row_index[j]);
+      }
+    }
+    builder.InitStorage();
+    for (int i = 0; i < ncol; ++i) {
+      for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
+        builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
+      }
+    }
+    utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
+    std::vector<float> row_data(ndata);
+    std::vector<unsigned> col_index(ndata);
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < ndata; ++i) {
+      col_index[i] = csr_data[i].first;
+      row_data[i] = csr_data[i].second;      
+    }
+    void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
+    SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+    R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+    UNPROTECT(1);
+    return ret;
+  }
  void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
    XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
                        CHAR(asChar(fname)), asInteger(silent));
@@ -142,7 +201,7 @@ extern "C" {
    FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
    for (size_t i = 0; i < olen; ++i) {
      fprintf(fo, "booster[%lu]:\n", i);
-      fprintf(fo, "%s\n", res[i]);
+      fprintf(fo, "%s", res[i]);
    }
    fclose(fo);
  }
--- a/wrapper/xgboost_R.h
+++ b/wrapper/xgboost_R.h
@@ -17,6 +17,25 @@ extern "C" {
   * \return a loaded data matrix
   */
  SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
+  /*!
+   * \brief create matrix content from dense matrix
+   * This assumes the matrix is stored in column major format
+   * \param data R Matrix object
+   * \param missing which value to represent missing value
+   * \return created dmatrix
+   */
+  SEXP XGDMatrixCreateFromMat_R(SEXP mat, 
+                                SEXP missing);
+  /*! 
+   * \brief create a matrix content from CSC format
+   * \param indptr pointer to column headers
+   * \param indices row indices
+   * \param data content of the data
+   * \return created dmatrix
+   */
+  SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
+                                SEXP indices,
+                                SEXP data);
  /*!
   * \brief load a data matrix into binary file
   * \param handle a instance of data matrix
--- a/wrapper/xgboost_wrapper.h
+++ b/wrapper/xgboost_wrapper.h
@@ -16,7 +16,6 @@ extern "C" {
  void* XGDMatrixCreateFromFile(const char *fname, int silent);
  /*! 
   * \brief create a matrix content from csr format
-   * \param handle a instance of data matrix
   * \param indptr pointer to row headers
   * \param indices findex
   * \param data fvalue
@@ -31,7 +30,6 @@ extern "C" {
                               size_t nelem);
  /*!
   * \brief create matrix content from dense matrix
-   * \param handle a instance of data matrix
   * \param data pointer to the data space
   * \param nrow number of rows
   * \param ncol number columns