diff --git a/wrapper/R-example/demo.R b/wrapper/R-example/demo.R index 076dc79a9..7922b768a 100644 --- a/wrapper/R-example/demo.R +++ b/wrapper/R-example/demo.R @@ -1,30 +1,78 @@ # include xgboost library, must set chdir=TRURE -source('../xgboost.R', chdir=TRUE) +source("../xgboost.R", chdir=TRUE) + +# helper function to read libsvm format +# this is very badly written, load in dense, and convert to sparse +# use this only for demo purpose +read.libsvm <- function(fname, maxcol) { + content <- readLines(fname) + nline <- length(content) + label <- numeric(nline) + mat <- matrix(0, nline, maxcol+1) + for (i in 1:nline) { + arr <- as.vector(strsplit(content[i], " ")[[1]]) + label[i] <- as.numeric(arr[[1]]) + for (j in 2:length(arr)) { + kv <- strsplit(arr[j], ":")[[1]] + # to avoid 0 index + findex <- as.integer(kv[1]) + 1 + fvalue <- as.numeric(kv[2]) + mat[i,findex] <- fvalue + } + } + mat <- as(mat, "sparseMatrix") + return(list(label=label, data=mat)) +} # test code here dtrain <- xgb.DMatrix("agaricus.txt.train") dtest <- xgb.DMatrix("agaricus.txt.test") -param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic') -watchlist <- list('train'=dtrain,'test'=dtest) +param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") +watchlist <- list("eval"=dtest,"train"=dtrain) # training xgboost model -bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist) +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) # make prediction preds <- xgb.predict(bst, dtest) labels <- xgb.getinfo(dtest, "label") err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels) # print error rate -print(err) +print(paste("error=",err)) + +# dump model +xgb.dump(bst, "dump.raw.txt") +# dump model with feature map +xgb.dump(bst, "dump.nice.txt", "featmap.txt") # save dmatrix into binary buffer succ <- xgb.save(dtest, "dtest.buffer") # save model into file succ <- xgb.save(bst, "xgb.model") -# load model in +# load model and data in bst2 <- xgb.Booster(modelfile="xgb.model") dtest2 <- xgb.DMatrix("dtest.buffer") preds2 <- xgb.predict(bst2, dtest2) -# print difference -print(sum(abs(preds2-preds))) +# assert they are the same +stopifnot(sum(abs(preds2-preds)) == 0) + +### +# build dmatrix from sparseMatrix +### +print ('start running example of build DMatrix from R.sparseMatrix') +csc <- read.libsvm("agaricus.txt.train", 126) +label <- csc$label +data <- csc$data +dtrain <- xgb.DMatrix(data, info=list(label=label) ) +watchlist <- list("eval"=dtest,"train"=dtrain) +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) + +### +# build dmatrix from dense matrix +### +print ('start running example of build DMatrix from R.Matrix') +mat = as.matrix(data) +dtrain <- xgb.DMatrix(mat, info=list(label=label) ) +watchlist <- list("eval"=dtest,"train"=dtrain) +bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) ### # advanced: cutomsized loss function @@ -33,7 +81,7 @@ print("start running example to used cutomized objective function") # note: for customized objective function, we leave objective as default # note: what we are getting is margin value in prediction # you must know what you are doing -param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1) +param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1) # user define objective function, given prediction, return gradient and second order gradient # this is loglikelihood loss logregobj <- function(preds, dtrain) { @@ -56,15 +104,15 @@ evalerror <- function(preds, dtrain) { } # training with customized objective, we can also do step by step training -# simply look at xgboost.py's implementation of train +# simply look at xgboost.py"s implementation of train bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror) ### # advanced: start from a initial base prediction # -print ('start running example to start from a initial prediction') +print ("start running example to start from a initial prediction") # specify parameters via map, definition are same as c++ version -param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic') +param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") # train xgboost for 1 round bst <- xgb.train( param, dtrain, 1, watchlist ) # Note: we need the margin value instead of transformed prediction in set_base_margin @@ -73,5 +121,5 @@ ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE) ptest <- xgb.predict(bst, dtest, outputmargin=TRUE) succ <- xgb.setinfo(dtrain, "base_margin", ptrain) succ <- xgb.setinfo(dtest, "base_margin", ptest) -print ('this is result of running from initial prediction') +print ("this is result of running from initial prediction") bst <- xgb.train( param, dtrain, 1, watchlist ) diff --git a/wrapper/README.md b/wrapper/README.md index 2d62fa686..d6caa7c6f 100644 --- a/wrapper/README.md +++ b/wrapper/README.md @@ -6,8 +6,10 @@ This folder provides wrapper of xgboost to other languages Python ===== * To make the python module, type ```make``` in the root directory of project +* Refer to the walk through example in [python-example/demo.py](python-example/demo.py) R ===== * To make the R wrapper, type ```make R``` in the root directory of project * R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile +* Refer to the walk through example in [R-example/demo.R](R-example/demo.R) diff --git a/wrapper/python-example/demo.py b/wrapper/python-example/demo.py index 2b66e92a7..52d565456 100755 --- a/wrapper/python-example/demo.py +++ b/wrapper/python-example/demo.py @@ -30,6 +30,16 @@ bst.dump_model('dump.raw.txt') # dump model with feature map bst.dump_model('dump.nice.txt','featmap.txt') +# save dmatrix into binary buffer +dtest.save_binary('dtest.buffer') +bst.save_model('xgb.model') +# load model and data in +bst2 = xgb.Booster(model_file='xgb.model') +dtest2 = xgb.DMatrix('dtest.buffer') +preds2 = bst2.predict(dtest2) +# assert they are the same +assert np.sum(np.abs(preds2-preds)) == 0 + ### # build dmatrix from scipy.sparse print ('start running example of build DMatrix from scipy.sparse') @@ -92,7 +102,6 @@ def evalerror(preds, dtrain): # simply look at xgboost.py's implementation of train bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror) - ### # advanced: start from a initial base prediction # diff --git a/wrapper/xgboost.R b/wrapper/xgboost.R index 7ad27b527..962376027 100644 --- a/wrapper/xgboost.R +++ b/wrapper/xgboost.R @@ -1,18 +1,27 @@ +# depends on matrix +succ <- require("Matrix") +if (!succ) { + stop("xgboost depends on Matrix library") +} # load in library dyn.load("./libxgboostR.so") # constructing DMatrix -xgb.DMatrix <- function(data, info=list()) { +xgb.DMatrix <- function(data, info=list(), missing=0.0) { if (typeof(data) == "character") { handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE)) - }else { + } else if(is.matrix(data)) { + handle <- .Call("XGDMatrixCreateFromMat_R", data, missing) + } else if(class(data) == "dgCMatrix") { + handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x) + } else { stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data))) } - dmat = structure(handle, class="xgb.DMatrix") + dmat <- structure(handle, class="xgb.DMatrix") if (length(info) != 0) { for (i in 1:length(info)) { - p = info[i] - xgb.setinfo(dmat, names(p), p) + p <- info[i] + xgb.setinfo(dmat, names(p), p[[1]]) } } return(dmat) @@ -68,9 +77,10 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) { } } handle <- .Call("XGBoosterCreate_R", cachelist) + .Call("XGBoosterSetParam_R", handle, "seed", "0") if (length(params) != 0) { for (i in 1:length(params)) { - p = params[i] + p <- params[i] .Call("XGBoosterSetParam_R", handle, names(p), as.character(p)) } } @@ -95,8 +105,8 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, fe if (is.null(obj)) { succ <- xgb.iter.update(bst, dtrain, i-1) } else { - pred = xgb.predict(bst, dtrain) - gpair = obj(pred, dtrain) + pred <- xgb.predict(bst, dtrain) + gpair <- obj(pred, dtrain) succ <- xgb.iter.boost(bst, dtrain, gpair) } if (length(watchlist) != 0) { @@ -139,14 +149,25 @@ xgb.save <- function(handle, fname) { # predict xgb.predict <- function(booster, dmat, outputmargin = FALSE) { if (class(booster) != "xgb.Booster") { - stop("xgb.iter.update: first argument must be type xgb.Booster") + stop("xgb.predict: first argument must be type xgb.Booster") } if (class(dmat) != "xgb.DMatrix") { - stop("xgb.iter.update: second argument must be type xgb.DMatrix") + stop("xgb.predict: second argument must be type xgb.DMatrix") } - ret = .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin)) + ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin)) return(ret) } +# dump model +xgb.dump <- function(booster, fname, fmap = "") { + if (class(booster) != "xgb.Booster") { + stop("xgb.dump: first argument must be type xgb.Booster") + } + if (typeof(fname) != "character"){ + stop("xgb.dump: second argument must be type character") + } + .Call("XGBoosterDumpModel_R", booster, fname, fmap) + return(TRUE) +} ##-------------------------------------- # the following are low level iteratively function, not needed # if you do not want to use them diff --git a/wrapper/xgboost.py b/wrapper/xgboost.py index f6c26eac9..40f34471e 100644 --- a/wrapper/xgboost.py +++ b/wrapper/xgboost.py @@ -127,7 +127,7 @@ class DMatrix: class Booster: """learner class """ - def __init__(self, params={}, cache=[], model_name = None): + def __init__(self, params={}, cache=[], model_file = None): """ constructor, param: """ for d in cache: assert isinstance(d, DMatrix) @@ -135,8 +135,8 @@ class Booster: self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache))) self.set_param({'seed':0}) self.set_param(params) - if model_name != None: - self.load_model(model_name) + if model_file != None: + self.load_model(model_file) def __del__(self): xglib.XGBoosterFree(self.handle) def set_param(self, params, pv=None): diff --git a/wrapper/xgboost_R.cpp b/wrapper/xgboost_R.cpp index 7b7b232fe..98f31d7c2 100644 --- a/wrapper/xgboost_R.cpp +++ b/wrapper/xgboost_R.cpp @@ -1,10 +1,12 @@ #include #include +#include #include #include "xgboost_wrapper.h" #include "xgboost_R.h" #include "../src/utils/utils.h" #include "../src/utils/omp.h" +#include "../src/utils/matrix_csr.h" using namespace xgboost; @@ -21,6 +23,63 @@ extern "C" { UNPROTECT(1); return ret; } + SEXP XGDMatrixCreateFromMat_R(SEXP mat, + SEXP missing) { + SEXP dim = getAttrib(mat, R_DimSymbol); + int nrow = INTEGER(dim)[0]; + int ncol = INTEGER(dim)[1]; + double *din = REAL(mat); + std::vector data(nrow * ncol); + #pragma omp parallel for schedule(static) + for (int i = 0; i < nrow; ++i) { + for (int j = 0; j < ncol; ++j) { + data[i * ncol +j] = din[i + nrow * j]; + } + } + void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing)); + SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); + R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); + UNPROTECT(1); + return ret; + } + SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, + SEXP indices, + SEXP data) { + const int *col_ptr = INTEGER(indptr); + const int *row_index = INTEGER(indices); + const double *col_data = REAL(data); + int ncol = length(indptr) - 1; + int ndata = length(data); + // transform into CSR format + std::vector row_ptr; + std::vector< std::pair > csr_data; + utils::SparseCSRMBuilder< std::pair > builder(row_ptr, csr_data); + builder.InitBudget(); + for (int i = 0; i < ncol; ++i) { + for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.AddBudget(row_index[j]); + } + } + builder.InitStorage(); + for (int i = 0; i < ncol; ++i) { + for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) { + builder.PushElem(row_index[j], std::make_pair(i, col_data[j])); + } + } + utils::Assert(csr_data.size() == static_cast(ndata), "BUG CreateFromCSC"); + std::vector row_data(ndata); + std::vector col_index(ndata); + #pragma omp parallel for schedule(static) + for (int i = 0; i < ndata; ++i) { + col_index[i] = csr_data[i].first; + row_data[i] = csr_data[i].second; + } + void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata ); + SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); + R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); + UNPROTECT(1); + return ret; + } void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) { XGDMatrixSaveBinary(R_ExternalPtrAddr(handle), CHAR(asChar(fname)), asInteger(silent)); @@ -142,7 +201,7 @@ extern "C" { FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w"); for (size_t i = 0; i < olen; ++i) { fprintf(fo, "booster[%lu]:\n", i); - fprintf(fo, "%s\n", res[i]); + fprintf(fo, "%s", res[i]); } fclose(fo); } diff --git a/wrapper/xgboost_R.h b/wrapper/xgboost_R.h index c572b03ca..8e8b2728b 100644 --- a/wrapper/xgboost_R.h +++ b/wrapper/xgboost_R.h @@ -17,6 +17,25 @@ extern "C" { * \return a loaded data matrix */ SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent); + /*! + * \brief create matrix content from dense matrix + * This assumes the matrix is stored in column major format + * \param data R Matrix object + * \param missing which value to represent missing value + * \return created dmatrix + */ + SEXP XGDMatrixCreateFromMat_R(SEXP mat, + SEXP missing); + /*! + * \brief create a matrix content from CSC format + * \param indptr pointer to column headers + * \param indices row indices + * \param data content of the data + * \return created dmatrix + */ + SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, + SEXP indices, + SEXP data); /*! * \brief load a data matrix into binary file * \param handle a instance of data matrix diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 3772cfd95..6b3e669dd 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -16,7 +16,6 @@ extern "C" { void* XGDMatrixCreateFromFile(const char *fname, int silent); /*! * \brief create a matrix content from csr format - * \param handle a instance of data matrix * \param indptr pointer to row headers * \param indices findex * \param data fvalue @@ -31,7 +30,6 @@ extern "C" { size_t nelem); /*! * \brief create matrix content from dense matrix - * \param handle a instance of data matrix * \param data pointer to the data space * \param nrow number of rows * \param ncol number columns