seems ok
This commit is contained in:
parent
de83ac72ea
commit
3b12ff51b9
@ -1,30 +1,78 @@
|
||||
# include xgboost library, must set chdir=TRURE
|
||||
source('../xgboost.R', chdir=TRUE)
|
||||
source("../xgboost.R", chdir=TRUE)
|
||||
|
||||
# helper function to read libsvm format
|
||||
# this is very badly written, load in dense, and convert to sparse
|
||||
# use this only for demo purpose
|
||||
read.libsvm <- function(fname, maxcol) {
|
||||
content <- readLines(fname)
|
||||
nline <- length(content)
|
||||
label <- numeric(nline)
|
||||
mat <- matrix(0, nline, maxcol+1)
|
||||
for (i in 1:nline) {
|
||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
||||
label[i] <- as.numeric(arr[[1]])
|
||||
for (j in 2:length(arr)) {
|
||||
kv <- strsplit(arr[j], ":")[[1]]
|
||||
# to avoid 0 index
|
||||
findex <- as.integer(kv[1]) + 1
|
||||
fvalue <- as.numeric(kv[2])
|
||||
mat[i,findex] <- fvalue
|
||||
}
|
||||
}
|
||||
mat <- as(mat, "sparseMatrix")
|
||||
return(list(label=label, data=mat))
|
||||
}
|
||||
|
||||
# test code here
|
||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
|
||||
watchlist <- list('train'=dtrain,'test'=dtest)
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
# training xgboost model
|
||||
bst <- xgb.train(param, dtrain, nround=3, watchlist=watchlist)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
# make prediction
|
||||
preds <- xgb.predict(bst, dtest)
|
||||
labels <- xgb.getinfo(dtest, "label")
|
||||
err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
|
||||
# print error rate
|
||||
print(err)
|
||||
print(paste("error=",err))
|
||||
|
||||
# dump model
|
||||
xgb.dump(bst, "dump.raw.txt")
|
||||
# dump model with feature map
|
||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
succ <- xgb.save(dtest, "dtest.buffer")
|
||||
# save model into file
|
||||
succ <- xgb.save(bst, "xgb.model")
|
||||
# load model in
|
||||
# load model and data in
|
||||
bst2 <- xgb.Booster(modelfile="xgb.model")
|
||||
dtest2 <- xgb.DMatrix("dtest.buffer")
|
||||
preds2 <- xgb.predict(bst2, dtest2)
|
||||
# print difference
|
||||
print(sum(abs(preds2-preds)))
|
||||
# assert they are the same
|
||||
stopifnot(sum(abs(preds2-preds)) == 0)
|
||||
|
||||
###
|
||||
# build dmatrix from sparseMatrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.sparseMatrix')
|
||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
||||
label <- csc$label
|
||||
data <- csc$data
|
||||
dtrain <- xgb.DMatrix(data, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# build dmatrix from dense matrix
|
||||
###
|
||||
print ('start running example of build DMatrix from R.Matrix')
|
||||
mat = as.matrix(data)
|
||||
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
|
||||
watchlist <- list("eval"=dtest,"train"=dtrain)
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
|
||||
|
||||
###
|
||||
# advanced: cutomsized loss function
|
||||
@ -33,7 +81,7 @@ print("start running example to used cutomized objective function")
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list('bst:max_depth' = 2, 'bst:eta' = 1, 'silent' =1)
|
||||
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
@ -56,15 +104,15 @@ evalerror <- function(preds, dtrain) {
|
||||
}
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
# simply look at xgboost.py"s implementation of train
|
||||
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ('start running example to start from a initial prediction')
|
||||
print ("start running example to start from a initial prediction")
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
|
||||
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
||||
# train xgboost for 1 round
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
@ -73,5 +121,5 @@ ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
|
||||
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
|
||||
succ <- xgb.setinfo(dtest, "base_margin", ptest)
|
||||
print ('this is result of running from initial prediction')
|
||||
print ("this is result of running from initial prediction")
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
|
||||
@ -6,8 +6,10 @@ This folder provides wrapper of xgboost to other languages
|
||||
Python
|
||||
=====
|
||||
* To make the python module, type ```make``` in the root directory of project
|
||||
* Refer to the walk through example in [python-example/demo.py](python-example/demo.py)
|
||||
|
||||
R
|
||||
=====
|
||||
* To make the R wrapper, type ```make R``` in the root directory of project
|
||||
* R module need Rinternals.h, find the path in your system and add it to CPLUS_INCLUDE_PATH in Makefile
|
||||
* Refer to the walk through example in [R-example/demo.R](R-example/demo.R)
|
||||
|
||||
@ -30,6 +30,16 @@ bst.dump_model('dump.raw.txt')
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.nice.txt','featmap.txt')
|
||||
|
||||
# save dmatrix into binary buffer
|
||||
dtest.save_binary('dtest.buffer')
|
||||
bst.save_model('xgb.model')
|
||||
# load model and data in
|
||||
bst2 = xgb.Booster(model_file='xgb.model')
|
||||
dtest2 = xgb.DMatrix('dtest.buffer')
|
||||
preds2 = bst2.predict(dtest2)
|
||||
# assert they are the same
|
||||
assert np.sum(np.abs(preds2-preds)) == 0
|
||||
|
||||
###
|
||||
# build dmatrix from scipy.sparse
|
||||
print ('start running example of build DMatrix from scipy.sparse')
|
||||
@ -92,7 +102,6 @@ def evalerror(preds, dtrain):
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
|
||||
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
|
||||
@ -1,18 +1,27 @@
|
||||
# depends on matrix
|
||||
succ <- require("Matrix")
|
||||
if (!succ) {
|
||||
stop("xgboost depends on Matrix library")
|
||||
}
|
||||
# load in library
|
||||
dyn.load("./libxgboostR.so")
|
||||
|
||||
# constructing DMatrix
|
||||
xgb.DMatrix <- function(data, info=list()) {
|
||||
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
|
||||
if (typeof(data) == "character") {
|
||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
|
||||
}else {
|
||||
} else if(is.matrix(data)) {
|
||||
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing)
|
||||
} else if(class(data) == "dgCMatrix") {
|
||||
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x)
|
||||
} else {
|
||||
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
|
||||
}
|
||||
dmat = structure(handle, class="xgb.DMatrix")
|
||||
dmat <- structure(handle, class="xgb.DMatrix")
|
||||
if (length(info) != 0) {
|
||||
for (i in 1:length(info)) {
|
||||
p = info[i]
|
||||
xgb.setinfo(dmat, names(p), p)
|
||||
p <- info[i]
|
||||
xgb.setinfo(dmat, names(p), p[[1]])
|
||||
}
|
||||
}
|
||||
return(dmat)
|
||||
@ -68,9 +77,10 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
}
|
||||
}
|
||||
handle <- .Call("XGBoosterCreate_R", cachelist)
|
||||
.Call("XGBoosterSetParam_R", handle, "seed", "0")
|
||||
if (length(params) != 0) {
|
||||
for (i in 1:length(params)) {
|
||||
p = params[i]
|
||||
p <- params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
|
||||
}
|
||||
}
|
||||
@ -95,8 +105,8 @@ xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, fe
|
||||
if (is.null(obj)) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i-1)
|
||||
} else {
|
||||
pred = xgb.predict(bst, dtrain)
|
||||
gpair = obj(pred, dtrain)
|
||||
pred <- xgb.predict(bst, dtrain)
|
||||
gpair <- obj(pred, dtrain)
|
||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
@ -139,14 +149,25 @@ xgb.save <- function(handle, fname) {
|
||||
# predict
|
||||
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
stop("xgb.predict: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
stop("xgb.predict: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
ret = .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
|
||||
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
|
||||
return(ret)
|
||||
}
|
||||
# dump model
|
||||
xgb.dump <- function(booster, fname, fmap = "") {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.dump: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(fname) != "character"){
|
||||
stop("xgb.dump: second argument must be type character")
|
||||
}
|
||||
.Call("XGBoosterDumpModel_R", booster, fname, fmap)
|
||||
return(TRUE)
|
||||
}
|
||||
##--------------------------------------
|
||||
# the following are low level iteratively function, not needed
|
||||
# if you do not want to use them
|
||||
|
||||
@ -127,7 +127,7 @@ class DMatrix:
|
||||
|
||||
class Booster:
|
||||
"""learner class """
|
||||
def __init__(self, params={}, cache=[], model_name = None):
|
||||
def __init__(self, params={}, cache=[], model_file = None):
|
||||
""" constructor, param: """
|
||||
for d in cache:
|
||||
assert isinstance(d, DMatrix)
|
||||
@ -135,8 +135,8 @@ class Booster:
|
||||
self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
|
||||
self.set_param({'seed':0})
|
||||
self.set_param(params)
|
||||
if model_name != None:
|
||||
self.load_model(model_name)
|
||||
if model_file != None:
|
||||
self.load_model(model_file)
|
||||
def __del__(self):
|
||||
xglib.XGBoosterFree(self.handle)
|
||||
def set_param(self, params, pv=None):
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <cstring>
|
||||
#include "xgboost_wrapper.h"
|
||||
#include "xgboost_R.h"
|
||||
#include "../src/utils/utils.h"
|
||||
#include "../src/utils/omp.h"
|
||||
#include "../src/utils/matrix_csr.h"
|
||||
|
||||
using namespace xgboost;
|
||||
|
||||
@ -21,6 +23,63 @@ extern "C" {
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
SEXP missing) {
|
||||
SEXP dim = getAttrib(mat, R_DimSymbol);
|
||||
int nrow = INTEGER(dim)[0];
|
||||
int ncol = INTEGER(dim)[1];
|
||||
double *din = REAL(mat);
|
||||
std::vector<float> data(nrow * ncol);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < nrow; ++i) {
|
||||
for (int j = 0; j < ncol; ++j) {
|
||||
data[i * ncol +j] = din[i + nrow * j];
|
||||
}
|
||||
}
|
||||
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data) {
|
||||
const int *col_ptr = INTEGER(indptr);
|
||||
const int *row_index = INTEGER(indices);
|
||||
const double *col_data = REAL(data);
|
||||
int ncol = length(indptr) - 1;
|
||||
int ndata = length(data);
|
||||
// transform into CSR format
|
||||
std::vector<size_t> row_ptr;
|
||||
std::vector< std::pair<unsigned, float> > csr_data;
|
||||
utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
|
||||
builder.InitBudget();
|
||||
for (int i = 0; i < ncol; ++i) {
|
||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.AddBudget(row_index[j]);
|
||||
}
|
||||
}
|
||||
builder.InitStorage();
|
||||
for (int i = 0; i < ncol; ++i) {
|
||||
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
|
||||
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
|
||||
}
|
||||
}
|
||||
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
|
||||
std::vector<float> row_data(ndata);
|
||||
std::vector<unsigned> col_index(ndata);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = 0; i < ndata; ++i) {
|
||||
col_index[i] = csr_data[i].first;
|
||||
row_data[i] = csr_data[i].second;
|
||||
}
|
||||
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fname)), asInteger(silent));
|
||||
@ -142,7 +201,7 @@ extern "C" {
|
||||
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
fprintf(fo, "booster[%lu]:\n", i);
|
||||
fprintf(fo, "%s\n", res[i]);
|
||||
fprintf(fo, "%s", res[i]);
|
||||
}
|
||||
fclose(fo);
|
||||
}
|
||||
|
||||
@ -17,6 +17,25 @@ extern "C" {
|
||||
* \return a loaded data matrix
|
||||
*/
|
||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
|
||||
/*!
|
||||
* \brief create matrix content from dense matrix
|
||||
* This assumes the matrix is stored in column major format
|
||||
* \param data R Matrix object
|
||||
* \param missing which value to represent missing value
|
||||
* \return created dmatrix
|
||||
*/
|
||||
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
|
||||
SEXP missing);
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param indptr pointer to column headers
|
||||
* \param indices row indices
|
||||
* \param data content of the data
|
||||
* \return created dmatrix
|
||||
*/
|
||||
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
|
||||
SEXP indices,
|
||||
SEXP data);
|
||||
/*!
|
||||
* \brief load a data matrix into binary file
|
||||
* \param handle a instance of data matrix
|
||||
|
||||
@ -16,7 +16,6 @@ extern "C" {
|
||||
void* XGDMatrixCreateFromFile(const char *fname, int silent);
|
||||
/*!
|
||||
* \brief create a matrix content from csr format
|
||||
* \param handle a instance of data matrix
|
||||
* \param indptr pointer to row headers
|
||||
* \param indices findex
|
||||
* \param data fvalue
|
||||
@ -31,7 +30,6 @@ extern "C" {
|
||||
size_t nelem);
|
||||
/*!
|
||||
* \brief create matrix content from dense matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param data pointer to the data space
|
||||
* \param nrow number of rows
|
||||
* \param ncol number columns
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user