Merge pull request #1 from tqchen/master

updating fork to current master
This commit is contained in:
giuliohome 2014-08-27 20:17:44 +02:00
commit ce1803a40c
60 changed files with 10097 additions and 352 deletions

18
.gitignore vendored
View File

@ -16,7 +16,6 @@
*conf
*buffer
*model
xgboost
*pyc
*train
*test
@ -24,3 +23,20 @@ xgboost
*rar
*vali
*data
*sdf
Release
*exe*
*exp
ipch
*.filters
*.user
*log
Debug
*suo
*test*
.Rhistory
*.dll
*i386
*x64
*dump
*save

View File

@ -3,18 +3,21 @@ export CXX = g++
export LDFLAGS= -pthread -lm
# note for R module
# add include path to Rinternals.h here
export CPLUS_INCLUDE_PATH=/usr/share/R/include
ifeq ($(no_omp),1)
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -DDISABLE_OPENMP
else
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -fopenmp
endif
# expose these flags to R CMD SHLIB
export PKG_CPPFLAGS = $(CFLAGS) -DXGBOOST_CUSTOMIZE_ERROR_
# specify tensor path
BIN = xgboost
OBJ =
SLIB = wrapper/libxgboostwrapper.so wrapper/libxgboostR.so
SLIB = wrapper/libxgboostwrapper.so
RLIB = wrapper/libxgboostR.so
.PHONY: clean all R
all: $(BIN) wrapper/libxgboostwrapper.so
@ -31,6 +34,9 @@ $(BIN) :
$(SLIB) :
$(CXX) $(CFLAGS) -fPIC $(LDFLAGS) -shared -o $@ $(filter %.cpp %.o %.c, $^)
$(RLIB) :
R CMD SHLIB -c -o $@ $(filter %.cpp %.o %.c, $^)
$(OBJ) :
$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
@ -38,4 +44,4 @@ install:
cp -f -r $(BIN) $(INSTALL_PATH)
clean:
$(RM) $(OBJ) $(BIN) $(SLIB) *~ */*~ */*/*~
$(RM) $(OBJ) $(BIN) $(SLIB) $(RLIB) *~ */*~ */*/*~

16
R-package/DESCRIPTION Normal file
View File

@ -0,0 +1,16 @@
Package: xgboost
Type: Package
Title: R wrapper of xgboost
Version: 0.3-0
Date: 2014-08-23
Author: Tianqi Chen
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>
Description: xgboost
License: See LICENSE file
URL: https://github.com/tqchen/xgboost
BugReports: https://github.com/tqchen/xgboost/issues
Depends:
R (>= 2.0.2)
Imports:
Matrix (>= 1.1-0),
methods

13
R-package/LICENSE Normal file
View File

@ -0,0 +1,13 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

10
R-package/NAMESPACE Normal file
View File

@ -0,0 +1,10 @@
importClassesFrom("Matrix", dgCMatrix, dgeMatrix)
export(xgboost)
export(xgb.DMatrix)
export(xgb.getinfo)
exportMethods(predict)
export(xgb.train)
export(xgb.save)
export(xgb.load)
export(xgb.dump)

View File

@ -0,0 +1,16 @@
#' @export
setClass("xgb.Booster")
#' @export
setMethod("predict",
signature = "xgb.Booster",
definition = function(object, newdata, outputmargin = FALSE)
{
if (class(newdata) != "xgb.DMatrix") {
newdata = xgb.DMatrix(newdata)
}
ret <- .Call("XGBoosterPredict_R", object, newdata,
as.integer(outputmargin), PACKAGE="xgboost")
return(ret)
})

128
R-package/R/utils.R Normal file
View File

@ -0,0 +1,128 @@
# depends on matrix
.onLoad <- function(libname, pkgname) {
library.dynam("xgboost", pkgname, libname);
}
.onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath);
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost")
return(TRUE)
}
stop(paste("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE="xgboost")
.Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) != "character"){
stop("xgb.Booster: modelfile must be character");
}
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE="xgboost")
}
return(structure(handle, class="xgb.Booster"))
}
# predict, depreciated
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.predict: first argument must be type xgb.Booster")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix")
}
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost")
return(ret)
}
##--------------------------------------
# the following are low level iteratively function, not needed
# if you do not want to use them
#---------------------------------------
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost")
return(TRUE)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost")
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
evnames <- list()
if (length(watchlist) != 0) {
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost")
return(msg)
}

22
R-package/R/xgb.DMatrix.R Normal file
View File

@ -0,0 +1,22 @@
# constructing DMatrix
xgb.DMatrix <- function(data, missing=0.0, ...) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost")
} else if(is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost")
} else if(class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost")
} else {
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
}
dmat <- structure(handle, class="xgb.DMatrix")
info = list(...)
if (length(info)==0)
return(dmat)
for (i in 1:length(info)) {
p = info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
return(dmat)
}

11
R-package/R/xgb.dump.R Normal file
View File

@ -0,0 +1,11 @@
# dump model
xgb.dump <- function(booster, fname, fmap = "") {
if (class(booster) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster")
}
if (typeof(fname) != "character"){
stop("xgb.dump: second argument must be type character")
}
.Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost")
return(TRUE)
}

16
R-package/R/xgb.getinfo.R Normal file
View File

@ -0,0 +1,16 @@
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost")
return(ret)
}

5
R-package/R/xgb.load.R Normal file
View File

@ -0,0 +1,5 @@
xgb.load <- function(modelfile) {
if (is.null(modelfile))
stop('xgb.load: modelfile cannot be NULL')
xgb.Booster(modelfile=modelfile)
}

16
R-package/R/xgb.save.R Normal file
View File

@ -0,0 +1,16 @@
# save model or DMatrix to file
xgb.save <- function(handle, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(handle) == "xgb.Booster") {
.Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost")
return(TRUE)
}
if (class(handle) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE)
}

38
R-package/R/xgb.train.R Normal file
View File

@ -0,0 +1,38 @@
# train a model using given parameters
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list");
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
}
bst <- xgb.Booster(params, append(watchlist,dtrain))
for (i in 1:nrounds) {
if (is.null(obj)) {
succ <- xgb.iter.update(bst, dtrain, i-1)
} else {
pred <- xgb.predict(bst, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair)
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i-1)
cat(msg); cat("\n")
} else {
cat("["); cat(i); cat("]");
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
cat(":"); cat(ret$value)
}
cat("\n")
}
}
}
return(bst)
}

49
R-package/R/xgboost.R Normal file
View File

@ -0,0 +1,49 @@
# Main function for xgboost-package
xgboost = function(x=NULL,y=NULL,DMatrix=NULL, file=NULL, validation=NULL,
nrounds=10, obj=NULL, feval=NULL, margin=NULL, verbose = T, ...)
{
if (!is.null(DMatrix))
dtrain = DMatrix
else
{
if (is.null(x) && is.null(y))
{
if (is.null(file))
stop('xgboost need input data, either R objects, local files or DMatrix object.')
dtrain = xgb.DMatrix(file)
}
else
dtrain = xgb.DMatrix(x, label=y)
if (!is.null(margin))
{
succ <- xgb.setinfo(dtrain, "base_margin", margin)
if (!succ)
warning('Attemp to use margin failed.')
}
}
params = list(...)
watchlist=list()
if (verbose)
{
if (!is.null(validation))
{
if (class(validation)!='xgb.DMatrix')
dtest = xgb.DMatrix(validation)
else
dtest = validation
watchlist = list(eval=dtest,train=dtrain)
}
else
watchlist = list(train=dtrain)
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval)
return(bst)
}

10
R-package/README.md Normal file
View File

@ -0,0 +1,10 @@
This is subfolder for experimental version of R package.
Not yet ready.
Installation:
```r
require(devtools)
install_github('xgboost','tqchen',subdir='R-package')
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm = function(fname, maxcol) {
content = readLines(fname)
nline = length(content)
label = numeric(nline)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex = as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2])
mat[i,findex] = fvalue
}
}
mat = as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
############################
# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
############################
# Directly read in local file
dtrain = xgb.DMatrix('agaricus.txt.train')
class(dtrain)
# read file in R
csc = read.libsvm("agaricus.txt.train", 126)
y = csc$label
x = csc$data
# x as Sparse Matrix
class(x)
dtrain = xgb.DMatrix(x, label=y)
# x as dense matrix
dense.x = as.matrix(x)
dtrain = xgb.DMatrix(dense.x, label=y)
############################
# Test xgboost with local file, sparse matrix and dense matrix in R.
############################
# Test with DMatrix object
bst = xgboost(DMatrix=dtrain, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with local file
bst = xgboost(file='agaricus.txt.train', max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with Sparse Matrix
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with dense Matrix
bst = xgboost(x = dense.x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic')
# Test with validation set
bst = xgboost(file='agaricus.txt.train', validation='agaricus.txt.test',
max_depth=2, eta=1, silent=1, objective='binary:logistic')
############################
# Test predict
############################
# Prediction with DMatrix object
dtest = xgb.DMatrix('agaricus.txt.test')
pred = predict(bst, dtest)
# Prediction with local test file
pred = predict(bst, 'agaricus.txt.test')
# Prediction with Sparse Matrix
csc = read.libsvm("agaricus.txt.test", 126)
test.y = csc$label
test.x = csc$data
pred = predict(bst, test.x)
# Extrac label with xgb.getinfo
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
############################
# Save and load model to hard disk
############################
# save model to binary local file
xgb.save(bst, 'model.save')
# load binary model to R
bst = xgb.load('model.save')
pred = predict(bst, test.x)
# save model to text file
xgb.dump(bst, 'model.dump')
############################
# Customized objective and evaluation function
############################
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
bst = xgboost(x = x, y = y, max_depth=2, eta=1, silent=1, objective='binary:logistic',
obj=logregobj, feval=evalerror)

View File

@ -0,0 +1,127 @@
# load xgboost library
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i,findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# test code here
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
# dump model
xgb.dump(bst, "dump.raw.txt")
# dump model with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save dmatrix into binary buffer
succ <- xgb.save(dtest, "dtest.buffer")
# save model into file
succ <- xgb.save(bst, "xgb.model")
# load model and data in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# assert they are the same
stopifnot(sum(abs(preds2-preds)) == 0)
###
# build dmatrix from sparseMatrix
###
print ('start running example of build DMatrix from R.sparseMatrix')
csc <- read.libsvm("agaricus.txt.train", 126)
label <- csc$label
data <- csc$data
dtrain <- xgb.DMatrix(data, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# build dmatrix from dense matrix
###
print ('start running example of build DMatrix from R.Matrix')
mat = as.matrix(data)
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py"s implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ("start running example to start from a initial prediction")
# specify parameters via map, definition are same as c++ version
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ("this is result of running from initial prediction")
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -0,0 +1,103 @@
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm = function(fname, maxcol) {
content = readLines(fname)
nline = length(content)
label = numeric(nline)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex = as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2])
mat[i,findex] = fvalue
}
}
mat = as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# Parameter setting
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist = list("eval"=dtest,"train"=dtrain)
###########################
# Train from local file
###########################
# Training
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
# Prediction
pred = predict(bst, 'agaricus.txt.test')
# Performance
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
###########################
# Train from R object
###########################
csc = read.libsvm("agaricus.txt.train", 126)
y = csc$label
x = csc$data
# x as Sparse Matrix
class(x)
# Training
bst = xgboost(x,y,params=param,watchlist=watchlist)
# Prediction
pred = predict(bst, 'agaricus.txt.test')
# Performance
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
# Training with dense matrix
x = as.matrix(x)
bst = xgboost(x,y,params=param,watchlist=watchlist)
###########################
# Train with customization
###########################
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror)
############################
# Train with previous result
############################
bst = xgboost(x,y,params=param,watchlist=watchlist)
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)

View File

@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

28
R-package/src/Makevars Normal file
View File

@ -0,0 +1,28 @@
# _*_ mode: Makefile; _*_
export CC = gcc
export CXX = g++
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

View File

@ -0,0 +1,32 @@
# _*_ mode: Makefile; _*_
export CC = gcc
export CXX = g++
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fopenmp -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
# add flag to build native code even in cross compiler
ifeq "$(WIN)" "64"
PKG_CPPFLAGS += -m64
endif
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

221
R-package/src/xgboost_R.cpp Normal file
View File

@ -0,0 +1,221 @@
#include <vector>
#include <string>
#include <utility>
#include <cstring>
#include "xgboost_R.h"
#include "../../wrapper/xgboost_wrapper.h"
#include "../../src/utils/utils.h"
#include "../../src/utils/omp.h"
#include "../../src/utils/matrix_csr.h"
using namespace xgboost;
// implements error handling
namespace xgboost {
namespace utils {
void HandleAssertError(const char *msg) {
error("%s", msg);
}
void HandleCheckError(const char *msg) {
error("%s", msg);
}
} // namespace utils
} // namespace xgboost
extern "C" {
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) {
SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0];
int ncol = INTEGER(dim)[1];
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
const int *col_ptr = INTEGER(indptr);
const int *row_index = INTEGER(indices);
const double *col_data = REAL(data);
int ncol = length(indptr) - 1;
int ndata = length(data);
// transform into CSR format
std::vector<bst_ulong> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(row_index[j]);
}
}
builder.InitStorage();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
}
}
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
std::vector<float> row_data(ndata);
std::vector<unsigned> col_index(ndata);
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
col_index[i] = csr_data[i].first;
row_data[i] = csr_data[i].second;
}
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent));
}
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
return;
}
{
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
&vec[0], len);
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGBoosterFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGBoosterCreate_R(SEXP dmats) {
int len = length(dmats);
std::vector<void*> dvec;
for (int i = 0; i < len; ++i){
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
}
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)),
CHAR(asChar(val)));
}
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain));
}
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad);
std::vector<float> tgrad(len), thess(len);
#pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) {
tgrad[j] = REAL(grad)[j];
thess[j] = REAL(hess)[j];
}
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain),
&tgrad[0], &thess[0], len);
}
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats);
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
&vec_dmats[0], &vec_sptr[0], len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
&olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
&olen);
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
for (size_t i = 0; i < olen; ++i) {
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
fprintf(fo, "%s", res[i]);
}
fclose(fo);
}
}

124
R-package/src/xgboost_R.h Normal file
View File

@ -0,0 +1,124 @@
#ifndef XGBOOST_WRAPPER_R_H_
#define XGBOOST_WRAPPER_R_H_
/*!
* \file xgboost_wrapper_R.h
* \author Tianqi Chen
* \brief R wrapper of xgboost
*/
extern "C" {
#include <Rinternals.h>
}
extern "C" {
/*!
* \brief load a data matrix
* \param fname name of the content
* \param silent whether print messages
* \return a loaded data matrix
*/
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
/*!
* \brief create matrix content from dense matrix
* This assumes the matrix is stored in column major format
* \param data R Matrix object
* \param missing which value to represent missing value
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
*/
SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
*/
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats list of handles to dmatrices
* \param evname name of evaluation
* \return the string containing evaluation stati
*/
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param output_margin whether only output raw margin value
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
/*!
* \brief dump model into text file
* \param handle handle
* \param fname file name of model that can be dumped into
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
};
#endif // XGBOOST_WRAPPER_R_H_

View File

@ -6,13 +6,13 @@ objective = binary:logistic
# Tree Booster Parameters
# step size shrinkage
bst:eta = 1.0
eta = 1.0
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 1
min_child_weight = 1
# maximum depth of a tree
bst:max_depth = 3
max_depth = 3
# Task Parameters
# the number of round to do boosting

View File

@ -42,8 +42,8 @@ param = {}
param['objective'] = 'binary:logitraw'
# scale weight of positive examples
param['scale_pos_weight'] = sum_wneg/sum_wpos
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['eta'] = 0.1
param['max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 16

View File

@ -25,8 +25,8 @@ param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 6

View File

@ -5,13 +5,13 @@ objective="rank:pairwise"
# Tree Booster Parameters
# step size shrinkage
bst:eta = 0.1
eta = 0.1
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 0.1
min_child_weight = 0.1
# maximum depth of a tree
bst:max_depth = 6
max_depth = 6
# Task parameters
# the number of round to do boosting

View File

@ -7,13 +7,13 @@ objective = reg:linear
# Tree Booster Parameters
# step size shrinkage
bst:eta = 1.0
eta = 1.0
# minimum loss reduction required to make a further partition
bst:gamma = 1.0
gamma = 1.0
# minimum sum of instance weight(hessian) needed in a child
bst:min_child_weight = 1
min_child_weight = 1
# maximum depth of a tree
bst:max_depth = 3
max_depth = 3
# Task parameters
# the number of round to do boosting

View File

@ -12,6 +12,7 @@
#include <cstring>
#include <algorithm>
#include "utils/io.h"
#include "utils/omp.h"
#include "utils/utils.h"
#include "utils/iterator.h"
#include "utils/random.h"
@ -44,6 +45,10 @@ struct bst_gpair {
* these information are not necessarily presented, and can be empty
*/
struct BoosterInfo {
/*! \brief number of rows in the data */
size_t num_row;
/*! \brief number of columns in the data */
size_t num_col;
/*!
* \brief specified root index of each instance,
* can be used for multi task setting
@ -51,6 +56,9 @@ struct BoosterInfo {
std::vector<unsigned> root_index;
/*! \brief set fold indicator */
std::vector<unsigned> fold_index;
/*! \brief number of rows, number of columns */
BoosterInfo(void) : num_row(0), num_col(0) {
}
/*! \brief get root of ith instance */
inline unsigned GetRoot(size_t i) const {
return root_index.size() == 0 ? 0 : root_index[i];
@ -96,7 +104,7 @@ struct SparseBatch {
const Entry *data_ptr;
/*! \brief get i-th row from the batch */
inline Inst operator[](size_t i) const {
return Inst(data_ptr + row_ptr[i], row_ptr[i+1] - row_ptr[i]);
return Inst(data_ptr + row_ptr[i], static_cast<bst_uint>(row_ptr[i+1] - row_ptr[i]));
}
};
@ -334,7 +342,7 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
const SparseBatch &batch = iter_->Value();
for (size_t i = 0; i < batch.size; ++i) {
if (pkeep == 1.0f || random::SampleBinary(pkeep)) {
buffered_rowset_.push_back(batch.base_rowid+i);
buffered_rowset_.push_back(static_cast<bst_uint>(batch.base_rowid+i));
SparseBatch::Inst inst = batch[i];
for (bst_uint j = 0; j < inst.length; ++j) {
builder.AddBudget(inst[j].findex);
@ -363,11 +371,11 @@ class FMatrixS : public FMatrixInterface<FMatrixS>{
}
// sort columns
unsigned ncol = static_cast<unsigned>(this->NumCol());
bst_omp_uint ncol = static_cast<bst_omp_uint>(this->NumCol());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ncol; ++i) {
std::sort(&col_data_[col_ptr_[i]],
&col_data_[col_ptr_[i + 1]], Entry::CmpValue);
for (bst_omp_uint i = 0; i < ncol; ++i) {
std::sort(&col_data_[0] + col_ptr_[i],
&col_data_[0] + col_ptr_[i + 1], Entry::CmpValue);
}
}

View File

@ -51,20 +51,21 @@ class GBLinear : public IGradBooster<FMatrix> {
// for all the output group
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
const unsigned ndata = static_cast<unsigned>(rowset.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static) reduction(+: sum_grad, sum_hess)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.hess >= 0.0f) {
sum_grad += p.grad; sum_hess += p.hess;
}
}
// remove bias effect
double dw = param.learning_rate * param.CalcDeltaBias(sum_grad, sum_hess, model.bias()[gid]);
bst_float dw = static_cast<bst_float>(
param.learning_rate * param.CalcDeltaBias(sum_grad, sum_hess, model.bias()[gid]));
model.bias()[gid] += dw;
// update grad value
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
bst_gpair &p = gpair[rowset[i] * ngroup + gid];
if (p.hess >= 0.0f) {
p.grad += p.hess * dw;
@ -72,9 +73,9 @@ class GBLinear : public IGradBooster<FMatrix> {
}
}
// number of features
const unsigned nfeat = static_cast<unsigned>(feat_index.size());
const bst_omp_uint nfeat = static_cast<bst_omp_uint>(feat_index.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nfeat; ++i) {
for (bst_omp_uint i = 0; i < nfeat; ++i) {
const bst_uint fid = feat_index[i];
for (int gid = 0; gid < ngroup; ++gid) {
double sum_grad = 0.0, sum_hess = 0.0;
@ -86,7 +87,7 @@ class GBLinear : public IGradBooster<FMatrix> {
sum_hess += p.hess * v * v;
}
float &w = model[fid][gid];
double dw = param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w);
bst_float dw = static_cast<bst_float>(param.learning_rate * param.CalcDelta(sum_grad, sum_hess, w));
w += dw;
// update grad value
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
@ -116,9 +117,9 @@ class GBLinear : public IGradBooster<FMatrix> {
// k is number of group
preds.resize(preds.size() + batch.size * ngroup);
// parallel over local batch
const unsigned nsize = static_cast<unsigned>(batch.size);
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nsize; ++i) {
for (bst_omp_uint i = 0; i < nsize; ++i) {
const size_t ridx = batch.base_rowid + i;
// loop over output groups
for (int gid = 0; gid < ngroup; ++gid) {

View File

@ -94,8 +94,9 @@ class GBTree : public IGradBooster<FMatrix> {
"must have exactly ngroup*nrow gpairs");
std::vector<bst_gpair> tmp(gpair.size()/ngroup);
for (int gid = 0; gid < ngroup; ++gid) {
bst_omp_uint nsize = static_cast<bst_omp_uint>(tmp.size());
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < tmp.size(); ++i) {
for (bst_omp_uint i = 0; i < nsize; ++i) {
tmp[i] = gpair[i * ngroup + gid];
}
this->BoostNewTrees(tmp, fmat, info, gid);
@ -129,13 +130,13 @@ class GBTree : public IGradBooster<FMatrix> {
// k is number of group
preds.resize(preds.size() + batch.size * mparam.num_output_group);
// parallel over local batch
const unsigned nsize = static_cast<unsigned>(batch.size);
const bst_omp_uint nsize = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nsize; ++i) {
for (bst_omp_uint i = 0; i < nsize; ++i) {
const int tid = omp_get_thread_num();
tree::RegTree::FVec &feats = thread_temp[tid];
const size_t ridx = batch.base_rowid + i;
const unsigned root_idx = info.GetRoot(i);
int64_t ridx = static_cast<int64_t>(batch.base_rowid + i);
const unsigned root_idx = info.GetRoot(ridx);
// loop over output groups
for (int gid = 0; gid < mparam.num_output_group; ++gid) {
preds[ridx * mparam.num_output_group + gid] =
@ -172,15 +173,15 @@ class GBTree : public IGradBooster<FMatrix> {
}
updaters.clear();
std::string tval = tparam.updater_seq;
char *saveptr, *pstr;
pstr = strtok_r(&tval[0], ",", &saveptr);
char *pstr;
pstr = strtok(&tval[0], ",");
while (pstr != NULL) {
updaters.push_back(tree::CreateUpdater<FMatrix>(pstr));
for (size_t j = 0; j < cfg.size(); ++j) {
// set parameters
updaters.back()->SetParam(cfg[j].first.c_str(), cfg[j].second.c_str());
}
pstr = strtok_r(NULL, ",", &saveptr);
pstr = strtok(NULL, ",");
}
tparam.updater_initialized = 1;
}
@ -218,7 +219,7 @@ class GBTree : public IGradBooster<FMatrix> {
tree::RegTree::FVec *p_feats) {
size_t itop = 0;
float psum = 0.0f;
const int bid = mparam.BufferOffset(buffer_index, bst_group);
const int64_t bid = mparam.BufferOffset(buffer_index, bst_group);
// load buffered results if any
if (bid >= 0) {
itop = pred_counter[bid];
@ -320,7 +321,7 @@ class GBTree : public IGradBooster<FMatrix> {
* \brief get the buffer offset given a buffer index and group id
* \return calculated buffer offset
*/
inline size_t BufferOffset(int64_t buffer_index, int bst_group) const {
inline int64_t BufferOffset(int64_t buffer_index, int bst_group) const {
if (buffer_index < 0) return -1;
utils::Check(buffer_index < num_pbuffer, "buffer_index exceed num_pbuffer");
return buffer_index + num_pbuffer * bst_group;

View File

@ -2,6 +2,7 @@
#define _CRT_SECURE_NO_DEPRECATE
#include <string>
#include "./io.h"
#include "../utils/io.h"
#include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp"
// implements data loads using dmatrix simple for now
@ -9,6 +10,19 @@
namespace xgboost {
namespace io {
DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
int magic;
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
fs.Seek(0);
if (magic == DMatrixSimple::kMagic) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->LoadBinary(fs, silent, fname);
fs.Close();
return dmat;
}
fs.Close();
DMatrixSimple *dmat = new DMatrixSimple();
dmat->CacheLoad(fname, silent, savebuffer);
return dmat;

View File

@ -62,10 +62,10 @@ class DMatrixSimple : public DataMatrix {
inline size_t AddRow(const std::vector<SparseBatch::Entry> &feats) {
for (size_t i = 0; i < feats.size(); ++i) {
row_data_.push_back(feats[i]);
info.num_col = std::max(info.num_col, static_cast<size_t>(feats[i].findex+1));
info.info.num_col = std::max(info.info.num_col, static_cast<size_t>(feats[i].findex+1));
}
row_ptr_.push_back(row_ptr_.back() + feats.size());
info.num_row += 1;
info.info.num_row += 1;
return row_ptr_.size() - 2;
}
/*!
@ -99,19 +99,19 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
info.num_row(), info.num_col(), row_data_.size(), fname);
}
fclose(file);
// try to load in additional file
std::string name = fname;
std::string gname = name + ".group";
if (info.TryLoadGroup(gname.c_str(), silent)) {
utils::Check(info.group_ptr.back() == info.num_row,
utils::Check(info.group_ptr.back() == info.num_row(),
"DMatrix: group data does not match the number of rows in features");
}
std::string wname = name + ".weight";
if (info.TryLoadFloatInfo("weight", wname.c_str(), silent)) {
utils::Check(info.weights.size() == info.num_row,
utils::Check(info.weights.size() == info.num_row(),
"DMatrix: weight data does not match the number of rows in features");
}
std::string mname = name + ".base_margin";
@ -128,6 +128,17 @@ class DMatrixSimple : public DataMatrix {
FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false;
utils::FileStream fs(fp);
this->LoadBinary(fs, silent, fname);
fs.Close();
return true;
}
/*!
* \brief load from binary stream
* \param fs input file stream
* \param silent whether print information during loading
* \param fname file name, used to print message
*/
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
int magic;
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
utils::Check(magic == kMagic, "invalid format,magic number mismatch");
@ -135,16 +146,19 @@ class DMatrixSimple : public DataMatrix {
info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
fmat.LoadColAccess(fs);
fs.Close();
if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
printf("%lux%lu matrix with %lu entries is loaded",
info.num_row(), info.num_col(), row_data_.size());
if (fname != NULL) {
printf(" from %s\n", fname);
} else {
printf("\n");
}
if (info.group_ptr.size() != 0) {
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
}
}
return true;
}
/*!
* \brief save to binary file
@ -163,7 +177,7 @@ class DMatrixSimple : public DataMatrix {
if (!silent) {
printf("%lux%lu matrix with %lu entries is saved to %s\n",
info.num_row, info.num_col, row_data_.size(), fname);
info.num_row(), info.num_col(), row_data_.size(), fname);
if (info.group_ptr.size() != 0) {
printf("data contains %lu groups\n", info.group_ptr.size()-1);
}
@ -179,7 +193,7 @@ class DMatrixSimple : public DataMatrix {
* \param savebuffer whether do save binary buffer if it is text
*/
inline void CacheLoad(const char *fname, bool silent = false, bool savebuffer = true) {
int len = strlen(fname);
size_t len = strlen(fname);
if (len > 8 && !strcmp(fname + len - 7, ".buffer")) {
if (!this->LoadBinary(fname, silent)) {
utils::Error("can not open file \"%s\"", fname);

View File

@ -15,10 +15,12 @@ namespace learner {
* \brief meta information needed in training, including label, weight
*/
struct MetaInfo {
/*! \brief number of rows in the data */
size_t num_row;
/*! \brief number of columns in the data */
size_t num_col;
/*!
* \brief information needed by booster
* BoosterInfo does not implement save and load,
* all serialization is done in MetaInfo
*/
BoosterInfo info;
/*! \brief label of each instance */
std::vector<float> labels;
/*!
@ -28,8 +30,6 @@ struct MetaInfo {
std::vector<bst_uint> group_ptr;
/*! \brief weights of each instance, optional */
std::vector<float> weights;
/*! \brief information needed by booster */
BoosterInfo info;
/*!
* \brief initialized margins,
* if specified, xgboost will start from this init margin
@ -39,7 +39,15 @@ struct MetaInfo {
/*! \brief version flag, used to check version of this info */
static const int kVersion = 0;
// constructor
MetaInfo(void) : num_row(0), num_col(0) {}
MetaInfo(void) {}
/*! \return number of rows in dataset */
inline size_t num_row(void) const {
return info.num_row;
}
/*! \return number of columns in dataset */
inline size_t num_col(void) const {
return info.num_col;
}
/*! \brief clear all the information */
inline void Clear(void) {
labels.clear();
@ -47,7 +55,7 @@ struct MetaInfo {
weights.clear();
info.root_index.clear();
base_margin.clear();
num_row = num_col = 0;
info.num_row = info.num_col = 0;
}
/*! \brief get weight of each instances */
inline float GetWeight(size_t i) const {
@ -60,8 +68,8 @@ struct MetaInfo {
inline void SaveBinary(utils::IStream &fo) const {
int version = kVersion;
fo.Write(&version, sizeof(version));
fo.Write(&num_row, sizeof(num_row));
fo.Write(&num_col, sizeof(num_col));
fo.Write(&info.num_row, sizeof(info.num_row));
fo.Write(&info.num_col, sizeof(info.num_col));
fo.Write(labels);
fo.Write(group_ptr);
fo.Write(weights);
@ -70,9 +78,9 @@ struct MetaInfo {
}
inline void LoadBinary(utils::IStream &fi) {
int version;
utils::Check(fi.Read(&version, sizeof(version)), "MetaInfo: invalid format");
utils::Check(fi.Read(&num_row, sizeof(num_row)), "MetaInfo: invalid format");
utils::Check(fi.Read(&num_col, sizeof(num_col)), "MetaInfo: invalid format");
utils::Check(fi.Read(&version, sizeof(version)) != 0, "MetaInfo: invalid format");
utils::Check(fi.Read(&info.num_row, sizeof(info.num_row)) != 0, "MetaInfo: invalid format");
utils::Check(fi.Read(&info.num_col, sizeof(info.num_col)) != 0, "MetaInfo: invalid format");
utils::Check(fi.Read(&labels), "MetaInfo: invalid format");
utils::Check(fi.Read(&group_ptr), "MetaInfo: invalid format");
utils::Check(fi.Read(&weights), "MetaInfo: invalid format");
@ -94,19 +102,28 @@ struct MetaInfo {
fclose(fi);
return true;
}
inline std::vector<float>& GetInfo(const char *field) {
inline std::vector<float>& GetFloatInfo(const char *field) {
if (!strcmp(field, "label")) return labels;
if (!strcmp(field, "weight")) return weights;
if (!strcmp(field, "base_margin")) return base_margin;
utils::Error("unknown field %s", field);
return labels;
}
inline const std::vector<float>& GetInfo(const char *field) const {
return ((MetaInfo*)this)->GetInfo(field);
inline const std::vector<float>& GetFloatInfo(const char *field) const {
return ((MetaInfo*)this)->GetFloatInfo(field);
}
inline std::vector<unsigned> &GetUIntInfo(const char *field) {
if (!strcmp(field, "root_index")) return info.root_index;
if (!strcmp(field, "fold_index")) return info.fold_index;
utils::Error("unknown field %s", field);
return info.root_index;
}
inline const std::vector<unsigned> &GetUIntInfo(const char *field) const {
return ((MetaInfo*)this)->GetUIntInfo(field);
}
// try to load weight information from file, if exists
inline bool TryLoadFloatInfo(const char *field, const char* fname, bool silent = false) {
std::vector<float> &weights = this->GetInfo(field);
std::vector<float> &weights = this->GetFloatInfo(field);
FILE *fi = fopen64(fname, "r");
if (fi == NULL) return false;
float wt;

View File

@ -26,10 +26,10 @@ struct EvalEWiseBase : public IEvaluator {
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(),
"label and prediction size not match");
const unsigned ndata = static_cast<unsigned>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
float sum = 0.0, wsum = 0.0;
#pragma omp parallel for reduction(+: sum, wsum) schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
const float wt = info.GetWeight(i);
sum += Derived::EvalRow(info.labels[i], preds[i]) * wt;
wsum += wt;
@ -109,12 +109,12 @@ struct EvalAMS : public IEvaluator {
}
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
const unsigned ndata = static_cast<unsigned>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
utils::Check(info.weights.size() == ndata, "we need weight to evaluate ams");
std::vector< std::pair<float, unsigned> > rec(ndata);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
rec[i] = std::make_pair(preds[i], i);
}
std::sort(rec.begin(), rec.end(), CmpFirst);
@ -123,7 +123,7 @@ struct EvalAMS : public IEvaluator {
const double br = 10.0;
unsigned thresindex = 0;
double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
for (unsigned i = 0; i < ndata-1 && i < ntop; ++i) {
for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
const unsigned ridx = rec[i].second;
const float wt = info.weights[ridx];
if (info.labels[ridx] > 0.5f) {
@ -132,7 +132,7 @@ struct EvalAMS : public IEvaluator {
b_fp += wt;
}
if (rec[i].first != rec[i+1].first) {
double ams = sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
double ams = sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
if (tams < ams) {
thresindex = i;
tams = ams;
@ -141,9 +141,9 @@ struct EvalAMS : public IEvaluator {
}
if (ntop == ndata) {
fprintf(stderr, "\tams-ratio=%g", static_cast<float>(thresindex) / ndata);
return tams;
return static_cast<float>(tams);
} else {
return sqrtf(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp));
return static_cast<float>(sqrt(2*((s_tp+b_fp+br) * log(1.0 + s_tp/(b_fp+br)) - s_tp)));
}
}
virtual const char *Name(void) const {
@ -171,7 +171,7 @@ struct EvalPrecisionRatio : public IEvaluator{
utils::Assert(preds.size() == info.labels.size(), "label size predict size not match");
std::vector< std::pair<float, unsigned> > rec;
for (size_t j = 0; j < preds.size(); ++j) {
rec.push_back(std::make_pair(preds[j], j));
rec.push_back(std::make_pair(preds[j], static_cast<unsigned>(j)));
}
std::sort(rec.begin(), rec.end(), CmpFirst);
double pratio = CalcPRatio(rec, info);
@ -207,11 +207,11 @@ struct EvalAuc : public IEvaluator {
virtual float Eval(const std::vector<float> &preds,
const MetaInfo &info) const {
utils::Check(preds.size() == info.labels.size(), "label size predict size not match");
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.back() == preds.size(),
"EvalAuc: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
// sum statictis
double sum_auc = 0.0f;
#pragma omp parallel reduction(+:sum_auc)
@ -219,7 +219,7 @@ struct EvalAuc : public IEvaluator {
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
for (bst_omp_uint k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], j));
@ -264,12 +264,12 @@ struct EvalRankList : public IEvaluator {
utils::Check(preds.size() == info.labels.size(),
"label size predict size not match");
// quick consistency when group is not available
std::vector<unsigned> tgptr(2, 0); tgptr[1] = preds.size();
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
utils::Assert(gptr.back() == preds.size(),
"EvalRanklist: group structure must match number of prediction");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
// sum statistics
double sum_metric = 0.0f;
#pragma omp parallel reduction(+:sum_metric)
@ -277,7 +277,7 @@ struct EvalRankList : public IEvaluator {
// each thread takes a local rec
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
for (bst_omp_uint k = 0; k < ngroup; ++k) {
rec.clear();
for (unsigned j = gptr[k]; j < gptr[k + 1]; ++j) {
rec.push_back(std::make_pair(preds[j], static_cast<int>(info.labels[j])));
@ -339,7 +339,7 @@ struct EvalNDCG : public EvalRankList{
for (size_t i = 0; i < rec.size() && i < this->topn_; ++i) {
const unsigned rel = rec[i].second;
if (rel != 0) {
sumdcg += ((1 << rel) - 1) / logf(i + 2);
sumdcg += ((1 << rel) - 1) / log(i + 2.0);
}
}
return static_cast<float>(sumdcg);

View File

@ -7,6 +7,7 @@
*/
#include <string>
#include <vector>
#include <cstdio>
#include "../utils/utils.h"
#include "./dmatrix.h"

View File

@ -58,9 +58,9 @@ class BoostLearner {
if (dupilicate) continue;
// set mats[i]'s cache learner pointer to this
mats[i]->cache_learner_ptr_ = this;
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row));
buffer_size += mats[i]->info.num_row;
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col));
cache_.push_back(CacheEntry(mats[i], buffer_size, mats[i]->info.num_row()));
buffer_size += mats[i]->info.num_row();
num_feature = std::max(num_feature, static_cast<unsigned>(mats[i]->info.num_col()));
}
char str_temp[25];
if (num_feature > mparam.num_feature) {
@ -79,6 +79,11 @@ class BoostLearner {
* \param val value of the parameter
*/
inline void SetParam(const char *name, const char *val) {
// in this version, bst: prefix is no longer required
if (strncmp(name, "bst:", 4) != 0) {
std::string n = "bst:"; n += name;
this->SetParam(n.c_str(), val);
}
if (!strcmp(name, "silent")) silent = atoi(val);
if (!strcmp(name, "prob_buffer_row")) prob_buffer_row = static_cast<float>(atof(val));
if (!strcmp(name, "eval_metric")) evaluator_.AddEval(val);
@ -91,7 +96,7 @@ class BoostLearner {
if (!strcmp(name, "objective")) name_obj_ = val;
if (!strcmp(name, "booster")) name_gbm_ = val;
mparam.SetParam(name, val);
}
}
if (gbm_ != NULL) gbm_->SetParam(name, val);
if (obj_ != NULL) obj_->SetParam(name, val);
if (gbm_ == NULL || obj_ == NULL) {
@ -248,17 +253,17 @@ class BoostLearner {
data.info.info, out_preds);
// add base margin
std::vector<float> &preds = *out_preds;
const unsigned ndata = static_cast<unsigned>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
if (data.info.base_margin.size() != 0) {
utils::Check(preds.size() == data.info.base_margin.size(),
"base_margin.size does not match with prediction size");
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += data.info.base_margin[j];
}
} else {
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] += mparam.base_score;
}
}
@ -329,8 +334,8 @@ class BoostLearner {
inline int64_t FindBufferOffset(const DMatrix<FMatrix> &mat) const {
for (size_t i = 0; i < cache_.size(); ++i) {
if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
if (cache_[i].num_row_ == mat.info.num_row) {
return cache_[i].buffer_offset_;
if (cache_[i].num_row_ == mat.info.num_row()) {
return static_cast<int64_t>(cache_[i].buffer_offset_);
}
}
}

View File

@ -116,9 +116,9 @@ class RegLossObj : public IObjFunction{
gpair.resize(preds.size());
// start calculating gradient
const unsigned nstep = static_cast<unsigned>(info.labels.size());
const unsigned ndata = static_cast<unsigned>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
const unsigned j = i % nstep;
float p = loss.PredTransform(preds[i]);
float w = info.GetWeight(j);
@ -132,9 +132,9 @@ class RegLossObj : public IObjFunction{
}
virtual void PredTransform(std::vector<float> *io_preds) {
std::vector<float> &preds = *io_preds;
const unsigned ndata = static_cast<unsigned>(preds.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size());
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
preds[j] = loss.PredTransform(preds[j]);
}
}
@ -169,12 +169,12 @@ class SoftmaxMultiClassObj : public IObjFunction {
std::vector<bst_gpair> &gpair = *out_gpair;
gpair.resize(preds.size());
const unsigned nstep = static_cast<unsigned>(info.labels.size() * nclass);
const unsigned ndata = static_cast<unsigned>(preds.size() / nclass);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size() / nclass);
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma omp for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
for (int k = 0; k < nclass; ++k) {
rec[k] = preds[i * nclass + k];
}
@ -210,18 +210,18 @@ class SoftmaxMultiClassObj : public IObjFunction {
utils::Check(nclass != 0, "must set num_class to use softmax");
std::vector<float> &preds = *io_preds;
std::vector<float> tmp;
const unsigned ndata = static_cast<unsigned>(preds.size()/nclass);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(preds.size()/nclass);
if (prob == 0) tmp.resize(ndata);
#pragma omp parallel
{
std::vector<float> rec(nclass);
#pragma omp for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
for (int k = 0; k < nclass; ++k) {
rec[k] = preds[j * nclass + k];
}
if (prob == 0) {
tmp[j] = FindMaxIndex(rec);
tmp[j] = static_cast<float>(FindMaxIndex(rec));
} else {
Softmax(&rec);
for (int k = 0; k < nclass; ++k) {
@ -259,11 +259,11 @@ class LambdaRankObj : public IObjFunction {
std::vector<bst_gpair> &gpair = *out_gpair;
gpair.resize(preds.size());
// quick consistency when group is not available
std::vector<unsigned> tgptr(2, 0); tgptr[1] = info.labels.size();
std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(info.labels.size());
const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
utils::Check(gptr.size() != 0 && gptr.back() == info.labels.size(),
"group structure not consistent with #rows");
const unsigned ngroup = static_cast<unsigned>(gptr.size() - 1);
const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
#pragma omp parallel
{
// parall construct, declare random number generator here, so that each
@ -273,7 +273,7 @@ class LambdaRankObj : public IObjFunction {
std::vector<ListEntry> lst;
std::vector< std::pair<float, unsigned> > rec;
#pragma omp for schedule(static)
for (unsigned k = 0; k < ngroup; ++k) {
for (bst_omp_uint k = 0; k < ngroup; ++k) {
lst.clear(); pairs.clear();
for (unsigned j = gptr[k]; j < gptr[k+1]; ++j) {
lst.push_back(ListEntry(preds[j], info.labels[j], j));
@ -290,7 +290,7 @@ class LambdaRankObj : public IObjFunction {
unsigned j = i + 1;
while (j < rec.size() && rec[j].first == rec[i].first) ++j;
// bucket in [i,j), get a sample outside bucket
unsigned nleft = i, nright = rec.size() - j;
unsigned nleft = i, nright = static_cast<unsigned>(rec.size() - j);
if (nleft + nright != 0) {
int nsample = num_pairsample;
while (nsample --) {
@ -436,9 +436,9 @@ class LambdaRankObjNDCG : public LambdaRankObj {
inline static float CalcDCG(const std::vector<float> &labels) {
double sumdcg = 0.0;
for (size_t i = 0; i < labels.size(); ++i) {
const unsigned rel = labels[i];
const unsigned rel = static_cast<unsigned>(labels[i]);
if (rel != 0) {
sumdcg += ((1 << rel) - 1) / logf(i + 2);
sumdcg += ((1 << rel) - 1) / logf(static_cast<float>(i + 2));
}
}
return static_cast<float>(sumdcg);

View File

@ -42,11 +42,17 @@ class TreeModel {
int max_depth;
/*! \brief number of features used for tree construction */
int num_feature;
/*!
* \brief leaf vector size, used for vector tree
* used to store more than one dimensional information in tree
*/
int size_leaf_vector;
/*! \brief reserved part */
int reserved[32];
int reserved[31];
/*! \brief constructor */
Param(void) {
max_depth = 0;
size_leaf_vector = 0;
memset(reserved, 0, sizeof(reserved));
}
/*!
@ -57,6 +63,7 @@ class TreeModel {
inline void SetParam(const char *name, const char *val) {
if (!strcmp("num_roots", name)) num_roots = atoi(val);
if (!strcmp("num_feature", name)) num_feature = atoi(val);
if (!strcmp("size_leaf_vector", name)) size_leaf_vector = atoi(val);
}
};
/*! \brief tree node */
@ -166,10 +173,12 @@ class TreeModel {
protected:
// vector of nodes
std::vector<Node> nodes;
// stats of nodes
std::vector<TNodeStat> stats;
// free node space, used during training process
std::vector<int> deleted_nodes;
// stats of nodes
std::vector<TNodeStat> stats;
// leaf vector, that is used to store additional information
std::vector<bst_float> leaf_vector;
// allocate a new node,
// !!!!!! NOTE: may cause BUG here, nodes.resize
inline int AllocNode(void) {
@ -184,6 +193,7 @@ class TreeModel {
"number of nodes in the tree exceed 2^31");
nodes.resize(param.num_nodes);
stats.resize(param.num_nodes);
leaf_vector.resize(param.num_nodes * param.size_leaf_vector);
return nd;
}
// delete a tree node
@ -247,6 +257,16 @@ class TreeModel {
inline NodeStat &stat(int nid) {
return stats[nid];
}
/*! \brief get leaf vector given nid */
inline bst_float* leafvec(int nid) {
if (leaf_vector.size() == 0) return NULL;
return &leaf_vector[nid * param.size_leaf_vector];
}
/*! \brief get leaf vector given nid */
inline const bst_float* leafvec(int nid) const{
if (leaf_vector.size() == 0) return NULL;
return &leaf_vector[nid * param.size_leaf_vector];
}
/*! \brief initialize the model */
inline void InitModel(void) {
param.num_nodes = param.num_roots;

View File

@ -11,45 +11,6 @@
namespace xgboost {
namespace tree {
/*! \brief core statistics used for tree construction */
struct GradStats {
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief constructor */
GradStats(void) {
this->Clear();
}
/*! \brief clear the statistics */
inline void Clear(void) {
sum_grad = sum_hess = 0.0f;
}
/*! \brief add statistics to the data */
inline void Add(double grad, double hess) {
sum_grad += grad; sum_hess += hess;
}
/*! \brief add statistics to the data */
inline void Add(const bst_gpair& b) {
this->Add(b.grad, b.hess);
}
/*! \brief add statistics to the data */
inline void Add(const GradStats &b) {
this->Add(b.sum_grad, b.sum_hess);
}
/*! \brief substract the statistics by b */
inline GradStats Substract(const GradStats &b) const {
GradStats res;
res.sum_grad = this->sum_grad - b.sum_grad;
res.sum_hess = this->sum_hess - b.sum_hess;
return res;
}
/*! \return whether the statistics is not used yet */
inline bool Empty(void) const {
return sum_hess == 0.0;
}
};
/*! \brief training parameters for regression tree */
struct TrainParam{
// learning step size for a time
@ -106,7 +67,7 @@ struct TrainParam{
if (!strcmp(name, "min_child_weight")) min_child_weight = static_cast<float>(atof(val));
if (!strcmp(name, "min_split_loss")) min_split_loss = static_cast<float>(atof(val));
if (!strcmp(name, "reg_lambda")) reg_lambda = static_cast<float>(atof(val));
if (!strcmp(name, "reg_method")) reg_method = static_cast<float>(atof(val));
if (!strcmp(name, "reg_method")) reg_method = atoi(val);
if (!strcmp(name, "subsample")) subsample = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bylevel")) colsample_bylevel = static_cast<float>(atof(val));
if (!strcmp(name, "colsample_bytree")) colsample_bytree = static_cast<float>(atof(val));
@ -165,13 +126,6 @@ struct TrainParam{
inline bool cannot_split(double sum_hess, int depth) const {
return sum_hess < this->min_child_weight * 2.0;
}
// code support for template data
inline double CalcWeight(const GradStats &d) const {
return this->CalcWeight(d.sum_grad, d.sum_hess);
}
inline double CalcGain(const GradStats &d) const {
return this->CalcGain(d.sum_grad, d.sum_hess);
}
protected:
// functions for L1 cost
@ -185,6 +139,63 @@ struct TrainParam{
}
};
/*! \brief core statistics used for tree construction */
struct GradStats {
/*! \brief sum gradient statistics */
double sum_grad;
/*! \brief sum hessian statistics */
double sum_hess;
/*! \brief constructor, the object must be cleared during construction */
explicit GradStats(const TrainParam &param) {
this->Clear();
}
/*! \brief clear the statistics */
inline void Clear(void) {
sum_grad = sum_hess = 0.0f;
}
/*!
* \brief accumulate statistics,
* \param gpair the vector storing the gradient statistics
* \param info the additional information
* \param ridx instance index of this instance
*/
inline void Add(const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
bst_uint ridx) {
const bst_gpair &b = gpair[ridx];
this->Add(b.grad, b.hess);
}
/*! \brief caculate leaf weight */
inline double CalcWeight(const TrainParam &param) const {
return param.CalcWeight(sum_grad, sum_hess);
}
/*! \brief calculate gain of the solution */
inline double CalcGain(const TrainParam &param) const {
return param.CalcGain(sum_grad, sum_hess);
}
/*! \brief add statistics to the data */
inline void Add(const GradStats &b) {
this->Add(b.sum_grad, b.sum_hess);
}
/*! \brief set current value to a - b */
inline void SetSubstract(const GradStats &a, const GradStats &b) {
sum_grad = a.sum_grad - b.sum_grad;
sum_hess = a.sum_hess - b.sum_hess;
}
/*! \return whether the statistics is not used yet */
inline bool Empty(void) const {
return sum_hess == 0.0;
}
/*! \brief set leaf vector value based on statistics */
inline void SetLeafVec(const TrainParam &param, bst_float *vec) const{
}
protected:
/*! \brief add statistics to the data */
inline void Add(double grad, double hess) {
sum_grad += grad; sum_hess += hess;
}
};
/*!
* \brief statistics that is helpful to store
* and represent a split solution for the tree

View File

@ -60,7 +60,7 @@ namespace tree {
template<typename FMatrix>
inline IUpdater<FMatrix>* CreateUpdater(const char *name) {
if (!strcmp(name, "prune")) return new TreePruner<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix>();
if (!strcmp(name, "refresh")) return new TreeRefresher<FMatrix, GradStats>();
if (!strcmp(name, "grow_colmaker")) return new ColMaker<FMatrix, GradStats>();
utils::Error("unknown updater:%s", name);
return NULL;

View File

@ -51,8 +51,8 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief current best solution */
SplitEntry best;
// constructor
ThreadEntry(void) {
stats.Clear();
explicit ThreadEntry(const TrainParam &param)
: stats(param) {
}
};
struct NodeEntry {
@ -65,8 +65,8 @@ class ColMaker: public IUpdater<FMatrix> {
/*! \brief current best solution */
SplitEntry best;
// constructor
NodeEntry(void) : root_gain(0.0f), weight(0.0f){
stats.Clear();
explicit NodeEntry(const TrainParam &param)
: stats(param), root_gain(0.0f), weight(0.0f){
}
};
// actual builder that runs the algorithm
@ -80,13 +80,13 @@ class ColMaker: public IUpdater<FMatrix> {
const BoosterInfo &info,
RegTree *p_tree) {
this->InitData(gpair, fmat, info.root_index, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
for (int depth = 0; depth < param.max_depth; ++depth) {
this->FindSplit(depth, this->qexpand, gpair, fmat, p_tree);
this->FindSplit(depth, this->qexpand, gpair, fmat, info, p_tree);
this->ResetPosition(this->qexpand, fmat, *p_tree);
this->UpdateQueueExpand(*p_tree, &this->qexpand);
this->InitNewNode(qexpand, gpair, fmat, *p_tree);
this->InitNewNode(qexpand, gpair, fmat, info, *p_tree);
// if nothing left to be expand, break
if (qexpand.size() == 0) break;
}
@ -100,6 +100,7 @@ class ColMaker: public IUpdater<FMatrix> {
p_tree->stat(nid).loss_chg = snode[nid].best.loss_chg;
p_tree->stat(nid).base_weight = snode[nid].weight;
p_tree->stat(nid).sum_hess = static_cast<float>(snode[nid].stats.sum_hess);
snode[nid].stats.SetLeafVec(param, p_tree->leafvec(nid));
}
}
@ -175,34 +176,35 @@ class ColMaker: public IUpdater<FMatrix> {
inline void InitNewNode(const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
const BoosterInfo &info,
const RegTree &tree) {
{// setup statistics space for each tree node
for (size_t i = 0; i < stemp.size(); ++i) {
stemp[i].resize(tree.param.num_nodes, ThreadEntry());
stemp[i].resize(tree.param.num_nodes, ThreadEntry(param));
}
snode.resize(tree.param.num_nodes, NodeEntry());
snode.resize(tree.param.num_nodes, NodeEntry(param));
}
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// setup position
const unsigned ndata = static_cast<unsigned>(rowset.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int tid = omp_get_thread_num();
if (position[ridx] < 0) continue;
stemp[tid][position[ridx]].stats.Add(gpair[ridx]);
stemp[tid][position[ridx]].stats.Add(gpair, info, ridx);
}
// sum the per thread statistics together
for (size_t j = 0; j < qexpand.size(); ++j) {
const int nid = qexpand[j];
TStats stats; stats.Clear();
TStats stats(param);
for (size_t tid = 0; tid < stemp.size(); ++tid) {
stats.Add(stemp[tid][nid].stats);
}
// update node statistics
snode[nid].stats = stats;
snode[nid].root_gain = param.CalcGain(stats);
snode[nid].weight = param.CalcWeight(stats);
snode[nid].root_gain = static_cast<float>(stats.CalcGain(param));
snode[nid].weight = static_cast<float>(stats.CalcWeight(param));
}
}
/*! \brief update queue expand add in new leaves */
@ -223,12 +225,15 @@ class ColMaker: public IUpdater<FMatrix> {
template<typename Iter>
inline void EnumerateSplit(Iter it, unsigned fid,
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
std::vector<ThreadEntry> &temp,
bool is_forward_search) {
// clear all the temp statistics
for (size_t j = 0; j < qexpand.size(); ++j) {
temp[qexpand[j]].stats.Clear();
}
// left statistics
TStats c(param);
while (it.Next()) {
const bst_uint ridx = it.rindex();
const int nid = position[ridx];
@ -239,19 +244,19 @@ class ColMaker: public IUpdater<FMatrix> {
ThreadEntry &e = temp[nid];
// test if first hit, this is fine, because we set 0 during init
if (e.stats.Empty()) {
e.stats.Add(gpair[ridx]);
e.stats.Add(gpair, info, ridx);
e.last_fvalue = fvalue;
} else {
// try to find a split
if (fabsf(fvalue - e.last_fvalue) > rt_2eps && e.stats.sum_hess >= param.min_child_weight) {
TStats c = snode[nid].stats.Substract(e.stats);
c.SetSubstract(snode[nid].stats, e.stats);
if (c.sum_hess >= param.min_child_weight) {
double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
e.best.Update(loss_chg, fid, (fvalue + e.last_fvalue) * 0.5f, !is_forward_search);
}
}
// update the statistics
e.stats.Add(gpair[ridx]);
e.stats.Add(gpair, info, ridx);
e.last_fvalue = fvalue;
}
}
@ -259,9 +264,9 @@ class ColMaker: public IUpdater<FMatrix> {
for (size_t i = 0; i < qexpand.size(); ++i) {
const int nid = qexpand[i];
ThreadEntry &e = temp[nid];
TStats c = snode[nid].stats.Substract(e.stats);
c.SetSubstract(snode[nid].stats, e.stats);
if (e.stats.sum_hess >= param.min_child_weight && c.sum_hess >= param.min_child_weight) {
const double loss_chg = param.CalcGain(e.stats) + param.CalcGain(c) - snode[nid].root_gain;
bst_float loss_chg = static_cast<bst_float>(e.stats.CalcGain(param) + c.CalcGain(param) - snode[nid].root_gain);
const float delta = is_forward_search ? rt_eps : -rt_eps;
e.best.Update(loss_chg, fid, e.last_fvalue + delta, !is_forward_search);
}
@ -269,7 +274,9 @@ class ColMaker: public IUpdater<FMatrix> {
}
// find splits at current level, do split per level
inline void FindSplit(int depth, const std::vector<int> &qexpand,
const std::vector<bst_gpair> &gpair, const FMatrix &fmat,
const std::vector<bst_gpair> &gpair,
const FMatrix &fmat,
const BoosterInfo &info,
RegTree *p_tree) {
std::vector<unsigned> feat_set = feat_index;
if (param.colsample_bylevel != 1.0f) {
@ -279,19 +286,19 @@ class ColMaker: public IUpdater<FMatrix> {
feat_set.resize(n);
}
// start enumeration
const unsigned nsize = static_cast<unsigned>(feat_set.size());
const bst_omp_uint nsize = static_cast<bst_omp_uint>(feat_set.size());
#if defined(_OPENMP)
const int batch_size = std::max(static_cast<int>(nsize / this->nthread / 32), 1);
#endif
#pragma omp parallel for schedule(dynamic, batch_size)
for (unsigned i = 0; i < nsize; ++i) {
for (bst_omp_uint i = 0; i < nsize; ++i) {
const unsigned fid = feat_set[i];
const int tid = omp_get_thread_num();
if (param.need_forward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, stemp[tid], true);
this->EnumerateSplit(fmat.GetSortedCol(fid), fid, gpair, info, stemp[tid], true);
}
if (param.need_backward_search(fmat.GetColDensity(fid))) {
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, stemp[tid], false);
this->EnumerateSplit(fmat.GetReverseSortedCol(fid), fid, gpair, info, stemp[tid], false);
}
}
// after this each thread's stemp will get the best candidates, aggregate results
@ -314,9 +321,9 @@ class ColMaker: public IUpdater<FMatrix> {
inline void ResetPosition(const std::vector<int> &qexpand, const FMatrix &fmat, const RegTree &tree) {
const std::vector<bst_uint> &rowset = fmat.buffered_rowset();
// step 1, set default direct nodes to default, and leaf nodes to -1
const unsigned ndata = static_cast<unsigned>(rowset.size());
const bst_omp_uint ndata = static_cast<bst_omp_uint>(rowset.size());
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < ndata; ++i) {
for (bst_omp_uint i = 0; i < ndata; ++i) {
const bst_uint ridx = rowset[i];
const int nid = position[ridx];
if (nid >= 0) {
@ -337,9 +344,9 @@ class ColMaker: public IUpdater<FMatrix> {
std::sort(fsplits.begin(), fsplits.end());
fsplits.resize(std::unique(fsplits.begin(), fsplits.end()) - fsplits.begin());
// start put things into right place
const unsigned nfeats = static_cast<unsigned>(fsplits.size());
const bst_omp_uint nfeats = static_cast<bst_omp_uint>(fsplits.size());
#pragma omp parallel for schedule(dynamic, 1)
for (unsigned i = 0; i < nfeats; ++i) {
for (bst_omp_uint i = 0; i < nfeats; ++i) {
const unsigned fid = fsplits[i];
for (typename FMatrix::ColIter it = fmat.GetSortedCol(fid); it.Next();) {
const bst_uint ridx = it.rindex();

View File

@ -13,7 +13,7 @@
namespace xgboost {
namespace tree {
/*! \brief pruner that prunes a tree after growing finishs */
template<typename FMatrix>
template<typename FMatrix, typename TStats>
class TreeRefresher: public IUpdater<FMatrix> {
public:
virtual ~TreeRefresher(void) {}
@ -30,7 +30,7 @@ class TreeRefresher: public IUpdater<FMatrix> {
// number of threads
int nthread;
// thread temporal space
std::vector< std::vector<GradStats> > stemp;
std::vector< std::vector<TStats> > stemp;
std::vector<RegTree::FVec> fvec_temp;
// setup temp space for each thread
#pragma omp parallel
@ -38,14 +38,14 @@ class TreeRefresher: public IUpdater<FMatrix> {
nthread = omp_get_num_threads();
}
fvec_temp.resize(nthread, RegTree::FVec());
stemp.resize(trees.size() * nthread, std::vector<GradStats>());
stemp.resize(trees.size() * nthread, std::vector<TStats>());
#pragma omp parallel
{
int tid = omp_get_thread_num();
for (size_t i = 0; i < trees.size(); ++i) {
std::vector<GradStats> &vec = stemp[tid * trees.size() + i];
vec.resize(trees[i]->param.num_nodes);
std::fill(vec.begin(), vec.end(), GradStats());
std::vector<TStats> &vec = stemp[tid * trees.size() + i];
vec.resize(trees[i]->param.num_nodes, TStats(param));
std::fill(vec.begin(), vec.end(), TStats(param));
}
fvec_temp[tid].Init(trees[0]->param.num_feature);
}
@ -56,17 +56,16 @@ class TreeRefresher: public IUpdater<FMatrix> {
const SparseBatch &batch = iter->Value();
utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
"too large batch size ");
const unsigned nbatch = static_cast<unsigned>(batch.size);
const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
#pragma omp parallel for schedule(static)
for (unsigned i = 0; i < nbatch; ++i) {
for (bst_omp_uint i = 0; i < nbatch; ++i) {
SparseBatch::Inst inst = batch[i];
const int tid = omp_get_thread_num();
const size_t ridx = batch.base_rowid + i;
const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
RegTree::FVec &feats = fvec_temp[tid];
feats.Fill(inst);
for (size_t j = 0; j < trees.size(); ++j) {
AddStats(*trees[j], feats, gpair[ridx],
info.GetRoot(j),
AddStats(*trees[j], feats, gpair, info, ridx,
&stemp[tid * trees.size() + j]);
}
feats.Drop(inst);
@ -95,31 +94,34 @@ class TreeRefresher: public IUpdater<FMatrix> {
private:
inline static void AddStats(const RegTree &tree,
const RegTree::FVec &feat,
const bst_gpair &gpair, unsigned root_id,
std::vector<GradStats> *p_gstats) {
std::vector<GradStats> &gstats = *p_gstats;
const std::vector<bst_gpair> &gpair,
const BoosterInfo &info,
const bst_uint ridx,
std::vector<TStats> *p_gstats) {
std::vector<TStats> &gstats = *p_gstats;
// start from groups that belongs to current data
int pid = static_cast<int>(root_id);
gstats[pid].Add(gpair);
int pid = static_cast<int>(info.GetRoot(ridx));
gstats[pid].Add(gpair, info, ridx);
// tranverse tree
while (!tree[pid].is_leaf()) {
unsigned split_index = tree[pid].split_index();
pid = tree.GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
gstats[pid].Add(gpair);
gstats[pid].Add(gpair, info, ridx);
}
}
inline void Refresh(const std::vector<GradStats> &gstats,
inline void Refresh(const std::vector<TStats> &gstats,
int nid, RegTree *p_tree) {
RegTree &tree = *p_tree;
tree.stat(nid).base_weight = param.CalcWeight(gstats[nid]);
tree.stat(nid).base_weight = static_cast<float>(gstats[nid].CalcWeight(param));
tree.stat(nid).sum_hess = static_cast<float>(gstats[nid].sum_hess);
gstats[nid].SetLeafVec(param, tree.leafvec(nid));
if (tree[nid].is_leaf()) {
tree[nid].set_leaf(tree.stat(nid).base_weight * param.learning_rate);
} else {
tree.stat(nid).loss_chg =
param.CalcGain(gstats[tree[nid].cleft()]) +
param.CalcGain(gstats[tree[nid].cright()]) -
param.CalcGain(gstats[nid]);
tree.stat(nid).loss_chg = static_cast<float>(
gstats[tree[nid].cleft()].CalcGain(param) +
gstats[tree[nid].cright()].CalcGain(param) -
gstats[nid].CalcGain(param));
this->Refresh(gstats, tree[nid].cleft(), p_tree);
this->Refresh(gstats, tree[nid].cright(), p_tree);
}

View File

@ -40,7 +40,7 @@ class IStream {
*/
template<typename T>
inline void Write(const std::vector<T> &vec) {
uint64_t sz = vec.size();
uint64_t sz = static_cast<uint64_t>(vec.size());
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&vec[0], sizeof(T) * sz);
@ -66,7 +66,7 @@ class IStream {
* \param str the string to be serialized
*/
inline void Write(const std::string &str) {
uint64_t sz = str.length();
uint64_t sz = static_cast<uint64_t>(str.length());
this->Write(&sz, sizeof(sz));
if (sz != 0) {
this->Write(&str[0], sizeof(char) * sz);
@ -102,6 +102,9 @@ class FileStream : public IStream {
virtual void Write(const void *ptr, size_t size) {
fwrite(ptr, size, 1, fp);
}
inline void Seek(size_t pos) {
fseek(fp, 0, SEEK_SET);
}
inline void Close(void) {
fclose(fp);
}

View File

@ -17,26 +17,26 @@ namespace utils {
* \tparam IndexType type of index used to store the index position, usually unsigned or size_t
* \tparam whether enabling the usage of aclist, this option must be enabled manually
*/
template<typename IndexType, bool UseAcList = false>
template<typename IndexType, bool UseAcList = false, typename SizeType = size_t>
struct SparseCSRMBuilder {
private:
/*! \brief dummy variable used in the indicator matrix construction */
std::vector<size_t> dummy_aclist;
/*! \brief pointer to each of the row */
std::vector<size_t> &rptr;
std::vector<SizeType> &rptr;
/*! \brief index of nonzero entries in each row */
std::vector<IndexType> &findex;
/*! \brief a list of active rows, used when many rows are empty */
std::vector<size_t> &aclist;
public:
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
std::vector<IndexType> &p_findex)
:rptr(p_rptr), findex(p_findex), aclist(dummy_aclist) {
Assert(!UseAcList, "enabling bug");
}
/*! \brief use with caution! rptr must be cleaned before use */
SparseCSRMBuilder(std::vector<size_t> &p_rptr,
SparseCSRMBuilder(std::vector<SizeType> &p_rptr,
std::vector<IndexType> &p_findex,
std::vector<size_t> &p_aclist)
:rptr(p_rptr), findex(p_findex), aclist(p_aclist) {
@ -62,7 +62,7 @@ struct SparseCSRMBuilder {
* \param row_id the id of the row
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t row_id, size_t nelem = 1) {
inline void AddBudget(size_t row_id, SizeType nelem = 1) {
if (rptr.size() < row_id + 2) {
rptr.resize(row_id + 2, 0);
}
@ -101,7 +101,7 @@ struct SparseCSRMBuilder {
* element to each row, the number of calls shall be exactly same as add_budget
*/
inline void PushElem(size_t row_id, IndexType col_id) {
size_t &rp = rptr[row_id + 1];
SizeType &rp = rptr[row_id + 1];
findex[rp++] = col_id;
}
/*!

View File

@ -9,10 +9,26 @@
#include <omp.h>
#else
#ifndef DISABLE_OPENMP
#warning "OpenMP is not available, compile to single thread code"
#ifndef _MSC_VER
#warning "OpenMP is not available, compile to single thread code."\
"You may want to ungrade your compiler to enable OpenMP support,"\
"to get benefit of multi-threading."
#else
// TODO add warning for msvc
#endif
#endif
inline int omp_get_thread_num() { return 0; }
inline int omp_get_num_threads() { return 1; }
inline void omp_set_num_threads(int nthread) {}
#endif
// loop variable used in openmp
namespace xgboost {
#ifdef _MSC_VER
typedef int bst_omp_uint;
#else
typedef unsigned bst_omp_uint;
#endif
} // namespace xgboost
#endif // XGBOOST_UTILS_OMP_H_

View File

@ -88,11 +88,21 @@ inline void Shuffle(std::vector<T> &data) {
struct Random{
/*! \brief set random number seed */
inline void Seed(unsigned sd) {
this->rseed = sd;
this->rseed = sd;
#if defined(_MSC_VER)||defined(_WIN32)
srand(rseed);
#endif
}
/*! \brief return a real number uniform in [0,1) */
inline double RandDouble(void) {
return static_cast<double>( rand_r( &rseed ) ) / (static_cast<double>( RAND_MAX )+1.0);
// use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
// For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
// todo, replace with another PRNG
#if defined(_MSC_VER)||defined(_WIN32)
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0);
#else
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
#endif
}
// random number seed
unsigned rseed;

View File

@ -6,8 +6,15 @@
* \author Tianqi Chen
*/
#define _CRT_SECURE_NO_WARNINGS
#include <cstdio>
#include <cstdarg>
#include <cstdlib>
#ifdef _MSC_VER
#define fopen64 fopen
// NOTE: sprintf_s is not equivalent to snprintf,
// they are equivalent when success, which is sufficient for our case
#define snprintf sprintf_s
#define vsnprintf vsprintf_s
#else
#ifdef _FILE_OFFSET_BITS
#if _FILE_OFFSET_BITS == 32
@ -36,49 +43,68 @@ typedef long int64_t;
#include <inttypes.h>
#endif
#include <cstdio>
#include <cstdarg>
#include <cstdlib>
namespace xgboost {
/*! \brief namespace for helper utils of the project */
namespace utils {
/*! \brief error message buffer length */
const int kErrorBuffer = 1 << 12;
#ifndef XGBOOST_CUSTOMIZE_ERROR_
/*!
* \brief handling of Assert error, caused by in-apropriate input
* \param msg error message
*/
inline void HandleAssertError(const char *msg) {
fprintf(stderr, "AssertError:%s\n", msg);
exit(-1);
}
/*!
* \brief handling of Check error, caused by in-apropriate input
* \param msg error message
*/
inline void HandleCheckError(const char *msg) {
fprintf(stderr, "%s\n", msg);
exit(-1);
}
#else
// include declarations, some one must implement this
void HandleAssertError(const char *msg);
void HandleCheckError(const char *msg);
#endif
/*! \brief assert an condition is true, use this to handle debug information */
inline void Assert(bool exp, const char *fmt, ...) {
if (!exp) {
std::string msg(kErrorBuffer, '\0');
va_list args;
va_start(args, fmt);
fprintf(stderr, "AssertError:");
vfprintf(stderr, fmt, args);
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
HandleAssertError(msg.c_str());
}
}
/*!\brief same as assert, but this is intended to be used as message for user*/
inline void Check(bool exp, const char *fmt, ...) {
if (!exp) {
std::string msg(kErrorBuffer, '\0');
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
HandleCheckError(msg.c_str());
}
}
/*! \brief report error message, same as check */
inline void Error(const char *fmt, ...) {
{
std::string msg(kErrorBuffer, '\0');
va_list args;
va_start(args, fmt);
vfprintf(stderr, fmt, args);
vsnprintf(&msg[0], kErrorBuffer, fmt, args);
va_end(args);
fprintf(stderr, "\n");
exit(-1);
HandleCheckError(msg.c_str());
}
}

1
windows/README.md Normal file
View File

@ -0,0 +1 @@
This is a test for minimal files needed for windows version

26
windows/xgboost.sln Normal file
View File

@ -0,0 +1,26 @@

Microsoft Visual Studio Solution File, Format Version 11.00
# Visual Studio 2010
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.vcxproj", "{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Debug|x64 = Debug|x64
Release|Win32 = Release|Win32
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.ActiveCfg = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|Win32.Build.0 = Debug|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.ActiveCfg = Debug|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Debug|x64.Build.0 = Debug|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.ActiveCfg = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|Win32.Build.0 = Release|Win32
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.ActiveCfg = Release|x64
{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -0,0 +1,117 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{1D6A56A5-5557-4D20-9D50-3DE4C30BE00C}</ProjectGuid>
<RootNamespace>xgboost</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>MultiByte</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup />
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<OpenMPSupport>true</OpenMPSupport>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="..\..\src\io\io.cpp" />
<ClCompile Include="..\..\src\xgboost_main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@ -35,7 +35,7 @@ bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.real(sum(as.integer(preds > 0.5) != labels)) / length(labels)
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
@ -100,7 +100,7 @@ logregobj <- function(preds, dtrain) {
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.real(sum(labels != (preds > 0.0))) / length(labels)
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}

View File

@ -13,7 +13,7 @@ dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# specify validations set to watch performance
evallist = [(dtest,'eval'), (dtrain,'train')]
@ -75,7 +75,7 @@ print ('start running example to used cutomized objective function')
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 }
param = {'max_depth':2, 'eta':1, 'silent':1 }
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
@ -107,7 +107,7 @@ bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
#
print ('start running example to start from a initial prediction')
# specify parameters via map, definition are same as c++ version
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
# train xgboost for 1 round
bst = xgb.train( param, dtrain, 1, evallist )
# Note: we need the margin value instead of transformed prediction in set_base_margin

View File

@ -48,15 +48,15 @@ xgb.setinfo <- function(dmat, name, info) {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.real(info))
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info))
return(TRUE)
}
if (name == "group") {
@ -214,7 +214,7 @@ xgb.iter.eval <- function(booster, watchlist, iter) {
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
evnames <- append(evnames, names(w))
}
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)

View File

@ -19,6 +19,7 @@ xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float)
xglib.XGDMatrixGetUIntInfo.restype = ctypes.POINTER(ctypes.c_uint)
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
xglib.XGBoosterCreate.restype = ctypes.c_void_p
@ -27,10 +28,10 @@ xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
def ctypes2numpy(cptr, length):
def ctypes2numpy(cptr, length, dtype):
# convert a ctypes pointer array to numpy
assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
res = numpy.zeros(length, dtype='float32')
res = numpy.zeros(length, dtype=dtype)
assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
return res
@ -44,7 +45,7 @@ class DMatrix:
return
if isinstance(data, str):
self.handle = ctypes.c_void_p(
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 1))
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0))
elif isinstance(data, scp.csr_matrix):
self.__init_from_csr(data)
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
@ -76,23 +77,31 @@ class DMatrix:
# destructor
def __del__(self):
xglib.XGDMatrixFree(self.handle)
def __get_float_info(self, field):
def get_float_info(self, field):
length = ctypes.c_ulong()
ret = xglib.XGDMatrixGetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
ctypes.byref(length))
return ctypes2numpy(ret, length.value)
def __set_float_info(self, field, data):
xglib.XGDMatrixSetFloatInfo(self.handle,ctypes.c_char_p(field.encode('utf-8')),
return ctypes2numpy(ret, length.value, 'float32')
def get_uint_info(self, field):
length = ctypes.c_ulong()
ret = xglib.XGDMatrixGetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
ctypes.byref(length))
return ctypes2numpy(ret, length.value, 'uint32')
def set_float_info(self, field, data):
xglib.XGDMatrixSetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
(ctypes.c_float*len(data))(*data), len(data))
def set_uint_info(self, field, data):
xglib.XGDMatrixSetUIntInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
(ctypes.c_uint*len(data))(*data), len(data))
# load data from file
def save_binary(self, fname, silent=True):
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
# set label of dmatrix
def set_label(self, label):
self.__set_float_info('label', label)
self.set_float_info('label', label)
# set weight of each instances
def set_weight(self, weight):
self.__set_float_info('weight', weight)
self.set_float_info('weight', weight)
# set initialized margin prediction
def set_base_margin(self, margin):
"""
@ -103,19 +112,19 @@ class DMatrix:
e.g. for logistic regression: need to put in value before logistic transformation
see also example/demo.py
"""
self.__set_float_info('base_margin', margin)
self.set_float_info('base_margin', margin)
# set group size of dmatrix, used for rank
def set_group(self, group):
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
# get label from dmatrix
def get_label(self):
return self.__get_float_info('label')
return self.get_float_info('label')
# get weight from dmatrix
def get_weight(self):
return self.__get_float_info('weight')
return self.get_float_info('weight')
# get base_margin from dmatrix
def get_base_margin(self):
return self.__get_float_info('base_margin')
return self.get_float_info('base_margin')
def num_row(self):
return xglib.XGDMatrixNumRow(self.handle)
# slice the DMatrix to return a new DMatrix that only contains rindex
@ -189,7 +198,7 @@ class Booster:
length = ctypes.c_ulong()
preds = xglib.XGBoosterPredict(self.handle, data.handle,
int(output_margin), ctypes.byref(length))
return ctypes2numpy(preds, length.value)
return ctypes2numpy(preds, length.value, 'float32')
def save_model(self, fname):
""" save model to file """
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))

View File

@ -2,13 +2,24 @@
#include <string>
#include <utility>
#include <cstring>
#include "xgboost_wrapper.h"
#include "xgboost_R.h"
#include "xgboost_wrapper.h"
#include "../src/utils/utils.h"
#include "../src/utils/omp.h"
#include "../src/utils/matrix_csr.h"
using namespace xgboost;
// implements error handling
namespace xgboost {
namespace utils {
void HandleAssertError(const char *msg) {
error("%s", msg);
}
void HandleCheckError(const char *msg) {
error("%s", msg);
}
} // namespace utils
} // namespace xgboost
extern "C" {
void _DMatrixFinalizer(SEXP ext) {
@ -51,9 +62,9 @@ extern "C" {
int ncol = length(indptr) - 1;
int ndata = length(data);
// transform into CSR format
std::vector<size_t> row_ptr;
std::vector<bst_ulong> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
utils::SparseCSRMBuilder<std::pair<unsigned,float>, false, bst_ulong> builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
@ -108,7 +119,7 @@ extern "C" {
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
size_t olen;
bst_ulong olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
@ -165,17 +176,19 @@ extern "C" {
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i){
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
vec_sptr.push_back(vec_names.back().c_str());
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
&vec_dmats[0], &vec_sptr[0], len));
&vec_dmats[0], &vec_sptr[0], len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
size_t olen;
bst_ulong olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
@ -194,13 +207,13 @@ extern "C" {
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
size_t olen;
bst_ulong olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
&olen);
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
for (size_t i = 0; i < olen; ++i) {
fprintf(fo, "booster[%lu]:\n", i);
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
fprintf(fo, "%s", res[i]);
}
fclose(fo);

View File

@ -23,18 +23,18 @@ class Booster: public learner::BoostLearner<FMatrixS> {
this->init_model = false;
this->SetCacheData(mats);
}
const float *Pred(const DataMatrix &dmat, int output_margin, size_t *len) {
const float *Pred(const DataMatrix &dmat, int output_margin, bst_ulong *len) {
this->CheckInitModel();
this->Predict(dmat, output_margin, &this->preds_);
*len = this->preds_.size();
return &this->preds_[0];
}
inline void BoostOneIter(const DataMatrix &train,
float *grad, float *hess, size_t len) {
float *grad, float *hess, bst_ulong len) {
this->gpair_.resize(len);
const unsigned ndata = static_cast<unsigned>(len);
const bst_omp_uint ndata = static_cast<bst_omp_uint>(len);
#pragma omp parallel for schedule(static)
for (unsigned j = 0; j < ndata; ++j) {
for (bst_omp_uint j = 0; j < ndata; ++j) {
gpair_[j] = bst_gpair(grad[j], hess[j]);
}
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
@ -48,7 +48,7 @@ class Booster: public learner::BoostLearner<FMatrixS> {
learner::BoostLearner<FMatrixS>::LoadModel(fname);
this->init_model = true;
}
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, size_t *len) {
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, bst_ulong *len) {
model_dump = this->DumpModel(fmap, with_stats);
model_dump_cptr.resize(model_dump.size());
for (size_t i = 0; i < model_dump.size(); ++i) {
@ -76,35 +76,37 @@ extern "C"{
void* XGDMatrixCreateFromFile(const char *fname, int silent) {
return LoadDataMatrix(fname, silent, false);
}
void* XGDMatrixCreateFromCSR(const size_t *indptr,
void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,
size_t nindptr,
size_t nelem) {
bst_ulong nindptr,
bst_ulong nelem) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.row_ptr_.resize(nindptr);
memcpy(&mat.row_ptr_[0], indptr, sizeof(size_t)*nindptr);
mat.row_data_.resize(nelem);
for (size_t i = 0; i < nelem; ++i) {
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
mat.info.num_col = std::max(mat.info.num_col,
static_cast<size_t>(indices[i]+1));
for (bst_ulong i = 0; i < nindptr; ++i) {
mat.row_ptr_[i] = static_cast<size_t>(indptr[i]);
}
mat.info.num_row = nindptr - 1;
mat.row_data_.resize(nelem);
for (bst_ulong i = 0; i < nelem; ++i) {
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
mat.info.info.num_col = std::max(mat.info.info.num_col,
static_cast<size_t>(indices[i]+1));
}
mat.info.info.num_row = nindptr - 1;
return p_mat;
}
void* XGDMatrixCreateFromMat(const float *data,
size_t nrow,
size_t ncol,
bst_ulong nrow,
bst_ulong ncol,
float missing) {
DMatrixSimple *p_mat = new DMatrixSimple();
DMatrixSimple &mat = *p_mat;
mat.info.num_row = nrow;
mat.info.num_col = ncol;
for (size_t i = 0; i < nrow; ++i, data += ncol) {
size_t nelem = 0;
for (size_t j = 0; j < ncol; ++j) {
mat.info.info.num_row = nrow;
mat.info.info.num_col = ncol;
for (bst_ulong i = 0; i < nrow; ++i, data += ncol) {
bst_ulong nelem = 0;
for (bst_ulong j = 0; j < ncol; ++j) {
if (data[j] != missing) {
mat.row_data_.push_back(SparseBatch::Entry(j, data[j]));
++nelem;
@ -116,7 +118,7 @@ extern "C"{
}
void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
size_t len) {
bst_ulong len) {
DMatrixSimple tmp;
DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
if (dsrc.magic != DMatrixSimple::kMagic) {
@ -130,17 +132,17 @@ extern "C"{
utils::Check(src.info.group_ptr.size() == 0,
"slice does not support group structure");
ret.Clear();
ret.info.num_row = len;
ret.info.num_col = src.info.num_col;
ret.info.info.num_row = len;
ret.info.info.num_col = src.info.num_col();
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
iter->BeforeFirst();
utils::Assert(iter->Next(), "slice");
const SparseBatch &batch = iter->Value();
for (size_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
const int ridx = idxset[i];
SparseBatch::Inst inst = batch[ridx];
utils::Check(static_cast<size_t>(ridx) < batch.size, "slice index exceed number of rows");
utils::Check(static_cast<bst_ulong>(ridx) < batch.size, "slice index exceed number of rows");
ret.row_data_.resize(ret.row_data_.size() + inst.length);
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
sizeof(SparseBatch::Entry) * inst.length);
@ -163,34 +165,46 @@ extern "C"{
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent);
}
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, size_t len) {
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, bst_ulong len) {
std::vector<float> &vec =
static_cast<DataMatrix*>(handle)->info.GetInfo(field);
static_cast<DataMatrix*>(handle)->info.GetFloatInfo(field);
vec.resize(len);
memcpy(&vec[0], info, sizeof(float) * len);
}
void XGDMatrixSetGroup(void *handle, const unsigned *group, size_t len) {
void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *info, bst_ulong len) {
std::vector<unsigned> &vec =
static_cast<DataMatrix*>(handle)->info.GetUIntInfo(field);
vec.resize(len);
memcpy(&vec[0], info, sizeof(unsigned) * len);
}
void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len) {
DataMatrix *pmat = static_cast<DataMatrix*>(handle);
pmat->info.group_ptr.resize(len + 1);
pmat->info.group_ptr[0] = 0;
for (size_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i]+group[i];
}
}
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* len) {
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<float> &vec =
static_cast<const DataMatrix*>(handle)->info.GetInfo(field);
static_cast<const DataMatrix*>(handle)->info.GetFloatInfo(field);
*len = vec.size();
return &vec[0];
}
size_t XGDMatrixNumRow(const void *handle) {
return static_cast<const DataMatrix*>(handle)->info.num_row;
const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* len) {
const std::vector<unsigned> &vec =
static_cast<const DataMatrix*>(handle)->info.GetUIntInfo(field);
*len = vec.size();
return &vec[0];
}
bst_ulong XGDMatrixNumRow(const void *handle) {
return static_cast<const DataMatrix*>(handle)->info.num_row();
}
// xgboost implementation
void *XGBoosterCreate(void *dmats[], size_t len) {
void *XGBoosterCreate(void *dmats[], bst_ulong len) {
std::vector<DataMatrix*> mats;
for (size_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
mats.push_back(dtr);
}
@ -210,7 +224,7 @@ extern "C"{
bst->UpdateOneIter(iter, *dtr);
}
void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, size_t len) {
float *grad, float *hess, bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
bst->CheckInitModel();
@ -218,11 +232,11 @@ extern "C"{
bst->BoostOneIter(*dtr, grad, hess, len);
}
const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], size_t len) {
const char *evnames[], bst_ulong len) {
Booster *bst = static_cast<Booster*>(handle);
std::vector<std::string> names;
std::vector<const DataMatrix*> mats;
for (size_t i = 0; i < len; ++i) {
for (bst_ulong i = 0; i < len; ++i) {
mats.push_back(static_cast<DataMatrix*>(dmats[i]));
names.push_back(std::string(evnames[i]));
}
@ -230,7 +244,7 @@ extern "C"{
bst->eval_str = bst->EvalOneIter(iter, mats, names);
return bst->eval_str.c_str();
}
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, size_t *len) {
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len) {
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, len);
}
void XGBoosterLoadModel(void *handle, const char *fname) {
@ -239,7 +253,7 @@ extern "C"{
void XGBoosterSaveModel(const void *handle, const char *fname) {
static_cast<const Booster*>(handle)->SaveModel(fname);
}
const char** XGBoosterDumpModel(void *handle, const char *fmap, size_t *len){
const char** XGBoosterDumpModel(void *handle, const char *fmap, bst_ulong *len){
utils::FeatMap featmap;
if (strlen(fmap) != 0) {
featmap.LoadText(fmap);

View File

@ -7,13 +7,16 @@
* can be used to create wrapper of other languages
*/
#include <cstdio>
#define XGB_DLL
// manually define unsign long
typedef unsigned long bst_ulong;
extern "C" {
/*!
* \brief load a data matrix
* \return a loaded data matrix
*/
void* XGDMatrixCreateFromFile(const char *fname, int silent);
XGB_DLL void* XGDMatrixCreateFromFile(const char *fname, int silent);
/*!
* \brief create a matrix content from csr format
* \param indptr pointer to row headers
@ -23,11 +26,11 @@ extern "C" {
* \param nelem number of nonzero elements in the matrix
* \return created dmatrix
*/
void* XGDMatrixCreateFromCSR(const size_t *indptr,
const unsigned *indices,
const float *data,
size_t nindptr,
size_t nelem);
XGB_DLL void* XGDMatrixCreateFromCSR(const bst_ulong *indptr,
const unsigned *indices,
const float *data,
bst_ulong nindptr,
bst_ulong nelem);
/*!
* \brief create matrix content from dense matrix
* \param data pointer to the data space
@ -36,10 +39,10 @@ extern "C" {
* \param missing which value to represent missing value
* \return created dmatrix
*/
void* XGDMatrixCreateFromMat(const float *data,
size_t nrow,
size_t ncol,
float missing);
XGB_DLL void* XGDMatrixCreateFromMat(const float *data,
bst_ulong nrow,
bst_ulong ncol,
float missing);
/*!
* \brief create a new dmatrix from sliced content of existing matrix
* \param handle instance of data matrix to be sliced
@ -47,20 +50,20 @@ extern "C" {
* \param len length of index set
* \return a sliced new matrix
*/
void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
size_t len);
XGB_DLL void* XGDMatrixSliceDMatrix(void *handle,
const int *idxset,
bst_ulong len);
/*!
* \brief free space in data matrix
*/
void XGDMatrixFree(void *handle);
XGB_DLL void XGDMatrixFree(void *handle);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
XGB_DLL void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
/*!
* \brief set float vector to a content in info
* \param handle a instance of data matrix
@ -68,52 +71,68 @@ extern "C" {
* \param array pointer to float vector
* \param len length of array
*/
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, size_t len);
XGB_DLL void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, bst_ulong len);
/*!
* \brief set uint32 vector to a content in info
* \param handle a instance of data matrix
* \param field field name
* \param array pointer to float vector
* \param len length of array
*/
XGB_DLL void XGDMatrixSetUIntInfo(void *handle, const char *field, const unsigned *array, bst_ulong len);
/*!
* \brief set label of the training matrix
* \param handle a instance of data matrix
* \param group pointer to group size
* \param len length of array
*/
void XGDMatrixSetGroup(void *handle, const unsigned *group, size_t len);
XGB_DLL void XGDMatrixSetGroup(void *handle, const unsigned *group, bst_ulong len);
/*!
* \brief get float info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \param out_len used to set result length
* \return pointer to the label
* \return pointer to the result
*/
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);
XGB_DLL const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, bst_ulong* out_len);
/*!
* \brief get uint32 info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \param out_len used to set result length
* \return pointer to the result
*/
XGB_DLL const unsigned* XGDMatrixGetUIntInfo(const void *handle, const char *field, bst_ulong* out_len);
/*!
* \brief return number of rows
*/
size_t XGDMatrixNumRow(const void *handle);
XGB_DLL bst_ulong XGDMatrixNumRow(const void *handle);
// --- start XGBoost class
/*!
* \brief create xgboost learner
* \param dmats matrices that are set to be cached
* \param len length of dmats
*/
void *XGBoosterCreate(void* dmats[], size_t len);
XGB_DLL void *XGBoosterCreate(void* dmats[], bst_ulong len);
/*!
* \brief free obj in handle
* \param handle handle to be freed
*/
void XGBoosterFree(void* handle);
XGB_DLL void XGBoosterFree(void* handle);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam(void *handle, const char *name, const char *value);
XGB_DLL void XGBoosterSetParam(void *handle, const char *name, const char *value);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
XGB_DLL void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
@ -123,8 +142,8 @@ extern "C" {
* \param hess second order gradient statistics
* \param len length of grad/hess array
*/
void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, size_t len);
XGB_DLL void XGBoosterBoostOneIter(void *handle, void *dtrain,
float *grad, float *hess, bst_ulong len);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
@ -134,8 +153,8 @@ extern "C" {
* \param len length of dmats
* \return the string containing evaluation stati
*/
const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], size_t len);
XGB_DLL const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
const char *evnames[], bst_ulong len);
/*!
* \brief make prediction based on dmat
* \param handle handle
@ -143,19 +162,19 @@ extern "C" {
* \param output_margin whether only output raw margin value
* \param len used to store length of returning result
*/
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, size_t *len);
XGB_DLL const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, bst_ulong *len);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel(void *handle, const char *fname);
XGB_DLL void XGBoosterLoadModel(void *handle, const char *fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel(const void *handle, const char *fname);
XGB_DLL void XGBoosterSaveModel(const void *handle, const char *fname);
/*!
* \brief dump model, return array of strings representing model dump
* \param handle handle
@ -163,7 +182,7 @@ extern "C" {
* \param out_len length of output array
* \return char *data[], representing dump of each model
*/
const char **XGBoosterDumpModel(void *handle, const char *fmap,
size_t *out_len);
XGB_DLL const char **XGBoosterDumpModel(void *handle, const char *fmap,
bst_ulong *out_len);
};
#endif // XGBOOST_WRAPPER_H_