Merge pull request #46 from tqchen/master

merge master into unity
This commit is contained in:
Tianqi Chen 2014-08-26 12:18:26 -07:00
commit 636ffaf23b
23 changed files with 9252 additions and 11 deletions

4
.gitignore vendored
View File

@ -33,4 +33,6 @@ ipch
*log *log
Debug Debug
*suo *suo
*test*
.Rhistory
*.dll

16
R-package/DESCRIPTION Normal file
View File

@ -0,0 +1,16 @@
Package: xgboost
Type: Package
Title: R wrapper of xgboost
Version: 0.3-0
Date: 2014-08-23
Author: Tianqi Chen
Maintainer: Tianqi Chen <tianqi.tchen@gmail.com>
Description: xgboost
License: See LICENSE file
URL: https://github.com/tqchen/xgboost
BugReports: https://github.com/tqchen/xgboost/issues
Depends:
R (>= 2.0.2)
Imports:
Matrix (>= 1.1-0),
methods

13
R-package/LICENSE Normal file
View File

@ -0,0 +1,13 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

18
R-package/NAMESPACE Normal file
View File

@ -0,0 +1,18 @@
importClassesFrom("Matrix", dgCMatrix, dgeMatrix)
export(xgboost)
export(xgb.DMatrix)
export(xgb.getinfo)
export(xgb.setinfo)
# exportClasses(xgb.Boost)
exportMethods(predict)
# export(xgb.Booster)
# export(xgb.train)
# export(xgb.save)
# export(xgb.predict)
# export(xgb.dump)

View File

@ -0,0 +1,16 @@
#' @export
setClass("xgb.Booster")
#' @export
setMethod("predict",
signature = "xgb.Booster",
definition = function(object, newdata, outputmargin = FALSE)
{
if (class(newdata) != "xgb.DMatrix") {
newdata = xgb.DMatrix(newdata)
}
ret <- .Call("XGBoosterPredict_R", object, newdata,
as.integer(outputmargin), PACKAGE="xgboost")
return(ret)
})

222
R-package/R/xgb.Utils.R Normal file
View File

@ -0,0 +1,222 @@
# depends on matrix
.onLoad <- function(libname, pkgname) {
library.dynam("xgboost", pkgname, libname);
}
.onUnload <- function(libpath) {
library.dynam.unload("xgboost", libpath);
}
# constructing DMatrix
xgb.DMatrix <- function(data, info=list(), missing=0.0) {
if (typeof(data) == "character") {
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), PACKAGE="xgboost")
} else if(is.matrix(data)) {
handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, PACKAGE="xgboost")
} else if(class(data) == "dgCMatrix") {
handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, PACKAGE="xgboost")
} else {
stop(paste("xgb.DMatrix: does not support to construct from ", typeof(data)))
}
dmat <- structure(handle, class="xgb.DMatrix")
if (length(info) != 0) {
for (i in 1:length(info)) {
p <- info[i]
xgb.setinfo(dmat, names(p), p[[1]])
}
}
return(dmat)
}
# get information from dmatrix
xgb.getinfo <- function(dmat, name) {
if (typeof(name) != "character") {
stop("xgb.getinfo: name must be character")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name != "label" &&
name != "weight" &&
name != "base_margin" ) {
stop(paste("xgb.getinfo: unknown info name", name))
}
ret <- .Call("XGDMatrixGetInfo_R", dmat, name, PACKAGE="xgboost")
return(ret)
}
# set information into dmatrix, this mutate dmatrix
xgb.setinfo <- function(dmat, name, info) {
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix");
}
if (name == "label") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "weight") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "base_margin") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), PACKAGE="xgboost")
return(TRUE)
}
if (name == "group") {
.Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), PACKAGE="xgboost")
return(TRUE)
}
stop(pase("xgb.setinfo: unknown info name", name))
return(FALSE)
}
# construct a Booster from cachelist
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (typeof(cachelist) != "list") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
for (dm in cachelist) {
if (class(dm) != "xgb.DMatrix") {
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
}
}
handle <- .Call("XGBoosterCreate_R", cachelist, PACKAGE="xgboost")
.Call("XGBoosterSetParam_R", handle, "seed", "0", PACKAGE="xgboost")
if (length(params) != 0) {
for (i in 1:length(params)) {
p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), PACKAGE="xgboost")
}
}
if (!is.null(modelfile)) {
if (typeof(modelfile) != "character"){
stop("xgb.Booster: modelfile must be character");
}
.Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE="xgboost")
}
return(structure(handle, class="xgb.Booster"))
}
# train a model using given parameters
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL, feval=NULL) {
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list");
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
}
bst <- xgb.Booster(params, append(watchlist,dtrain))
for (i in 1:nrounds) {
if (is.null(obj)) {
succ <- xgb.iter.update(bst, dtrain, i-1)
} else {
pred <- xgb.predict(bst, dtrain)
gpair <- obj(pred, dtrain)
succ <- xgb.iter.boost(bst, dtrain, gpair)
}
if (length(watchlist) != 0) {
if (is.null(feval)) {
msg <- xgb.iter.eval(bst, watchlist, i-1)
cat(msg); cat("\n")
} else {
cat("["); cat(i); cat("]");
for (j in 1:length(watchlist)) {
w <- watchlist[j]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
ret <- feval(xgb.predict(bst, w[[1]]), w[[1]])
cat("\t"); cat(names(w)); cat("-"); cat(ret$metric);
cat(":"); cat(ret$value)
}
cat("\n")
}
}
}
return(bst)
}
# save model or DMatrix to file
xgb.save <- function(handle, fname) {
if (typeof(fname) != "character") {
stop("xgb.save: fname must be character")
}
if (class(handle) == "xgb.Booster") {
.Call("XGBoosterSaveModel_R", handle, fname, PACKAGE="xgboost")
return(TRUE)
}
if (class(handle) == "xgb.DMatrix") {
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE), PACKAGE="xgboost")
return(TRUE)
}
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
return(FALSE)
}
# predict
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
if (class(booster) != "xgb.Booster") {
stop("xgb.predict: first argument must be type xgb.Booster")
}
if (class(dmat) != "xgb.DMatrix") {
stop("xgb.predict: second argument must be type xgb.DMatrix")
}
ret <- .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin), PACKAGE="xgboost")
return(ret)
}
# dump model
xgb.dump <- function(booster, fname, fmap = "") {
if (class(booster) != "xgb.Booster") {
stop("xgb.dump: first argument must be type xgb.Booster")
}
if (typeof(fname) != "character"){
stop("xgb.dump: second argument must be type character")
}
.Call("XGBoosterDumpModel_R", booster, fname, fmap, PACKAGE="xgboost")
return(TRUE)
}
##--------------------------------------
# the following are low level iteratively function, not needed
# if you do not want to use them
#---------------------------------------
# iteratively update booster with dtrain
xgb.iter.update <- function(booster, dtrain, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, PACKAGE="xgboost")
return(TRUE)
}
# iteratively update booster with customized statistics
xgb.iter.boost <- function(booster, dtrain, gpair) {
if (class(booster) != "xgb.Booster") {
stop("xgb.iter.update: first argument must be type xgb.Booster")
}
if (class(dtrain) != "xgb.DMatrix") {
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
}
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE="xgboost")
return(TRUE)
}
# iteratively evaluate one iteration
xgb.iter.eval <- function(booster, watchlist, iter) {
if (class(booster) != "xgb.Booster") {
stop("xgb.eval: first argument must be type xgb.Booster")
}
if (typeof(watchlist) != "list") {
stop("xgb.eval: only accepts list of DMatrix as watchlist")
}
for (w in watchlist) {
if (class(w) != "xgb.DMatrix") {
stop("xgb.eval: watch list can only contain xgb.DMatrix")
}
}
evnames <- list()
if (length(watchlist) != 0) {
for (i in 1:length(watchlist)) {
w <- watchlist[i]
if (length(names(w)) == 0) {
stop("xgb.eval: name tag must be presented for every elements in watchlist")
}
evnames <- append(evnames, names(w))
}
}
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames, PACKAGE="xgboost")
return(msg)
}

24
R-package/R/xgboost.R Normal file
View File

@ -0,0 +1,24 @@
# Main function for xgboost-package
xgboost = function(x=NULL,y=NULL,file=NULL,nrounds=10,params,watchlist=list(),
obj=NULL, feval=NULL, margin=NULL)
{
if (is.null(x) && is.null(y))
{
if (is.null(file))
stop('xgboost need input data, either R objects or local files.')
dtrain = xgb.DMatrix(file)
}
else
dtrain = xgb.DMatrix(x, info=list(label=y))
if (!is.null(margin))
{
succ <- xgb.setinfo(dtrain, "base_margin", margin)
if (!succ)
warning('Attemp to use margin failed.')
}
bst <- xgb.train(params, dtrain, nrounds, watchlist, obj, feval)
return(bst)
}

10
R-package/README.md Normal file
View File

@ -0,0 +1,10 @@
This is subfolder for experimental version of R package.
Not yet ready.
Installation:
```r
require(devtools)
install_github('xgboost','tqchen',subdir='R-package')
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,127 @@
# load xgboost library
require(xgboost)
require(methods)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm <- function(fname, maxcol) {
content <- readLines(fname)
nline <- length(content)
label <- numeric(nline)
mat <- matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex <- as.integer(kv[1]) + 1
fvalue <- as.numeric(kv[2])
mat[i,findex] <- fvalue
}
}
mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# test code here
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain)
# training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
# make prediction
preds <- xgb.predict(bst, dtest)
labels <- xgb.getinfo(dtest, "label")
err <- as.numeric(sum(as.integer(preds > 0.5) != labels)) / length(labels)
# print error rate
print(paste("error=",err))
# dump model
xgb.dump(bst, "dump.raw.txt")
# dump model with feature map
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
# save dmatrix into binary buffer
succ <- xgb.save(dtest, "dtest.buffer")
# save model into file
succ <- xgb.save(bst, "xgb.model")
# load model and data in
bst2 <- xgb.Booster(modelfile="xgb.model")
dtest2 <- xgb.DMatrix("dtest.buffer")
preds2 <- xgb.predict(bst2, dtest2)
# assert they are the same
stopifnot(sum(abs(preds2-preds)) == 0)
###
# build dmatrix from sparseMatrix
###
print ('start running example of build DMatrix from R.sparseMatrix')
csc <- read.libsvm("agaricus.txt.train", 126)
label <- csc$label
data <- csc$data
dtrain <- xgb.DMatrix(data, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# build dmatrix from dense matrix
###
print ('start running example of build DMatrix from R.Matrix')
mat = as.matrix(data)
dtrain <- xgb.DMatrix(mat, info=list(label=label) )
watchlist <- list("eval"=dtest,"train"=dtrain)
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist)
###
# advanced: cutomsized loss function
#
print("start running example to used cutomized objective function")
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list("bst:max_depth" = 2, "bst:eta" = 1, "silent" =1)
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py"s implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror)
###
# advanced: start from a initial base prediction
#
print ("start running example to start from a initial prediction")
# specify parameters via map, definition are same as c++ version
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
# train xgboost for 1 round
bst <- xgb.train( param, dtrain, 1, watchlist )
# Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=True, will always give you margin values before logistic transformation
ptrain <- xgb.predict(bst, dtrain, outputmargin=TRUE)
ptest <- xgb.predict(bst, dtest, outputmargin=TRUE)
succ <- xgb.setinfo(dtrain, "base_margin", ptrain)
succ <- xgb.setinfo(dtest, "base_margin", ptest)
print ("this is result of running from initial prediction")
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -0,0 +1,102 @@
require(xgboost)
# helper function to read libsvm format
# this is very badly written, load in dense, and convert to sparse
# use this only for demo purpose
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
read.libsvm = function(fname, maxcol) {
content = readLines(fname)
nline = length(content)
label = numeric(nline)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]])
for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]]
# to avoid 0 index
findex = as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2])
mat[i,findex] = fvalue
}
}
mat = as(mat, "sparseMatrix")
return(list(label=label, data=mat))
}
# Parameter setting
dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
watchlist = list("eval"=dtest,"train"=dtrain)
###########################
# Train from local file
###########################
# Training
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
# Prediction
pred = predict(bst, 'agaricus.txt.test')
# Performance
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
###########################
# Train from R object
###########################
csc = read.libsvm("agaricus.txt.train", 126)
y = csc$label
x = csc$data
# x as Sparse Matrix
class(x)
# Training
bst = xgboost(x,y,params=param,watchlist=watchlist)
# Prediction
pred = predict(bst, 'agaricus.txt.test')
# Performance
labels = xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
print(paste("error=",err))
# Training with dense matrix
x = as.matrix(x)
bst = xgboost(x,y,params=param,watchlist=watchlist)
###########################
# Train with customization
###########################
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds))
grad = preds - labels
hess = preds * (1.0-preds)
return(list(grad=grad, hess=hess))
}
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror = function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(metric="error", value=err))
}
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror)
############################
# Train with previous result
############################
bst = xgboost(x,y,params=param,watchlist=watchlist)
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)

View File

@ -0,0 +1,126 @@
0 cap-shape=bell i
1 cap-shape=conical i
2 cap-shape=convex i
3 cap-shape=flat i
4 cap-shape=knobbed i
5 cap-shape=sunken i
6 cap-surface=fibrous i
7 cap-surface=grooves i
8 cap-surface=scaly i
9 cap-surface=smooth i
10 cap-color=brown i
11 cap-color=buff i
12 cap-color=cinnamon i
13 cap-color=gray i
14 cap-color=green i
15 cap-color=pink i
16 cap-color=purple i
17 cap-color=red i
18 cap-color=white i
19 cap-color=yellow i
20 bruises?=bruises i
21 bruises?=no i
22 odor=almond i
23 odor=anise i
24 odor=creosote i
25 odor=fishy i
26 odor=foul i
27 odor=musty i
28 odor=none i
29 odor=pungent i
30 odor=spicy i
31 gill-attachment=attached i
32 gill-attachment=descending i
33 gill-attachment=free i
34 gill-attachment=notched i
35 gill-spacing=close i
36 gill-spacing=crowded i
37 gill-spacing=distant i
38 gill-size=broad i
39 gill-size=narrow i
40 gill-color=black i
41 gill-color=brown i
42 gill-color=buff i
43 gill-color=chocolate i
44 gill-color=gray i
45 gill-color=green i
46 gill-color=orange i
47 gill-color=pink i
48 gill-color=purple i
49 gill-color=red i
50 gill-color=white i
51 gill-color=yellow i
52 stalk-shape=enlarging i
53 stalk-shape=tapering i
54 stalk-root=bulbous i
55 stalk-root=club i
56 stalk-root=cup i
57 stalk-root=equal i
58 stalk-root=rhizomorphs i
59 stalk-root=rooted i
60 stalk-root=missing i
61 stalk-surface-above-ring=fibrous i
62 stalk-surface-above-ring=scaly i
63 stalk-surface-above-ring=silky i
64 stalk-surface-above-ring=smooth i
65 stalk-surface-below-ring=fibrous i
66 stalk-surface-below-ring=scaly i
67 stalk-surface-below-ring=silky i
68 stalk-surface-below-ring=smooth i
69 stalk-color-above-ring=brown i
70 stalk-color-above-ring=buff i
71 stalk-color-above-ring=cinnamon i
72 stalk-color-above-ring=gray i
73 stalk-color-above-ring=orange i
74 stalk-color-above-ring=pink i
75 stalk-color-above-ring=red i
76 stalk-color-above-ring=white i
77 stalk-color-above-ring=yellow i
78 stalk-color-below-ring=brown i
79 stalk-color-below-ring=buff i
80 stalk-color-below-ring=cinnamon i
81 stalk-color-below-ring=gray i
82 stalk-color-below-ring=orange i
83 stalk-color-below-ring=pink i
84 stalk-color-below-ring=red i
85 stalk-color-below-ring=white i
86 stalk-color-below-ring=yellow i
87 veil-type=partial i
88 veil-type=universal i
89 veil-color=brown i
90 veil-color=orange i
91 veil-color=white i
92 veil-color=yellow i
93 ring-number=none i
94 ring-number=one i
95 ring-number=two i
96 ring-type=cobwebby i
97 ring-type=evanescent i
98 ring-type=flaring i
99 ring-type=large i
100 ring-type=none i
101 ring-type=pendant i
102 ring-type=sheathing i
103 ring-type=zone i
104 spore-print-color=black i
105 spore-print-color=brown i
106 spore-print-color=buff i
107 spore-print-color=chocolate i
108 spore-print-color=green i
109 spore-print-color=orange i
110 spore-print-color=purple i
111 spore-print-color=white i
112 spore-print-color=yellow i
113 population=abundant i
114 population=clustered i
115 population=numerous i
116 population=scattered i
117 population=several i
118 population=solitary i
119 habitat=grasses i
120 habitat=leaves i
121 habitat=meadows i
122 habitat=paths i
123 habitat=urban i
124 habitat=waste i
125 habitat=woods i

28
R-package/src/Makevars Normal file
View File

@ -0,0 +1,28 @@
# _*_ mode: Makefile; _*_
export CC = gcc
export CXX = g++
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

View File

@ -0,0 +1,32 @@
# _*_ mode: Makefile; _*_
export CC = gcc
export CXX = g++
# expose these flags to R CMD SHLIB
PKG_CPPFLAGS = -O3 -Wno-unknown-pragmas -DXGBOOST_CUSTOMIZE_ERROR_ -fopenmp -fPIC $(SHLIB_OPENMP_CFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CFLAGS)
# add flag to build native code even in cross compiler
ifeq "$(WIN)" "64"
PKG_CPPFLAGS += -m64
endif
ifeq ($(no_omp),1)
PKG_CPPFLAGS += -DDISABLE_OPENMP
endif
CXXOBJ= xgboost_wrapper.o xgboost_io.o
OBJECTS= xgboost_R.o $(CXXOBJ)
.PHONY: all clean
all: $(SHLIB)
$(SHLIB): $(OBJECTS)
xgboost_wrapper.o: ../../wrapper/xgboost_wrapper.cpp
xgboost_io.o: ../../src/io/io.cpp
$(CXXOBJ) :
$(CXX) -c $(PKG_CPPFLAGS) -o $@ $(firstword $(filter %.cpp %.c, $^) )
clean:
rm -rf *.so *.o *~ *.dll

221
R-package/src/xgboost_R.cpp Normal file
View File

@ -0,0 +1,221 @@
#include <vector>
#include <string>
#include <utility>
#include <cstring>
#include "xgboost_R.h"
#include "../../wrapper/xgboost_wrapper.h"
#include "../../src/utils/utils.h"
#include "../../src/utils/omp.h"
#include "../../src/utils/matrix_csr.h"
using namespace xgboost;
// implements error handling
namespace xgboost {
namespace utils {
void HandleAssertError(const char *msg) {
error("%s", msg);
}
void HandleCheckError(const char *msg) {
error("%s", msg);
}
} // namespace utils
} // namespace xgboost
extern "C" {
void _DMatrixFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGDMatrixFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing) {
SEXP dim = getAttrib(mat, R_DimSymbol);
int nrow = INTEGER(dim)[0];
int ncol = INTEGER(dim)[1];
double *din = REAL(mat);
std::vector<float> data(nrow * ncol);
#pragma omp parallel for schedule(static)
for (int i = 0; i < nrow; ++i) {
for (int j = 0; j < ncol; ++j) {
data[i * ncol +j] = din[i + nrow * j];
}
}
void *handle = XGDMatrixCreateFromMat(&data[0], nrow, ncol, asReal(missing));
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data) {
const int *col_ptr = INTEGER(indptr);
const int *row_index = INTEGER(indices);
const double *col_data = REAL(data);
int ncol = length(indptr) - 1;
int ndata = length(data);
// transform into CSR format
std::vector<size_t> row_ptr;
std::vector< std::pair<unsigned, float> > csr_data;
utils::SparseCSRMBuilder< std::pair<unsigned,float> > builder(row_ptr, csr_data);
builder.InitBudget();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.AddBudget(row_index[j]);
}
}
builder.InitStorage();
for (int i = 0; i < ncol; ++i) {
for (int j = col_ptr[i]; j < col_ptr[i+1]; ++j) {
builder.PushElem(row_index[j], std::make_pair(i, col_data[j]));
}
}
utils::Assert(csr_data.size() == static_cast<size_t>(ndata), "BUG CreateFromCSC");
std::vector<float> row_data(ndata);
std::vector<unsigned> col_index(ndata);
#pragma omp parallel for schedule(static)
for (int i = 0; i < ndata; ++i) {
col_index[i] = csr_data[i].first;
row_data[i] = csr_data[i].second;
}
void *handle = XGDMatrixCreateFromCSR(&row_ptr[0], &col_index[0], &row_data[0], row_ptr.size(), ndata );
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
CHAR(asChar(fname)), asInteger(silent));
}
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
int len = length(array);
const char *name = CHAR(asChar(field));
if (!strcmp("group", name)) {
std::vector<unsigned> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
}
XGDMatrixSetGroup(R_ExternalPtrAddr(handle), &vec[0], len);
return;
}
{
std::vector<float> vec(len);
#pragma omp parallel for schedule(static)
for (int i = 0; i < len; ++i) {
vec[i] = REAL(array)[i];
}
XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)),
&vec[0], len);
}
}
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
size_t olen;
const float *res = XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle),
CHAR(asChar(field)), &olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
// functions related to booster
void _BoosterFinalizer(SEXP ext) {
if (R_ExternalPtrAddr(ext) == NULL) return;
XGBoosterFree(R_ExternalPtrAddr(ext));
R_ClearExternalPtr(ext);
}
SEXP XGBoosterCreate_R(SEXP dmats) {
int len = length(dmats);
std::vector<void*> dvec;
for (int i = 0; i < len; ++i){
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
}
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(1);
return ret;
}
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
XGBoosterSetParam(R_ExternalPtrAddr(handle),
CHAR(asChar(name)),
CHAR(asChar(val)));
}
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
R_ExternalPtrAddr(dtrain));
}
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
int len = length(grad);
std::vector<float> tgrad(len), thess(len);
#pragma omp parallel for schedule(static)
for (int j = 0; j < len; ++j) {
tgrad[j] = REAL(grad)[j];
thess[j] = REAL(hess)[j];
}
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dtrain),
&tgrad[0], &thess[0], len);
}
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
int len = length(dmats);
std::vector<void*> vec_dmats;
std::vector<std::string> vec_names;
std::vector<const char*> vec_sptr;
for (int i = 0; i < len; ++i) {
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
}
for (int i = 0; i < len; ++i) {
vec_sptr.push_back(vec_names[i].c_str());
}
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
asInteger(iter),
&vec_dmats[0], &vec_sptr[0], len));
}
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
size_t olen;
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
R_ExternalPtrAddr(dmat),
asInteger(output_margin),
&olen);
SEXP ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) {
REAL(ret)[i] = res[i];
}
UNPROTECT(1);
return ret;
}
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
}
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
size_t olen;
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
CHAR(asChar(fmap)),
&olen);
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
for (size_t i = 0; i < olen; ++i) {
fprintf(fo, "booster[%u]:\n", static_cast<unsigned>(i));
fprintf(fo, "%s", res[i]);
}
fclose(fo);
}
}

124
R-package/src/xgboost_R.h Normal file
View File

@ -0,0 +1,124 @@
#ifndef XGBOOST_WRAPPER_R_H_
#define XGBOOST_WRAPPER_R_H_
/*!
* \file xgboost_wrapper_R.h
* \author Tianqi Chen
* \brief R wrapper of xgboost
*/
extern "C" {
#include <Rinternals.h>
}
extern "C" {
/*!
* \brief load a data matrix
* \param fname name of the content
* \param silent whether print messages
* \return a loaded data matrix
*/
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
/*!
* \brief create matrix content from dense matrix
* This assumes the matrix is stored in column major format
* \param data R Matrix object
* \param missing which value to represent missing value
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing);
/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
* \param indices row indices
* \param data content of the data
* \return created dmatrix
*/
SEXP XGDMatrixCreateFromCSC_R(SEXP indptr,
SEXP indices,
SEXP data);
/*!
* \brief load a data matrix into binary file
* \param handle a instance of data matrix
* \param fname file name
* \param silent print statistics when saving
*/
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
/*!
* \brief set information to dmatrix
* \param handle a instance of data matrix
* \param field field name, can be label, weight
* \param array pointer to float vector
*/
void XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array);
/*!
* \brief get info vector from matrix
* \param handle a instance of data matrix
* \param field field name
* \return info vector
*/
SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field);
/*!
* \brief create xgboost learner
* \param dmats a list of dmatrix handles that will be cached
*/
SEXP XGBoosterCreate_R(SEXP dmats);
/*!
* \brief set parameters
* \param handle handle
* \param name parameter name
* \param val value of parameter
*/
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
/*!
* \brief update the model in one round using dtrain
* \param handle handle
* \param iter current iteration rounds
* \param dtrain training data
*/
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
/*!
* \brief update the model, by directly specify gradient and second order gradient,
* this can be used to replace UpdateOneIter, to support customized loss function
* \param handle handle
* \param dtrain training data
* \param grad gradient statistics
* \param hess second order gradient statistics
*/
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
/*!
* \brief get evaluation statistics for xgboost
* \param handle handle
* \param iter current iteration rounds
* \param dmats list of handles to dmatrices
* \param evname name of evaluation
* \return the string containing evaluation stati
*/
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
/*!
* \brief make prediction based on dmat
* \param handle handle
* \param dmat data matrix
* \param output_margin whether only output raw margin value
*/
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
/*!
* \brief load model from existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
/*!
* \brief save model into existing file
* \param handle handle
* \param fname file name
*/
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
/*!
* \brief dump model into text file
* \param handle handle
* \param fname file name of model that can be dumped into
* \param fmap name to fmap can be empty string
*/
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
};
#endif // XGBOOST_WRAPPER_R_H_

View File

@ -2,6 +2,7 @@
#define _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE
#include <string> #include <string>
#include "./io.h" #include "./io.h"
#include "../utils/io.h"
#include "../utils/utils.h" #include "../utils/utils.h"
#include "simple_dmatrix-inl.hpp" #include "simple_dmatrix-inl.hpp"
// implements data loads using dmatrix simple for now // implements data loads using dmatrix simple for now
@ -9,6 +10,19 @@
namespace xgboost { namespace xgboost {
namespace io { namespace io {
DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) { DataMatrix* LoadDataMatrix(const char *fname, bool silent, bool savebuffer) {
int magic;
utils::FileStream fs(utils::FopenCheck(fname, "rb"));
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
fs.Seek(0);
if (magic == DMatrixSimple::kMagic) {
DMatrixSimple *dmat = new DMatrixSimple();
dmat->LoadBinary(fs, silent, fname);
fs.Close();
return dmat;
}
fs.Close();
DMatrixSimple *dmat = new DMatrixSimple(); DMatrixSimple *dmat = new DMatrixSimple();
dmat->CacheLoad(fname, silent, savebuffer); dmat->CacheLoad(fname, silent, savebuffer);
return dmat; return dmat;

View File

@ -128,6 +128,17 @@ class DMatrixSimple : public DataMatrix {
FILE *fp = fopen64(fname, "rb"); FILE *fp = fopen64(fname, "rb");
if (fp == NULL) return false; if (fp == NULL) return false;
utils::FileStream fs(fp); utils::FileStream fs(fp);
this->LoadBinary(fs, silent, fname);
fs.Close();
return true;
}
/*!
* \brief load from binary stream
* \param fs input file stream
* \param silent whether print information during loading
* \param fname file name, used to print message
*/
inline void LoadBinary(utils::IStream &fs, bool silent = false, const char *fname = NULL) {
int magic; int magic;
utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format"); utils::Check(fs.Read(&magic, sizeof(magic)) != 0, "invalid input file format");
utils::Check(magic == kMagic, "invalid format,magic number mismatch"); utils::Check(magic == kMagic, "invalid format,magic number mismatch");
@ -135,16 +146,19 @@ class DMatrixSimple : public DataMatrix {
info.LoadBinary(fs); info.LoadBinary(fs);
FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_); FMatrixS::LoadBinary(fs, &row_ptr_, &row_data_);
fmat.LoadColAccess(fs); fmat.LoadColAccess(fs);
fs.Close();
if (!silent) { if (!silent) {
printf("%lux%lu matrix with %lu entries is loaded from %s\n", printf("%lux%lu matrix with %lu entries is loaded",
info.num_row(), info.num_col(), row_data_.size(), fname); info.num_row(), info.num_col(), row_data_.size());
if (fname != NULL) {
printf(" from %s\n", fname);
} else {
printf("\n");
}
if (info.group_ptr.size() != 0) { if (info.group_ptr.size() != 0) {
printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1); printf("data contains %u groups\n", (unsigned)info.group_ptr.size()-1);
} }
} }
return true;
} }
/*! /*!
* \brief save to binary file * \brief save to binary file

View File

@ -102,6 +102,9 @@ class FileStream : public IStream {
virtual void Write(const void *ptr, size_t size) { virtual void Write(const void *ptr, size_t size) {
fwrite(ptr, size, 1, fp); fwrite(ptr, size, 1, fp);
} }
inline void Seek(size_t pos) {
fseek(fp, 0, SEEK_SET);
}
inline void Close(void) { inline void Close(void) {
fclose(fp); fclose(fp);
} }

View File

@ -10,7 +10,9 @@
#else #else
#ifndef DISABLE_OPENMP #ifndef DISABLE_OPENMP
#ifndef _MSC_VER #ifndef _MSC_VER
#warning "OpenMP is not available, compile to single thread code" #warning "OpenMP is not available, compile to single thread code."\
"You may want to ungrade your compiler to enable OpenMP support,"\
"to get benefit of multi-threading."
#else #else
// TODO add warning for msvc // TODO add warning for msvc
#endif #endif

View File

@ -89,16 +89,19 @@ struct Random{
/*! \brief set random number seed */ /*! \brief set random number seed */
inline void Seed(unsigned sd) { inline void Seed(unsigned sd) {
this->rseed = sd; this->rseed = sd;
#ifdef _MSC_VER #if defined(_MSC_VER)||defined(_WIN32)
srand(rseed); srand(rseed);
#endif #endif
} }
/*! \brief return a real number uniform in [0,1) */ /*! \brief return a real number uniform in [0,1) */
inline double RandDouble(void) { inline double RandDouble(void) {
#ifndef _MSC_VER // use rand instead of rand_r in windows, for MSVC it is fine since rand is threadsafe
return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0); // For cygwin and mingw, this can slows down parallelism, but rand_r is only used in objective-inl.hpp, won't affect speed in general
// todo, replace with another PRNG
#if defined(_MSC_VER)||defined(_WIN32)
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0);
#else #else
return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX) + 1.0); return static_cast<double>(rand_r(&rseed)) / (static_cast<double>(RAND_MAX) + 1.0);
#endif #endif
} }
// random number seed // random number seed

View File

@ -45,7 +45,7 @@ class DMatrix:
return return
if isinstance(data, str): if isinstance(data, str):
self.handle = ctypes.c_void_p( self.handle = ctypes.c_void_p(
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 1)) xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 0))
elif isinstance(data, scp.csr_matrix): elif isinstance(data, scp.csr_matrix):
self.__init_from_csr(data) self.__init_from_csr(data)
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: elif isinstance(data, numpy.ndarray) and len(data.shape) == 2: