chg
This commit is contained in:
parent
3ba7995754
commit
08a6b92216
1611
wrapper/R-example/agaricus.txt.test
Normal file
1611
wrapper/R-example/agaricus.txt.test
Normal file
File diff suppressed because it is too large
Load Diff
6513
wrapper/R-example/agaricus.txt.train
Normal file
6513
wrapper/R-example/agaricus.txt.train
Normal file
File diff suppressed because it is too large
Load Diff
14
wrapper/R-example/demo.R
Normal file
14
wrapper/R-example/demo.R
Normal file
@ -0,0 +1,14 @@
|
||||
# include xgboost library, must set chdir=TRURE
|
||||
source('../xgboost.R', chdir=TRUE)
|
||||
|
||||
# test code here
|
||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
||||
param = list('bst:max_depth'=2, 'bst:eta'=1, 'silent'=1, 'objective'='binary:logistic')
|
||||
watchlist <- list('train'=dtrain,'test'=dtest)
|
||||
bst <- xgb.train(param, dtrain, watchlist=watchlist, nround=3)
|
||||
|
||||
succ <- xgb.save(bst, "iter.model")
|
||||
print('finsih save model')
|
||||
bst2 <- xgb.Booster(modelfile="iter.model")
|
||||
pred = xgb.predict(bst2, dtest)
|
||||
126
wrapper/R-example/featmap.txt
Normal file
126
wrapper/R-example/featmap.txt
Normal file
@ -0,0 +1,126 @@
|
||||
0 cap-shape=bell i
|
||||
1 cap-shape=conical i
|
||||
2 cap-shape=convex i
|
||||
3 cap-shape=flat i
|
||||
4 cap-shape=knobbed i
|
||||
5 cap-shape=sunken i
|
||||
6 cap-surface=fibrous i
|
||||
7 cap-surface=grooves i
|
||||
8 cap-surface=scaly i
|
||||
9 cap-surface=smooth i
|
||||
10 cap-color=brown i
|
||||
11 cap-color=buff i
|
||||
12 cap-color=cinnamon i
|
||||
13 cap-color=gray i
|
||||
14 cap-color=green i
|
||||
15 cap-color=pink i
|
||||
16 cap-color=purple i
|
||||
17 cap-color=red i
|
||||
18 cap-color=white i
|
||||
19 cap-color=yellow i
|
||||
20 bruises?=bruises i
|
||||
21 bruises?=no i
|
||||
22 odor=almond i
|
||||
23 odor=anise i
|
||||
24 odor=creosote i
|
||||
25 odor=fishy i
|
||||
26 odor=foul i
|
||||
27 odor=musty i
|
||||
28 odor=none i
|
||||
29 odor=pungent i
|
||||
30 odor=spicy i
|
||||
31 gill-attachment=attached i
|
||||
32 gill-attachment=descending i
|
||||
33 gill-attachment=free i
|
||||
34 gill-attachment=notched i
|
||||
35 gill-spacing=close i
|
||||
36 gill-spacing=crowded i
|
||||
37 gill-spacing=distant i
|
||||
38 gill-size=broad i
|
||||
39 gill-size=narrow i
|
||||
40 gill-color=black i
|
||||
41 gill-color=brown i
|
||||
42 gill-color=buff i
|
||||
43 gill-color=chocolate i
|
||||
44 gill-color=gray i
|
||||
45 gill-color=green i
|
||||
46 gill-color=orange i
|
||||
47 gill-color=pink i
|
||||
48 gill-color=purple i
|
||||
49 gill-color=red i
|
||||
50 gill-color=white i
|
||||
51 gill-color=yellow i
|
||||
52 stalk-shape=enlarging i
|
||||
53 stalk-shape=tapering i
|
||||
54 stalk-root=bulbous i
|
||||
55 stalk-root=club i
|
||||
56 stalk-root=cup i
|
||||
57 stalk-root=equal i
|
||||
58 stalk-root=rhizomorphs i
|
||||
59 stalk-root=rooted i
|
||||
60 stalk-root=missing i
|
||||
61 stalk-surface-above-ring=fibrous i
|
||||
62 stalk-surface-above-ring=scaly i
|
||||
63 stalk-surface-above-ring=silky i
|
||||
64 stalk-surface-above-ring=smooth i
|
||||
65 stalk-surface-below-ring=fibrous i
|
||||
66 stalk-surface-below-ring=scaly i
|
||||
67 stalk-surface-below-ring=silky i
|
||||
68 stalk-surface-below-ring=smooth i
|
||||
69 stalk-color-above-ring=brown i
|
||||
70 stalk-color-above-ring=buff i
|
||||
71 stalk-color-above-ring=cinnamon i
|
||||
72 stalk-color-above-ring=gray i
|
||||
73 stalk-color-above-ring=orange i
|
||||
74 stalk-color-above-ring=pink i
|
||||
75 stalk-color-above-ring=red i
|
||||
76 stalk-color-above-ring=white i
|
||||
77 stalk-color-above-ring=yellow i
|
||||
78 stalk-color-below-ring=brown i
|
||||
79 stalk-color-below-ring=buff i
|
||||
80 stalk-color-below-ring=cinnamon i
|
||||
81 stalk-color-below-ring=gray i
|
||||
82 stalk-color-below-ring=orange i
|
||||
83 stalk-color-below-ring=pink i
|
||||
84 stalk-color-below-ring=red i
|
||||
85 stalk-color-below-ring=white i
|
||||
86 stalk-color-below-ring=yellow i
|
||||
87 veil-type=partial i
|
||||
88 veil-type=universal i
|
||||
89 veil-color=brown i
|
||||
90 veil-color=orange i
|
||||
91 veil-color=white i
|
||||
92 veil-color=yellow i
|
||||
93 ring-number=none i
|
||||
94 ring-number=one i
|
||||
95 ring-number=two i
|
||||
96 ring-type=cobwebby i
|
||||
97 ring-type=evanescent i
|
||||
98 ring-type=flaring i
|
||||
99 ring-type=large i
|
||||
100 ring-type=none i
|
||||
101 ring-type=pendant i
|
||||
102 ring-type=sheathing i
|
||||
103 ring-type=zone i
|
||||
104 spore-print-color=black i
|
||||
105 spore-print-color=brown i
|
||||
106 spore-print-color=buff i
|
||||
107 spore-print-color=chocolate i
|
||||
108 spore-print-color=green i
|
||||
109 spore-print-color=orange i
|
||||
110 spore-print-color=purple i
|
||||
111 spore-print-color=white i
|
||||
112 spore-print-color=yellow i
|
||||
113 population=abundant i
|
||||
114 population=clustered i
|
||||
115 population=numerous i
|
||||
116 population=scattered i
|
||||
117 population=several i
|
||||
118 population=solitary i
|
||||
119 habitat=grasses i
|
||||
120 habitat=leaves i
|
||||
121 habitat=meadows i
|
||||
122 habitat=paths i
|
||||
123 habitat=urban i
|
||||
124 habitat=waste i
|
||||
125 habitat=woods i
|
||||
12
wrapper/README.md
Normal file
12
wrapper/README.md
Normal file
@ -0,0 +1,12 @@
|
||||
Wrapper of XGBoost
|
||||
=====
|
||||
This folder provides wrapper of xgboost to other languages
|
||||
|
||||
|
||||
Python
|
||||
=====
|
||||
To make the python module, type ```make``` in the root directory of project
|
||||
|
||||
R
|
||||
=====
|
||||
To make the R wrapper, type ```make R``` in the root directory of project
|
||||
3
wrapper/python-example/README.md
Normal file
3
wrapper/python-example/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
example to use python xgboost, the data is generated from demo/binary_classification, in libsvm format
|
||||
|
||||
for usage: see demo.py and comments in demo.py
|
||||
1611
wrapper/python-example/agaricus.txt.test
Normal file
1611
wrapper/python-example/agaricus.txt.test
Normal file
File diff suppressed because it is too large
Load Diff
6513
wrapper/python-example/agaricus.txt.train
Normal file
6513
wrapper/python-example/agaricus.txt.train
Normal file
File diff suppressed because it is too large
Load Diff
112
wrapper/python-example/demo.py
Executable file
112
wrapper/python-example/demo.py
Executable file
@ -0,0 +1,112 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
# append the path to xgboost, you may need to change the following line
|
||||
# alternatively, you can add the path to PYTHONPATH environment variable
|
||||
sys.path.append('../')
|
||||
import xgboost as xgb
|
||||
|
||||
### simple example
|
||||
# load file from text file, also binary buffer generated by xgboost
|
||||
dtrain = xgb.DMatrix('agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('agaricus.txt.test')
|
||||
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
|
||||
# specify validations set to watch performance
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, evallist)
|
||||
|
||||
# this is prediction
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
bst.save_model('0001.model')
|
||||
# dump model
|
||||
bst.dump_model('dump.raw.txt')
|
||||
# dump model with feature map
|
||||
bst.dump_model('dump.nice.txt','featmap.txt')
|
||||
|
||||
###
|
||||
# build dmatrix from scipy.sparse
|
||||
print ('start running example of build DMatrix from scipy.sparse')
|
||||
labels = []
|
||||
row = []; col = []; dat = []
|
||||
i = 0
|
||||
for l in open('agaricus.txt.train'):
|
||||
arr = l.split()
|
||||
labels.append( int(arr[0]))
|
||||
for it in arr[1:]:
|
||||
k,v = it.split(':')
|
||||
row.append(i); col.append(int(k)); dat.append(float(v))
|
||||
i += 1
|
||||
csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
|
||||
dtrain = xgb.DMatrix( csr )
|
||||
dtrain.set_label(labels)
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||
|
||||
print ('start running example of build DMatrix from numpy array')
|
||||
# NOTE: npymat is numpy array, we will convert it into scipy.sparse.csr_matrix in internal implementation,then convert to DMatrix
|
||||
npymat = csr.todense()
|
||||
dtrain = xgb.DMatrix( npymat)
|
||||
dtrain.set_label(labels)
|
||||
evallist = [(dtest,'eval'), (dtrain,'train')]
|
||||
bst = xgb.train( param, dtrain, num_round, evallist )
|
||||
|
||||
###
|
||||
# advanced: cutomsized loss function, set loss_type to 0, so that predict get untransformed score
|
||||
#
|
||||
print ('start running example to used cutomized objective function')
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1 }
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
def logregobj(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
preds = 1.0 / (1.0 + np.exp(-preds))
|
||||
grad = preds - labels
|
||||
hess = preds * (1.0-preds)
|
||||
return grad, hess
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
def evalerror(preds, dtrain):
|
||||
labels = dtrain.get_label()
|
||||
# return a pair metric_name, result
|
||||
# since preds are margin(before logistic transformation, cutoff at 0)
|
||||
return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, evallist, logregobj, evalerror)
|
||||
|
||||
|
||||
###
|
||||
# advanced: start from a initial base prediction
|
||||
#
|
||||
print ('start running example to start from a initial prediction')
|
||||
# specify parameters via map, definition are same as c++ version
|
||||
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
|
||||
# train xgboost for 1 round
|
||||
bst = xgb.train( param, dtrain, 1, evallist )
|
||||
# Note: we need the margin value instead of transformed prediction in set_base_margin
|
||||
# do predict with output_margin=True, will always give you margin values before logistic transformation
|
||||
ptrain = bst.predict(dtrain, output_margin=True)
|
||||
ptest = bst.predict(dtest, output_margin=True)
|
||||
dtrain.set_base_margin(ptrain)
|
||||
dtest.set_base_margin(ptest)
|
||||
|
||||
print ('this is result of running from initial prediction')
|
||||
bst = xgb.train( param, dtrain, 1, evallist )
|
||||
126
wrapper/python-example/featmap.txt
Normal file
126
wrapper/python-example/featmap.txt
Normal file
@ -0,0 +1,126 @@
|
||||
0 cap-shape=bell i
|
||||
1 cap-shape=conical i
|
||||
2 cap-shape=convex i
|
||||
3 cap-shape=flat i
|
||||
4 cap-shape=knobbed i
|
||||
5 cap-shape=sunken i
|
||||
6 cap-surface=fibrous i
|
||||
7 cap-surface=grooves i
|
||||
8 cap-surface=scaly i
|
||||
9 cap-surface=smooth i
|
||||
10 cap-color=brown i
|
||||
11 cap-color=buff i
|
||||
12 cap-color=cinnamon i
|
||||
13 cap-color=gray i
|
||||
14 cap-color=green i
|
||||
15 cap-color=pink i
|
||||
16 cap-color=purple i
|
||||
17 cap-color=red i
|
||||
18 cap-color=white i
|
||||
19 cap-color=yellow i
|
||||
20 bruises?=bruises i
|
||||
21 bruises?=no i
|
||||
22 odor=almond i
|
||||
23 odor=anise i
|
||||
24 odor=creosote i
|
||||
25 odor=fishy i
|
||||
26 odor=foul i
|
||||
27 odor=musty i
|
||||
28 odor=none i
|
||||
29 odor=pungent i
|
||||
30 odor=spicy i
|
||||
31 gill-attachment=attached i
|
||||
32 gill-attachment=descending i
|
||||
33 gill-attachment=free i
|
||||
34 gill-attachment=notched i
|
||||
35 gill-spacing=close i
|
||||
36 gill-spacing=crowded i
|
||||
37 gill-spacing=distant i
|
||||
38 gill-size=broad i
|
||||
39 gill-size=narrow i
|
||||
40 gill-color=black i
|
||||
41 gill-color=brown i
|
||||
42 gill-color=buff i
|
||||
43 gill-color=chocolate i
|
||||
44 gill-color=gray i
|
||||
45 gill-color=green i
|
||||
46 gill-color=orange i
|
||||
47 gill-color=pink i
|
||||
48 gill-color=purple i
|
||||
49 gill-color=red i
|
||||
50 gill-color=white i
|
||||
51 gill-color=yellow i
|
||||
52 stalk-shape=enlarging i
|
||||
53 stalk-shape=tapering i
|
||||
54 stalk-root=bulbous i
|
||||
55 stalk-root=club i
|
||||
56 stalk-root=cup i
|
||||
57 stalk-root=equal i
|
||||
58 stalk-root=rhizomorphs i
|
||||
59 stalk-root=rooted i
|
||||
60 stalk-root=missing i
|
||||
61 stalk-surface-above-ring=fibrous i
|
||||
62 stalk-surface-above-ring=scaly i
|
||||
63 stalk-surface-above-ring=silky i
|
||||
64 stalk-surface-above-ring=smooth i
|
||||
65 stalk-surface-below-ring=fibrous i
|
||||
66 stalk-surface-below-ring=scaly i
|
||||
67 stalk-surface-below-ring=silky i
|
||||
68 stalk-surface-below-ring=smooth i
|
||||
69 stalk-color-above-ring=brown i
|
||||
70 stalk-color-above-ring=buff i
|
||||
71 stalk-color-above-ring=cinnamon i
|
||||
72 stalk-color-above-ring=gray i
|
||||
73 stalk-color-above-ring=orange i
|
||||
74 stalk-color-above-ring=pink i
|
||||
75 stalk-color-above-ring=red i
|
||||
76 stalk-color-above-ring=white i
|
||||
77 stalk-color-above-ring=yellow i
|
||||
78 stalk-color-below-ring=brown i
|
||||
79 stalk-color-below-ring=buff i
|
||||
80 stalk-color-below-ring=cinnamon i
|
||||
81 stalk-color-below-ring=gray i
|
||||
82 stalk-color-below-ring=orange i
|
||||
83 stalk-color-below-ring=pink i
|
||||
84 stalk-color-below-ring=red i
|
||||
85 stalk-color-below-ring=white i
|
||||
86 stalk-color-below-ring=yellow i
|
||||
87 veil-type=partial i
|
||||
88 veil-type=universal i
|
||||
89 veil-color=brown i
|
||||
90 veil-color=orange i
|
||||
91 veil-color=white i
|
||||
92 veil-color=yellow i
|
||||
93 ring-number=none i
|
||||
94 ring-number=one i
|
||||
95 ring-number=two i
|
||||
96 ring-type=cobwebby i
|
||||
97 ring-type=evanescent i
|
||||
98 ring-type=flaring i
|
||||
99 ring-type=large i
|
||||
100 ring-type=none i
|
||||
101 ring-type=pendant i
|
||||
102 ring-type=sheathing i
|
||||
103 ring-type=zone i
|
||||
104 spore-print-color=black i
|
||||
105 spore-print-color=brown i
|
||||
106 spore-print-color=buff i
|
||||
107 spore-print-color=chocolate i
|
||||
108 spore-print-color=green i
|
||||
109 spore-print-color=orange i
|
||||
110 spore-print-color=purple i
|
||||
111 spore-print-color=white i
|
||||
112 spore-print-color=yellow i
|
||||
113 population=abundant i
|
||||
114 population=clustered i
|
||||
115 population=numerous i
|
||||
116 population=scattered i
|
||||
117 population=several i
|
||||
118 population=solitary i
|
||||
119 habitat=grasses i
|
||||
120 habitat=leaves i
|
||||
121 habitat=meadows i
|
||||
122 habitat=paths i
|
||||
123 habitat=urban i
|
||||
124 habitat=waste i
|
||||
125 habitat=woods i
|
||||
136
wrapper/xgboost.R
Normal file
136
wrapper/xgboost.R
Normal file
@ -0,0 +1,136 @@
|
||||
# load in library
|
||||
dyn.load("./libxgboostR.so")
|
||||
|
||||
# constructing DMatrix
|
||||
xgb.DMatrix <- function(data) {
|
||||
if (typeof(data) == "character") {
|
||||
handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE))
|
||||
}else {
|
||||
stop("xgb.DMatrix cannot recognize data type")
|
||||
}
|
||||
return(structure(handle, class="xgb.DMatrix"))
|
||||
}
|
||||
# construct a Booster from cachelist
|
||||
xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
if (typeof(cachelist) != "list") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
for (dm in cachelist) {
|
||||
if (class(dm) != "xgb.DMatrix") {
|
||||
stop("xgb.Booster: only accepts list of DMatrix as cachelist")
|
||||
}
|
||||
}
|
||||
handle <- .Call("XGBoosterCreate_R", cachelist)
|
||||
for (i in 1:length(params)) {
|
||||
p = params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p))
|
||||
}
|
||||
if (!is.null(modelfile)) {
|
||||
if (typeof(modelfile) != "character"){
|
||||
stop("xgb.Booster: modelfile must be character");
|
||||
}
|
||||
.Call("XGBoosterLoadModel_R", handle, modelfile)
|
||||
}
|
||||
return(structure(handle, class="xgb.Booster"))
|
||||
}
|
||||
# train a model using given parameters
|
||||
xgb.train <- function(params, dtrain, nrounds=10, watchlist=list(), obj=NULL) {
|
||||
if (typeof(params) != "list") {
|
||||
stop("xgb.train: first argument params must be list");
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.train: second argument dtrain must be xgb.DMatrix");
|
||||
}
|
||||
bst <- xgb.Booster(params, append(watchlist,dtrain))
|
||||
for (i in 1:nrounds) {
|
||||
if (is.null(obj)) {
|
||||
succ <- xgb.iter.update(bst, dtrain, i-1)
|
||||
} else {
|
||||
pred = xgb.predict(bst, dtrain)
|
||||
gpair = obj(pred, dtrain)
|
||||
succ <- xgb.iter.boost(bst, dtrain, gpair)
|
||||
}
|
||||
if (length(watchlist) != 0) {
|
||||
msg <- xgb.iter.eval(bst, watchlist, i-1)
|
||||
cat(msg); cat("\n")
|
||||
}
|
||||
}
|
||||
return(bst)
|
||||
}
|
||||
# save model or DMatrix to file
|
||||
xgb.save <- function(handle, fname) {
|
||||
if (typeof(fname) != "character") {
|
||||
stop("xgb.save: fname must be character");
|
||||
}
|
||||
if (class(handle) == "xgb.Booster") {
|
||||
.Call("XGBoosterSaveModel_R", handle, fname);
|
||||
return(TRUE)
|
||||
}
|
||||
if (class(handle) == "xgb.DMatrix") {
|
||||
.Call("XGDMatrixSaveBinary_R", handle, fname, as.integer(FALSE))
|
||||
return(TRUE)
|
||||
}
|
||||
stop("xgb.save: the input must be either xgb.DMatrix or xgb.Booster")
|
||||
return(FALSE)
|
||||
}
|
||||
# predict
|
||||
xgb.predict <- function(booster, dmat, outputmargin = FALSE) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dmat) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
ret = .Call("XGBoosterPredict_R", booster, dmat, as.integer(outputmargin))
|
||||
return(ret)
|
||||
}
|
||||
##--------------------------------------
|
||||
# the following are low level iteratively function, not needed
|
||||
# if you do not want to use them
|
||||
#---------------------------------------
|
||||
# iteratively update booster with dtrain
|
||||
xgb.iter.update <- function(booster, dtrain, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain)
|
||||
return(TRUE)
|
||||
}
|
||||
# iteratively update booster with customized statistics
|
||||
xgb.iter.boost <- function(booster, dtrain, gpair) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.iter.update: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (class(dtrain) != "xgb.DMatrix") {
|
||||
stop("xgb.iter.update: second argument must be type xgb.DMatrix")
|
||||
}
|
||||
.Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess)
|
||||
return(TRUE)
|
||||
}
|
||||
# iteratively evaluate one iteration
|
||||
xgb.iter.eval <- function(booster, watchlist, iter) {
|
||||
if (class(booster) != "xgb.Booster") {
|
||||
stop("xgb.eval: first argument must be type xgb.Booster")
|
||||
}
|
||||
if (typeof(watchlist) != "list") {
|
||||
stop("xgb.eval: only accepts list of DMatrix as watchlist")
|
||||
}
|
||||
for (w in watchlist) {
|
||||
if (class(w) != "xgb.DMatrix") {
|
||||
stop("xgb.eval: watch list can only contain xgb.DMatrix")
|
||||
}
|
||||
}
|
||||
evnames <- list()
|
||||
for (i in 1:length(watchlist)) {
|
||||
w <- watchlist[i]
|
||||
if (length(names(w)) == 0) {
|
||||
stop("xgb.eval: name tag must be presented for every elements in watchlist")
|
||||
}
|
||||
evnames <- append(evnames, names(w))
|
||||
}
|
||||
msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, evnames)
|
||||
return(msg)
|
||||
}
|
||||
266
wrapper/xgboost.py
Normal file
266
wrapper/xgboost.py
Normal file
@ -0,0 +1,266 @@
|
||||
# Author: Tianqi Chen, Bing Xu
|
||||
# module for xgboost
|
||||
import ctypes
|
||||
import os
|
||||
# optinally have scipy sparse, though not necessary
|
||||
import numpy
|
||||
import sys
|
||||
import numpy.ctypeslib
|
||||
import scipy.sparse as scp
|
||||
|
||||
# set this line correctly
|
||||
XGBOOST_PATH = os.path.dirname(__file__)+'/libxgboostwrapper.so'
|
||||
|
||||
# load in xgboost library
|
||||
xglib = ctypes.cdll.LoadLibrary(XGBOOST_PATH)
|
||||
|
||||
xglib.XGDMatrixCreateFromFile.restype = ctypes.c_void_p
|
||||
xglib.XGDMatrixCreateFromCSR.restype = ctypes.c_void_p
|
||||
xglib.XGDMatrixCreateFromMat.restype = ctypes.c_void_p
|
||||
xglib.XGDMatrixSliceDMatrix.restype = ctypes.c_void_p
|
||||
xglib.XGDMatrixGetFloatInfo.restype = ctypes.POINTER(ctypes.c_float)
|
||||
xglib.XGDMatrixNumRow.restype = ctypes.c_ulong
|
||||
|
||||
xglib.XGBoosterCreate.restype = ctypes.c_void_p
|
||||
xglib.XGBoosterPredict.restype = ctypes.POINTER(ctypes.c_float)
|
||||
xglib.XGBoosterEvalOneIter.restype = ctypes.c_char_p
|
||||
xglib.XGBoosterDumpModel.restype = ctypes.POINTER(ctypes.c_char_p)
|
||||
|
||||
|
||||
def ctypes2numpy(cptr, length):
|
||||
# convert a ctypes pointer array to numpy
|
||||
assert isinstance(cptr, ctypes.POINTER(ctypes.c_float))
|
||||
res = numpy.zeros(length, dtype='float32')
|
||||
assert ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0])
|
||||
return res
|
||||
|
||||
# data matrix used in xgboost
|
||||
class DMatrix:
|
||||
# constructor
|
||||
def __init__(self, data, label=None, missing=0.0, weight = None):
|
||||
# force into void_p, mac need to pass things in as void_p
|
||||
if data == None:
|
||||
self.handle = None
|
||||
return
|
||||
if isinstance(data, str):
|
||||
self.handle = ctypes.c_void_p(
|
||||
xglib.XGDMatrixCreateFromFile(ctypes.c_char_p(data.encode('utf-8')), 1))
|
||||
elif isinstance(data, scp.csr_matrix):
|
||||
self.__init_from_csr(data)
|
||||
elif isinstance(data, numpy.ndarray) and len(data.shape) == 2:
|
||||
self.__init_from_npy2d(data, missing)
|
||||
else:
|
||||
try:
|
||||
csr = scp.csr_matrix(data)
|
||||
self.__init_from_csr(csr)
|
||||
except:
|
||||
raise Exception("can not intialize DMatrix from"+str(type(data)))
|
||||
if label != None:
|
||||
self.set_label(label)
|
||||
if weight !=None:
|
||||
self.set_weight(weight)
|
||||
# convert data from csr matrix
|
||||
def __init_from_csr(self, csr):
|
||||
assert len(csr.indices) == len(csr.data)
|
||||
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromCSR(
|
||||
(ctypes.c_ulong * len(csr.indptr))(*csr.indptr),
|
||||
(ctypes.c_uint * len(csr.indices))(*csr.indices),
|
||||
(ctypes.c_float * len(csr.data))(*csr.data),
|
||||
len(csr.indptr), len(csr.data)))
|
||||
# convert data from numpy matrix
|
||||
def __init_from_npy2d(self,mat,missing):
|
||||
data = numpy.array(mat.reshape(mat.size), dtype='float32')
|
||||
self.handle = ctypes.c_void_p(xglib.XGDMatrixCreateFromMat(
|
||||
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
mat.shape[0], mat.shape[1], ctypes.c_float(missing)))
|
||||
# destructor
|
||||
def __del__(self):
|
||||
xglib.XGDMatrixFree(self.handle)
|
||||
def __get_float_info(self, field):
|
||||
length = ctypes.c_ulong()
|
||||
ret = xglib.XGDMatrixGetFloatInfo(self.handle, ctypes.c_char_p(field.encode('utf-8')),
|
||||
ctypes.byref(length))
|
||||
return ctypes2numpy(ret, length.value)
|
||||
def __set_float_info(self, field, data):
|
||||
xglib.XGDMatrixSetFloatInfo(self.handle,ctypes.c_char_p(field.encode('utf-8')),
|
||||
(ctypes.c_float*len(data))(*data), len(data))
|
||||
# load data from file
|
||||
def save_binary(self, fname, silent=True):
|
||||
xglib.XGDMatrixSaveBinary(self.handle, ctypes.c_char_p(fname.encode('utf-8')), int(silent))
|
||||
# set label of dmatrix
|
||||
def set_label(self, label):
|
||||
self.__set_float_info('label', label)
|
||||
# set weight of each instances
|
||||
def set_weight(self, weight):
|
||||
self.__set_float_info('weight', weight)
|
||||
# set initialized margin prediction
|
||||
def set_base_margin(self, margin):
|
||||
"""
|
||||
set base margin of booster to start from
|
||||
this can be used to specify a prediction value of
|
||||
existing model to be base_margin
|
||||
However, remember margin is needed, instead of transformed prediction
|
||||
e.g. for logistic regression: need to put in value before logistic transformation
|
||||
see also example/demo.py
|
||||
"""
|
||||
self.__set_float_info('base_margin', margin)
|
||||
# set group size of dmatrix, used for rank
|
||||
def set_group(self, group):
|
||||
xglib.XGDMatrixSetGroup(self.handle, (ctypes.c_uint*len(group))(*group), len(group))
|
||||
# get label from dmatrix
|
||||
def get_label(self):
|
||||
return self.__get_float_info('label')
|
||||
# get weight from dmatrix
|
||||
def get_weight(self):
|
||||
return self.__get_float_info('weight')
|
||||
# get base_margin from dmatrix
|
||||
def get_base_margin(self):
|
||||
return self.__get_float_info('base_margin')
|
||||
def num_row(self):
|
||||
return xglib.XGDMatrixNumRow(self.handle)
|
||||
# slice the DMatrix to return a new DMatrix that only contains rindex
|
||||
def slice(self, rindex):
|
||||
res = DMatrix(None)
|
||||
res.handle = ctypes.c_void_p(xglib.XGDMatrixSliceDMatrix(
|
||||
self.handle, (ctypes.c_int*len(rindex))(*rindex), len(rindex)))
|
||||
return res
|
||||
|
||||
class Booster:
|
||||
"""learner class """
|
||||
def __init__(self, params={}, cache=[], model_name = None):
|
||||
""" constructor, param: """
|
||||
for d in cache:
|
||||
assert isinstance(d, DMatrix)
|
||||
dmats = (ctypes.c_void_p * len(cache))(*[ d.handle for d in cache])
|
||||
self.handle = ctypes.c_void_p(xglib.XGBoosterCreate(dmats, len(cache)))
|
||||
self.set_param({'seed':0})
|
||||
self.set_param(params)
|
||||
if model_name != None:
|
||||
self.load_model(model_name)
|
||||
def __del__(self):
|
||||
xglib.XGBoosterFree(self.handle)
|
||||
def set_param(self, params, pv=None):
|
||||
if isinstance(params, dict):
|
||||
for k, v in params.items():
|
||||
xglib.XGBoosterSetParam(
|
||||
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
||||
ctypes.c_char_p(str(v).encode('utf-8')))
|
||||
elif isinstance(params,str) and pv != None:
|
||||
xglib.XGBoosterSetParam(
|
||||
self.handle, ctypes.c_char_p(params.encode('utf-8')),
|
||||
ctypes.c_char_p(str(pv).encode('utf-8')))
|
||||
else:
|
||||
for k, v in params:
|
||||
xglib.XGBoosterSetParam(
|
||||
self.handle, ctypes.c_char_p(k.encode('utf-8')),
|
||||
ctypes.c_char_p(str(v).encode('utf-8')))
|
||||
def update(self, dtrain, it):
|
||||
"""
|
||||
update
|
||||
dtrain: the training DMatrix
|
||||
it: current iteration number
|
||||
"""
|
||||
assert isinstance(dtrain, DMatrix)
|
||||
xglib.XGBoosterUpdateOneIter(self.handle, it, dtrain.handle)
|
||||
def boost(self, dtrain, grad, hess):
|
||||
""" update """
|
||||
assert len(grad) == len(hess)
|
||||
assert isinstance(dtrain, DMatrix)
|
||||
xglib.XGBoosterBoostOneIter(self.handle, dtrain.handle,
|
||||
(ctypes.c_float*len(grad))(*grad),
|
||||
(ctypes.c_float*len(hess))(*hess),
|
||||
len(grad))
|
||||
def eval_set(self, evals, it = 0):
|
||||
for d in evals:
|
||||
assert isinstance(d[0], DMatrix)
|
||||
assert isinstance(d[1], str)
|
||||
dmats = (ctypes.c_void_p * len(evals) )(*[ d[0].handle for d in evals])
|
||||
evnames = (ctypes.c_char_p * len(evals))(
|
||||
* [ctypes.c_char_p(d[1].encode('utf-8')) for d in evals])
|
||||
return xglib.XGBoosterEvalOneIter(self.handle, it, dmats, evnames, len(evals))
|
||||
def eval(self, mat, name = 'eval', it = 0):
|
||||
return self.eval_set( [(mat,name)], it)
|
||||
def predict(self, data, output_margin=False):
|
||||
"""
|
||||
predict with data
|
||||
data: the dmatrix storing the input
|
||||
output_margin: whether output raw margin value that is untransformed
|
||||
"""
|
||||
length = ctypes.c_ulong()
|
||||
preds = xglib.XGBoosterPredict(self.handle, data.handle,
|
||||
int(output_margin), ctypes.byref(length))
|
||||
return ctypes2numpy(preds, length.value)
|
||||
def save_model(self, fname):
|
||||
""" save model to file """
|
||||
xglib.XGBoosterSaveModel(self.handle, ctypes.c_char_p(fname.encode('utf-8')))
|
||||
def load_model(self, fname):
|
||||
"""load model from file"""
|
||||
xglib.XGBoosterLoadModel( self.handle, ctypes.c_char_p(fname.encode('utf-8')) )
|
||||
def dump_model(self, fo, fmap=''):
|
||||
"""dump model into text file"""
|
||||
if isinstance(fo,str):
|
||||
fo = open(fo,'w')
|
||||
need_close = True
|
||||
else:
|
||||
need_close = False
|
||||
ret = self.get_dump(fmap)
|
||||
for i in range(len(ret)):
|
||||
fo.write('booster[%d]:\n' %i)
|
||||
fo.write( ret[i] )
|
||||
if need_close:
|
||||
fo.close()
|
||||
def get_dump(self, fmap=''):
|
||||
"""get dump of model as list of strings """
|
||||
length = ctypes.c_ulong()
|
||||
sarr = xglib.XGBoosterDumpModel(self.handle, ctypes.c_char_p(fmap.encode('utf-8')), ctypes.byref(length))
|
||||
res = []
|
||||
for i in range(length.value):
|
||||
res.append( str(sarr[i]) )
|
||||
return res
|
||||
def get_fscore(self, fmap=''):
|
||||
""" get feature importance of each feature """
|
||||
trees = self.get_dump(fmap)
|
||||
fmap = {}
|
||||
for tree in trees:
|
||||
print tree
|
||||
for l in tree.split('\n'):
|
||||
arr = l.split('[')
|
||||
if len(arr) == 1:
|
||||
continue
|
||||
fid = arr[1].split(']')[0]
|
||||
fid = fid.split('<')[0]
|
||||
if fid not in fmap:
|
||||
fmap[fid] = 1
|
||||
else:
|
||||
fmap[fid]+= 1
|
||||
return fmap
|
||||
|
||||
def evaluate(bst, evals, it, feval = None):
|
||||
"""evaluation on eval set"""
|
||||
if feval != None:
|
||||
res = '[%d]' % it
|
||||
for dm, evname in evals:
|
||||
name, val = feval(bst.predict(dm), dm)
|
||||
res += '\t%s-%s:%f' % (evname, name, val)
|
||||
else:
|
||||
res = bst.eval_set(evals, it)
|
||||
|
||||
return res
|
||||
|
||||
def train(params, dtrain, num_boost_round = 10, evals = [], obj=None, feval=None):
|
||||
""" train a booster with given paramaters """
|
||||
bst = Booster(params, [dtrain]+[ d[0] for d in evals ] )
|
||||
if obj == None:
|
||||
for i in range(num_boost_round):
|
||||
bst.update( dtrain, i )
|
||||
if len(evals) != 0:
|
||||
sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
|
||||
else:
|
||||
# try customized objective function
|
||||
for i in range(num_boost_round):
|
||||
pred = bst.predict( dtrain )
|
||||
grad, hess = obj( pred, dtrain )
|
||||
bst.boost( dtrain, grad, hess )
|
||||
if len(evals) != 0:
|
||||
sys.stderr.write(evaluate(bst, evals, i, feval)+'\n')
|
||||
return bst
|
||||
115
wrapper/xgboost_R.cpp
Normal file
115
wrapper/xgboost_R.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "xgboost_wrapper.h"
|
||||
#include "xgboost_R.h"
|
||||
#include "../src/utils/utils.h"
|
||||
#include "../src/utils/omp.h"
|
||||
|
||||
using namespace xgboost;
|
||||
|
||||
extern "C" {
|
||||
void _DMatrixFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
XGDMatrixFree(R_ExternalPtrAddr(ext));
|
||||
R_ClearExternalPtr(ext);
|
||||
}
|
||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
|
||||
void *handle = XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent));
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {
|
||||
XGDMatrixSaveBinary(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fname)), asInteger(silent));
|
||||
}
|
||||
|
||||
// functions related to booster
|
||||
void _BoosterFinalizer(SEXP ext) {
|
||||
if (R_ExternalPtrAddr(ext) == NULL) return;
|
||||
XGBoosterFree(R_ExternalPtrAddr(ext));
|
||||
R_ClearExternalPtr(ext);
|
||||
}
|
||||
SEXP XGBoosterCreate_R(SEXP dmats) {
|
||||
int len = length(dmats);
|
||||
std::vector<void*> dvec;
|
||||
for (int i = 0; i < len; ++i){
|
||||
dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
||||
}
|
||||
void *handle = XGBoosterCreate(&dvec[0], dvec.size());
|
||||
SEXP ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
|
||||
XGBoosterSetParam(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(name)),
|
||||
CHAR(asChar(val)));
|
||||
}
|
||||
void XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
|
||||
XGBoosterUpdateOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
R_ExternalPtrAddr(dtrain));
|
||||
}
|
||||
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
|
||||
utils::Check(length(grad) == length(hess), "gradient and hess must have same length");
|
||||
int len = length(grad);
|
||||
std::vector<float> tgrad(len), thess(len);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int j = 0; j < len; ++j) {
|
||||
tgrad[j] = REAL(grad)[j];
|
||||
thess[j] = REAL(hess)[j];
|
||||
}
|
||||
XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
|
||||
R_ExternalPtrAddr(dtrain),
|
||||
&tgrad[0], &thess[0], len);
|
||||
}
|
||||
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
|
||||
utils::Check(length(dmats) == length(evnames), "dmats and evnams must have same length");
|
||||
int len = length(dmats);
|
||||
std::vector<void*> vec_dmats;
|
||||
std::vector<std::string> vec_names;
|
||||
std::vector<const char*> vec_sptr;
|
||||
for (int i = 0; i < len; ++i){
|
||||
vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
|
||||
vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
|
||||
vec_sptr.push_back(vec_names.back().c_str());
|
||||
}
|
||||
return mkString(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
|
||||
asInteger(iter),
|
||||
&vec_dmats[0], &vec_sptr[0], len));
|
||||
}
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin) {
|
||||
size_t olen;
|
||||
const float *res = XGBoosterPredict(R_ExternalPtrAddr(handle),
|
||||
R_ExternalPtrAddr(dmat),
|
||||
asInteger(output_margin),
|
||||
&olen);
|
||||
SEXP ret = PROTECT(allocVector(REALSXP, olen));
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
REAL(ret)[i] = res[i];
|
||||
}
|
||||
UNPROTECT(1);
|
||||
return ret;
|
||||
}
|
||||
void XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
|
||||
XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
}
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname) {
|
||||
XGBoosterSaveModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname)));
|
||||
}
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap) {
|
||||
size_t olen;
|
||||
const char **res = XGBoosterDumpModel(R_ExternalPtrAddr(handle),
|
||||
CHAR(asChar(fmap)),
|
||||
&olen);
|
||||
FILE *fo = utils::FopenCheck(CHAR(asChar(fname)), "w");
|
||||
for (size_t i = 0; i < olen; ++i) {
|
||||
fprintf(fo, "booster[%lu]:\n", i);
|
||||
fprintf(fo, "%s\n", res[i]);
|
||||
}
|
||||
fclose(fo);
|
||||
}
|
||||
}
|
||||
91
wrapper/xgboost_R.h
Normal file
91
wrapper/xgboost_R.h
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef XGBOOST_WRAPPER_R_H_
|
||||
#define XGBOOST_WRAPPER_R_H_
|
||||
/*!
|
||||
* \file xgboost_wrapper_R.h
|
||||
* \author Tianqi Chen
|
||||
* \brief R wrapper of xgboost
|
||||
*/
|
||||
extern "C" {
|
||||
#include <Rinternals.h>
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \param fname name of the content
|
||||
* \param silent whether print messages
|
||||
* \return a loaded data matrix
|
||||
*/
|
||||
SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
|
||||
/*!
|
||||
* \brief load a data matrix into binary file
|
||||
* \param handle a instance of data matrix
|
||||
* \param fname file name
|
||||
* \param silent print statistics when saving
|
||||
*/
|
||||
void XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent);
|
||||
/*!
|
||||
* \brief create xgboost learner
|
||||
* \param dmats a list of dmatrix handles that will be cached
|
||||
*/
|
||||
SEXP XGBoosterCreate_R(SEXP dmats);
|
||||
/*!
|
||||
* \brief set parameters
|
||||
* \param handle handle
|
||||
* \param name parameter name
|
||||
* \param val value of parameter
|
||||
*/
|
||||
void XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val);
|
||||
/*!
|
||||
* \brief update the model in one round using dtrain
|
||||
* \param handle handle
|
||||
* \param iter current iteration rounds
|
||||
* \param dtrain training data
|
||||
*/
|
||||
void XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
|
||||
/*!
|
||||
* \brief update the model, by directly specify gradient and second order gradient,
|
||||
* this can be used to replace UpdateOneIter, to support customized loss function
|
||||
* \param handle handle
|
||||
* \param dtrain training data
|
||||
* \param grad gradient statistics
|
||||
* \param hess second order gradient statistics
|
||||
*/
|
||||
void XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
|
||||
/*!
|
||||
* \brief get evaluation statistics for xgboost
|
||||
* \param handle handle
|
||||
* \param iter current iteration rounds
|
||||
* \param dmats list of handles to dmatrices
|
||||
* \param evname name of evaluation
|
||||
* \return the string containing evaluation stati
|
||||
*/
|
||||
SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames);
|
||||
/*!
|
||||
* \brief make prediction based on dmat
|
||||
* \param handle handle
|
||||
* \param dmat data matrix
|
||||
* \param output_margin whether only output raw margin value
|
||||
*/
|
||||
SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP output_margin);
|
||||
/*!
|
||||
* \brief load model from existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterLoadModel_R(SEXP handle, SEXP fname);
|
||||
/*!
|
||||
* \brief save model into existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterSaveModel_R(SEXP handle, SEXP fname);
|
||||
/*!
|
||||
* \brief dump model into text file
|
||||
* \param handle handle
|
||||
* \param fname file name of model that can be dumped into
|
||||
* \param fmap name to fmap can be empty string
|
||||
*/
|
||||
void XGBoosterDumpModel_R(SEXP handle, SEXP fname, SEXP fmap);
|
||||
};
|
||||
#endif // XGBOOST_WRAPPER_R_H_
|
||||
249
wrapper/xgboost_wrapper.cpp
Normal file
249
wrapper/xgboost_wrapper.cpp
Normal file
@ -0,0 +1,249 @@
|
||||
// implementations in ctypes
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include "./xgboost_wrapper.h"
|
||||
#include "../src/data.h"
|
||||
#include "../src/learner/learner-inl.hpp"
|
||||
#include "../src/io/io.h"
|
||||
#include "../src/io/simple_dmatrix-inl.hpp"
|
||||
|
||||
using namespace xgboost;
|
||||
using namespace xgboost::io;
|
||||
|
||||
namespace xgboost {
|
||||
namespace wrapper {
|
||||
// booster wrapper class
|
||||
class Booster: public learner::BoostLearner<FMatrixS> {
|
||||
public:
|
||||
explicit Booster(const std::vector<DataMatrix*>& mats) {
|
||||
this->silent = 1;
|
||||
this->init_model = false;
|
||||
this->SetCacheData(mats);
|
||||
}
|
||||
const float *Pred(const DataMatrix &dmat, int output_margin, size_t *len) {
|
||||
this->CheckInitModel();
|
||||
this->Predict(dmat, output_margin, &this->preds_);
|
||||
*len = this->preds_.size();
|
||||
return &this->preds_[0];
|
||||
}
|
||||
inline void BoostOneIter(const DataMatrix &train,
|
||||
float *grad, float *hess, size_t len) {
|
||||
this->gpair_.resize(len);
|
||||
const unsigned ndata = static_cast<unsigned>(len);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (unsigned j = 0; j < ndata; ++j) {
|
||||
gpair_[j] = bst_gpair(grad[j], hess[j]);
|
||||
}
|
||||
gbm_->DoBoost(train.fmat, train.info.info, &gpair_);
|
||||
}
|
||||
inline void CheckInitModel(void) {
|
||||
if (!init_model) {
|
||||
this->InitModel(); init_model = true;
|
||||
}
|
||||
}
|
||||
inline void LoadModel(const char *fname) {
|
||||
learner::BoostLearner<FMatrixS>::LoadModel(fname);
|
||||
this->init_model = true;
|
||||
}
|
||||
inline const char** GetModelDump(const utils::FeatMap& fmap, bool with_stats, size_t *len) {
|
||||
model_dump = this->DumpModel(fmap, with_stats);
|
||||
model_dump_cptr.resize(model_dump.size());
|
||||
for (size_t i = 0; i < model_dump.size(); ++i) {
|
||||
model_dump_cptr[i] = model_dump[i].c_str();
|
||||
}
|
||||
*len = model_dump.size();
|
||||
return &model_dump_cptr[0];
|
||||
}
|
||||
// temporal fields
|
||||
// temporal data to save evaluation dump
|
||||
std::string eval_str;
|
||||
// temporal space to save model dump
|
||||
std::vector<std::string> model_dump;
|
||||
std::vector<const char*> model_dump_cptr;
|
||||
|
||||
private:
|
||||
bool init_model;
|
||||
};
|
||||
} // namespace wrapper
|
||||
} // namespace xgboost
|
||||
|
||||
using namespace xgboost::wrapper;
|
||||
|
||||
extern "C"{
|
||||
void* XGDMatrixCreateFromFile(const char *fname, int silent) {
|
||||
return LoadDataMatrix(fname, silent, false);
|
||||
}
|
||||
void* XGDMatrixCreateFromCSR(const size_t *indptr,
|
||||
const unsigned *indices,
|
||||
const float *data,
|
||||
size_t nindptr,
|
||||
size_t nelem) {
|
||||
DMatrixSimple *p_mat = new DMatrixSimple();
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
mat.row_ptr_.resize(nindptr);
|
||||
memcpy(&mat.row_ptr_[0], indptr, sizeof(size_t)*nindptr);
|
||||
mat.row_data_.resize(nelem);
|
||||
for (size_t i = 0; i < nelem; ++i) {
|
||||
mat.row_data_[i] = SparseBatch::Entry(indices[i], data[i]);
|
||||
mat.info.num_col = std::max(mat.info.num_col,
|
||||
static_cast<size_t>(indices[i]+1));
|
||||
}
|
||||
mat.info.num_row = nindptr - 1;
|
||||
return p_mat;
|
||||
}
|
||||
void* XGDMatrixCreateFromMat(const float *data,
|
||||
size_t nrow,
|
||||
size_t ncol,
|
||||
float missing) {
|
||||
DMatrixSimple *p_mat = new DMatrixSimple();
|
||||
DMatrixSimple &mat = *p_mat;
|
||||
mat.info.num_row = nrow;
|
||||
mat.info.num_col = ncol;
|
||||
for (size_t i = 0; i < nrow; ++i, data += ncol) {
|
||||
size_t nelem = 0;
|
||||
for (size_t j = 0; j < ncol; ++j) {
|
||||
if (data[j] != missing) {
|
||||
mat.row_data_.push_back(SparseBatch::Entry(j, data[j]));
|
||||
++nelem;
|
||||
}
|
||||
}
|
||||
mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
|
||||
}
|
||||
return p_mat;
|
||||
}
|
||||
void* XGDMatrixSliceDMatrix(void *handle,
|
||||
const int *idxset,
|
||||
size_t len) {
|
||||
DMatrixSimple tmp;
|
||||
DataMatrix &dsrc = *static_cast<DataMatrix*>(handle);
|
||||
if (dsrc.magic != DMatrixSimple::kMagic) {
|
||||
tmp.CopyFrom(dsrc);
|
||||
}
|
||||
DataMatrix &src = (dsrc.magic == DMatrixSimple::kMagic ?
|
||||
*static_cast<DMatrixSimple*>(handle): tmp);
|
||||
DMatrixSimple *p_ret = new DMatrixSimple();
|
||||
DMatrixSimple &ret = *p_ret;
|
||||
|
||||
utils::Check(src.info.group_ptr.size() == 0,
|
||||
"slice does not support group structure");
|
||||
ret.Clear();
|
||||
ret.info.num_row = len;
|
||||
ret.info.num_col = src.info.num_col;
|
||||
|
||||
utils::IIterator<SparseBatch> *iter = src.fmat.RowIterator();
|
||||
iter->BeforeFirst();
|
||||
utils::Assert(iter->Next(), "slice");
|
||||
const SparseBatch &batch = iter->Value();
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
const int ridx = idxset[i];
|
||||
SparseBatch::Inst inst = batch[ridx];
|
||||
utils::Check(static_cast<size_t>(ridx) < batch.size, "slice index exceed number of rows");
|
||||
ret.row_data_.resize(ret.row_data_.size() + inst.length);
|
||||
memcpy(&ret.row_data_[ret.row_ptr_.back()], inst.data,
|
||||
sizeof(SparseBatch::Entry) * inst.length);
|
||||
ret.row_ptr_.push_back(ret.row_ptr_.back() + inst.length);
|
||||
if (src.info.labels.size() != 0) {
|
||||
ret.info.labels.push_back(src.info.labels[ridx]);
|
||||
}
|
||||
if (src.info.weights.size() != 0) {
|
||||
ret.info.weights.push_back(src.info.weights[ridx]);
|
||||
}
|
||||
if (src.info.info.root_index.size() != 0) {
|
||||
ret.info.info.root_index.push_back(src.info.info.root_index[ridx]);
|
||||
}
|
||||
}
|
||||
return p_ret;
|
||||
}
|
||||
void XGDMatrixFree(void *handle) {
|
||||
delete static_cast<DataMatrix*>(handle);
|
||||
}
|
||||
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent) {
|
||||
SaveDataMatrix(*static_cast<DataMatrix*>(handle), fname, silent);
|
||||
}
|
||||
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *info, size_t len) {
|
||||
std::vector<float> &vec =
|
||||
static_cast<DataMatrix*>(handle)->info.GetInfo(field);
|
||||
vec.resize(len);
|
||||
memcpy(&vec[0], info, sizeof(float) * len);
|
||||
}
|
||||
void XGDMatrixSetGroup(void *handle, const unsigned *group, size_t len) {
|
||||
DataMatrix *pmat = static_cast<DataMatrix*>(handle);
|
||||
pmat->info.group_ptr.resize(len + 1);
|
||||
pmat->info.group_ptr[0] = 0;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
pmat->info.group_ptr[i+1] = pmat->info.group_ptr[i]+group[i];
|
||||
}
|
||||
}
|
||||
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* len) {
|
||||
const std::vector<float> &vec =
|
||||
static_cast<const DataMatrix*>(handle)->info.GetInfo(field);
|
||||
*len = vec.size();
|
||||
return &vec[0];
|
||||
}
|
||||
size_t XGDMatrixNumRow(const void *handle) {
|
||||
return static_cast<const DataMatrix*>(handle)->info.num_row;
|
||||
}
|
||||
|
||||
// xgboost implementation
|
||||
void *XGBoosterCreate(void *dmats[], size_t len) {
|
||||
std::vector<DataMatrix*> mats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dmats[i]);
|
||||
mats.push_back(dtr);
|
||||
}
|
||||
return new Booster(mats);
|
||||
}
|
||||
void XGBoosterFree(void *handle) {
|
||||
delete static_cast<Booster*>(handle);
|
||||
}
|
||||
void XGBoosterSetParam(void *handle, const char *name, const char *value) {
|
||||
static_cast<Booster*>(handle)->SetParam(name, value);
|
||||
}
|
||||
void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain) {
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
|
||||
bst->CheckInitModel();
|
||||
bst->CheckInit(dtr);
|
||||
bst->UpdateOneIter(iter, *dtr);
|
||||
}
|
||||
void XGBoosterBoostOneIter(void *handle, void *dtrain,
|
||||
float *grad, float *hess, size_t len) {
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
DataMatrix *dtr = static_cast<DataMatrix*>(dtrain);
|
||||
bst->CheckInitModel();
|
||||
bst->CheckInit(dtr);
|
||||
bst->BoostOneIter(*dtr, grad, hess, len);
|
||||
}
|
||||
const char* XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
|
||||
const char *evnames[], size_t len) {
|
||||
Booster *bst = static_cast<Booster*>(handle);
|
||||
std::vector<std::string> names;
|
||||
std::vector<const DataMatrix*> mats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
mats.push_back(static_cast<DataMatrix*>(dmats[i]));
|
||||
names.push_back(std::string(evnames[i]));
|
||||
}
|
||||
bst->CheckInitModel();
|
||||
bst->eval_str = bst->EvalOneIter(iter, mats, names);
|
||||
return bst->eval_str.c_str();
|
||||
}
|
||||
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, size_t *len) {
|
||||
return static_cast<Booster*>(handle)->Pred(*static_cast<DataMatrix*>(dmat), output_margin, len);
|
||||
}
|
||||
void XGBoosterLoadModel(void *handle, const char *fname) {
|
||||
static_cast<Booster*>(handle)->LoadModel(fname);
|
||||
}
|
||||
void XGBoosterSaveModel(const void *handle, const char *fname) {
|
||||
static_cast<const Booster*>(handle)->SaveModel(fname);
|
||||
}
|
||||
const char** XGBoosterDumpModel(void *handle, const char *fmap, size_t *len){
|
||||
utils::FeatMap featmap;
|
||||
if (strlen(fmap) != 0) {
|
||||
featmap.LoadText(fmap);
|
||||
}
|
||||
return static_cast<Booster*>(handle)->GetModelDump(featmap, false, len);
|
||||
}
|
||||
};
|
||||
171
wrapper/xgboost_wrapper.h
Normal file
171
wrapper/xgboost_wrapper.h
Normal file
@ -0,0 +1,171 @@
|
||||
#ifndef XGBOOST_WRAPPER_H_
|
||||
#define XGBOOST_WRAPPER_H_
|
||||
/*!
|
||||
* \file xgboost_wrapperh
|
||||
* \author Tianqi Chen
|
||||
* \brief a C style wrapper of xgboost
|
||||
* can be used to create wrapper of other languages
|
||||
*/
|
||||
#include <cstdio>
|
||||
|
||||
extern "C" {
|
||||
/*!
|
||||
* \brief load a data matrix
|
||||
* \return a loaded data matrix
|
||||
*/
|
||||
void* XGDMatrixCreateFromFile(const char *fname, int silent);
|
||||
/*!
|
||||
* \brief create a matrix content from csr format
|
||||
* \param handle a instance of data matrix
|
||||
* \param indptr pointer to row headers
|
||||
* \param indices findex
|
||||
* \param data fvalue
|
||||
* \param nindptr number of rows in the matix + 1
|
||||
* \param nelem number of nonzero elements in the matrix
|
||||
* \return created dmatrix
|
||||
*/
|
||||
void* XGDMatrixCreateFromCSR(const size_t *indptr,
|
||||
const unsigned *indices,
|
||||
const float *data,
|
||||
size_t nindptr,
|
||||
size_t nelem);
|
||||
/*!
|
||||
* \brief create matrix content from dense matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param data pointer to the data space
|
||||
* \param nrow number of rows
|
||||
* \param ncol number columns
|
||||
* \param missing which value to represent missing value
|
||||
* \return created dmatrix
|
||||
*/
|
||||
void* XGDMatrixCreateFromMat(const float *data,
|
||||
size_t nrow,
|
||||
size_t ncol,
|
||||
float missing);
|
||||
/*!
|
||||
* \brief create a new dmatrix from sliced content of existing matrix
|
||||
* \param handle instance of data matrix to be sliced
|
||||
* \param idxset index set
|
||||
* \param len length of index set
|
||||
* \return a sliced new matrix
|
||||
*/
|
||||
void* XGDMatrixSliceDMatrix(void *handle,
|
||||
const int *idxset,
|
||||
size_t len);
|
||||
/*!
|
||||
* \brief free space in data matrix
|
||||
*/
|
||||
void XGDMatrixFree(void *handle);
|
||||
/*!
|
||||
* \brief load a data matrix into binary file
|
||||
* \param handle a instance of data matrix
|
||||
* \param fname file name
|
||||
* \param silent print statistics when saving
|
||||
*/
|
||||
void XGDMatrixSaveBinary(void *handle, const char *fname, int silent);
|
||||
/*!
|
||||
* \brief set float vector to a content in info
|
||||
* \param handle a instance of data matrix
|
||||
* \param field field name, can be label, weight
|
||||
* \param array pointer to float vector
|
||||
* \param len length of array
|
||||
*/
|
||||
void XGDMatrixSetFloatInfo(void *handle, const char *field, const float *array, size_t len);
|
||||
/*!
|
||||
* \brief set label of the training matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param group pointer to group size
|
||||
* \param len length of array
|
||||
*/
|
||||
void XGDMatrixSetGroup(void *handle, const unsigned *group, size_t len);
|
||||
/*!
|
||||
* \brief get float info vector from matrix
|
||||
* \param handle a instance of data matrix
|
||||
* \param len used to set result length
|
||||
* \param field field name
|
||||
* \return pointer to the label
|
||||
*/
|
||||
const float* XGDMatrixGetFloatInfo(const void *handle, const char *field, size_t* out_len);
|
||||
/*!
|
||||
* \brief return number of rows
|
||||
*/
|
||||
size_t XGDMatrixNumRow(const void *handle);
|
||||
// --- start XGBoost class
|
||||
/*!
|
||||
* \brief create xgboost learner
|
||||
* \param dmats matrices that are set to be cached
|
||||
* \param len length of dmats
|
||||
*/
|
||||
void *XGBoosterCreate(void* dmats[], size_t len);
|
||||
/*!
|
||||
* \brief free obj in handle
|
||||
* \param handle handle to be freed
|
||||
*/
|
||||
void XGBoosterFree(void* handle);
|
||||
/*!
|
||||
* \brief set parameters
|
||||
* \param handle handle
|
||||
* \param name parameter name
|
||||
* \param val value of parameter
|
||||
*/
|
||||
void XGBoosterSetParam(void *handle, const char *name, const char *value);
|
||||
/*!
|
||||
* \brief update the model in one round using dtrain
|
||||
* \param handle handle
|
||||
* \param iter current iteration rounds
|
||||
* \param dtrain training data
|
||||
*/
|
||||
void XGBoosterUpdateOneIter(void *handle, int iter, void *dtrain);
|
||||
/*!
|
||||
* \brief update the model, by directly specify gradient and second order gradient,
|
||||
* this can be used to replace UpdateOneIter, to support customized loss function
|
||||
* \param handle handle
|
||||
* \param dtrain training data
|
||||
* \param grad gradient statistics
|
||||
* \param hess second order gradient statistics
|
||||
* \param len length of grad/hess array
|
||||
*/
|
||||
void XGBoosterBoostOneIter(void *handle, void *dtrain,
|
||||
float *grad, float *hess, size_t len);
|
||||
/*!
|
||||
* \brief get evaluation statistics for xgboost
|
||||
* \param handle handle
|
||||
* \param iter current iteration rounds
|
||||
* \param dmats pointers to data to be evaluated
|
||||
* \param evnames pointers to names of each data
|
||||
* \param len length of dmats
|
||||
* \return the string containing evaluation stati
|
||||
*/
|
||||
const char *XGBoosterEvalOneIter(void *handle, int iter, void *dmats[],
|
||||
const char *evnames[], size_t len);
|
||||
/*!
|
||||
* \brief make prediction based on dmat
|
||||
* \param handle handle
|
||||
* \param dmat data matrix
|
||||
* \param output_margin whether only output raw margin value
|
||||
* \param len used to store length of returning result
|
||||
*/
|
||||
const float *XGBoosterPredict(void *handle, void *dmat, int output_margin, size_t *len);
|
||||
/*!
|
||||
* \brief load model from existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterLoadModel(void *handle, const char *fname);
|
||||
/*!
|
||||
* \brief save model into existing file
|
||||
* \param handle handle
|
||||
* \param fname file name
|
||||
*/
|
||||
void XGBoosterSaveModel(const void *handle, const char *fname);
|
||||
/*!
|
||||
* \brief dump model, return array of strings representing model dump
|
||||
* \param handle handle
|
||||
* \param fmap name to fmap can be empty string
|
||||
* \param out_len length of output array
|
||||
* \return char *data[], representing dump of each model
|
||||
*/
|
||||
const char **XGBoosterDumpModel(void *handle, const char *fmap,
|
||||
size_t *out_len);
|
||||
};
|
||||
#endif // XGBOOST_WRAPPER_H_
|
||||
Loading…
x
Reference in New Issue
Block a user