remove inst/, improve vignette
This commit is contained in:
parent
50d77c72eb
commit
cd35d88a03
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,10 +0,0 @@
|
|||||||
require(xgboost)
|
|
||||||
require(methods)
|
|
||||||
# Directly read in local file
|
|
||||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
|
||||||
|
|
||||||
history <- xgb.cv( data = dtrain, nround=3, nfold = 5, metrics=list("rmse","auc"),
|
|
||||||
"max_depth"=3, "eta"=1,
|
|
||||||
"objective"="binary:logistic")
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,153 +0,0 @@
|
|||||||
require(xgboost)
|
|
||||||
require(methods)
|
|
||||||
|
|
||||||
# helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
|
|
||||||
# use this only for demo purpose adopted from
|
|
||||||
# https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
|
||||||
read.libsvm <- function(fname, maxcol) {
|
|
||||||
content <- readLines(fname)
|
|
||||||
nline <- length(content)
|
|
||||||
label <- numeric(nline)
|
|
||||||
mat <- matrix(0, nline, maxcol + 1)
|
|
||||||
for (i in 1:nline) {
|
|
||||||
arr <- as.vector(strsplit(content[i], " ")[[1]])
|
|
||||||
label[i] <- as.numeric(arr[[1]])
|
|
||||||
for (j in 2:length(arr)) {
|
|
||||||
kv <- strsplit(arr[j], ":")[[1]]
|
|
||||||
# to avoid 0 index
|
|
||||||
findex <- as.integer(kv[1]) + 1
|
|
||||||
fvalue <- as.numeric(kv[2])
|
|
||||||
mat[i, findex] <- fvalue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mat <- as(mat, "sparseMatrix")
|
|
||||||
return(list(label = label, data = mat))
|
|
||||||
}
|
|
||||||
|
|
||||||
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
|
||||||
|
|
||||||
# Directly read in local file
|
|
||||||
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
|
||||||
class(dtrain)
|
|
||||||
|
|
||||||
# read file in R
|
|
||||||
csc <- read.libsvm("agaricus.txt.train", 126)
|
|
||||||
y <- csc$label
|
|
||||||
x <- csc$data
|
|
||||||
|
|
||||||
# x as Sparse Matrix
|
|
||||||
class(x)
|
|
||||||
dtrain <- xgb.DMatrix(x, label = y)
|
|
||||||
|
|
||||||
# x as dense matrix
|
|
||||||
dense.x <- as.matrix(x)
|
|
||||||
dtrain <- xgb.DMatrix(dense.x, label = y)
|
|
||||||
|
|
||||||
############################ Test xgboost with local file, sparse matrix and dense matrix in R.
|
|
||||||
|
|
||||||
# Test with DMatrix object
|
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic")
|
|
||||||
|
|
||||||
# Verbose = 0,1,2
|
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic", verbose = 0)
|
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic", verbose = 1)
|
|
||||||
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic", verbose = 2)
|
|
||||||
|
|
||||||
# Test with local file
|
|
||||||
bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,nround = 2,
|
|
||||||
objective = "binary:logistic")
|
|
||||||
|
|
||||||
# Test with Sparse Matrix
|
|
||||||
bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic")
|
|
||||||
|
|
||||||
# Test with dense Matrix
|
|
||||||
bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1, nround = 2,
|
|
||||||
objective = "binary:logistic")
|
|
||||||
|
|
||||||
|
|
||||||
############################ Test predict
|
|
||||||
|
|
||||||
# Prediction with DMatrix object
|
|
||||||
dtest <- xgb.DMatrix("agaricus.txt.test")
|
|
||||||
pred <- predict(bst, dtest)
|
|
||||||
|
|
||||||
# Prediction with local test file
|
|
||||||
pred <- predict(bst, "agaricus.txt.test")
|
|
||||||
|
|
||||||
# Prediction with Sparse Matrix
|
|
||||||
csc <- read.libsvm("agaricus.txt.test", 126)
|
|
||||||
test.y <- csc$label
|
|
||||||
test.x <- csc$data
|
|
||||||
pred <- predict(bst, test.x)
|
|
||||||
|
|
||||||
# Extrac label with getinfo
|
|
||||||
labels <- getinfo(dtest, "label")
|
|
||||||
err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
|
|
||||||
print(paste("error=", err))
|
|
||||||
|
|
||||||
############################ Save and load model to hard disk
|
|
||||||
|
|
||||||
# save model to binary local file
|
|
||||||
xgb.save(bst, "xgboost.model")
|
|
||||||
|
|
||||||
# load binary model to R
|
|
||||||
bst <- xgb.load("xgboost.model")
|
|
||||||
pred <- predict(bst, test.x)
|
|
||||||
|
|
||||||
# save model to text file
|
|
||||||
xgb.dump(bst, "dump.raw.txt")
|
|
||||||
# save model to text file, with feature map
|
|
||||||
xgb.dump(bst, "dump.nice.txt", "featmap.txt")
|
|
||||||
|
|
||||||
# save a DMatrix object to hard disk
|
|
||||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
|
||||||
|
|
||||||
# load a DMatrix object to R
|
|
||||||
dtrain <- xgb.DMatrix("dtrain.buffer")
|
|
||||||
|
|
||||||
############################ More flexible training function xgb.train
|
|
||||||
|
|
||||||
param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
|
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
|
||||||
|
|
||||||
# training xgboost model
|
|
||||||
bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
|
|
||||||
|
|
||||||
############################ cutomsized loss function
|
|
||||||
|
|
||||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
|
||||||
|
|
||||||
# note: for customized objective function, we leave objective as default note: what we are getting is
|
|
||||||
# margin value in prediction you must know what you are doing
|
|
||||||
|
|
||||||
# user define objective function, given prediction, return gradient and second order gradient this is
|
|
||||||
# loglikelihood loss
|
|
||||||
logregobj <- function(preds, dtrain) {
|
|
||||||
labels <- getinfo(dtrain, "label")
|
|
||||||
preds <- 1/(1 + exp(-preds))
|
|
||||||
grad <- preds - labels
|
|
||||||
hess <- preds * (1 - preds)
|
|
||||||
return(list(grad = grad, hess = hess))
|
|
||||||
}
|
|
||||||
# user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
|
|
||||||
# you do customized loss function, the default prediction value is margin this may make buildin
|
|
||||||
# evalution metric not function properly for example, we are doing logistic loss, the prediction is
|
|
||||||
# score before logistic transformation the buildin evaluation error assumes input is after logistic
|
|
||||||
# transformation Take this in mind when you use the customization, and maybe you need write customized
|
|
||||||
# evaluation function
|
|
||||||
evalerror <- function(preds, dtrain) {
|
|
||||||
labels <- getinfo(dtrain, "label")
|
|
||||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
|
||||||
return(list(metric = "error", value = err))
|
|
||||||
}
|
|
||||||
|
|
||||||
# training with customized objective, we can also do step by step training simply look at xgboost.py's
|
|
||||||
# implementation of train
|
|
||||||
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,126 +0,0 @@
|
|||||||
0 cap-shape=bell i
|
|
||||||
1 cap-shape=conical i
|
|
||||||
2 cap-shape=convex i
|
|
||||||
3 cap-shape=flat i
|
|
||||||
4 cap-shape=knobbed i
|
|
||||||
5 cap-shape=sunken i
|
|
||||||
6 cap-surface=fibrous i
|
|
||||||
7 cap-surface=grooves i
|
|
||||||
8 cap-surface=scaly i
|
|
||||||
9 cap-surface=smooth i
|
|
||||||
10 cap-color=brown i
|
|
||||||
11 cap-color=buff i
|
|
||||||
12 cap-color=cinnamon i
|
|
||||||
13 cap-color=gray i
|
|
||||||
14 cap-color=green i
|
|
||||||
15 cap-color=pink i
|
|
||||||
16 cap-color=purple i
|
|
||||||
17 cap-color=red i
|
|
||||||
18 cap-color=white i
|
|
||||||
19 cap-color=yellow i
|
|
||||||
20 bruises?=bruises i
|
|
||||||
21 bruises?=no i
|
|
||||||
22 odor=almond i
|
|
||||||
23 odor=anise i
|
|
||||||
24 odor=creosote i
|
|
||||||
25 odor=fishy i
|
|
||||||
26 odor=foul i
|
|
||||||
27 odor=musty i
|
|
||||||
28 odor=none i
|
|
||||||
29 odor=pungent i
|
|
||||||
30 odor=spicy i
|
|
||||||
31 gill-attachment=attached i
|
|
||||||
32 gill-attachment=descending i
|
|
||||||
33 gill-attachment=free i
|
|
||||||
34 gill-attachment=notched i
|
|
||||||
35 gill-spacing=close i
|
|
||||||
36 gill-spacing=crowded i
|
|
||||||
37 gill-spacing=distant i
|
|
||||||
38 gill-size=broad i
|
|
||||||
39 gill-size=narrow i
|
|
||||||
40 gill-color=black i
|
|
||||||
41 gill-color=brown i
|
|
||||||
42 gill-color=buff i
|
|
||||||
43 gill-color=chocolate i
|
|
||||||
44 gill-color=gray i
|
|
||||||
45 gill-color=green i
|
|
||||||
46 gill-color=orange i
|
|
||||||
47 gill-color=pink i
|
|
||||||
48 gill-color=purple i
|
|
||||||
49 gill-color=red i
|
|
||||||
50 gill-color=white i
|
|
||||||
51 gill-color=yellow i
|
|
||||||
52 stalk-shape=enlarging i
|
|
||||||
53 stalk-shape=tapering i
|
|
||||||
54 stalk-root=bulbous i
|
|
||||||
55 stalk-root=club i
|
|
||||||
56 stalk-root=cup i
|
|
||||||
57 stalk-root=equal i
|
|
||||||
58 stalk-root=rhizomorphs i
|
|
||||||
59 stalk-root=rooted i
|
|
||||||
60 stalk-root=missing i
|
|
||||||
61 stalk-surface-above-ring=fibrous i
|
|
||||||
62 stalk-surface-above-ring=scaly i
|
|
||||||
63 stalk-surface-above-ring=silky i
|
|
||||||
64 stalk-surface-above-ring=smooth i
|
|
||||||
65 stalk-surface-below-ring=fibrous i
|
|
||||||
66 stalk-surface-below-ring=scaly i
|
|
||||||
67 stalk-surface-below-ring=silky i
|
|
||||||
68 stalk-surface-below-ring=smooth i
|
|
||||||
69 stalk-color-above-ring=brown i
|
|
||||||
70 stalk-color-above-ring=buff i
|
|
||||||
71 stalk-color-above-ring=cinnamon i
|
|
||||||
72 stalk-color-above-ring=gray i
|
|
||||||
73 stalk-color-above-ring=orange i
|
|
||||||
74 stalk-color-above-ring=pink i
|
|
||||||
75 stalk-color-above-ring=red i
|
|
||||||
76 stalk-color-above-ring=white i
|
|
||||||
77 stalk-color-above-ring=yellow i
|
|
||||||
78 stalk-color-below-ring=brown i
|
|
||||||
79 stalk-color-below-ring=buff i
|
|
||||||
80 stalk-color-below-ring=cinnamon i
|
|
||||||
81 stalk-color-below-ring=gray i
|
|
||||||
82 stalk-color-below-ring=orange i
|
|
||||||
83 stalk-color-below-ring=pink i
|
|
||||||
84 stalk-color-below-ring=red i
|
|
||||||
85 stalk-color-below-ring=white i
|
|
||||||
86 stalk-color-below-ring=yellow i
|
|
||||||
87 veil-type=partial i
|
|
||||||
88 veil-type=universal i
|
|
||||||
89 veil-color=brown i
|
|
||||||
90 veil-color=orange i
|
|
||||||
91 veil-color=white i
|
|
||||||
92 veil-color=yellow i
|
|
||||||
93 ring-number=none i
|
|
||||||
94 ring-number=one i
|
|
||||||
95 ring-number=two i
|
|
||||||
96 ring-type=cobwebby i
|
|
||||||
97 ring-type=evanescent i
|
|
||||||
98 ring-type=flaring i
|
|
||||||
99 ring-type=large i
|
|
||||||
100 ring-type=none i
|
|
||||||
101 ring-type=pendant i
|
|
||||||
102 ring-type=sheathing i
|
|
||||||
103 ring-type=zone i
|
|
||||||
104 spore-print-color=black i
|
|
||||||
105 spore-print-color=brown i
|
|
||||||
106 spore-print-color=buff i
|
|
||||||
107 spore-print-color=chocolate i
|
|
||||||
108 spore-print-color=green i
|
|
||||||
109 spore-print-color=orange i
|
|
||||||
110 spore-print-color=purple i
|
|
||||||
111 spore-print-color=white i
|
|
||||||
112 spore-print-color=yellow i
|
|
||||||
113 population=abundant i
|
|
||||||
114 population=clustered i
|
|
||||||
115 population=numerous i
|
|
||||||
116 population=scattered i
|
|
||||||
117 population=several i
|
|
||||||
118 population=solitary i
|
|
||||||
119 habitat=grasses i
|
|
||||||
120 habitat=leaves i
|
|
||||||
121 habitat=meadows i
|
|
||||||
122 habitat=paths i
|
|
||||||
123 habitat=urban i
|
|
||||||
124 habitat=waste i
|
|
||||||
125 habitat=woods i
|
|
||||||
@ -80,12 +80,15 @@ Mushroom data is cited from UCI Machine Learning Repository. \citep{Bache+Lichma
|
|||||||
|
|
||||||
<<Training and prediction with iris>>=
|
<<Training and prediction with iris>>=
|
||||||
library(xgboost)
|
library(xgboost)
|
||||||
data(iris)
|
data(agaricus.train, package='xgboost')
|
||||||
bst <- xgboost(as.matrix(iris[,1:4]),as.numeric(iris[,5]=='setosa'),
|
data(agaricus.test, package='xgboost')
|
||||||
nrounds = 5)
|
train <- agaricus.train
|
||||||
|
test <- agaricus.test
|
||||||
|
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1,
|
||||||
|
nround = 2, objective = "binary:logistic")
|
||||||
xgb.save(bst, 'model.save')
|
xgb.save(bst, 'model.save')
|
||||||
bst = xgb.load('model.save')
|
bst = xgb.load('model.save')
|
||||||
pred <- predict(bst, as.matrix(iris[,1:4]))
|
pred <- predict(bst, test$data)
|
||||||
@
|
@
|
||||||
|
|
||||||
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
\verb@xgboost@ is the main function to train a \verb@Booster@, i.e. a model.
|
||||||
@ -102,17 +105,19 @@ The output looks like
|
|||||||
|
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
booster[0]:
|
booster[0]:
|
||||||
0:[f2<2.45] yes=1,no=2,missing=1
|
0:[f28<1.00001] yes=1,no=2,missing=2
|
||||||
1:leaf=0.147059
|
1:[f108<1.00001] yes=3,no=4,missing=4
|
||||||
2:[f3<1.65] yes=3,no=4,missing=3
|
3:leaf=1.85965
|
||||||
3:leaf=0.464151
|
4:leaf=-1.94071
|
||||||
4:leaf=0.722449
|
2:[f55<1.00001] yes=5,no=6,missing=6
|
||||||
|
5:leaf=-1.70044
|
||||||
|
6:leaf=1.71218
|
||||||
booster[1]:
|
booster[1]:
|
||||||
0:[f2<2.45] yes=1,no=2,missing=1
|
0:[f59<1.00001] yes=1,no=2,missing=2
|
||||||
1:leaf=0.103806
|
1:leaf=-6.23624
|
||||||
2:[f2<4.85] yes=3,no=4,missing=3
|
2:[f28<1.00001] yes=3,no=4,missing=4
|
||||||
3:leaf=0.316341
|
3:leaf=-0.96853
|
||||||
4:leaf=0.510365
|
4:leaf=0.784718
|
||||||
\end{verbatim}
|
\end{verbatim}
|
||||||
|
|
||||||
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
It is important to know \verb@xgboost@'s own data type: \verb@xgb.DMatrix@.
|
||||||
@ -121,18 +126,16 @@ training from initial prediction value, weighted training instance.
|
|||||||
|
|
||||||
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
|
||||||
<<xgb.DMatrix>>=
|
<<xgb.DMatrix>>=
|
||||||
iris.mat <- as.matrix(iris[,1:4])
|
dtrain <- xgb.DMatrix(train$data, label = train$label)
|
||||||
iris.label <- as.numeric(iris[,5]=='setosa')
|
class(dtrain)
|
||||||
diris <- xgb.DMatrix(iris.mat, label = iris.label)
|
head(getinfo(dtrain,'label'))
|
||||||
class(diris)
|
|
||||||
getinfo(diris,'label')
|
|
||||||
@
|
@
|
||||||
|
|
||||||
We can also save the matrix to a binary file. Then load it simply with
|
We can also save the matrix to a binary file. Then load it simply with
|
||||||
\verb@xgb.DMatrix@
|
\verb@xgb.DMatrix@
|
||||||
<<save model>>=
|
<<save model>>=
|
||||||
xgb.DMatrix.save(diris, 'iris.xgb.DMatrix')
|
xgb.DMatrix.save(dtrain, 'xgb.DMatrix')
|
||||||
diris = xgb.DMatrix('iris.xgb.DMatrix')
|
dtrain = xgb.DMatrix('xgb.DMatrix')
|
||||||
@
|
@
|
||||||
|
|
||||||
\section{Advanced Examples}
|
\section{Advanced Examples}
|
||||||
@ -157,11 +160,11 @@ evalerror <- function(preds, dtrain) {
|
|||||||
return(list(metric = "MSE", value = err))
|
return(list(metric = "MSE", value = err))
|
||||||
}
|
}
|
||||||
|
|
||||||
dtest <- slice(diris,1:100)
|
dtest <- xgb.DMatrix(test$data, label = test$label)
|
||||||
watchlist <- list(eval = dtest, train = diris)
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
param <- list(max_depth = 2, eta = 1, silent = 1)
|
param <- list(max_depth = 2, eta = 1, silent = 1)
|
||||||
|
|
||||||
bst <- xgb.train(param, diris, nround = 2, watchlist, logregobj, evalerror)
|
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)
|
||||||
@
|
@
|
||||||
|
|
||||||
The gradient and second order gradient is required for the output of customized
|
The gradient and second order gradient is required for the output of customized
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user