103 lines
3.4 KiB
R
103 lines
3.4 KiB
R
require(xgboost)
|
|
|
|
# helper function to read libsvm format
|
|
# this is very badly written, load in dense, and convert to sparse
|
|
# use this only for demo purpose
|
|
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
|
|
read.libsvm = function(fname, maxcol) {
|
|
content = readLines(fname)
|
|
nline = length(content)
|
|
label = numeric(nline)
|
|
mat = matrix(0, nline, maxcol+1)
|
|
for (i in 1:nline) {
|
|
arr = as.vector(strsplit(content[i], " ")[[1]])
|
|
label[i] = as.numeric(arr[[1]])
|
|
for (j in 2:length(arr)) {
|
|
kv = strsplit(arr[j], ":")[[1]]
|
|
# to avoid 0 index
|
|
findex = as.integer(kv[1]) + 1
|
|
fvalue = as.numeric(kv[2])
|
|
mat[i,findex] = fvalue
|
|
}
|
|
}
|
|
mat = as(mat, "sparseMatrix")
|
|
return(list(label=label, data=mat))
|
|
}
|
|
|
|
# Parameter setting
|
|
dtrain <- xgb.DMatrix("agaricus.txt.train")
|
|
dtest <- xgb.DMatrix("agaricus.txt.test")
|
|
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic")
|
|
watchlist = list("eval"=dtest,"train"=dtrain)
|
|
|
|
###########################
|
|
# Train from local file
|
|
###########################
|
|
|
|
# Training
|
|
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist)
|
|
# Prediction
|
|
pred = predict(bst, 'agaricus.txt.test')
|
|
# Performance
|
|
labels = xgb.getinfo(dtest, "label")
|
|
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
|
print(paste("error=",err))
|
|
|
|
###########################
|
|
# Train from R object
|
|
###########################
|
|
|
|
csc = read.libsvm("agaricus.txt.train", 126)
|
|
y = csc$label
|
|
x = csc$data
|
|
# x as Sparse Matrix
|
|
class(x)
|
|
|
|
# Training
|
|
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
|
# Prediction
|
|
pred = predict(bst, 'agaricus.txt.test')
|
|
# Performance
|
|
labels = xgb.getinfo(dtest, "label")
|
|
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels)
|
|
print(paste("error=",err))
|
|
|
|
# Training with dense matrix
|
|
x = as.matrix(x)
|
|
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
|
|
|
###########################
|
|
# Train with customization
|
|
###########################
|
|
|
|
# user define objective function, given prediction, return gradient and second order gradient
|
|
# this is loglikelihood loss
|
|
logregobj = function(preds, dtrain) {
|
|
labels = xgb.getinfo(dtrain, "label")
|
|
preds = 1.0 / (1.0 + exp(-preds))
|
|
grad = preds - labels
|
|
hess = preds * (1.0-preds)
|
|
return(list(grad=grad, hess=hess))
|
|
}
|
|
# user defined evaluation function, return a list(metric="metric-name", value="metric-value")
|
|
# NOTE: when you do customized loss function, the default prediction value is margin
|
|
# this may make buildin evalution metric not function properly
|
|
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
|
# the buildin evaluation error assumes input is after logistic transformation
|
|
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
|
evalerror = function(preds, dtrain) {
|
|
labels = xgb.getinfo(dtrain, "label")
|
|
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels)
|
|
return(list(metric="error", value=err))
|
|
}
|
|
|
|
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror)
|
|
|
|
############################
|
|
# Train with previous result
|
|
############################
|
|
|
|
bst = xgboost(x,y,params=param,watchlist=watchlist)
|
|
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE)
|
|
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred)
|