tidy code by formatR

This commit is contained in:
unknown 2014-08-27 13:15:28 -07:00
parent 4dcc7d7303
commit 247e0d5d78
2 changed files with 152 additions and 171 deletions

View File

@ -1,161 +1,151 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
# helper function to read libsvm format # helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
# this is very badly written, load in dense, and convert to sparse # use this only for demo purpose adopted from
# use this only for demo purpose # https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r read.libsvm <- function(fname, maxcol) {
read.libsvm = function(fname, maxcol) { content <- readLines(fname)
content = readLines(fname) nline <- length(content)
nline = length(content) label <- numeric(nline)
label = numeric(nline) mat <- matrix(0, nline, maxcol + 1)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) { for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]]) arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]]) label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) { for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]] kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index # to avoid 0 index
findex = as.integer(kv[1]) + 1 findex <- as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2]) fvalue <- as.numeric(kv[2])
mat[i,findex] = fvalue mat[i, findex] <- fvalue
} }
} }
mat = as(mat, "sparseMatrix") mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat)) return(list(label = label, data = mat))
} }
############################ ############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
# Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
############################
# Directly read in local file # Directly read in local file
dtrain = xgb.DMatrix('agaricus.txt.train') dtrain <- xgb.DMatrix("agaricus.txt.train")
class(dtrain) class(dtrain)
# read file in R # read file in R
csc = read.libsvm("agaricus.txt.train", 126) csc <- read.libsvm("agaricus.txt.train", 126)
y = csc$label y <- csc$label
x = csc$data x <- csc$data
# x as Sparse Matrix # x as Sparse Matrix
class(x) class(x)
dtrain = xgb.DMatrix(x, label=y) dtrain <- xgb.DMatrix(x, label = y)
# x as dense matrix # x as dense matrix
dense.x = as.matrix(x) dense.x <- as.matrix(x)
dtrain = xgb.DMatrix(dense.x, label=y) dtrain <- xgb.DMatrix(dense.x, label = y)
############################ ############################ Test xgboost with local file, sparse matrix and dense matrix in R.
# Test xgboost with local file, sparse matrix and dense matrix in R.
############################
# Test with DMatrix object # Test with DMatrix object
bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic') bst <- xgboost(data = dtrain, max_depth = 2, eta = 1,
objective = "binary:logistic")
# Verbose = 0,1,2 # Verbose = 0,1,2
bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', bst <- xgboost(data = dtrain, max_depth = 2, eta = 1,
verbose = 0) objective = "binary:logistic", verbose = 0)
bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', bst <- xgboost(data = dtrain, max_depth = 2, eta = 1,
verbose = 1) objective = "binary:logistic", verbose = 1)
bst = xgboost(data=dtrain, max_depth=2, eta=1, objective='binary:logistic', bst <- xgboost(data = dtrain, max_depth = 2, eta = 1,
verbose = 2) objective = "binary:logistic", verbose = 2)
# Test with local file # Test with local file
bst = xgboost(data='agaricus.txt.train', max_depth=2, eta=1, objective='binary:logistic') bst <- xgboost(data = "agaricus.txt.train", max_depth = 2, eta = 1,
objective = "binary:logistic")
# Test with Sparse Matrix # Test with Sparse Matrix
bst = xgboost(data = x, label = y, max_depth=2, eta=1, objective='binary:logistic') bst <- xgboost(data = x, label = y, max_depth = 2, eta = 1,
objective = "binary:logistic")
# Test with dense Matrix # Test with dense Matrix
bst = xgboost(data = dense.x, label = y, max_depth=2, eta=1, objective='binary:logistic') bst <- xgboost(data = dense.x, label = y, max_depth = 2, eta = 1,
objective = "binary:logistic")
############################ ############################ Test predict
# Test predict
############################
# Prediction with DMatrix object # Prediction with DMatrix object
dtest = xgb.DMatrix('agaricus.txt.test') dtest <- xgb.DMatrix("agaricus.txt.test")
pred = predict(bst, dtest) pred <- predict(bst, dtest)
# Prediction with local test file # Prediction with local test file
pred = predict(bst, 'agaricus.txt.test') pred <- predict(bst, "agaricus.txt.test")
# Prediction with Sparse Matrix # Prediction with Sparse Matrix
csc = read.libsvm("agaricus.txt.test", 126) csc <- read.libsvm("agaricus.txt.test", 126)
test.y = csc$label test.y <- csc$label
test.x = csc$data test.x <- csc$data
pred = predict(bst, test.x) pred <- predict(bst, test.x)
# Extrac label with xgb.getinfo # Extrac label with xgb.getinfo
labels = xgb.getinfo(dtest, "label") labels <- xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=",err)) print(paste("error=", err))
############################ ############################ Save and load model to hard disk
# Save and load model to hard disk
############################
# save model to binary local file # save model to binary local file
xgb.save(bst, 'model.save') xgb.save(bst, "model.save")
# load binary model to R # load binary model to R
bst = xgb.load('model.save') bst <- xgb.load("model.save")
pred = predict(bst, test.x) pred <- predict(bst, test.x)
# save model to text file # save model to text file
xgb.dump(bst, 'model.dump') xgb.dump(bst, "model.dump")
# save a DMatrix object to hard disk # save a DMatrix object to hard disk
xgb.DMatrix.save(dtrain,'dtrain.save') xgb.DMatrix.save(dtrain, "dtrain.save")
# load a DMatrix object to R # load a DMatrix object to R
dtrain = xgb.DMatrix('dtrain.save') dtrain <- xgb.DMatrix("dtrain.save")
############################ ############################ More flexible training function xgb.train
# More flexible training function xgb.train
############################
param = list(max_depth=2, eta=1, silent = 1, objective="binary:logistic") param <- list(max_depth = 2, eta = 1, silent = 1, objective = "binary:logistic")
watchlist <- list("eval"=dtest,"train"=dtrain) watchlist <- list(eval = dtest, train = dtrain)
# training xgboost model # training xgboost model
bst <- xgb.train(param, dtrain, nround=2, watchlist=watchlist) bst <- xgb.train(param, dtrain, nround = 2, watchlist = watchlist)
############################ ############################ cutomsized loss function
# cutomsized loss function
############################
param <- list(max_depth = 2, eta = 1, silent =1) param <- list(max_depth = 2, eta = 1, silent = 1)
# note: for customized objective function, we leave objective as default # note: for customized objective function, we leave objective as default note: what we are getting is
# note: what we are getting is margin value in prediction # margin value in prediction you must know what you are doing
# you must know what you are doing
# user define objective function, given prediction, return gradient and second order gradient # user define objective function, given prediction, return gradient and second order gradient this is
# this is loglikelihood loss # loglikelihood loss
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label") labels <- xgb.getinfo(dtrain, "label")
preds <- 1.0 / (1.0 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1.0-preds) hess <- preds * (1 - preds)
return(list(grad=grad, hess=hess)) return(list(grad = grad, hess = hess))
} }
# user defined evaluation function, return a list(metric="metric-name", value="metric-value") # user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
# NOTE: when you do customized loss function, the default prediction value is margin # you do customized loss function, the default prediction value is margin this may make buildin
# this may make buildin evalution metric not function properly # evalution metric not function properly for example, we are doing logistic loss, the prediction is
# for example, we are doing logistic loss, the prediction is score before logistic transformation # score before logistic transformation the buildin evaluation error assumes input is after logistic
# the buildin evaluation error assumes input is after logistic transformation # transformation Take this in mind when you use the customization, and maybe you need write customized
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # evaluation function
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- xgb.getinfo(dtrain, "label") labels <- xgb.getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric="error", value=err)) return(list(metric = "error", value = err))
} }
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training simply look at xgboost.py's
# simply look at xgboost.py"s implementation of train # implementation of train
bst <- xgb.train(param, dtrain, nround=2, watchlist, logregobj, evalerror) bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror)

View File

@ -1,103 +1,94 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
# helper function to read libsvm format # helper function to read libsvm format this is very badly written, load in dense, and convert to sparse
# this is very badly written, load in dense, and convert to sparse # use this only for demo purpose adopted from
# use this only for demo purpose # https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r
# adopted from https://github.com/zygmuntz/r-libsvm-format-read-write/blob/master/f_read.libsvm.r read.libsvm <- function(fname, maxcol) {
read.libsvm = function(fname, maxcol) { content <- readLines(fname)
content = readLines(fname) nline <- length(content)
nline = length(content) label <- numeric(nline)
label = numeric(nline) mat <- matrix(0, nline, maxcol + 1)
mat = matrix(0, nline, maxcol+1)
for (i in 1:nline) { for (i in 1:nline) {
arr = as.vector(strsplit(content[i], " ")[[1]]) arr <- as.vector(strsplit(content[i], " ")[[1]])
label[i] = as.numeric(arr[[1]]) label[i] <- as.numeric(arr[[1]])
for (j in 2:length(arr)) { for (j in 2:length(arr)) {
kv = strsplit(arr[j], ":")[[1]] kv <- strsplit(arr[j], ":")[[1]]
# to avoid 0 index # to avoid 0 index
findex = as.integer(kv[1]) + 1 findex <- as.integer(kv[1]) + 1
fvalue = as.numeric(kv[2]) fvalue <- as.numeric(kv[2])
mat[i,findex] = fvalue mat[i, findex] <- fvalue
} }
} }
mat = as(mat, "sparseMatrix") mat <- as(mat, "sparseMatrix")
return(list(label=label, data=mat)) return(list(label = label, data = mat))
} }
# Parameter setting # Parameter setting
dtrain <- xgb.DMatrix("agaricus.txt.train") dtrain <- xgb.DMatrix("agaricus.txt.train")
dtest <- xgb.DMatrix("agaricus.txt.test") dtest <- xgb.DMatrix("agaricus.txt.test")
param = list("bst:max_depth"=2, "bst:eta"=1, "silent"=1, "objective"="binary:logistic") param <- list(`bst:max_depth` = 2, `bst:eta` = 1, silent = 1, objective = "binary:logistic")
watchlist = list("eval"=dtest,"train"=dtrain) watchlist <- list(eval = dtest, train = dtrain)
########################### ########################### Train from local file
# Train from local file
###########################
# Training # Training
bst = xgboost(file='agaricus.txt.train',params=param,watchlist=watchlist) bst <- xgboost(file = "agaricus.txt.train", params = param, watchlist = watchlist)
# Prediction # Prediction
pred = predict(bst, 'agaricus.txt.test') pred <- predict(bst, "agaricus.txt.test")
# Performance # Performance
labels = xgb.getinfo(dtest, "label") labels <- xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=",err)) print(paste("error=", err))
########################### ########################### Train from R object
# Train from R object
###########################
csc = read.libsvm("agaricus.txt.train", 126) csc <- read.libsvm("agaricus.txt.train", 126)
y = csc$label y <- csc$label
x = csc$data x <- csc$data
# x as Sparse Matrix # x as Sparse Matrix
class(x) class(x)
# Training # Training
bst = xgboost(x,y,params=param,watchlist=watchlist) bst <- xgboost(x, y, params = param, watchlist = watchlist)
# Prediction # Prediction
pred = predict(bst, 'agaricus.txt.test') pred <- predict(bst, "agaricus.txt.test")
# Performance # Performance
labels = xgb.getinfo(dtest, "label") labels <- xgb.getinfo(dtest, "label")
err = as.numeric(sum(as.integer(pred > 0.5) != labels)) / length(labels) err <- as.numeric(sum(as.integer(pred > 0.5) != labels))/length(labels)
print(paste("error=",err)) print(paste("error=", err))
# Training with dense matrix # Training with dense matrix
x = as.matrix(x) x <- as.matrix(x)
bst = xgboost(x,y,params=param,watchlist=watchlist) bst <- xgboost(x, y, params = param, watchlist = watchlist)
########################### ########################### Train with customization
# Train with customization
###########################
# user define objective function, given prediction, return gradient and second order gradient # user define objective function, given prediction, return gradient and second order gradient this is
# this is loglikelihood loss # loglikelihood loss
logregobj = function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label") labels <- xgb.getinfo(dtrain, "label")
preds = 1.0 / (1.0 + exp(-preds)) preds <- 1/(1 + exp(-preds))
grad = preds - labels grad <- preds - labels
hess = preds * (1.0-preds) hess <- preds * (1 - preds)
return(list(grad=grad, hess=hess)) return(list(grad = grad, hess = hess))
} }
# user defined evaluation function, return a list(metric="metric-name", value="metric-value") # user defined evaluation function, return a list(metric='metric-name', value='metric-value') NOTE: when
# NOTE: when you do customized loss function, the default prediction value is margin # you do customized loss function, the default prediction value is margin this may make buildin
# this may make buildin evalution metric not function properly # evalution metric not function properly for example, we are doing logistic loss, the prediction is
# for example, we are doing logistic loss, the prediction is score before logistic transformation # score before logistic transformation the buildin evaluation error assumes input is after logistic
# the buildin evaluation error assumes input is after logistic transformation # transformation Take this in mind when you use the customization, and maybe you need write customized
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # evaluation function
evalerror = function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels = xgb.getinfo(dtrain, "label") labels <- xgb.getinfo(dtrain, "label")
err = as.numeric(sum(labels != (preds > 0.0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric="error", value=err)) return(list(metric = "error", value = err))
} }
bst = xgboost(x,y,params=param,watchlist=watchlist,obj=logregobj, feval=evalerror) bst <- xgboost(x, y, params = param, watchlist = watchlist, obj = logregobj, feval = evalerror)
############################ ############################ Train with previous result
# Train with previous result
############################
bst = xgboost(x,y,params=param,watchlist=watchlist) bst <- xgboost(x, y, params = param, watchlist = watchlist)
pred = predict(bst, 'agaricus.txt.train', outputmargin=TRUE) pred <- predict(bst, "agaricus.txt.train", outputmargin = TRUE)
bst2 = xgboost(x,y,params=param,watchlist=watchlist,margin=pred) bst2 <- xgboost(x, y, params = param, watchlist = watchlist, margin = pred)