This commit is contained in:
hetong 2014-09-06 11:17:43 -07:00
commit c3cef7e2c7
17 changed files with 134 additions and 283 deletions

View File

@ -52,7 +52,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
if (length(params) != 0) { if (length(params) != 0) {
for (i in 1:length(params)) { for (i in 1:length(params)) {
p <- params[i] p <- params[i]
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p), .Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
PACKAGE = "xgboost") PACKAGE = "xgboost")
} }
} }

View File

@ -20,7 +20,7 @@
#' @param nrounds the max number of iterations #' @param nrounds the max number of iterations
#' @param nfold number of folds used #' @param nfold number of folds used
#' @param label option field, when data is Matrix #' @param label option field, when data is Matrix
#' @param showd boolean, whether show standard deviation of cross validation #' @param showsd boolean, whether show standard deviation of cross validation
#' @param metrics, list of evaluation metrics to be used in corss validation, #' @param metrics, list of evaluation metrics to be used in corss validation,
#' when it is not specified, the evaluation metric is chosen according to objective function. #' when it is not specified, the evaluation metric is chosen according to objective function.
#' Possible options are: #' Possible options are:
@ -77,5 +77,5 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL,
history <- append(history, ret) history <- append(history, ret)
cat(paste(ret, "\n", sep="")) cat(paste(ret, "\n", sep=""))
} }
return (history) return (TRUE)
} }

View File

@ -17,5 +17,5 @@ install.packages('xgboost')
## Examples ## Examples
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example. * Please visit [walk through example](https://github.com/tqchen/xgboost/blob/master/R-package/demo).
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset. * See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.

6
R-package/demo/00Index Normal file
View File

@ -0,0 +1,6 @@
basic_walkthrough Basic feature walkthrough
custom_objective Cutomize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
generalized_linear_model Generalized Linear Model
cross_validation Cross validation

View File

@ -3,6 +3,6 @@ XGBoost R Feature Walkthrough
* [Basic walkthrough of wrappers](basic_walkthrough.R) * [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Cutomize loss function, and evaluation metric](custom_objective.R) * [Cutomize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R) * [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.py) * [Predicting using first n trees](predict_first_ntree.R)
* [Generalized Linear Model](generalized_linear_model.py) * [Generalized Linear Model](generalized_linear_model.R)
* [Cross validation](cross_validation.py) * [Cross validation](cross_validation.R)

View File

@ -4,49 +4,49 @@ require(methods)
# In this example, we are aiming to predict whether a mushroom can be eated # In this example, we are aiming to predict whether a mushroom can be eated
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
dtrain <- agaricus.train train <- agaricus.train
dtest <- agaricus.test test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1} # the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(dtrain$label) class(train$label)
class(dtrain$data) class(train$data)
#-------------Basic Training using XGBoost----------------- #-------------Basic Training using XGBoost-----------------
# this is the basic usage of xgboost you can put matrix in data field # this is the basic usage of xgboost you can put matrix in data field
# note: we are puting in sparse matrix here, xgboost naturally handles sparse input # note: we are puting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector) # use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
print("training xgboost with sparseMatrix") print("training xgboost with sparseMatrix")
bst <- xgboost(data = dtrain$data, label = dtrain$label, max_depth = 2, eta = 1, nround = 2, bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic") objective = "binary:logistic")
# alternatively, you can put in dense matrix, i.e. basic R-matrix # alternatively, you can put in dense matrix, i.e. basic R-matrix
print("training xgboost with Matrix") print("training xgboost with Matrix")
bst <- xgboost(data = as.matrix(dtrain$data), label = dtrain$label, max_depth = 2, eta = 1, nround = 2, bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic") objective = "binary:logistic")
# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features # you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
print("training xgboost with xgb.DMatrix") print("training xgboost with xgb.DMatrix")
dmat <- xgb.DMatrix(data = dtrain$data, label = dtrain$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic") bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
# Verbose = 0,1,2 # Verbose = 0,1,2
print ('train xgboost with verbose 0, no message') print ('train xgboost with verbose 0, no message')
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 0) objective = "binary:logistic", verbose = 0)
print ('train xgboost with verbose 1, print evaluation metric') print ('train xgboost with verbose 1, print evaluation metric')
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 1) objective = "binary:logistic", verbose = 1)
print ('train xgboost with verbose 2, also print information about tree') print ('train xgboost with verbose 2, also print information about tree')
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
objective = "binary:logistic", verbose = 2) objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LibSVM format input # you can also specify data as file path to a LibSVM format input
# since we do not have this file with us, the following line is just for illustration # since we do not have this file with us, the following line is just for illustration
# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nround = 2,objective = "binary:logistic") # bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
#--------------------basic prediction using xgboost-------------- #--------------------basic prediction using xgboost--------------
# you can do prediction using the following line # you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or xgb.DMatrix # you can put in Matrix, sparseMatrix, or xgb.DMatrix
pred <- predict(bst, dtest$data) pred <- predict(bst, test$data)
err <- as.numeric(sum(as.integer(pred > 0.5) != dtest$label))/length(dtest$label) err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err)) print(paste("test-error=", err))
#-------------------save and load models------------------------- #-------------------save and load models-------------------------
@ -54,33 +54,33 @@ print(paste("test-error=", err))
xgb.save(bst, "xgboost.model") xgb.save(bst, "xgboost.model")
# load binary model to R # load binary model to R
bst2 <- xgb.load("xgboost.model") bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, dtest$data) pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred # pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
#----------------Advanced features -------------- #----------------Advanced features --------------
# to use advanced features, we need to put data in xgb.DMatrix # to use advanced features, we need to put data in xgb.DMatrix
dtrain <- xgb.DMatrix(data = dtrain$data, label=dtrain$label) dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = dtest$data, label=dtest$label) dtest <- xgb.DMatrix(data = test$data, label=test$label)
#---------------Using watchlist---------------- #---------------Using watchlist----------------
# watchlist is a list of xgb.DMatrix, each of them tagged with name # watchlist is a list of xgb.DMatrix, each of them tagged with name
watchlist <- list(train=dtrain, test=dtest) watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features # to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list # watchlist allows us to monitor the evaluation result on all data in the list
print ('train xgboost using xgb.train with watchlist') print ('train xgboost using xgb.train with watchlist')
bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
objective = "binary:logistic") objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics # we can change evaluation metrics, or use multiple evaluation metrics
print ('train xgboost using xgb.train with watchlist, watch logloss and error') print ('train xgboost using xgb.train with watchlist, watch logloss and error')
bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
"eval_metric" = "error", "eval_metric" = "logloss", eval.metric = "error", eval.metric = "logloss",
objective = "binary:logistic") objective = "binary:logistic")
# xgb.DMatrix can also be saved using xgb.DMatrix.save # xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer") xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix # to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer") dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, "max_depth"=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
objective = "binary:logistic") objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo # information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label") label = getinfo(dtest, "label")

View File

@ -0,0 +1,47 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nround <- 2
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5,
metrics={'error'}, , showsd = FALSE)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max_depth=2,eta=1,silent=1)
# train with customized objective
xgb.cv(param, dtrain, nround, nfold = 5,
obj = logregobj, feval=evalerror)

View File

@ -0,0 +1,34 @@
require(xgboost)
# load in the agaricus dataset
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
# you can fit a linear regression, or logistic regression model
##
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param <- list(objective = "binary:logistic", booster = "gblinear",
alpha = 0.0001, lambda = 1)
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
##
# the rest of settings are the same
##
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
labels <- getinfo(dtest, 'label')
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')

View File

@ -1,27 +1,23 @@
require(xgboost) require(xgboost)
# load in the agaricus dataset
data(agaricus.train) data(agaricus.train, package='xgboost')
data(agaricus.test) data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
trainX = agaricus.train$data dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic') param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
num_round = 2 nround = 2
bst = xgb.train(param, dtrain, num_round, watchlist)
# training the model for two rounds
bst = xgb.train(param, dtrain, nround, watchlist)
cat('start testing prediction from first n trees\n') cat('start testing prediction from first n trees\n')
labels <- getinfo(dtest,'label') labels <- getinfo(dtest,'label')
### predict using first 1 tree
ypred1 = predict(bst, dtest, ntreelimit=1) ypred1 = predict(bst, dtest, ntreelimit=1)
# by default, we predict using all the trees
ypred2 = predict(bst, dtest) ypred2 = predict(bst, dtest)
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n') cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n') cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')

8
R-package/demo/runall.R Normal file
View File

@ -0,0 +1,8 @@
# running all scripts in demo folder
source('basic_walkthrough.R')
source('custom_objective.R')
source('boost_from_prediction.R')
source('predict_first_ntree.R')
source('generalized_linear_model.R')
source('cross_validation.R')

View File

@ -1,3 +0,0 @@
XGBoost R Feature Walkthrough
====
To be finished

View File

@ -1,47 +0,0 @@
require(xgboost)
require(methods)
data(agaricus.train)
data(agaricus.test)
# we use agaricus data as example dataset
# we will show how to use xgboost to do binary classification here
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
#-------------------------------------
# this is the basic usage of xgboost
# you can put sparse matrix in data field. this is helpful when your data is sparse
# for example, when you use one-hot encoding for feature vectors
bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic")
# alternatively, you can put dense matrix
denseX <- as(trainX, "matrix")
bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
objective = "binary:logistic")
# you can also specify data as file path to a LibSVM format input
# since we do not have libsvm format file for iris, next line is only for illustration
# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
preds <- predict(bst, dtest)
labels <- getinfo(dtest,'label')
cat('error=', mean(as.numeric(preds>0.5)!=labels),'\n')
xgb.save(bst, 'xgb.model')
xgb.dump(bst, 'dump.raw.txt')
xgb.dump(bst, 'dump.nuce.txt','../data/featmap.txt')
bst2 <- xgb.load('xgb.model')
preds2 <- predict(bst2,dtest)
stopifnot(sum((preds-preds2)^2)==0)
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.

View File

@ -1,31 +0,0 @@
require(xgboost)
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
watchlist <- list(eval = dtest, train = dtrain)
print('start running example to start from a initial prediction\n')
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
bst <- xgb.train( param, dtrain, 1, watchlist )
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
ptest <- predict(bst, dtest, outputmargin=TRUE)
# dtrain.set_base_margin(ptrain)
# dtest.set_base_margin(ptest)
cat('this is result of running from initial prediction\n')
bst <- xgb.train( param, dtrain, 1, watchlist )

View File

@ -1,75 +0,0 @@
require(xgboost)
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
num_round <- 2
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
cat('running cross validation\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0)
cat('running cross validation, disable standard deviation display\n')
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'error'}, seed = 0, show_stdv = False)
cat('running cross validation, with preprocessing function\n')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
fpreproc <- function(dtrain, dtest, param){
label <- getinfo(dtrain, 'label')
ratio <- mean(label==0)
param <- append(param, list(scale_pos_weight = ratio))
return(list(dtrain=dtrain, dtest= dtest, param = param))
}
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
param <- list(max_depth=2,eta=1,silent=1)
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
obj = logregobj, feval=evalerror)

View File

@ -1,47 +0,0 @@
require(xgboost)
data(agaricus.train)
data(agaricus.test)
trainX = agaricus.train$data
trainY = agaricus.train$label
testX = agaricus.test$data
testY = agaricus.test$label
dtrain <- xgb.DMatrix(trainX, label=trainY)
dtest <- xgb.DMatrix(testX, label=testY)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction
# you must know what you are doing
param <- list(max_depth=2,eta=1,silent=1)
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}
# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)

View File

@ -1,32 +0,0 @@
#!/usr/bin/python
import sys
sys.path.append('../../wrapper')
import xgboost as xgb
##
# this script demonstrate how to fit generalized linear model in xgboost
# basically, we are using linear model, instead of tree for our boosters
##
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
dtest = xgb.DMatrix('../data/agaricus.txt.test')
# change booster to gblinear, so that we are fitting a linear model
# alpha is the L1 regularizer
# lambda is the L2 regularizer
# you can also set lambda_bias which is L2 regularizer on the bias term
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
'alpha': 0.0001, 'lambda': 1 }
# normally, you do not need to set eta (step_size)
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
# there could be affection on convergence with parallelization on certain cases
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
# param['eta'] = 1
##
# the rest of settings are the same
##
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = 4
bst = xgb.train(param, dtrain, num_round, watchlist)
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

View File

@ -1,5 +0,0 @@
#!/bin/bash
# todo
Rscript basic_walkthrough.R
Rscript custom_objective.R
Rscript boost_from_prediction.R