Merge branch 'master' of https://github.com/tqchen/xgboost
This commit is contained in:
commit
c3cef7e2c7
@ -52,7 +52,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
|
||||
if (length(params) != 0) {
|
||||
for (i in 1:length(params)) {
|
||||
p <- params[i]
|
||||
.Call("XGBoosterSetParam_R", handle, names(p), as.character(p),
|
||||
.Call("XGBoosterSetParam_R", handle, gsub("\\.", "_", names(p)), as.character(p),
|
||||
PACKAGE = "xgboost")
|
||||
}
|
||||
}
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
#' @param nrounds the max number of iterations
|
||||
#' @param nfold number of folds used
|
||||
#' @param label option field, when data is Matrix
|
||||
#' @param showd boolean, whether show standard deviation of cross validation
|
||||
#' @param showsd boolean, whether show standard deviation of cross validation
|
||||
#' @param metrics, list of evaluation metrics to be used in corss validation,
|
||||
#' when it is not specified, the evaluation metric is chosen according to objective function.
|
||||
#' Possible options are:
|
||||
@ -77,5 +77,5 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL,
|
||||
history <- append(history, ret)
|
||||
cat(paste(ret, "\n", sep=""))
|
||||
}
|
||||
return (history)
|
||||
return (TRUE)
|
||||
}
|
||||
|
||||
@ -17,5 +17,5 @@ install.packages('xgboost')
|
||||
|
||||
## Examples
|
||||
|
||||
* Please visit [demo](https://github.com/tqchen/xgboost/blob/master/R-package/inst/examples/demo.R) for walk throughe example.
|
||||
* Please visit [walk through example](https://github.com/tqchen/xgboost/blob/master/R-package/demo).
|
||||
* See also the [example scripts](https://github.com/tqchen/xgboost/tree/master/demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](https://github.com/tqchen/xgboost/blob/master/demo/kaggle-higgs/speedtest.R) on this dataset.
|
||||
|
||||
6
R-package/demo/00Index
Normal file
6
R-package/demo/00Index
Normal file
@ -0,0 +1,6 @@
|
||||
basic_walkthrough Basic feature walkthrough
|
||||
custom_objective Cutomize loss function, and evaluation metric
|
||||
boost_from_prediction Boosting from existing prediction
|
||||
predict_first_ntree Predicting using first n trees
|
||||
generalized_linear_model Generalized Linear Model
|
||||
cross_validation Cross validation
|
||||
@ -3,6 +3,6 @@ XGBoost R Feature Walkthrough
|
||||
* [Basic walkthrough of wrappers](basic_walkthrough.R)
|
||||
* [Cutomize loss function, and evaluation metric](custom_objective.R)
|
||||
* [Boosting from existing prediction](boost_from_prediction.R)
|
||||
* [Predicting using first n trees](predict_first_ntree.py)
|
||||
* [Generalized Linear Model](generalized_linear_model.py)
|
||||
* [Cross validation](cross_validation.py)
|
||||
* [Predicting using first n trees](predict_first_ntree.R)
|
||||
* [Generalized Linear Model](generalized_linear_model.R)
|
||||
* [Cross validation](cross_validation.R)
|
||||
|
||||
@ -4,49 +4,49 @@ require(methods)
|
||||
# In this example, we are aiming to predict whether a mushroom can be eated
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
dtrain <- agaricus.train
|
||||
dtest <- agaricus.test
|
||||
train <- agaricus.train
|
||||
test <- agaricus.test
|
||||
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
|
||||
class(dtrain$label)
|
||||
class(dtrain$data)
|
||||
class(train$label)
|
||||
class(train$data)
|
||||
|
||||
#-------------Basic Training using XGBoost-----------------
|
||||
# this is the basic usage of xgboost you can put matrix in data field
|
||||
# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
|
||||
# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
|
||||
print("training xgboost with sparseMatrix")
|
||||
bst <- xgboost(data = dtrain$data, label = dtrain$label, max_depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
# alternatively, you can put in dense matrix, i.e. basic R-matrix
|
||||
print("training xgboost with Matrix")
|
||||
bst <- xgboost(data = as.matrix(dtrain$data), label = dtrain$label, max_depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
|
||||
print("training xgboost with xgb.DMatrix")
|
||||
dmat <- xgb.DMatrix(data = dtrain$data, label = dtrain$label)
|
||||
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
dtrain <- xgb.DMatrix(data = train$data, label = train$label)
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
|
||||
# Verbose = 0,1,2
|
||||
print ('train xgboost with verbose 0, no message')
|
||||
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 0)
|
||||
print ('train xgboost with verbose 1, print evaluation metric')
|
||||
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 1)
|
||||
print ('train xgboost with verbose 2, also print information about tree')
|
||||
bst <- xgboost(data = dmat, max_depth = 2, eta = 1, nround = 2,
|
||||
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
|
||||
objective = "binary:logistic", verbose = 2)
|
||||
|
||||
# you can also specify data as file path to a LibSVM format input
|
||||
# since we do not have this file with us, the following line is just for illustration
|
||||
# bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
|
||||
# bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic")
|
||||
|
||||
#--------------------basic prediction using xgboost--------------
|
||||
# you can do prediction using the following line
|
||||
# you can put in Matrix, sparseMatrix, or xgb.DMatrix
|
||||
pred <- predict(bst, dtest$data)
|
||||
err <- as.numeric(sum(as.integer(pred > 0.5) != dtest$label))/length(dtest$label)
|
||||
pred <- predict(bst, test$data)
|
||||
err <- mean(as.numeric(pred > 0.5) != test$label)
|
||||
print(paste("test-error=", err))
|
||||
|
||||
#-------------------save and load models-------------------------
|
||||
@ -54,33 +54,33 @@ print(paste("test-error=", err))
|
||||
xgb.save(bst, "xgboost.model")
|
||||
# load binary model to R
|
||||
bst2 <- xgb.load("xgboost.model")
|
||||
pred2 <- predict(bst2, dtest$data)
|
||||
pred2 <- predict(bst2, test$data)
|
||||
# pred2 should be identical to pred
|
||||
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
|
||||
|
||||
#----------------Advanced features --------------
|
||||
# to use advanced features, we need to put data in xgb.DMatrix
|
||||
dtrain <- xgb.DMatrix(data = dtrain$data, label=dtrain$label)
|
||||
dtest <- xgb.DMatrix(data = dtest$data, label=dtest$label)
|
||||
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
|
||||
dtest <- xgb.DMatrix(data = test$data, label=test$label)
|
||||
#---------------Using watchlist----------------
|
||||
# watchlist is a list of xgb.DMatrix, each of them tagged with name
|
||||
watchlist <- list(train=dtrain, test=dtest)
|
||||
# to train with watchlist, use xgb.train, which contains more advanced features
|
||||
# watchlist allows us to monitor the evaluation result on all data in the list
|
||||
print ('train xgboost using xgb.train with watchlist')
|
||||
bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
objective = "binary:logistic")
|
||||
# we can change evaluation metrics, or use multiple evaluation metrics
|
||||
print ('train xgboost using xgb.train with watchlist, watch logloss and error')
|
||||
bst <- xgb.train(data=dtrain, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
|
||||
"eval_metric" = "error", "eval_metric" = "logloss",
|
||||
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
eval.metric = "error", eval.metric = "logloss",
|
||||
objective = "binary:logistic")
|
||||
|
||||
# xgb.DMatrix can also be saved using xgb.DMatrix.save
|
||||
xgb.DMatrix.save(dtrain, "dtrain.buffer")
|
||||
# to load it in, simply call xgb.DMatrix
|
||||
dtrain2 <- xgb.DMatrix("dtrain.buffer")
|
||||
bst <- xgb.train(data=dtrain2, "max_depth"=2, eta=1, nround=2, watchlist=watchlist,
|
||||
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist,
|
||||
objective = "binary:logistic")
|
||||
# information can be extracted from xgb.DMatrix using getinfo
|
||||
label = getinfo(dtest, "label")
|
||||
|
||||
47
R-package/demo/cross_validation.R
Normal file
47
R-package/demo/cross_validation.R
Normal file
@ -0,0 +1,47 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
nround <- 2
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nround, nfold=5, metrics={'error'})
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, nround, nfold=5,
|
||||
metrics={'error'}, , showsd = FALSE)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
# See custom_objective.R
|
||||
##
|
||||
print ('running cross validation, with cutomsized loss function')
|
||||
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1)
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, nround, nfold = 5,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
34
R-package/demo/generalized_linear_model.R
Normal file
34
R-package/demo/generalized_linear_model.R
Normal file
@ -0,0 +1,34 @@
|
||||
require(xgboost)
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
# you can fit a linear regression, or logistic regression model
|
||||
##
|
||||
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param <- list(objective = "binary:logistic", booster = "gblinear",
|
||||
alpha = 0.0001, lambda = 1)
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
ypred <- predict(bst, dtest)
|
||||
labels <- getinfo(dtest, 'label')
|
||||
cat('error of preds=', mean(as.numeric(ypred>0.5)!=labels),'\n')
|
||||
|
||||
26
demo/guide-R/predict_first_ntree.R → R-package/demo/predict_first_ntree.R
Executable file → Normal file
26
demo/guide-R/predict_first_ntree.R → R-package/demo/predict_first_ntree.R
Executable file → Normal file
@ -1,27 +1,23 @@
|
||||
require(xgboost)
|
||||
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
# load in the agaricus dataset
|
||||
data(agaricus.train, package='xgboost')
|
||||
data(agaricus.test, package='xgboost')
|
||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
num_round = 2
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
nround = 2
|
||||
|
||||
# training the model for two rounds
|
||||
bst = xgb.train(param, dtrain, nround, watchlist)
|
||||
cat('start testing prediction from first n trees\n')
|
||||
labels <- getinfo(dtest,'label')
|
||||
|
||||
### predict using first 1 tree
|
||||
ypred1 = predict(bst, dtest, ntreelimit=1)
|
||||
# by default, we predict using all the trees
|
||||
ypred2 = predict(bst, dtest)
|
||||
|
||||
cat('error of ypred1=', mean(as.numeric(ypred1>0.5)!=labels),'\n')
|
||||
cat('error of ypred2=', mean(as.numeric(ypred2>0.5)!=labels),'\n')
|
||||
|
||||
8
R-package/demo/runall.R
Normal file
8
R-package/demo/runall.R
Normal file
@ -0,0 +1,8 @@
|
||||
# running all scripts in demo folder
|
||||
source('basic_walkthrough.R')
|
||||
source('custom_objective.R')
|
||||
source('boost_from_prediction.R')
|
||||
source('predict_first_ntree.R')
|
||||
source('generalized_linear_model.R')
|
||||
source('cross_validation.R')
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
XGBoost R Feature Walkthrough
|
||||
====
|
||||
To be finished
|
||||
@ -1,47 +0,0 @@
|
||||
require(xgboost)
|
||||
require(methods)
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
# we use agaricus data as example dataset
|
||||
# we will show how to use xgboost to do binary classification here
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
#-------------------------------------
|
||||
# this is the basic usage of xgboost
|
||||
# you can put sparse matrix in data field. this is helpful when your data is sparse
|
||||
# for example, when you use one-hot encoding for feature vectors
|
||||
bst <- xgboost(data = trainX, label = trainY, max_depth = 1, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
# alternatively, you can put dense matrix
|
||||
denseX <- as(trainX, "matrix")
|
||||
bst <- xgboost(data = denseX, label = trainY, max_depth = 1, eta = 1, nround = 2,
|
||||
objective = "binary:logistic")
|
||||
|
||||
# you can also specify data as file path to a LibSVM format input
|
||||
# since we do not have libsvm format file for iris, next line is only for illustration
|
||||
# bst <- xgboost(data = 'iris.svm', max_depth = 2, eta = 1, nround = 2, objective = "binary:logistic")
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
bst <- xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds <- predict(bst, dtest)
|
||||
labels <- getinfo(dtest,'label')
|
||||
cat('error=', mean(as.numeric(preds>0.5)!=labels),'\n')
|
||||
xgb.save(bst, 'xgb.model')
|
||||
xgb.dump(bst, 'dump.raw.txt')
|
||||
xgb.dump(bst, 'dump.nuce.txt','../data/featmap.txt')
|
||||
|
||||
bst2 <- xgb.load('xgb.model')
|
||||
preds2 <- predict(bst2,dtest)
|
||||
stopifnot(sum((preds-preds2)^2)==0)
|
||||
|
||||
############################ Test xgb.DMatrix with local file, sparse matrix and dense matrix in R.
|
||||
@ -1,31 +0,0 @@
|
||||
require(xgboost)
|
||||
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
print('start running example to start from a initial prediction\n')
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
|
||||
ptrain <- predict(bst, dtrain, outputmargin=TRUE)
|
||||
ptest <- predict(bst, dtest, outputmargin=TRUE)
|
||||
# dtrain.set_base_margin(ptrain)
|
||||
# dtest.set_base_margin(ptest)
|
||||
|
||||
|
||||
cat('this is result of running from initial prediction\n')
|
||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,75 +0,0 @@
|
||||
require(xgboost)
|
||||
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
num_round <- 2
|
||||
param <- list(max_depth=2,eta=1,silent=1,objective='binary:logistic')
|
||||
|
||||
cat('running cross validation\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0)
|
||||
|
||||
cat('running cross validation, disable standard deviation display\n')
|
||||
# do cross validation, this will print result out as
|
||||
# [iteration] metric_name:mean_value+std_value
|
||||
# std_value is standard deviation of the metric
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'error'}, seed = 0, show_stdv = False)
|
||||
|
||||
cat('running cross validation, with preprocessing function\n')
|
||||
# define the preprocessing function
|
||||
# used to return the preprocessed training, test data, and parameter
|
||||
# we can use this to do weight rescale, etc.
|
||||
# as a example, we try to set scale_pos_weight
|
||||
fpreproc <- function(dtrain, dtest, param){
|
||||
label <- getinfo(dtrain, 'label')
|
||||
ratio <- mean(label==0)
|
||||
param <- append(param, list(scale_pos_weight = ratio))
|
||||
return(list(dtrain=dtrain, dtest= dtest, param = param))
|
||||
}
|
||||
|
||||
|
||||
# do cross validation, for each fold
|
||||
# the dtrain, dtest, param will be passed into fpreproc
|
||||
# then the return value of fpreproc will be used to generate
|
||||
# results of that fold
|
||||
xgb.cv(param, dtrain, num_round, nfold=5,
|
||||
metrics={'auc'}, seed = 0, fpreproc = fpreproc)
|
||||
|
||||
###
|
||||
# you can also do cross validation with cutomized loss function
|
||||
# See custom_objective.py
|
||||
##
|
||||
print ('running cross validation, with cutomsized loss function')
|
||||
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
param <- list(max_depth=2,eta=1,silent=1)
|
||||
# train with customized objective
|
||||
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
|
||||
obj = logregobj, feval=evalerror)
|
||||
|
||||
@ -1,47 +0,0 @@
|
||||
require(xgboost)
|
||||
|
||||
data(agaricus.train)
|
||||
data(agaricus.test)
|
||||
|
||||
trainX = agaricus.train$data
|
||||
trainY = agaricus.train$label
|
||||
testX = agaricus.test$data
|
||||
testY = agaricus.test$label
|
||||
|
||||
dtrain <- xgb.DMatrix(trainX, label=trainY)
|
||||
dtest <- xgb.DMatrix(testX, label=testY)
|
||||
|
||||
# note: for customized objective function, we leave objective as default
|
||||
# note: what we are getting is margin value in prediction
|
||||
# you must know what you are doing
|
||||
param <- list(max_depth=2,eta=1,silent=1)
|
||||
watchlist <- list(eval = dtest, train = dtrain)
|
||||
num_round <- 2
|
||||
|
||||
|
||||
# user define objective function, given prediction, return gradient and second order gradient
|
||||
# this is loglikelihood loss
|
||||
logregobj <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
preds <- 1/(1 + exp(-preds))
|
||||
grad <- preds - labels
|
||||
hess <- preds * (1 - preds)
|
||||
return(list(grad = grad, hess = hess))
|
||||
}
|
||||
|
||||
# user defined evaluation function, return a pair metric_name, result
|
||||
# NOTE: when you do customized loss function, the default prediction value is margin
|
||||
# this may make buildin evalution metric not function properly
|
||||
# for example, we are doing logistic loss, the prediction is score before logistic transformation
|
||||
# the buildin evaluation error assumes input is after logistic transformation
|
||||
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
|
||||
evalerror <- function(preds, dtrain) {
|
||||
labels <- getinfo(dtrain, "label")
|
||||
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
|
||||
return(list(metric = "error", value = err))
|
||||
}
|
||||
|
||||
|
||||
# training with customized objective, we can also do step by step training
|
||||
# simply look at xgboost.py's implementation of train
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
|
||||
@ -1,32 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
import sys
|
||||
sys.path.append('../../wrapper')
|
||||
import xgboost as xgb
|
||||
##
|
||||
# this script demonstrate how to fit generalized linear model in xgboost
|
||||
# basically, we are using linear model, instead of tree for our boosters
|
||||
##
|
||||
dtrain = xgb.DMatrix('../data/agaricus.txt.train')
|
||||
dtest = xgb.DMatrix('../data/agaricus.txt.test')
|
||||
# change booster to gblinear, so that we are fitting a linear model
|
||||
# alpha is the L1 regularizer
|
||||
# lambda is the L2 regularizer
|
||||
# you can also set lambda_bias which is L2 regularizer on the bias term
|
||||
param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear',
|
||||
'alpha': 0.0001, 'lambda': 1 }
|
||||
|
||||
# normally, you do not need to set eta (step_size)
|
||||
# XGBoost uses a parallel coordinate descent algorithm (shotgun),
|
||||
# there could be affection on convergence with parallelization on certain cases
|
||||
# setting eta to be smaller value, e.g 0.5 can make the optimization more stable
|
||||
# param['eta'] = 1
|
||||
|
||||
##
|
||||
# the rest of settings are the same
|
||||
##
|
||||
watchlist = [(dtest,'eval'), (dtrain,'train')]
|
||||
num_round = 4
|
||||
bst = xgb.train(param, dtrain, num_round, watchlist)
|
||||
preds = bst.predict(dtest)
|
||||
labels = dtest.get_label()
|
||||
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
|
||||
@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
# todo
|
||||
Rscript basic_walkthrough.R
|
||||
Rscript custom_objective.R
|
||||
Rscript boost_from_prediction.R
|
||||
Loading…
x
Reference in New Issue
Block a user