[R] adopt demos and vignettes to a more consistent parameter style

This commit is contained in:
Vadim Khotilovich 2016-06-27 02:00:39 -05:00
parent a0aa305268
commit 3b6b344561
11 changed files with 59 additions and 59 deletions

View File

@ -1,7 +1,8 @@
require(xgboost) require(xgboost)
require(methods) require(methods)
# we load in the agaricus dataset # we load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom can be eaten # In this example, we are aiming to predict whether a mushroom is edible
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
@ -15,33 +16,33 @@ class(train$data)
# note: we are putting in sparse matrix here, xgboost naturally handles sparse input # note: we are putting in sparse matrix here, xgboost naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
print("Training xgboost with sparseMatrix") print("Training xgboost with sparseMatrix")
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2, bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# alternatively, you can put in dense matrix, i.e. basic R-matrix # alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training xgboost with Matrix") print("Training xgboost with Matrix")
bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2, bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
print("Training xgboost with xgb.DMatrix") print("Training xgboost with xgb.DMatrix")
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2,
objective = "binary:logistic") objective = "binary:logistic")
# Verbose = 0,1,2 # Verbose = 0,1,2
print("Train xgboost with verbose 0, no message") print("Train xgboost with verbose 0, no message")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
nthread = 2, objective = "binary:logistic", verbose = 0) nthread = 2, objective = "binary:logistic", verbose = 0)
print("Train xgboost with verbose 1, print evaluation metric") print("Train xgboost with verbose 1, print evaluation metric")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
nthread = 2, objective = "binary:logistic", verbose = 1) nthread = 2, objective = "binary:logistic", verbose = 1)
print("Train xgboost with verbose 2, also print information about tree") print("Train xgboost with verbose 2, also print information about tree")
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2,
nthread = 2, objective = "binary:logistic", verbose = 2) nthread = 2, objective = "binary:logistic", verbose = 2)
# you can also specify data as file path to a LibSVM format input # you can also specify data as file path to a LibSVM format input
# since we do not have this file with us, the following line is just for illustration # since we do not have this file with us, the following line is just for illustration
# bst <- xgboost(data = 'agaricus.train.svm', max.depth = 2, eta = 1, nround = 2,objective = "binary:logistic") # bst <- xgboost(data = 'agaricus.train.svm', max_depth = 2, eta = 1, nrounds = 2,objective = "binary:logistic")
#--------------------basic prediction using xgboost-------------- #--------------------basic prediction using xgboost--------------
# you can do prediction using the following line # you can do prediction using the following line
@ -77,19 +78,19 @@ watchlist <- list(train=dtrain, test=dtest)
# to train with watchlist, use xgb.train, which contains more advanced features # to train with watchlist, use xgb.train, which contains more advanced features
# watchlist allows us to monitor the evaluation result on all data in the list # watchlist allows us to monitor the evaluation result on all data in the list
print("Train xgboost using xgb.train with watchlist") print("Train xgboost using xgb.train with watchlist")
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# we can change evaluation metrics, or use multiple evaluation metrics # we can change evaluation metrics, or use multiple evaluation metrics
print("train xgboost using xgb.train with watchlist, watch logloss and error") print("train xgboost using xgb.train with watchlist, watch logloss and error")
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
eval.metric = "error", eval.metric = "logloss", eval_metric = "error", eval_metric = "logloss",
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# xgb.DMatrix can also be saved using xgb.DMatrix.save # xgb.DMatrix can also be saved using xgb.DMatrix.save
xgb.DMatrix.save(dtrain, "dtrain.buffer") xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix # to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer") dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nround=2, watchlist=watchlist, bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nrounds=2, watchlist=watchlist,
nthread = 2, objective = "binary:logistic") nthread = 2, objective = "binary:logistic")
# information can be extracted from xgb.DMatrix using getinfo # information can be extracted from xgb.DMatrix using getinfo
label = getinfo(dtest, "label") label = getinfo(dtest, "label")
@ -98,11 +99,11 @@ err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err)) print(paste("test-error=", err))
# You can dump the tree you learned using xgb.dump into a text file # You can dump the tree you learned using xgb.dump into a text file
xgb.dump(bst, "dump.raw.txt", with.stats = T) xgb.dump(bst, "dump.raw.txt", with_stats = T)
# Finally, you can check which features are the most important. # Finally, you can check which features are the most important.
print("Most important features (look at column Gain):") print("Most important features (look at column Gain):")
imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst) imp_matrix <- xgb.importance(feature_names = colnames(train$data), model = bst)
print(imp_matrix) print(imp_matrix)
# Feature importance bar plot by gain # Feature importance bar plot by gain

View File

@ -11,8 +11,8 @@ watchlist <- list(eval = dtest, train = dtrain)
# #
print('start running example to start from a initial prediction') print('start running example to start from a initial prediction')
# train xgboost for 1 round # train xgboost for 1 round
param <- list(max.depth=2,eta=1,nthread = 2, silent=1,objective='binary:logistic') param <- list(max_depth=2, eta=1, nthread = 2, silent=1, objective='binary:logistic')
bst <- xgb.train( param, dtrain, 1, watchlist ) bst <- xgb.train(param, dtrain, 1, watchlist)
# Note: we need the margin value instead of transformed prediction in set_base_margin # Note: we need the margin value instead of transformed prediction in set_base_margin
# do predict with output_margin=TRUE, will always give you margin values before logistic transformation # do predict with output_margin=TRUE, will always give you margin values before logistic transformation
ptrain <- predict(bst, dtrain, outputmargin=TRUE) ptrain <- predict(bst, dtrain, outputmargin=TRUE)

View File

@ -65,11 +65,10 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
# Following is the same process as other demo # Following is the same process as other demo
cat("Learning...\n") cat("Learning...\n")
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 9,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix. importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
print(importance) print(importance)
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column). # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).

View File

@ -6,7 +6,7 @@ dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
nround <- 2 nround <- 2
param <- list(max.depth=2,eta=1,silent=1,nthread = 2, objective='binary:logistic') param <- list(max_depth=2, eta=1, silent=1, nthread=2, objective='binary:logistic')
cat('running cross validation\n') cat('running cross validation\n')
# do cross validation, this will print result out as # do cross validation, this will print result out as
@ -19,7 +19,7 @@ cat('running cross validation, disable standard deviation display\n')
# [iteration] metric_name:mean_value+std_value # [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric # std_value is standard deviation of the metric
xgb.cv(param, dtrain, nround, nfold=5, xgb.cv(param, dtrain, nround, nfold=5,
metrics={'error'}, showsd = FALSE) metrics='error', showsd = FALSE)
### ###
# you can also do cross validation with cutomized loss function # you can also do cross validation with cutomized loss function
@ -40,12 +40,12 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max.depth=2,eta=1,silent=1, param <- list(max_depth=2, eta=1, silent=1,
objective = logregobj, eval_metric = evalerror) objective = logregobj, eval_metric = evalerror)
# train with customized objective # train with customized objective
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5) xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
# do cross validation with prediction values for each fold # do cross validation with prediction values for each fold
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE) res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
res$dt res$evaluation_log
length(res$pred) length(res$pred)

View File

@ -33,7 +33,7 @@ evalerror <- function(preds, dtrain) {
return(list(metric = "error", value = err)) return(list(metric = "error", value = err))
} }
param <- list(max.depth=2, eta=1, nthread = 2, silent=1, param <- list(max_depth=2, eta=1, nthread = 2, silent=1,
objective=logregobj, eval_metric=evalerror) objective=logregobj, eval_metric=evalerror)
print ('start training with user customized objective') print ('start training with user customized objective')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training
@ -57,7 +57,7 @@ logregobjattr <- function(preds, dtrain) {
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
param <- list(max.depth=2, eta=1, nthread = 2, silent=1, param <- list(max_depth=2, eta=1, nthread = 2, silent=1,
objective=logregobjattr, eval_metric=evalerror) objective=logregobjattr, eval_metric=evalerror)
print ('start training with user customized objective, with additional attributes in DMatrix') print ('start training with user customized objective, with additional attributes in DMatrix')
# training with customized objective, we can also do step by step training # training with customized objective, we can also do step by step training

View File

@ -7,7 +7,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default # note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction # note: what we are getting is margin value in prediction
# you must know what you are doing # you must know what you are doing
param <- list(max.depth=2,eta=1,nthread = 2, silent=1) param <- list(max_depth=2, eta=1, nthread = 2, silent=1)
watchlist <- list(eval = dtest) watchlist <- list(eval = dtest)
num_round <- 20 num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient # user define objective function, given prediction, return gradient and second order gradient
@ -34,7 +34,7 @@ print ('start training with early Stopping setting')
bst <- xgb.train(param, dtrain, num_round, watchlist, bst <- xgb.train(param, dtrain, num_round, watchlist,
objective = logregobj, eval_metric = evalerror, maximize = FALSE, objective = logregobj, eval_metric = evalerror, maximize = FALSE,
early.stop.round = 3) early_stopping_round = 3)
bst <- xgb.cv(param, dtrain, num_round, nfold = 5, bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
objective = logregobj, eval_metric = evalerror, objective = logregobj, eval_metric = evalerror,
maximize = FALSE, early.stop.round = 3) maximize = FALSE, early_stopping_rounds = 3)

View File

@ -5,7 +5,7 @@ data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic') param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
nround = 2 nround = 2

View File

@ -10,7 +10,7 @@ data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') param <- list(max_depth=2, eta=1, silent=1, objective='binary:logistic')
nround = 4 nround = 4
# training the model for two rounds # training the model for two rounds

View File

@ -168,8 +168,8 @@ Build the model
The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [Xgboost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
```{r} ```{r}
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4, bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
``` ```
@ -179,7 +179,7 @@ A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitti
> Here you can see the numbers decrease until line 7 and then increase. > Here you can see the numbers decrease until line 7 and then increase.
> >
> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nround = 4`. I will let things like that because I don't really care for the purpose of this example :-) > It probably means we are overfitting. To fix that I should reduce the number of rounds to `nrounds = 4`. I will let things like that because I don't really care for the purpose of this example :-)
Feature importance Feature importance
------------------ ------------------
@ -189,10 +189,10 @@ Feature importance
### Build the feature importance data.table ### Build the feature importance data.table
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature). Remember, each binary column corresponds to a single value of one of *categorical* features.
```{r} ```{r}
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst) importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
head(importance) head(importance)
``` ```
@ -215,7 +215,7 @@ One simple solution is to count the co-occurrences of a feature and a class of t
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`. For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
```{r} ```{r}
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
# Cleaning for better display # Cleaning for better display
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)] importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
@ -328,10 +328,10 @@ train <- agaricus.train
test <- agaricus.test test <- agaricus.test
#Random Forest™ - 1000 trees #Random Forest™ - 1000 trees
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nround = 1, objective = "binary:logistic") bst <- xgboost(data = train$data, label = train$label, max_depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nrounds = 1, objective = "binary:logistic")
#Boosting - 3 rounds #Boosting - 3 rounds
bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nround = 3, objective = "binary:logistic") bst <- xgboost(data = train$data, label = train$label, max_depth = 4, nrounds = 3, objective = "binary:logistic")
``` ```
> Note that the parameter `round` is set to `1`. > Note that the parameter `round` is set to `1`.

View File

@ -84,8 +84,8 @@ data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train
test <- agaricus.test test <- agaricus.test
bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
nround = 2, objective = "binary:logistic") nrounds = 2, objective = "binary:logistic")
xgb.save(bst, 'model.save') xgb.save(bst, 'model.save')
bst = xgb.load('model.save') bst = xgb.load('model.save')
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
@ -162,9 +162,9 @@ evalerror <- function(preds, dtrain) {
dtest <- xgb.DMatrix(test$data, label = test$label) dtest <- xgb.DMatrix(test$data, label = test$label)
watchlist <- list(eval = dtest, train = dtrain) watchlist <- list(eval = dtest, train = dtrain)
param <- list(max.depth = 2, eta = 1, silent = 1) param <- list(max_depth = 2, eta = 1, silent = 1)
bst <- xgb.train(param, dtrain, nround = 2, watchlist, logregobj, evalerror) bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror)
@ @
The gradient and second order gradient is required for the output of customized The gradient and second order gradient is required for the output of customized

View File

@ -147,12 +147,12 @@ In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore,
We will train decision tree model using the following parameters: We will train decision tree model using the following parameters:
* `objective = "binary:logistic"`: we will train a binary classification model ; * `objective = "binary:logistic"`: we will train a binary classification model ;
* `max.deph = 2`: the trees won't be deep, because our case is very simple ; * `max_depth = 2`: the trees won't be deep, because our case is very simple ;
* `nthread = 2`: the number of cpu threads we are going to use; * `nthread = 2`: the number of cpu threads we are going to use;
* `nround = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction. * `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction.
```{r trainingSparse, message=F, warning=F} ```{r trainingSparse, message=F, warning=F}
bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") bstSparse <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
``` ```
> More complex the relationship between your features and your `label` is, more passes you need. > More complex the relationship between your features and your `label` is, more passes you need.
@ -164,7 +164,7 @@ bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta
Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix. Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix.
```{r trainingDense, message=F, warning=F} ```{r trainingDense, message=F, warning=F}
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
``` ```
##### xgb.DMatrix ##### xgb.DMatrix
@ -173,7 +173,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth
```{r trainingDmatrix, message=F, warning=F} ```{r trainingDmatrix, message=F, warning=F}
dtrain <- xgb.DMatrix(data = train$data, label = train$label) dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") bstDMatrix <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
``` ```
##### Verbose option ##### Verbose option
@ -184,17 +184,17 @@ One of the simplest way to see the training progress is to set the `verbose` opt
```{r trainingVerbose0, message=T, warning=F} ```{r trainingVerbose0, message=T, warning=F}
# verbose = 0, no message # verbose = 0, no message
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 0) bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
``` ```
```{r trainingVerbose1, message=T, warning=F} ```{r trainingVerbose1, message=T, warning=F}
# verbose = 1, print evaluation metric # verbose = 1, print evaluation metric
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 1) bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 1)
``` ```
```{r trainingVerbose2, message=T, warning=F} ```{r trainingVerbose2, message=T, warning=F}
# verbose = 2, also print information about tree # verbose = 2, also print information about tree
bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2) bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2)
``` ```
## Basic prediction using XGBoost ## Basic prediction using XGBoost
@ -287,10 +287,10 @@ For the purpose of this example, we use `watchlist` parameter. It is a list of `
```{r watchlist, message=F, warning=F} ```{r watchlist, message=F, warning=F}
watchlist <- list(train=dtrain, test=dtest) watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic") bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
``` ```
**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. **XGBoost** has computed at each round the same average error metric than seen above (we set `nrounds` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
@ -299,10 +299,10 @@ If with your own dataset you have not such results, you should think about how y
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics. For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
```{r watchlist2, message=F, warning=F} ```{r watchlist2, message=F, warning=F}
bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic") bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, eval_metric = "error", eval_metric = "logloss", objective = "binary:logistic")
``` ```
> `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`. > `eval_metric` allows us to monitor two new metrics for each round, `logloss` and `error`.
### Linear boosting ### Linear boosting
@ -310,7 +310,7 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter). Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
```{r linearBoosting, message=F, warning=F} ```{r linearBoosting, message=F, warning=F}
bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic") bst <- xgb.train(data=dtrain, booster = "gblinear", max_depth=2, nthread = 2, nrounds=2, watchlist=watchlist, eval_metric = "error", eval_metric = "logloss", objective = "binary:logistic")
``` ```
In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
@ -328,7 +328,7 @@ Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome)
xgb.DMatrix.save(dtrain, "dtrain.buffer") xgb.DMatrix.save(dtrain, "dtrain.buffer")
# to load it in, simply call xgb.DMatrix # to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer") dtrain2 <- xgb.DMatrix("dtrain.buffer")
bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic") bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
``` ```
```{r DMatrixDel, include=FALSE} ```{r DMatrixDel, include=FALSE}
@ -363,7 +363,7 @@ xgb.plot.importance(importance_matrix = importance_matrix)
You can dump the tree you learned using `xgb.dump` into a text file. You can dump the tree you learned using `xgb.dump` into a text file.
```{r dump, message=T, warning=F} ```{r dump, message=T, warning=F}
xgb.dump(bst, with.stats = T) xgb.dump(bst, with_stats = T)
``` ```
You can plot the trees from your model using ```xgb.plot.tree`` You can plot the trees from your model using ```xgb.plot.tree``