From 947afd7eace40c55384359b232051bd82fb733dc Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Sun, 17 May 2015 15:16:28 +0200 Subject: [PATCH 01/11] =?UTF-8?q?multi=20=1Btrees?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R-package/R/xgb.model.dt.tree.R | 5 ++-- R-package/R/xgb.plot.multi.tree.R | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 R-package/R/xgb.plot.multi.tree.R diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 7eea3dfcd..d68dbf5cd 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -96,13 +96,14 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model allTrees <- data.table() - anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" + anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" + for(i in 1:n_round){ tree <- text[(position[i]+1):(position[i+1]-1)] # avoid tree made of a leaf only (no split) - if(length(tree) <2) next + if(length(tree) < 2) next treeID <- i-1 diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R new file mode 100644 index 000000000..314e2157a --- /dev/null +++ b/R-package/R/xgb.plot.multi.tree.R @@ -0,0 +1,42 @@ +library(stringr) +library(data.table) + + + +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 5, + eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) + + +# first number of the path represents the tree, then the following numbers are related to the path to follow + +# root init +root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] + +precedent.nodes <- root.nodes + +while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { + yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] + no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] + yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("-0") + no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("-1") + + tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] + tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] + precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) +} + +tree.matrix + + + From 1ea7f6f03355bac95dc999830923592e97f15c3b Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Sun, 17 May 2015 20:37:15 +0200 Subject: [PATCH 02/11] fix bug --- R-package/R/xgb.plot.multi.tree.R | 33 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R index 314e2157a..f61540dae 100644 --- a/R-package/R/xgb.plot.multi.tree.R +++ b/R-package/R/xgb.plot.multi.tree.R @@ -1,6 +1,6 @@ library(stringr) library(data.table) - +library(xgboost) data(agaricus.train, package='xgboost') @@ -10,8 +10,8 @@ data(agaricus.train, package='xgboost') #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 5, - eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +bst <- xgboost(data = train$data, label = train$label, max.depth = 3, + eta = 1, nthread = 2, nround = 4,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) @@ -21,22 +21,39 @@ tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) # root init root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] -tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes %>% str_replace("-", "_")] -precedent.nodes <- root.nodes +precedent.nodes <- root.nodes %>% str_replace("-", "_") while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] - yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("-0") - no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("-1") + yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0") + no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1") tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) } -tree.matrix +tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] +tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] +tree.matrix[,ID:= Abs.Position] +tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] +tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] + +CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + + +yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") + +no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") + +path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n") +DiagrammeR::mermaid(path) + +# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" +setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) From 936190c17c798e7365e63886ce79e77c3403342d Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 16 Jun 2015 21:38:14 +0200 Subject: [PATCH 03/11] slight update in documentation --- R-package/R/xgb.train.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index d75659737..23accef3a 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -43,7 +43,7 @@ #' \item \code{binary:logistic} logistic regression for binary classification. Output probability. #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. #' \item \code{num_class} set the number of classes. To use only with multiclass objectives. -#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}. +#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}. #' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } @@ -82,6 +82,7 @@ #' \itemize{ #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} +#' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss} #' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. #' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. From ad2e93f6c5cf051eb5133f3c9f6564eae4c6505a Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 16 Jun 2015 21:39:31 +0200 Subject: [PATCH 04/11] multi tree update --- R-package/R/xgb.plot.multi.tree.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R index f61540dae..feb7e667e 100644 --- a/R-package/R/xgb.plot.multi.tree.R +++ b/R-package/R/xgb.plot.multi.tree.R @@ -10,8 +10,8 @@ data(agaricus.train, package='xgboost') #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 3, - eta = 1, nthread = 2, nround = 4,objective = "binary:logistic") +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 4, objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) @@ -21,9 +21,9 @@ tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) # root init root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] -tree.matrix[ID == root.nodes, Abs.Position:=root.nodes %>% str_replace("-", "_")] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] -precedent.nodes <- root.nodes %>% str_replace("-", "_") +precedent.nodes <- root.nodes while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] @@ -40,11 +40,16 @@ tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] tree.matrix[,ID:= Abs.Position] +tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))] +keepN <- 3 +tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position] tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] +tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")] + CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" @@ -56,4 +61,4 @@ path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% pas DiagrammeR::mermaid(path) # path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" -setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) +# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) From 0dfc44325291b7f2de73a936016c7c90ab787667 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 15 Jul 2015 15:59:36 +0200 Subject: [PATCH 05/11] New projection of all trees on one --- .../understandingXGBoostModel.html | 338 ++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 demo/kaggle-otto/understandingXGBoostModel.html diff --git a/demo/kaggle-otto/understandingXGBoostModel.html b/demo/kaggle-otto/understandingXGBoostModel.html new file mode 100644 index 000000000..abbfdb55b --- /dev/null +++ b/demo/kaggle-otto/understandingXGBoostModel.html @@ -0,0 +1,338 @@ + + + + + + + + + + + + + +Understanding XGBoost Model on Otto Dataset + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

1 Introduction

+

XGBoost is an implementation of the famous gradient boosting algorithm. This model is often described as a blackbox, meaning it works well but it is not trivial to understand how. Indeed, the model is made of hundreds (thousands?) of decision trees. You may wonder how possible a human would be able to have a general view of the model?

+

While XGBoost is known for its fast speed and accurate predictive power, it also comes with various functions to help you understand the model. The purpose of this RMarkdown document is to demonstrate how easily we can leverage the functions already implemented in XGBoost R package. Of course, everything showed below can be applied to the dataset you may have to manipulate at work or wherever!

+

First we will prepare the Otto dataset and train a model, then we will generate two vizualisations to get a clue of what is important to the model, finally, we will see how we can leverage these information.

+
+
+

2 Preparation of the data

+

This part is based on the R tutorial example by Tong He

+

First, let’s load the packages and the dataset.

+
require(xgboost)
+
## Loading required package: xgboost
+
require(methods)
+require(data.table)
+
## Loading required package: data.table
+
require(magrittr)
+
## Loading required package: magrittr
+
train <- fread('data/train.csv', header = T, stringsAsFactors = F)
+test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
+
+

magrittr and data.table are here to make the code cleaner and much more rapid.

+
+

Let’s explore the dataset.

+
# Train dataset dimensions
+dim(train)
+
## [1] 61878    95
+
# Training content
+train[1:6,1:5, with =F]
+
##    id feat_1 feat_2 feat_3 feat_4
+## 1:  1      1      0      0      0
+## 2:  2      0      0      0      0
+## 3:  3      0      0      0      0
+## 4:  4      1      0      0      1
+## 5:  5      0      0      0      0
+## 6:  6      2      1      0      0
+
# Test dataset dimensions
+dim(train)
+
## [1] 61878    95
+
# Test content
+test[1:6,1:5, with =F]
+
##    id feat_1 feat_2 feat_3 feat_4
+## 1:  1      0      0      0      0
+## 2:  2      2      2     14     16
+## 3:  3      0      1     12      1
+## 4:  4      0      0      0      1
+## 5:  5      1      0      0      1
+## 6:  6      0      0      0      0
+
+

We only display the 6 first rows and 5 first columns for convenience

+
+

Each column represents a feature measured by an integer. Each row is an Otto product.

+

Obviously the first column (ID) doesn’t contain any useful information.

+

To let the algorithm focus on real stuff, we will delete it.

+
# Delete ID column in training dataset
+train[, id := NULL]
+
+# Delete ID column in testing dataset
+test[, id := NULL]
+

According to its description, the Otto challenge is a multi class classification challenge. We need to extract the labels (here the name of the different classes) from the dataset. We only have two files (test and training), it seems logical that the training file contains the class we are looking for. Usually the labels is in the first or the last column. We already know what is in the first column, let’s check the content of the last one.

+
# Check the content of the last column
+train[1:6, ncol(train), with  = F]
+
##     target
+## 1: Class_1
+## 2: Class_1
+## 3: Class_1
+## 4: Class_1
+## 5: Class_1
+## 6: Class_1
+
# Save the name of the last column
+nameLastCol <- names(train)[ncol(train)]
+

The classes are provided as character string in the 94th column called target. As you may know, XGBoost doesn’t support anything else than numbers. So we will convert classes to integer. Moreover, according to the documentation, it should start at 0.

+

For that purpose, we will:

+
    +
  • extract the target column
  • +
  • remove Class_ from each class name
  • +
  • convert to integer
  • +
  • remove 1 to the new value
  • +
+
# Convert from classes to numbers
+y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
+
+# Display the first 5 levels
+y[1:5]
+
## [1] 0 0 0 0 0
+

We remove label column from training dataset, otherwise XGBoost would use it to guess the labels!

+
train[, nameLastCol:=NULL, with = F]
+

data.table is an awesome implementation of data.frame, unfortunately it is not a format supported natively by XGBoost. We need to convert both datasets (training and test) in numeric Matrix format.

+
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
+testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
+
+
+

3 Model training

+

Before the learning we will use the cross validation to evaluate the our error rate.

+

Basically XGBoost will divide the training data in nfold parts, then XGBoost will retain the first part to use it as the test data and perform a training. Then it will reintegrate the first part and retain the second part, do a training and so on…

+

You can look at the function documentation for more information.

+
numberOfClasses <- max(y) + 1
+
+param <- list("objective" = "multi:softprob",
+              "eval_metric" = "mlogloss",
+              "num_class" = numberOfClasses)
+
+cv.nround <- 5
+cv.nfold <- 3
+
+bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, 
+                nfold = cv.nfold, nrounds = cv.nround)
+
## [0]  train-mlogloss:1.540431+0.002213    test-mlogloss:1.554751+0.001620
+## [1]  train-mlogloss:1.282145+0.002432    test-mlogloss:1.305809+0.000891
+## [2]  train-mlogloss:1.112233+0.003468    test-mlogloss:1.143170+0.001239
+## [3]  train-mlogloss:0.990676+0.003071    test-mlogloss:1.027884+0.002789
+## [4]  train-mlogloss:0.898998+0.003624    test-mlogloss:0.941951+0.002773
+
+

As we can see the error rate is low on the test dataset (for a 5mn trained model).

+
+

Finally, we are ready to train the real model!!!

+
nround = 50
+bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nround)
+
## [0]  train-mlogloss:1.539812
+## [1]  train-mlogloss:1.284372
+## [2]  train-mlogloss:1.116199
+## [3]  train-mlogloss:0.997417
+## [4]  train-mlogloss:0.908790
+## [5]  train-mlogloss:0.837503
+## [6]  train-mlogloss:0.780621
+## [7]  train-mlogloss:0.735461
+## [8]  train-mlogloss:0.696942
+## [9]  train-mlogloss:0.666732
+## [10] train-mlogloss:0.641023
+## [11] train-mlogloss:0.618737
+## [12] train-mlogloss:0.599404
+## [13] train-mlogloss:0.583204
+## [14] train-mlogloss:0.568396
+## [15] train-mlogloss:0.555462
+## [16] train-mlogloss:0.543350
+## [17] train-mlogloss:0.532383
+## [18] train-mlogloss:0.522704
+## [19] train-mlogloss:0.513795
+## [20] train-mlogloss:0.506245
+## [21] train-mlogloss:0.497973
+## [22] train-mlogloss:0.491395
+## [23] train-mlogloss:0.484097
+## [24] train-mlogloss:0.477012
+## [25] train-mlogloss:0.470934
+## [26] train-mlogloss:0.466095
+## [27] train-mlogloss:0.461394
+## [28] train-mlogloss:0.456613
+## [29] train-mlogloss:0.450938
+## [30] train-mlogloss:0.446367
+## [31] train-mlogloss:0.442480
+## [32] train-mlogloss:0.437640
+## [33] train-mlogloss:0.433672
+## [34] train-mlogloss:0.428959
+## [35] train-mlogloss:0.424677
+## [36] train-mlogloss:0.421388
+## [37] train-mlogloss:0.418912
+## [38] train-mlogloss:0.415505
+## [39] train-mlogloss:0.411825
+## [40] train-mlogloss:0.407472
+## [41] train-mlogloss:0.404232
+## [42] train-mlogloss:0.401184
+## [43] train-mlogloss:0.397714
+## [44] train-mlogloss:0.394451
+## [45] train-mlogloss:0.392290
+## [46] train-mlogloss:0.389948
+## [47] train-mlogloss:0.387899
+## [48] train-mlogloss:0.385107
+## [49] train-mlogloss:0.382828
+
+
+

4 Model understanding

+
+

4.1 Feature importance

+

So far, we have built a model made of 50 trees.

+

To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding Otto products).

+

Each division operation is called a split.

+

Each group at each division level is called a branch and the deepest level is called a leaf.

+

In the final model, these leafs are supposed to be as pure as possible for each tree, meaning in our case that each leaf should be made of one class of Otto product only (of course it is not true, but that’s what we try to achieve in a minimum of splits).

+

Not all splits are equally important. Basically the first split of a tree will have more impact on the purity that, for instance, the deepest split. Intuitively, we understand that the first split makes most of the work, and the following splits focus on smaller parts of the dataset which have been missclassified by the first tree.

+

In the same way, in Boosting we try to optimize the missclassification at each round (it is called the loss). So the first tree will do the big work and the following trees will focus on the remaining, on the parts not correctly learned by the previous trees.

+

The improvement brought by each split can be measured, it is the gain.

+

Each split is done on one feature only at one value.

+

Let’s see what the model looks like.

+
model <- xgb.dump(bst, with.stats = T)
+model[1:10]
+
##  [1] "booster[0]"                                                         
+##  [2] "0:[f16<1.5] yes=1,no=2,missing=1,gain=309.719,cover=12222.8"        
+##  [3] "1:[f29<26.5] yes=3,no=4,missing=3,gain=161.964,cover=11424"         
+##  [4] "3:[f77<2.5] yes=7,no=8,missing=7,gain=106.092,cover=11416.3"        
+##  [5] "7:[f52<12.5] yes=13,no=14,missing=13,gain=43.1389,cover=11211.9"    
+##  [6] "13:[f76<1.5] yes=25,no=26,missing=25,gain=37.407,cover=11143.5"     
+##  [7] "25:[f16<2.00001] yes=49,no=50,missing=50,gain=36.3329,cover=10952.1"
+##  [8] "49:leaf=-0.0905567,cover=1090.77"                                   
+##  [9] "50:leaf=-0.148413,cover=9861.33"                                    
+## [10] "26:[f83<26] yes=51,no=52,missing=52,gain=167.766,cover=191.407"
+
+

For convenience, we are displaying the first 10 lines of the model only.

+
+

Clearly, it is not easy to understand what it means.

+

Basically each line represents a branch, there is the tree ID, the feature ID, the point where it splits, and information regarding the next branches (left, right, when the row for this feature is N/A).

+

Hopefully, XGBoost offers a better representation: feature importance.

+

Feature importance is about averaging the gain of each feature for all split and all trees.

+

Then we can use the function xgb.plot.importance.

+
# Get the feature real names
+names <- dimnames(trainMatrix)[[2]]
+
+# Compute feature importance matrix
+importance_matrix <- xgb.importance(names, model = bst)
+
+# Nice graph
+xgb.plot.importance(importance_matrix[1:10,])
+

+
+

To make it understandable we first extract the column names from the Matrix.

+
+
+
+

4.2 Interpretation

+

In the feature importance above, we can see the first 10 most important features.

+

This function gives a color to each bar. These colors represent groups of features. Basically a K-means clustering is applied to group each feature by importance.

+

From here you can take several actions. For instance you can remove the less important feature (feature selection process), or go deeper in the interaction between the most important features and labels.

+

Or you can just reason about why these features are so importat (in Otto challenge we can’t go this way because there is not enough information).

+
+
+

4.3 Tree graph

+

Feature importance gives you feature weight information but not interaction between features.

+

XGBoost R package have another useful function for that.

+

Please, scroll on the right to see the tree.

+
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
+

+

+

We are just displaying the first two trees here.

+

On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. Besides, XGBoost generate k trees at each round for a k-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.

+
+
+
+

5 Going deeper

+

There are 4 documents you may also be interested in:

+ +
+ + + + + + + + From 951ba267cf0c04cfa0ff275573ee7aa5c310fddd Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 22 Jul 2015 23:50:54 +0200 Subject: [PATCH 06/11] move plot file --- R-package/{R => demo}/xgb.plot.multi.tree.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R-package/{R => demo}/xgb.plot.multi.tree.R (100%) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/demo/xgb.plot.multi.tree.R similarity index 100% rename from R-package/R/xgb.plot.multi.tree.R rename to R-package/demo/xgb.plot.multi.tree.R From 635645c65093edbfd2cf1f96e84c274770cb2d27 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 7 Nov 2015 21:00:02 +0100 Subject: [PATCH 07/11] Rewrite tree plot function Replace Mermaid by GraphViz --- R-package/R/xgb.plot.tree.R | 59 ++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 5e359219a..9977748db 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -9,17 +9,14 @@ #' @importFrom data.table := #' @importFrom data.table copy #' @importFrom magrittr %>% -#' @importFrom magrittr not -#' @importFrom magrittr add -#' @importFrom stringr str_extract -#' @importFrom stringr str_split -#' @importFrom stringr str_extract -#' @importFrom stringr str_trim +#' @importFrom DiagrammeR create_nodes +#' @importFrom DiagrammeR create_edges +#' @importFrom DiagrammeR create_graph +#' @importFrom DiagrammeR render_graph #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. -#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information. #' @param width the width of the diagram in pixels. #' @param height the height of the diagram in pixels. #' @@ -36,7 +33,7 @@ #' } #' #' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. +#' It uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose. #' #' @examples #' data(agaricus.train, package='xgboost') @@ -53,12 +50,7 @@ #' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst) #' #' @export -#' -xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){ - - if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) { - stop("style: Has to be a character vector of size 1.") - } +xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, width = NULL, height = NULL){ if (!class(model) %in% c("xgb.Booster", "NULL")) { stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.") @@ -78,19 +70,38 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU allTrees[Feature != "Leaf" ,noPath := paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] - if(is.null(CSSstyle)){ - CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" - } + allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)] + allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"] + allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"] + + nodes <- create_nodes(nodes = allTrees[,ID], + label = allTrees[,label], + #type = c("lower", "lower", "upper", "upper"), + style = "filled", + color = "DimGray", + fillcolor= allTrees[,filledcolor], + shape = allTrees[,shape], + data = allTrees[,Feature], + fontname = "Helvetica" + ) + + edges <- create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2), + to = allTrees[Feature != "Leaf", c(Yes, No)], + label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))), + color = "DimGray", + arrowsize = "1.5", + arrowhead = "vee", + fontname = "Helvetica", + rel = "leading_to") - yes <- allTrees[Feature != "Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") - - no <- allTrees[Feature != "Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") - - path <- allTrees[Feature != "Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";") - DiagrammeR::mermaid(path, width, height) + graph <- create_graph(nodes_df = nodes, + edges_df = edges, + graph_attrs = "rankdir = LR") + + render_graph(graph, width = width, height = height) } # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", ".")) +globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor")) From 0052b193cf47a2482e209dad8b90c41393b3f85f Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 7 Nov 2015 21:01:28 +0100 Subject: [PATCH 08/11] Update lib version dependencies (for DiagrammeR mainly) Fix @export tag in each R file (for Roxygen 5, otherwise it doesn't work anymore) Regerate Roxygen doc --- R-package/DESCRIPTION | 23 +++++----- R-package/NAMESPACE | 6 ++- R-package/R/getinfo.xgb.DMatrix.R | 1 - R-package/R/predict.xgb.Booster.R | 1 - R-package/R/setinfo.xgb.DMatrix.R | 1 - R-package/R/slice.xgb.DMatrix.R | 1 - R-package/R/xgb.DMatrix.R | 1 - R-package/R/xgb.DMatrix.save.R | 1 - R-package/R/xgb.cv.R | 1 - R-package/R/xgb.dump.R | 1 - R-package/R/xgb.load.R | 1 - R-package/R/xgb.save.R | 1 - R-package/R/xgb.save.raw.R | 1 - R-package/R/xgb.train.R | 1 - R-package/R/xgboost.R | 1 - R-package/man/agaricus.test.Rd | 8 ++-- R-package/man/agaricus.train.Rd | 8 ++-- R-package/man/getinfo.Rd | 2 +- R-package/man/nrow-xgb.DMatrix-method.Rd | 3 +- R-package/man/predict-xgb.Booster-method.Rd | 12 +++--- .../man/predict-xgb.Booster.handle-method.Rd | 2 +- R-package/man/setinfo.Rd | 2 +- R-package/man/slice.Rd | 2 +- R-package/man/xgb.DMatrix.Rd | 4 +- R-package/man/xgb.DMatrix.save.Rd | 2 +- R-package/man/xgb.cv.Rd | 20 ++++----- R-package/man/xgb.dump.Rd | 18 ++++---- R-package/man/xgb.importance.Rd | 13 +++--- R-package/man/xgb.load.Rd | 4 +- R-package/man/xgb.model.dt.tree.Rd | 9 ++-- R-package/man/xgb.plot.importance.Rd | 9 ++-- R-package/man/xgb.plot.tree.Rd | 19 ++++---- R-package/man/xgb.save.Rd | 4 +- R-package/man/xgb.save.raw.Rd | 4 +- R-package/man/xgb.train.Rd | 43 ++++++++++--------- R-package/man/xgboost.Rd | 19 ++++---- 36 files changed, 123 insertions(+), 126 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 59728f3c2..b4201e793 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -3,16 +3,16 @@ Type: Package Title: Extreme Gradient Boosting Version: 0.4-2 Date: 2015-08-01 -Author: Tianqi Chen , Tong He , Michael Benesty +Author: Tianqi Chen , Tong He , + Michael Benesty Maintainer: Tong He -Description: Extreme Gradient Boosting, which is an - efficient implementation of gradient boosting framework. - This package is its R interface. The package includes efficient - linear model solver and tree learning algorithms. The package can automatically - do parallel computation on a single machine which could be more than 10 times faster - than existing gradient boosting packages. It supports various - objective functions, including regression, classification and ranking. The - package is made to be extensible, so that users are also allowed to define +Description: Extreme Gradient Boosting, which is an efficient implementation + of gradient boosting framework. This package is its R interface. The package + includes efficient linear model solver and tree learning algorithms. The package + can automatically do parallel computation on a single machine which could be + more than 10 times faster than existing gradient boosting packages. It supports + various objective functions, including regression, classification and ranking. + The package is made to be extensible, so that users are also allowed to define their own objectives easily. License: Apache License (== 2.0) | file LICENSE URL: https://github.com/dmlc/xgboost @@ -21,7 +21,7 @@ VignetteBuilder: knitr Suggests: knitr, ggplot2 (>= 1.0.0), - DiagrammeR (>= 0.6), + DiagrammeR (>= 0.8.1), Ckmeans.1d.dp (>= 3.3.1), vcd (>= 1.3), testthat @@ -30,6 +30,7 @@ Depends: Imports: Matrix (>= 1.1-0), methods, - data.table (>= 1.9.4), + data.table (>= 1.9.6), magrittr (>= 1.5), stringr (>= 0.6.2) +RoxygenNote: 5.0.0 diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a4f07799a..f3a7390b7 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(getinfo) export(setinfo) @@ -21,6 +21,10 @@ exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) +importFrom(DiagrammeR,create_edges) +importFrom(DiagrammeR,create_graph) +importFrom(DiagrammeR,create_nodes) +importFrom(DiagrammeR,render_graph) importFrom(Matrix,cBind) importFrom(Matrix,colSums) importFrom(Matrix,sparseVector) diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R index dc734bce1..3000a1e7d 100644 --- a/R-package/R/getinfo.xgb.DMatrix.R +++ b/R-package/R/getinfo.xgb.DMatrix.R @@ -23,7 +23,6 @@ setClass('xgb.DMatrix') #' stopifnot(all(labels2 == 1-labels)) #' @rdname getinfo #' @export -#' getinfo <- function(object, ...){ UseMethod("getinfo") } diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R index 432581e76..abdb94e75 100644 --- a/R-package/R/predict.xgb.Booster.R +++ b/R-package/R/predict.xgb.Booster.R @@ -29,7 +29,6 @@ setClass("xgb.Booster", #' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #' pred <- predict(bst, test$data) #' @export -#' setMethod("predict", signature = "xgb.Booster", definition = function(object, newdata, missing = NA, outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) { diff --git a/R-package/R/setinfo.xgb.DMatrix.R b/R-package/R/setinfo.xgb.DMatrix.R index 4bee161b7..427de08d4 100644 --- a/R-package/R/setinfo.xgb.DMatrix.R +++ b/R-package/R/setinfo.xgb.DMatrix.R @@ -21,7 +21,6 @@ #' stopifnot(all(labels2 == 1-labels)) #' @rdname setinfo #' @export -#' setinfo <- function(object, ...){ UseMethod("setinfo") } diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R index 3b025e1dd..4626c2b4d 100644 --- a/R-package/R/slice.xgb.DMatrix.R +++ b/R-package/R/slice.xgb.DMatrix.R @@ -13,7 +13,6 @@ setClass('xgb.DMatrix') #' dsub <- slice(dtrain, 1:3) #' @rdname slice #' @export -#' slice <- function(object, ...){ UseMethod("slice") } diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 20a3276c0..c34c65d95 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -17,7 +17,6 @@ #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' @export -#' xgb.DMatrix <- function(data, info = list(), missing = NA, ...) { if (typeof(data) == "character") { handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R index 7a9ac611d..63a0be691 100644 --- a/R-package/R/xgb.DMatrix.save.R +++ b/R-package/R/xgb.DMatrix.save.R @@ -12,7 +12,6 @@ #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data') #' dtrain <- xgb.DMatrix('xgb.DMatrix.data') #' @export -#' xgb.DMatrix.save <- function(DMatrix, fname) { if (typeof(fname) != "character") { stop("xgb.save: fname must be character") diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 5f964c4f8..89edbeb63 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -90,7 +90,6 @@ #' max.depth =3, eta = 1, objective = "binary:logistic") #' print(history) #' @export -#' xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA, prediction = FALSE, showsd = TRUE, metrics=list(), obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L, diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 856ec0888..b39359abd 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -36,7 +36,6 @@ #' # print the model without saving it to a file #' print(xgb.dump(bst)) #' @export -#' xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) { if (class(model) != "xgb.Booster") { stop("model: argument must be type xgb.Booster") diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R index 2a2598dd8..03d6a4842 100644 --- a/R-package/R/xgb.load.R +++ b/R-package/R/xgb.load.R @@ -15,7 +15,6 @@ #' bst <- xgb.load('xgb.model') #' pred <- predict(bst, test$data) #' @export -#' xgb.load <- function(modelfile) { if (is.null(modelfile)) stop("xgb.load: modelfile cannot be NULL") diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index ad3cc8b12..7d595ddc6 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -16,7 +16,6 @@ #' bst <- xgb.load('xgb.model') #' pred <- predict(bst, test$data) #' @export -#' xgb.save <- function(model, fname) { if (typeof(fname) != "character") { stop("xgb.save: fname must be character") diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R index e885e6e7e..e61303add 100644 --- a/R-package/R/xgb.save.raw.R +++ b/R-package/R/xgb.save.raw.R @@ -16,7 +16,6 @@ #' bst <- xgb.load(raw) #' pred <- predict(bst, test$data) #' @export -#' xgb.save.raw <- function(model) { if (class(model) == "xgb.Booster"){ model <- model$handle diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 07bf74589..ffc94e34f 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -120,7 +120,6 @@ #' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror) #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist) #' @export -#' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, early.stop.round = NULL, maximize = NULL, diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 122d2f492..92637bb43 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -58,7 +58,6 @@ #' pred <- predict(bst, test$data) #' #' @export -#' xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd index c54e30ba3..52ff08f86 100644 --- a/R-package/man/agaricus.test.Rd +++ b/R-package/man/agaricus.test.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.test} \alias{agaricus.test} \title{Test part from Mushroom Data Set} -\format{A list containing a label vector, and a dgCMatrix object with 1611 +\format{A list containing a label vector, and a dgCMatrix object with 1611 rows and 126 variables} \usage{ data(agaricus.test) @@ -24,8 +24,8 @@ This data set includes the following fields: \references{ https://archive.ics.uci.edu/ml/datasets/Mushroom -Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository -[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. } \keyword{datasets} diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd index 955257148..e27d3ac25 100644 --- a/R-package/man/agaricus.train.Rd +++ b/R-package/man/agaricus.train.Rd @@ -1,10 +1,10 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgboost.R \docType{data} \name{agaricus.train} \alias{agaricus.train} \title{Training part from Mushroom Data Set} -\format{A list containing a label vector, and a dgCMatrix object with 6513 +\format{A list containing a label vector, and a dgCMatrix object with 6513 rows and 127 variables} \usage{ data(agaricus.train) @@ -24,8 +24,8 @@ This data set includes the following fields: \references{ https://archive.ics.uci.edu/ml/datasets/Mushroom -Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository -[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, +Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository +[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. } \keyword{datasets} diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd index 87c507566..f8b4f6b99 100644 --- a/R-package/man/getinfo.Rd +++ b/R-package/man/getinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/getinfo.xgb.DMatrix.R \docType{methods} \name{getinfo} diff --git a/R-package/man/nrow-xgb.DMatrix-method.Rd b/R-package/man/nrow-xgb.DMatrix-method.Rd index f86709afd..1fd52b9c1 100644 --- a/R-package/man/nrow-xgb.DMatrix-method.Rd +++ b/R-package/man/nrow-xgb.DMatrix-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/nrow.xgb.DMatrix.R \docType{methods} \name{nrow,xgb.DMatrix-method} @@ -18,5 +18,6 @@ data(agaricus.train, package='xgboost') train <- agaricus.train dtrain <- xgb.DMatrix(train$data, label=train$label) stopifnot(nrow(dtrain) == nrow(train$data)) + } diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd index 682df1f4b..13f37802e 100644 --- a/R-package/man/predict-xgb.Booster-method.Rd +++ b/R-package/man/predict-xgb.Booster-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predict.xgb.Booster.R \docType{methods} \name{predict,xgb.Booster-method} @@ -11,19 +11,19 @@ \arguments{ \item{object}{Object of class "xgb.Boost"} -\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or +\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.} -\item{missing}{Missing is only used when input is dense matrix, pick a float +\item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.} \item{outputmargin}{whether the prediction should be shown in the original -value of sum of functions, when outputmargin=TRUE, the prediction is +value of sum of functions, when outputmargin=TRUE, the prediction is untransformed margin value. In logistic regression, outputmargin=T will output value before logistic transformation.} \item{ntreelimit}{limit number of trees used in prediction, this parameter is -only valid for gbtree, but not for gblinear. set it to be value bigger +only valid for gbtree, but not for gblinear. set it to be value bigger than 0. It will use all trees by default.} \item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.} @@ -36,7 +36,7 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") pred <- predict(bst, test$data) } diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd index 7eb237a94..34454e555 100644 --- a/R-package/man/predict-xgb.Booster.handle-method.Rd +++ b/R-package/man/predict-xgb.Booster.handle-method.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/predict.xgb.Booster.handle.R \docType{methods} \name{predict,xgb.Booster.handle-method} diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd index edf5284bd..cb939721e 100644 --- a/R-package/man/setinfo.Rd +++ b/R-package/man/setinfo.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/setinfo.xgb.DMatrix.R \docType{methods} \name{setinfo} diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd index 20a78a383..b17722115 100644 --- a/R-package/man/slice.Rd +++ b/R-package/man/slice.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/slice.xgb.DMatrix.R \docType{methods} \name{slice} diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd index 9432ce319..2e892cc6d 100644 --- a/R-package/man/xgb.DMatrix.Rd +++ b/R-package/man/xgb.DMatrix.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.DMatrix.R \name{xgb.DMatrix} \alias{xgb.DMatrix} @@ -7,7 +7,7 @@ xgb.DMatrix(data, info = list(), missing = NA, ...) } \arguments{ -\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character +\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character indicating the data file.} \item{info}{a list of information of the xgb.DMatrix object} diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd index 3ba36f55a..78348c3fa 100644 --- a/R-package/man/xgb.DMatrix.save.Rd +++ b/R-package/man/xgb.DMatrix.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.DMatrix.save.R \name{xgb.DMatrix.save} \alias{xgb.DMatrix.save} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index f918a003c..f3a1fcfd1 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.cv.R \name{xgb.cv} \alias{xgb.cv} @@ -40,7 +40,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item{showsd}{\code{boolean}, whether show standard deviation of cross validation} -\item{metrics,}{list of evaluation metrics to be used in corss validation, +\item{metrics, }{list of evaluation metrics to be used in corss validation, when it is not specified, the evaluation metric is chosen according to objective function. Possible options are: \itemize{ @@ -51,11 +51,11 @@ value that represents missing value. Sometime a data use 0 or other extreme valu \item \code{merror} Exact matching error, used to evaluate multi-class classification }} -\item{obj}{customized objective function. Returns gradient and second order +\item{obj}{customized objective function. Returns gradient and second order gradient with given prediction and dtrain.} -\item{feval}{custimized evaluation function. Returns -\code{list(metric='metric-name', value='metric-value')} with given +\item{feval}{custimized evaluation function. Returns +\code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain.} \item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}} @@ -67,12 +67,12 @@ If folds are supplied, the nfold and stratified parameters would be ignored.} \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. -If set to an integer \code{k}, training with a validation set will stop if the performance +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. - \code{maximize=TRUE} means the larger the evaluation score the better.} +\code{maximize=TRUE} means the larger the evaluation score the better.} \item{...}{other parameters to pass to \code{params}.} } @@ -89,9 +89,9 @@ If \code{prediction = FALSE}, just a \code{data.table} with each mean and standa The cross valudation function of xgboost } \details{ -The original sample is randomly partitioned into \code{nfold} equal size subsamples. +The original sample is randomly partitioned into \code{nfold} equal size subsamples. -Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data. +Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data. The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data. diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index eaf1ca521..cafa8ac14 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.dump.R \name{xgb.dump} \alias{xgb.dump} @@ -11,17 +11,17 @@ xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE) \item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.} -\item{fmap}{feature map file representing the type of feature. -Detailed description could be found at +\item{fmap}{feature map file representing the type of feature. +Detailed description could be found at \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. See demo/ for walkthrough example in R, and -\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} +\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} for example Format.} -\item{with.stats}{whether dump statistics of splits - When this option is on, the model dump comes with two additional statistics: - gain is the approximate loss function gain we get in each split; - cover is the sum of second order gradient in each node.} +\item{with.stats}{whether dump statistics of splits +When this option is on, the model dump comes with two additional statistics: +gain is the approximate loss function gain we get in each split; +cover is the sum of second order gradient in each node.} } \value{ if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}. @@ -34,7 +34,7 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") # save the model in file 'xgb.model.dump' xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE) diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 11740e4ac..a1ce89d4f 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.importance.R \name{xgb.importance} \alias{xgb.importance} @@ -24,7 +24,7 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL, A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. } \description{ -Read a xgboost model text dump. +Read a xgboost model text dump. Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). } \details{ @@ -32,7 +32,7 @@ This is the function to understand the model trained (and through your model, yo Results are returned for both linear and tree models. -\code{data.table} is returned by the function. +\code{data.table} is returned by the function. There are 3 columns : \itemize{ \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump. @@ -53,12 +53,12 @@ If you need to remember one thing only: until you want to leave us early, don't \examples{ data(agaricus.train, package='xgboost') -# Both dataset are list with two items, a sparse matrix and labels -# (labels = outcome column which will be learned). +# Both dataset are list with two items, a sparse matrix and labels +# (labels = outcome column which will be learned). # Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") # train$data@Dimnames[[2]] represents the column names of the sparse matrix. @@ -66,5 +66,6 @@ xgb.importance(train$data@Dimnames[[2]], model = bst) # Same thing with co-occurence computation this time xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label) + } diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd index 1331ff249..92576ad95 100644 --- a/R-package/man/xgb.load.Rd +++ b/R-package/man/xgb.load.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.load.R \name{xgb.load} \alias{xgb.load} @@ -17,7 +17,7 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") xgb.save(bst, 'xgb.model') bst <- xgb.load('xgb.model') diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index c53ed057f..9a3efc39f 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.model.dt.tree.R \name{xgb.model.dt.tree} \alias{xgb.model.dt.tree} @@ -45,15 +45,16 @@ The content of the \code{data.table} is organised that way: \examples{ data(agaricus.train, package='xgboost') -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) + } diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 4147278b9..de70624cb 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.plot.importance.R \name{xgb.plot.importance} \alias{xgb.plot.importance} @@ -25,16 +25,17 @@ In particular you may want to override the title of the graph. To do so, add \co \examples{ data(agaricus.train, package='xgboost') -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #train$data@Dimnames[[2]] represents the column names of the sparse matrix. importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst) xgb.plot.importance(importance_matrix) + } diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 4501d87ce..f34e75bf9 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -1,11 +1,11 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.plot.tree.R \name{xgb.plot.tree} \alias{xgb.plot.tree} \title{Plot a boosted tree model} \usage{ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, - n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL) + n_first_tree = NULL, width = NULL, height = NULL) } \arguments{ \item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.} @@ -16,8 +16,6 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.} -\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.} - \item{width}{the width of the diagram in pixels.} \item{height}{the height of the diagram in pixels.} @@ -26,7 +24,7 @@ xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL, A \code{DiagrammeR} of the model. } \description{ -Read a tree model text dump. +Read a tree model text dump. Plotting only works for boosted tree model (not linear model). } \details{ @@ -36,23 +34,24 @@ The content of each node is organised that way: \item \code{feature} value ; \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ; \item \code{gain}: metric the importance of the node in the model. -} +} Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated. -It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose. +It uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose. } \examples{ data(agaricus.train, package='xgboost') -#Both dataset are list with two items, a sparse matrix and labels -#(labels = outcome column which will be learned). +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst) + } diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd index eca097fac..db335105c 100644 --- a/R-package/man/xgb.save.Rd +++ b/R-package/man/xgb.save.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.save.R \name{xgb.save} \alias{xgb.save} @@ -19,7 +19,7 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") xgb.save(bst, 'xgb.model') bst <- xgb.load('xgb.model') diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd index 79c356c0f..1e9f4a4db 100644 --- a/R-package/man/xgb.save.raw.Rd +++ b/R-package/man/xgb.save.raw.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.save.raw.R \name{xgb.save.raw} \alias{xgb.save.raw} @@ -18,7 +18,7 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") raw <- xgb.save.raw(bst) bst <- xgb.load(raw) diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 15a0b0ba7..50bfb46d0 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgb.train.R \name{xgb.train} \alias{xgb.train} @@ -10,7 +10,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, save_name = "xgboost.model", ...) } \arguments{ -\item{params}{the list of parameters. +\item{params}{the list of parameters. 1. General Parameters @@ -18,30 +18,30 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree} \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0 } - + 2. Booster Parameters 2.1. Parameter for Tree Booster \itemize{ \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 - \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. \item \code{max_depth} maximum depth of a tree. Default: 6 \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 - \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 } 2.2. Parameter for Linear Booster - + \itemize{ \item \code{lambda} L2 regularization term on weights. Default: 0 \item \code{lambda_bias} L2 regularization term on bias. Default: 0 \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 } -3. Task Parameters +3. Task Parameters \itemize{ \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: @@ -51,7 +51,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item \code{binary:logistic} logistic regression for binary classification. Output probability. \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. \item \code{num_class} set the number of classes. To use only with multiclass objectives. - \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}. + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}. \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. } @@ -64,25 +64,25 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, \item{nrounds}{the max number of iterations} \item{watchlist}{what information should be printed when \code{verbose=1} or - \code{verbose=2}. Watchlist is used to specify validation set monitoring - during training. For example user can specify - watchlist=list(validation1=mat1, validation2=mat2) to watch - the performance of each round's model on mat1 and mat2} +\code{verbose=2}. Watchlist is used to specify validation set monitoring +during training. For example user can specify + watchlist=list(validation1=mat1, validation2=mat2) to watch + the performance of each round's model on mat1 and mat2} -\item{obj}{customized objective function. Returns gradient and second order +\item{obj}{customized objective function. Returns gradient and second order gradient with given prediction and dtrain,} -\item{feval}{custimized evaluation function. Returns -\code{list(metric='metric-name', value='metric-value')} with given +\item{feval}{custimized evaluation function. Returns +\code{list(metric='metric-name', value='metric-value')} with given prediction and dtrain,} -\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print +\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print information of performance. If 2, xgboost will print information of both} \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. -If set to an integer \code{k}, training with a validation set will stop if the performance +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. @@ -98,24 +98,25 @@ keeps getting worse consecutively for \code{k} rounds.} An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface. } \details{ -This is the training function for \code{xgboost}. +This is the training function for \code{xgboost}. It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}), therefore it is more flexible than \code{\link{xgboost}} function. -Parallelization is automatically enabled if \code{OpenMP} is present. +Parallelization is automatically enabled if \code{OpenMP} is present. Number of threads can also be manually specified via \code{nthread} parameter. \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter. \itemize{ \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} + \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss} \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} } - + Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}. This function only accepts an \code{\link{xgb.DMatrix}} object as the input. diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 79c33007e..e31e5da43 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/xgboost.R \name{xgboost} \alias{xgboost} @@ -10,13 +10,13 @@ xgboost(data = NULL, label = NULL, missing = NA, weight = NULL, save_name = "xgboost.model", ...) } \arguments{ -\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or +\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or \code{xgb.DMatrix}.} \item{label}{the response variable. User should not set this field, if data is local data file or \code{xgb.DMatrix}.} -\item{missing}{Missing is only used when input is dense matrix, pick a float +\item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.} \item{weight}{a vector indicating the weight for each row of the input.} @@ -34,21 +34,21 @@ Commonly used ones are: \item \code{max.depth} maximum depth of the tree \item \code{nthread} number of thread used in training, if not set, all threads are used } - + Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list. - + See also \code{demo/} for walkthrough example in R.} \item{nrounds}{the max number of iterations} -\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print +\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print information of performance. If 2, xgboost will print information of both performance and construction progress information} \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.} -\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. -If set to an integer \code{k}, training with a validation set will stop if the performance +\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance keeps getting worse consecutively for \code{k} rounds.} \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. @@ -75,8 +75,9 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") pred <- predict(bst, test$data) + } From 77ae180d3d6619ac664760d5f41d38e0f58d6b59 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 7 Nov 2015 21:46:08 +0100 Subject: [PATCH 09/11] Remove DiagrammeR dependency to make travis happy... --- R-package/NAMESPACE | 4 ---- R-package/R/xgb.plot.tree.R | 4 ---- 2 files changed, 8 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index f3a7390b7..3fb05b7d8 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -21,10 +21,6 @@ exportMethods(predict) import(methods) importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) -importFrom(DiagrammeR,create_edges) -importFrom(DiagrammeR,create_graph) -importFrom(DiagrammeR,create_nodes) -importFrom(DiagrammeR,render_graph) importFrom(Matrix,cBind) importFrom(Matrix,colSums) importFrom(Matrix,sparseVector) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 9977748db..475be7231 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -9,10 +9,6 @@ #' @importFrom data.table := #' @importFrom data.table copy #' @importFrom magrittr %>% -#' @importFrom DiagrammeR create_nodes -#' @importFrom DiagrammeR create_edges -#' @importFrom DiagrammeR create_graph -#' @importFrom DiagrammeR render_graph #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. From 996645dc17b7a46470f7c653de093db89467032d Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 7 Nov 2015 22:04:54 +0100 Subject: [PATCH 10/11] Change the way functions are called --- R-package/R/xgb.plot.tree.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 475be7231..10ca42bc7 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -70,7 +70,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"] allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"] - nodes <- create_nodes(nodes = allTrees[,ID], + nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID], label = allTrees[,label], #type = c("lower", "lower", "upper", "upper"), style = "filled", @@ -81,7 +81,7 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU fontname = "Helvetica" ) - edges <- create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2), + edges <- DiagrammeR::create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2), to = allTrees[Feature != "Leaf", c(Yes, No)], label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))), color = "DimGray", @@ -90,11 +90,11 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU fontname = "Helvetica", rel = "leading_to") - graph <- create_graph(nodes_df = nodes, + graph <- DiagrammeR::create_graph(nodes_df = nodes, edges_df = edges, graph_attrs = "rankdir = LR") - render_graph(graph, width = width, height = height) + DiagrammeR::render_graph(graph, width = width, height = height) } # Avoid error messages during CRAN check. From 7cb34e3ad678200d8b2dc47b702d70601b41c6f6 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 7 Nov 2015 22:24:37 +0100 Subject: [PATCH 11/11] Fix some bug + improve display + code clean --- R-package/R/xgb.plot.tree.R | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 10ca42bc7..63bebf6cf 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -4,16 +4,13 @@ #' Plotting only works for boosted tree model (not linear model). #' #' @importFrom data.table data.table -#' @importFrom data.table set -#' @importFrom data.table rbindlist #' @importFrom data.table := -#' @importFrom data.table copy #' @importFrom magrittr %>% #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument). #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file. #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models. -#' @param width the width of the diagram in pixels. +#' @param width the width of the diagram in pixels. #' @param height the height of the diagram in pixels. #' #' @return A \code{DiagrammeR} of the model. @@ -62,22 +59,18 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree) } - allTrees[Feature != "Leaf" ,yesPath := paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] - - allTrees[Feature != "Leaf" ,noPath := paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] - allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)] allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"] allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"] - nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID], - label = allTrees[,label], - #type = c("lower", "lower", "upper", "upper"), + # rev is used to put the first tree on top. + nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID] %>% rev, + label = allTrees[,label] %>% rev, style = "filled", color = "DimGray", - fillcolor= allTrees[,filledcolor], - shape = allTrees[,shape], - data = allTrees[,Feature], + fillcolor= allTrees[,filledcolor] %>% rev, + shape = allTrees[,shape] %>% rev, + data = allTrees[,Feature] %>% rev, fontname = "Helvetica" ) @@ -100,4 +93,4 @@ xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NU # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor")) +globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))