From 947afd7eace40c55384359b232051bd82fb733dc Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Sun, 17 May 2015 15:16:28 +0200 Subject: [PATCH 1/6] =?UTF-8?q?multi=20=1Btrees?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R-package/R/xgb.model.dt.tree.R | 5 ++-- R-package/R/xgb.plot.multi.tree.R | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 R-package/R/xgb.plot.multi.tree.R diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 7eea3dfcd..d68dbf5cd 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -96,13 +96,14 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model allTrees <- data.table() - anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" + anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" + for(i in 1:n_round){ tree <- text[(position[i]+1):(position[i+1]-1)] # avoid tree made of a leaf only (no split) - if(length(tree) <2) next + if(length(tree) < 2) next treeID <- i-1 diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R new file mode 100644 index 000000000..314e2157a --- /dev/null +++ b/R-package/R/xgb.plot.multi.tree.R @@ -0,0 +1,42 @@ +library(stringr) +library(data.table) + + + +data(agaricus.train, package='xgboost') + +#Both dataset are list with two items, a sparse matrix and labels +#(labels = outcome column which will be learned). +#Each column of the sparse Matrix is a feature in one hot encoding format. +train <- agaricus.train + +bst <- xgboost(data = train$data, label = train$label, max.depth = 5, + eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") + +#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. +tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) + + +# first number of the path represents the tree, then the following numbers are related to the path to follow + +# root init +root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] + +precedent.nodes <- root.nodes + +while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { + yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] + no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] + yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("-0") + no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("-1") + + tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] + tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] + precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) +} + +tree.matrix + + + From 1ea7f6f03355bac95dc999830923592e97f15c3b Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Sun, 17 May 2015 20:37:15 +0200 Subject: [PATCH 2/6] fix bug --- R-package/R/xgb.plot.multi.tree.R | 33 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R index 314e2157a..f61540dae 100644 --- a/R-package/R/xgb.plot.multi.tree.R +++ b/R-package/R/xgb.plot.multi.tree.R @@ -1,6 +1,6 @@ library(stringr) library(data.table) - +library(xgboost) data(agaricus.train, package='xgboost') @@ -10,8 +10,8 @@ data(agaricus.train, package='xgboost') #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 5, - eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") +bst <- xgboost(data = train$data, label = train$label, max.depth = 3, + eta = 1, nthread = 2, nround = 4,objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) @@ -21,22 +21,39 @@ tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) # root init root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] -tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes %>% str_replace("-", "_")] -precedent.nodes <- root.nodes +precedent.nodes <- root.nodes %>% str_replace("-", "_") while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] no.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(No)] - yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("-0") - no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("-1") + yes.nodes.abs.pos <- yes.row.nodes[, Abs.Position] %>% paste0("_0") + no.nodes.abs.pos <- no.row.nodes[, Abs.Position] %>% paste0("_1") tree.matrix[ID == yes.row.nodes[, Yes], Abs.Position := yes.nodes.abs.pos] tree.matrix[ID == no.row.nodes[, No], Abs.Position := no.nodes.abs.pos] precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) } -tree.matrix +tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] +tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] +tree.matrix[,ID:= Abs.Position] +tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] +tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] + +CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" + + +yes <- tree.matrix[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "") + +no <- tree.matrix[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "") + +path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = "\n") %>% paste("graph LR", .,collapse = "", sep = "\n") %>% paste(CSSstyle, yes, no, sep = "\n") +DiagrammeR::mermaid(path) + +# path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" +setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) From 936190c17c798e7365e63886ce79e77c3403342d Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 16 Jun 2015 21:38:14 +0200 Subject: [PATCH 3/6] slight update in documentation --- R-package/R/xgb.train.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index d75659737..23accef3a 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -43,7 +43,7 @@ #' \item \code{binary:logistic} logistic regression for binary classification. Output probability. #' \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. #' \item \code{num_class} set the number of classes. To use only with multiclass objectives. -#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}. +#' \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}. #' \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. #' \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. #' } @@ -82,6 +82,7 @@ #' \itemize{ #' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} #' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} +#' \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss} #' \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances. #' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. #' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. From ad2e93f6c5cf051eb5133f3c9f6564eae4c6505a Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Tue, 16 Jun 2015 21:39:31 +0200 Subject: [PATCH 4/6] multi tree update --- R-package/R/xgb.plot.multi.tree.R | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/R/xgb.plot.multi.tree.R index f61540dae..feb7e667e 100644 --- a/R-package/R/xgb.plot.multi.tree.R +++ b/R-package/R/xgb.plot.multi.tree.R @@ -10,8 +10,8 @@ data(agaricus.train, package='xgboost') #Each column of the sparse Matrix is a feature in one hot encoding format. train <- agaricus.train -bst <- xgboost(data = train$data, label = train$label, max.depth = 3, - eta = 1, nthread = 2, nround = 4,objective = "binary:logistic") +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 4, objective = "binary:logistic") #agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix. tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) @@ -21,9 +21,9 @@ tree.matrix <- xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst) # root init root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID] -tree.matrix[ID == root.nodes, Abs.Position:=root.nodes %>% str_replace("-", "_")] +tree.matrix[ID == root.nodes, Abs.Position:=root.nodes] -precedent.nodes <- root.nodes %>% str_replace("-", "_") +precedent.nodes <- root.nodes while(tree.matrix[,sum(is.na(Abs.Position))] > 0) { yes.row.nodes <- tree.matrix[Abs.Position %in% precedent.nodes & !is.na(Yes)] @@ -40,11 +40,16 @@ tree.matrix[!is.na(Yes),Yes:= paste0(Abs.Position, "_0")] tree.matrix[!is.na(No),No:= paste0(Abs.Position, "_1")] tree.matrix[,ID:= Abs.Position] +tree.matrix[,Abs.Position:=substr(Abs.Position, nchar(Tree)+2, nchar(Abs.Position))] +keepN <- 3 +tree.matrix <- tree.matrix[,sum(Quality),by = .(Abs.Position, Feature)][order(-V1)][,.(paste0(Feature[1:min(length(Feature), keepN)], " (", V1[1:min(length(V1), keepN)], ")") %>% paste0(collapse = "\n")), by=Abs.Position] tree.matrix[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "
Cover: ", Cover, "
Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")] tree.matrix[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")] +tree.matrix[, Yes:= Abs.Position %>% paste0("_0")][, No:= Abs.Position %>% paste0("_1")] + CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px\nclassDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px" @@ -56,4 +61,4 @@ path <- tree.matrix[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% pas DiagrammeR::mermaid(path) # path <- "graph LR;0-0-0(spore-print-color=green)-->|>= 2.00001|0-0-0-1>Leaf" -setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) +# setnames(tree.matrix, old = c("ID", "Yes", "No"), c("nodes", "edge_from", "edge_to")) From 0dfc44325291b7f2de73a936016c7c90ab787667 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 15 Jul 2015 15:59:36 +0200 Subject: [PATCH 5/6] New projection of all trees on one --- .../understandingXGBoostModel.html | 338 ++++++++++++++++++ 1 file changed, 338 insertions(+) create mode 100644 demo/kaggle-otto/understandingXGBoostModel.html diff --git a/demo/kaggle-otto/understandingXGBoostModel.html b/demo/kaggle-otto/understandingXGBoostModel.html new file mode 100644 index 000000000..abbfdb55b --- /dev/null +++ b/demo/kaggle-otto/understandingXGBoostModel.html @@ -0,0 +1,338 @@ + + + + + + + + + + + + + +Understanding XGBoost Model on Otto Dataset + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

1 Introduction

+

XGBoost is an implementation of the famous gradient boosting algorithm. This model is often described as a blackbox, meaning it works well but it is not trivial to understand how. Indeed, the model is made of hundreds (thousands?) of decision trees. You may wonder how possible a human would be able to have a general view of the model?

+

While XGBoost is known for its fast speed and accurate predictive power, it also comes with various functions to help you understand the model. The purpose of this RMarkdown document is to demonstrate how easily we can leverage the functions already implemented in XGBoost R package. Of course, everything showed below can be applied to the dataset you may have to manipulate at work or wherever!

+

First we will prepare the Otto dataset and train a model, then we will generate two vizualisations to get a clue of what is important to the model, finally, we will see how we can leverage these information.

+
+
+

2 Preparation of the data

+

This part is based on the R tutorial example by Tong He

+

First, let’s load the packages and the dataset.

+
require(xgboost)
+
## Loading required package: xgboost
+
require(methods)
+require(data.table)
+
## Loading required package: data.table
+
require(magrittr)
+
## Loading required package: magrittr
+
train <- fread('data/train.csv', header = T, stringsAsFactors = F)
+test <- fread('data/test.csv', header=TRUE, stringsAsFactors = F)
+
+

magrittr and data.table are here to make the code cleaner and much more rapid.

+
+

Let’s explore the dataset.

+
# Train dataset dimensions
+dim(train)
+
## [1] 61878    95
+
# Training content
+train[1:6,1:5, with =F]
+
##    id feat_1 feat_2 feat_3 feat_4
+## 1:  1      1      0      0      0
+## 2:  2      0      0      0      0
+## 3:  3      0      0      0      0
+## 4:  4      1      0      0      1
+## 5:  5      0      0      0      0
+## 6:  6      2      1      0      0
+
# Test dataset dimensions
+dim(train)
+
## [1] 61878    95
+
# Test content
+test[1:6,1:5, with =F]
+
##    id feat_1 feat_2 feat_3 feat_4
+## 1:  1      0      0      0      0
+## 2:  2      2      2     14     16
+## 3:  3      0      1     12      1
+## 4:  4      0      0      0      1
+## 5:  5      1      0      0      1
+## 6:  6      0      0      0      0
+
+

We only display the 6 first rows and 5 first columns for convenience

+
+

Each column represents a feature measured by an integer. Each row is an Otto product.

+

Obviously the first column (ID) doesn’t contain any useful information.

+

To let the algorithm focus on real stuff, we will delete it.

+
# Delete ID column in training dataset
+train[, id := NULL]
+
+# Delete ID column in testing dataset
+test[, id := NULL]
+

According to its description, the Otto challenge is a multi class classification challenge. We need to extract the labels (here the name of the different classes) from the dataset. We only have two files (test and training), it seems logical that the training file contains the class we are looking for. Usually the labels is in the first or the last column. We already know what is in the first column, let’s check the content of the last one.

+
# Check the content of the last column
+train[1:6, ncol(train), with  = F]
+
##     target
+## 1: Class_1
+## 2: Class_1
+## 3: Class_1
+## 4: Class_1
+## 5: Class_1
+## 6: Class_1
+
# Save the name of the last column
+nameLastCol <- names(train)[ncol(train)]
+

The classes are provided as character string in the 94th column called target. As you may know, XGBoost doesn’t support anything else than numbers. So we will convert classes to integer. Moreover, according to the documentation, it should start at 0.

+

For that purpose, we will:

+
    +
  • extract the target column
  • +
  • remove Class_ from each class name
  • +
  • convert to integer
  • +
  • remove 1 to the new value
  • +
+
# Convert from classes to numbers
+y <- train[, nameLastCol, with = F][[1]] %>% gsub('Class_','',.) %>% {as.integer(.) -1}
+
+# Display the first 5 levels
+y[1:5]
+
## [1] 0 0 0 0 0
+

We remove label column from training dataset, otherwise XGBoost would use it to guess the labels!

+
train[, nameLastCol:=NULL, with = F]
+

data.table is an awesome implementation of data.frame, unfortunately it is not a format supported natively by XGBoost. We need to convert both datasets (training and test) in numeric Matrix format.

+
trainMatrix <- train[,lapply(.SD,as.numeric)] %>% as.matrix
+testMatrix <- test[,lapply(.SD,as.numeric)] %>% as.matrix
+
+
+

3 Model training

+

Before the learning we will use the cross validation to evaluate the our error rate.

+

Basically XGBoost will divide the training data in nfold parts, then XGBoost will retain the first part to use it as the test data and perform a training. Then it will reintegrate the first part and retain the second part, do a training and so on…

+

You can look at the function documentation for more information.

+
numberOfClasses <- max(y) + 1
+
+param <- list("objective" = "multi:softprob",
+              "eval_metric" = "mlogloss",
+              "num_class" = numberOfClasses)
+
+cv.nround <- 5
+cv.nfold <- 3
+
+bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, 
+                nfold = cv.nfold, nrounds = cv.nround)
+
## [0]  train-mlogloss:1.540431+0.002213    test-mlogloss:1.554751+0.001620
+## [1]  train-mlogloss:1.282145+0.002432    test-mlogloss:1.305809+0.000891
+## [2]  train-mlogloss:1.112233+0.003468    test-mlogloss:1.143170+0.001239
+## [3]  train-mlogloss:0.990676+0.003071    test-mlogloss:1.027884+0.002789
+## [4]  train-mlogloss:0.898998+0.003624    test-mlogloss:0.941951+0.002773
+
+

As we can see the error rate is low on the test dataset (for a 5mn trained model).

+
+

Finally, we are ready to train the real model!!!

+
nround = 50
+bst = xgboost(param=param, data = trainMatrix, label = y, nrounds=nround)
+
## [0]  train-mlogloss:1.539812
+## [1]  train-mlogloss:1.284372
+## [2]  train-mlogloss:1.116199
+## [3]  train-mlogloss:0.997417
+## [4]  train-mlogloss:0.908790
+## [5]  train-mlogloss:0.837503
+## [6]  train-mlogloss:0.780621
+## [7]  train-mlogloss:0.735461
+## [8]  train-mlogloss:0.696942
+## [9]  train-mlogloss:0.666732
+## [10] train-mlogloss:0.641023
+## [11] train-mlogloss:0.618737
+## [12] train-mlogloss:0.599404
+## [13] train-mlogloss:0.583204
+## [14] train-mlogloss:0.568396
+## [15] train-mlogloss:0.555462
+## [16] train-mlogloss:0.543350
+## [17] train-mlogloss:0.532383
+## [18] train-mlogloss:0.522704
+## [19] train-mlogloss:0.513795
+## [20] train-mlogloss:0.506245
+## [21] train-mlogloss:0.497973
+## [22] train-mlogloss:0.491395
+## [23] train-mlogloss:0.484097
+## [24] train-mlogloss:0.477012
+## [25] train-mlogloss:0.470934
+## [26] train-mlogloss:0.466095
+## [27] train-mlogloss:0.461394
+## [28] train-mlogloss:0.456613
+## [29] train-mlogloss:0.450938
+## [30] train-mlogloss:0.446367
+## [31] train-mlogloss:0.442480
+## [32] train-mlogloss:0.437640
+## [33] train-mlogloss:0.433672
+## [34] train-mlogloss:0.428959
+## [35] train-mlogloss:0.424677
+## [36] train-mlogloss:0.421388
+## [37] train-mlogloss:0.418912
+## [38] train-mlogloss:0.415505
+## [39] train-mlogloss:0.411825
+## [40] train-mlogloss:0.407472
+## [41] train-mlogloss:0.404232
+## [42] train-mlogloss:0.401184
+## [43] train-mlogloss:0.397714
+## [44] train-mlogloss:0.394451
+## [45] train-mlogloss:0.392290
+## [46] train-mlogloss:0.389948
+## [47] train-mlogloss:0.387899
+## [48] train-mlogloss:0.385107
+## [49] train-mlogloss:0.382828
+
+
+

4 Model understanding

+
+

4.1 Feature importance

+

So far, we have built a model made of 50 trees.

+

To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding Otto products).

+

Each division operation is called a split.

+

Each group at each division level is called a branch and the deepest level is called a leaf.

+

In the final model, these leafs are supposed to be as pure as possible for each tree, meaning in our case that each leaf should be made of one class of Otto product only (of course it is not true, but that’s what we try to achieve in a minimum of splits).

+

Not all splits are equally important. Basically the first split of a tree will have more impact on the purity that, for instance, the deepest split. Intuitively, we understand that the first split makes most of the work, and the following splits focus on smaller parts of the dataset which have been missclassified by the first tree.

+

In the same way, in Boosting we try to optimize the missclassification at each round (it is called the loss). So the first tree will do the big work and the following trees will focus on the remaining, on the parts not correctly learned by the previous trees.

+

The improvement brought by each split can be measured, it is the gain.

+

Each split is done on one feature only at one value.

+

Let’s see what the model looks like.

+
model <- xgb.dump(bst, with.stats = T)
+model[1:10]
+
##  [1] "booster[0]"                                                         
+##  [2] "0:[f16<1.5] yes=1,no=2,missing=1,gain=309.719,cover=12222.8"        
+##  [3] "1:[f29<26.5] yes=3,no=4,missing=3,gain=161.964,cover=11424"         
+##  [4] "3:[f77<2.5] yes=7,no=8,missing=7,gain=106.092,cover=11416.3"        
+##  [5] "7:[f52<12.5] yes=13,no=14,missing=13,gain=43.1389,cover=11211.9"    
+##  [6] "13:[f76<1.5] yes=25,no=26,missing=25,gain=37.407,cover=11143.5"     
+##  [7] "25:[f16<2.00001] yes=49,no=50,missing=50,gain=36.3329,cover=10952.1"
+##  [8] "49:leaf=-0.0905567,cover=1090.77"                                   
+##  [9] "50:leaf=-0.148413,cover=9861.33"                                    
+## [10] "26:[f83<26] yes=51,no=52,missing=52,gain=167.766,cover=191.407"
+
+

For convenience, we are displaying the first 10 lines of the model only.

+
+

Clearly, it is not easy to understand what it means.

+

Basically each line represents a branch, there is the tree ID, the feature ID, the point where it splits, and information regarding the next branches (left, right, when the row for this feature is N/A).

+

Hopefully, XGBoost offers a better representation: feature importance.

+

Feature importance is about averaging the gain of each feature for all split and all trees.

+

Then we can use the function xgb.plot.importance.

+
# Get the feature real names
+names <- dimnames(trainMatrix)[[2]]
+
+# Compute feature importance matrix
+importance_matrix <- xgb.importance(names, model = bst)
+
+# Nice graph
+xgb.plot.importance(importance_matrix[1:10,])
+

+
+

To make it understandable we first extract the column names from the Matrix.

+
+
+
+

4.2 Interpretation

+

In the feature importance above, we can see the first 10 most important features.

+

This function gives a color to each bar. These colors represent groups of features. Basically a K-means clustering is applied to group each feature by importance.

+

From here you can take several actions. For instance you can remove the less important feature (feature selection process), or go deeper in the interaction between the most important features and labels.

+

Or you can just reason about why these features are so importat (in Otto challenge we can’t go this way because there is not enough information).

+
+
+

4.3 Tree graph

+

Feature importance gives you feature weight information but not interaction between features.

+

XGBoost R package have another useful function for that.

+

Please, scroll on the right to see the tree.

+
xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
+

+

+

We are just displaying the first two trees here.

+

On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. Besides, XGBoost generate k trees at each round for a k-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.

+
+
+
+

5 Going deeper

+

There are 4 documents you may also be interested in:

+ +
+ + + + + + + + From 951ba267cf0c04cfa0ff275573ee7aa5c310fddd Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Wed, 22 Jul 2015 23:50:54 +0200 Subject: [PATCH 6/6] move plot file --- R-package/{R => demo}/xgb.plot.multi.tree.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R-package/{R => demo}/xgb.plot.multi.tree.R (100%) diff --git a/R-package/R/xgb.plot.multi.tree.R b/R-package/demo/xgb.plot.multi.tree.R similarity index 100% rename from R-package/R/xgb.plot.multi.tree.R rename to R-package/demo/xgb.plot.multi.tree.R