From 8e3c25ed33b26f57f2e265385b0a87cf7d1a989f Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 16 Feb 2015 22:35:01 +0100 Subject: [PATCH 01/14] css improvement --- R-package/vignettes/vignette.css | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R-package/vignettes/vignette.css b/R-package/vignettes/vignette.css index 3d143d299..49be24033 100644 --- a/R-package/vignettes/vignette.css +++ b/R-package/vignettes/vignette.css @@ -129,7 +129,7 @@ code { font-family: Consolas, Monaco, Andale Mono, monospace; line-height: 1.5; font-size: 15px; - background: #CDCDCD; + background: #F8F8F8; border-radius: 4px; padding: 5px; display: inline-block; @@ -137,10 +137,14 @@ code { white-space: pre-wrap; } +blockquote code { + background: #CDCDCD; + color: #606AAA; +} + code.r, code.cpp { display: block; word-wrap: break-word; - background: #F8F8F8; border: 1px solid #606AAA; } @@ -159,7 +163,7 @@ blockquote { max-width: 500px; } -blockquote cite { +blockquote cite { font-size:14px; line-height:10px; color:#bfbfbf; From 2e391ed0ee71003d88d9d8e7d420004db7b1ffc8 Mon Sep 17 00:00:00 2001 From: pommedeterresautee Date: Mon, 16 Feb 2015 22:43:12 +0100 Subject: [PATCH 02/14] text refactor --- R-package/vignettes/discoverYourData.Rmd | 9 +++++++-- R-package/vignettes/vignette.css | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 016bfb69b..6ac84b8ce 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -39,6 +39,7 @@ Sometimes the dataset we have to work on have *categorical* data. A *categorical* variable is one which have a fixed number of different values. By exemple, if for each observation a variable called *Colour* can have only *red*, *blue* or *green* as value, it is a *categorical* variable. > In *R*, *categorical* variable is called `factor`. +> > Type `?factor` in console for more information. In this demo we will see how to transform a dense dataframe (dense = few zero in the matrix) with *categorical* variables to a very sparse matrix (sparse = lots of zero in the matrix) of `numeric` features before analyzing these data in **Xgboost**. @@ -65,8 +66,10 @@ str(df) ``` > 2 columns have `factor` type, one has `ordinal` type. -> `ordinal` variable is a categorical variable with values wich can be ordered -> Here: `None` > `Some` > `Marked`. +> +> `ordinal` variable can take a limited number of values and these values can be ordered. +> +> `Marked > Some > None` Let's add some new *categorical* features to see if it helps. @@ -158,6 +161,7 @@ print(importance) ``` > The column `Gain` provide the information we are looking for. +> > As you can see, features are classified by `Gain`. `Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as 1, and the other branch saying the exact opposite, both new branches being more accurate than the one before the split). @@ -166,6 +170,7 @@ print(importance) `Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it). + Plotting the feature importance ------------------------------- diff --git a/R-package/vignettes/vignette.css b/R-package/vignettes/vignette.css index 49be24033..15c1a2057 100644 --- a/R-package/vignettes/vignette.css +++ b/R-package/vignettes/vignette.css @@ -126,7 +126,7 @@ pre { } code { - font-family: Consolas, Monaco, Andale Mono, monospace; + font-family: Consolas, Monaco, Andale Mono, monospace, courrier new; line-height: 1.5; font-size: 15px; background: #F8F8F8; From e2b2c21aefe4156e9fe4160aa0a2478ef3b2d1ac Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 22:39:38 +0100 Subject: [PATCH 03/14] better co occurence function --- R-package/NAMESPACE | 1 + R-package/R/xgb.importance.R | 18 +++++++++++------- R-package/man/xgb.importance.Rd | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index fce6cceaf..cfa166899 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -22,6 +22,7 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp) importFrom(DiagrammeR,mermaid) +importFrom(Matrix,colSums) importFrom(data.table,":=") importFrom(data.table,as.data.table) importFrom(data.table,copy) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 6d219797c..6adbad512 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -7,6 +7,7 @@ #' @importFrom data.table setnames #' @importFrom data.table := #' @importFrom magrittr %>% +#' @importFrom Matrix colSums #' #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @@ -18,7 +19,7 @@ #' #' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. #' -#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between a one-hot encoded categorical feature and a binary classification label.The \code{target} function should have only one parameter (will be used to provide each feature vector listed as importance feature). More information in \code{Detail} part. This parameter is optional. +#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between in a binary classification. The \code{target} function should have only one parameter (will be used to provide each important feature vector after applying the split condition on it). More information in \code{Detail} part. This parameter is optional. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @@ -40,7 +41,7 @@ #' #' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. But, by itself, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. #' -#' Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function true. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. +#' Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function \code{TRUE}. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. #' #' If you need to remember one thing of all of this: until you want to leave us early, don't eat a mushroom which has no odor :-) #' @@ -89,20 +90,23 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N result <- readLines(filename_dump) %>% linearDump(feature_names, .) if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.") } else { - result <- treeDump(feature_names, text = text) + result <- treeDump(feature_names, text = text, keepDetail = !is.null(data)) # Co-occurence computation if(!is.null(data) & !is.null(label) & nrow(result) > 0) { - apply(data[, result[,Feature],drop=FALSE], 2, . %>% target %>% sum) -> vec - result <- result[Feature == names(vec), "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)] + ((data[, result[,Feature],drop=FALSE] != 0) & (data[, result[,Feature],drop=FALSE] < as.numeric(result[,Split]))) %>% apply(., 2, . %>% target %>% sum) -> vec + + result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)] } } result } -treeDump <- function(feature_names, text){ - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = Feature][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] +treeDump <- function(feature_names, text, keepDetail){ + if(keepDetail) groupBy <- c("Feature", "Split") else groupBy <- "Feature" + + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] result } diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 3f11cda72..3290ace1c 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -18,7 +18,7 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL, \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} -\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between a one-hot encoded categorical feature and a binary classification label.The \code{target} function should have only one parameter (will be used to provide each feature vector listed as importance feature). More information in \code{Detail} part. This parameter is optional.} +\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between in a binary classification. The \code{target} function should have only one parameter (will be used to provide each important feature vector after applying the split condition on it). More information in \code{Detail} part. This parameter is optional.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. @@ -45,7 +45,7 @@ Co-occurence count The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. But, by itself, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. -Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function true. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. +Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function \code{TRUE}. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. If you need to remember one thing of all of this: until you want to leave us early, don't eat a mushroom which has no odor :-) } From 2ea6fd9931d527a0c4290069afe583b40c0b5938 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 23:01:48 +0100 Subject: [PATCH 04/14] better CSS --- R-package/vignettes/vignette.css | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R-package/vignettes/vignette.css b/R-package/vignettes/vignette.css index 3d143d299..9447ea2c3 100644 --- a/R-package/vignettes/vignette.css +++ b/R-package/vignettes/vignette.css @@ -129,7 +129,7 @@ code { font-family: Consolas, Monaco, Andale Mono, monospace; line-height: 1.5; font-size: 15px; - background: #CDCDCD; + background: #F8F8F8; border-radius: 4px; padding: 5px; display: inline-block; @@ -137,10 +137,14 @@ code { white-space: pre-wrap; } +p code { + background: #CDCDCD; + color: #606AAA; +} + code.r, code.cpp { display: block; - word-wrap: break-word; - background: #F8F8F8; + word-wrap: break-word; border: 1px solid #606AAA; } @@ -159,7 +163,7 @@ blockquote { max-width: 500px; } -blockquote cite { +blockquote cite { font-size:14px; line-height:10px; color:#bfbfbf; From d4731e7b295bdb3fee5c5779e44dd7ca9e1a3c82 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 23:06:09 +0100 Subject: [PATCH 05/14] vignette text --- R-package/vignettes/discoverYourData.Rmd | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 016bfb69b..f719ba9e1 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -166,6 +166,31 @@ print(importance) `Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it). +We can go deeper in the analysis. In the table above, we have discovered which feature counts to predict if the illness will go or not. But we don't yet know the role of these feature. + +One simple way to see this role is to count the co-occurence. For that purpose we will execute the same function but with more arguments. + +```{r} +importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) + +# Removing not important things for better display +importance <- importance[,`:=`(Cover=NULL, Frequence=NULL)][1:10,] + +print(importance) +``` + +In the table above we have removed two not needed columns and select only the first 10 lines. + +First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature Age is used several times with different split. + +How the split is applied to count the co-occurences? It is always `<`. For instance, in the second line, we measure the number of person under 61 years with the illness gone. + +The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observation in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the all population that the previous figure represents. + +Therefore, according to our findings, getting a Placebo doesn't seem to help but being less than 61 years old may help. + +> You may wonder how to interpret the `< 1.00001 ` on the first line. Basically, in a sparse `Matrix`, there is no 0, therefore, looking for categorical observations validating the rule `< 1.00001` is like looking for `1` for this feature. + Plotting the feature importance ------------------------------- From 08493c2b3d29ea7369203da6608fa5cb40024597 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 23:27:02 +0100 Subject: [PATCH 06/14] missing feature management --- R-package/NAMESPACE | 1 + R-package/R/xgb.importance.R | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index cfa166899..23082bb46 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -22,6 +22,7 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) importFrom(Ckmeans.1d.dp,Ckmeans.1d.dp) importFrom(DiagrammeR,mermaid) +importFrom(Matrix,cBind) importFrom(Matrix,colSums) importFrom(data.table,":=") importFrom(data.table,as.data.table) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 6adbad512..c53d9e3e8 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -8,6 +8,7 @@ #' @importFrom data.table := #' @importFrom magrittr %>% #' @importFrom Matrix colSums +#' @importFrom Matrix cBind #' #' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}. #' @@ -94,8 +95,12 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N # Co-occurence computation if(!is.null(data) & !is.null(label) & nrow(result) > 0) { + a <- data[, result[,Feature],drop=FALSE] < as.numeric(result[,Split]) + b <- data[, result[No == Missing,Feature],drop=FALSE] != 0 + c <- data[, result[No != Missing,Feature],drop=FALSE] + d <- cBind(b,c) %>% .[,result[,Feature]] - ((data[, result[,Feature],drop=FALSE] != 0) & (data[, result[,Feature],drop=FALSE] < as.numeric(result[,Split]))) %>% apply(., 2, . %>% target %>% sum) -> vec + apply(a & d, 2, . %>% target %>% sum) -> vec result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)] } @@ -104,7 +109,7 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N } treeDump <- function(feature_names, text, keepDetail){ - if(keepDetail) groupBy <- c("Feature", "Split") else groupBy <- "Feature" + if(keepDetail) groupBy <- c("Feature", "Split", "No", "Missing") else groupBy <- "Feature" result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] From 412a6e10859770d68a3ec831dbffd0fd75cfcdbb Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 23:30:36 +0100 Subject: [PATCH 07/14] Add comments --- R-package/R/xgb.importance.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index c53d9e3e8..07fbfded3 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -94,15 +94,19 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N result <- treeDump(feature_names, text = text, keepDetail = !is.null(data)) # Co-occurence computation - if(!is.null(data) & !is.null(label) & nrow(result) > 0) { + if(!is.null(data) & !is.null(label) & nrow(result) > 0) { + # Apply split a <- data[, result[,Feature],drop=FALSE] < as.numeric(result[,Split]) + # Take care of missing column b <- data[, result[No == Missing,Feature],drop=FALSE] != 0 + # Do nothing if missing should be included in Yes c <- data[, result[No != Missing,Feature],drop=FALSE] + # Bind the two previous Matrix and reorder columns d <- cBind(b,c) %>% .[,result[,Feature]] apply(a & d, 2, . %>% target %>% sum) -> vec - result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)] + result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,`:=`(No = NULL, Missing = NULL)] } } result From 1cfa810edb1e8cb01a976c5a80099a7d09156dbf Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Tue, 17 Feb 2015 23:37:56 +0100 Subject: [PATCH 08/14] refix --- R-package/vignettes/discoverYourData.Rmd | 3 --- R-package/vignettes/vignette.css | 9 +-------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index cf4db18a6..6296cb648 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -170,7 +170,6 @@ print(importance) `Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it). -<<<<<<< HEAD We can go deeper in the analysis. In the table above, we have discovered which feature counts to predict if the illness will go or not. But we don't yet know the role of these feature. One simple way to see this role is to count the co-occurence. For that purpose we will execute the same function but with more arguments. @@ -195,8 +194,6 @@ The two other new columns are `RealCover` and `RealCover %`. In the first column Therefore, according to our findings, getting a Placebo doesn't seem to help but being less than 61 years old may help. > You may wonder how to interpret the `< 1.00001 ` on the first line. Basically, in a sparse `Matrix`, there is no 0, therefore, looking for categorical observations validating the rule `< 1.00001` is like looking for `1` for this feature. -======= ->>>>>>> origin/master Plotting the feature importance ------------------------------- diff --git a/R-package/vignettes/vignette.css b/R-package/vignettes/vignette.css index 5f99e08c2..7d370f2f2 100644 --- a/R-package/vignettes/vignette.css +++ b/R-package/vignettes/vignette.css @@ -137,22 +137,15 @@ code { white-space: pre-wrap; } -<<<<<<< HEAD + p code { -======= -blockquote code { ->>>>>>> origin/master background: #CDCDCD; color: #606AAA; } code.r, code.cpp { display: block; -<<<<<<< HEAD - word-wrap: break-word; -======= word-wrap: break-word; ->>>>>>> origin/master border: 1px solid #606AAA; } From 8fd546ab3ceda8a4d5ac09fab36a387860872062 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 18 Feb 2015 13:13:27 +0100 Subject: [PATCH 09/14] vignette text --- R-package/vignettes/discoverYourData.Rmd | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 6296cb648..15381a003 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -170,30 +170,32 @@ print(importance) `Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it). -We can go deeper in the analysis. In the table above, we have discovered which feature counts to predict if the illness will go or not. But we don't yet know the role of these feature. +We can go deeper in the analysis. In the table above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we will try to answer will be: does receiving a placebo helps to recover from the illness? -One simple way to see this role is to count the co-occurence. For that purpose we will execute the same function but with more arguments. +One simple solution is to count the co-occurences of a feature and a class of the classification. + +For that purpose we will execute the same function as above but using two more parameters, `data` and `label`. ```{r} importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector) -# Removing not important things for better display +# Cleaning for better display importance <- importance[,`:=`(Cover=NULL, Frequence=NULL)][1:10,] print(importance) ``` -In the table above we have removed two not needed columns and select only the first 10 lines. +> In the table above we have removed two not needed columns and select only the first 10 lines. -First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature Age is used several times with different split. +First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits. -How the split is applied to count the co-occurences? It is always `<`. For instance, in the second line, we measure the number of person under 61 years with the illness gone. +How the split is applied to count the co-occurences? It is always `<`. For instance, in the second line, we measure the number of persons under 61 years with the illness gone after the treatment. -The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observation in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the all population that the previous figure represents. +The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents. -Therefore, according to our findings, getting a Placebo doesn't seem to help but being less than 61 years old may help. +Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic). -> You may wonder how to interpret the `< 1.00001 ` on the first line. Basically, in a sparse `Matrix`, there is no 0, therefore, looking for categorical observations validating the rule `< 1.00001` is like looking for `1` for this feature. +> You may wonder how to interpret the `< 1.00001 ` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature. Plotting the feature importance ------------------------------- From f57f0f254336bc4fcf7911d81c1863913cbade26 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 18 Feb 2015 13:19:39 +0100 Subject: [PATCH 10/14] Documentation feature importance --- R-package/R/xgb.importance.R | 9 +++++---- R-package/man/xgb.importance.Rd | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 07fbfded3..7e55f7316 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -20,7 +20,7 @@ #' #' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional. #' -#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between in a binary classification. The \code{target} function should have only one parameter (will be used to provide each important feature vector after applying the split condition on it). More information in \code{Detail} part. This parameter is optional. +#' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional. #' #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. #' @@ -39,12 +39,13 @@ #' } #' #' Co-occurence count +#' ------------------ #' -#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. But, by itself, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. +#' The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. #' -#' Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function \code{TRUE}. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. +#' Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. #' -#' If you need to remember one thing of all of this: until you want to leave us early, don't eat a mushroom which has no odor :-) +#' If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-) #' #' @examples #' data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 3290ace1c..f57251755 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -18,7 +18,7 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL, \item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.} -\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurence between in a binary classification. The \code{target} function should have only one parameter (will be used to provide each important feature vector after applying the split condition on it). More information in \code{Detail} part. This parameter is optional.} +\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.} } \value{ A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. @@ -42,12 +42,13 @@ There are 3 columns : } Co-occurence count +------------------ -The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. But, by itself, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. +The gain gives you indication about the information of how a feature is important in making a branch of a decision tree more pure. However, with this information only, you can't know if this feature has to be present or not to get a specific classification. In the example code, you may wonder if odor=none should be \code{TRUE} to not eat a mushroom. -Co-occurence computation is here to help in understanding this relation. It will counts how many observations have target function \code{TRUE}. In our example, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. +Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely. -If you need to remember one thing of all of this: until you want to leave us early, don't eat a mushroom which has no odor :-) +If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-) } \examples{ data(agaricus.train, package='xgboost') From 8523fb9f497e63a25cd0ca418ed630f7b0a851f2 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 18 Feb 2015 13:44:21 +0100 Subject: [PATCH 11/14] avoid error message --- R-package/R/xgb.importance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 7e55f7316..69995d9ef 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -128,4 +128,4 @@ linearDump <- function(feature_names, text){ # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(".") +globalVariables(".", "Feature", "Split", "No", "Missing") From 83ddbbf03b35e216f0dbfec2055cd33c27afc123 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 18 Feb 2015 17:14:08 +0100 Subject: [PATCH 12/14] splell --- R-package/vignettes/discoverYourData.Rmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 15381a003..9d1ce1f5e 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -80,7 +80,7 @@ df[,AgeDiscret:= as.factor(round(Age/10,0))][1:10] ``` > For the first feature we create groups of age by rounding the real age. -> Note that we transform it to `factor` so the algorithm treat these age groups as independant values. +> Note that we transform it to `factor` so the algorithm treat these age groups as independent values. > Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation. Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). @@ -172,7 +172,7 @@ print(importance) We can go deeper in the analysis. In the table above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we will try to answer will be: does receiving a placebo helps to recover from the illness? -One simple solution is to count the co-occurences of a feature and a class of the classification. +One simple solution is to count the co-occurrences of a feature and a class of the classification. For that purpose we will execute the same function as above but using two more parameters, `data` and `label`. @@ -189,7 +189,7 @@ print(importance) First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits. -How the split is applied to count the co-occurences? It is always `<`. For instance, in the second line, we measure the number of persons under 61 years with the illness gone after the treatment. +How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61 years with the illness gone after the treatment. The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents. @@ -253,7 +253,7 @@ In *data science* expression, there is the word *science* :-) Conclusion ========== -As you can see, in general *destroying information by simplying it won't improve your model*. **Chi2** just demonstrates that. +As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. @@ -270,14 +270,14 @@ Linear model may not be that strong in these scenario. Special Note: What about Random forest? ======================================= -As you may know, [Random Forest](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble leanrning](http://en.wikipedia.org/wiki/Ensemble_learning) family. +As you may know, [Random Forest](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family. -Both trains several decision trees for one dataset. The *main* difference is that in Random Forest, trees are independant and in boosting tree N+1 focus its learning on the loss (= what has no been well modeled by tree N). +Both trains several decision trees for one dataset. The *main* difference is that in Random Forest, trees are independent and in boosting tree N+1 focus its learning on the loss (= what has no been well modeled by tree N). This difference have an impact on feature importance analysis: the *correlated features*. Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and random forest). -However, in Random Forest this random choice will be done for each tree, because each tree is independant from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the **importance** of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features... +However, in Random Forest this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the **importance** of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features... In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is never that simple). Therefore, all the importance will be on `A` or on `B`. You will know that one feature have an important role in the link between your dataset and the outcome. It is still up to you to search for the correlated features to the one detected as important if you need all of them. From d982f2746ce26758d9e08016d4c174e18aa2a684 Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Wed, 18 Feb 2015 19:41:13 +0100 Subject: [PATCH 13/14] small fixes --- R-package/R/xgb.importance.R | 2 +- R-package/R/xgb.plot.importance.R | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 69995d9ef..72e2a0719 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -128,4 +128,4 @@ linearDump <- function(feature_names, text){ # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(".", "Feature", "Split", "No", "Missing") +globalVariables(c(".", "Feature", "Split", "No", "Missing")) diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 627266a7e..b34a16c8b 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -45,6 +45,9 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 stop("importance_matrix: Should be a data.table.") } + # To avoid issues in clustering when co-occurences are used + importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature] + clusters <- suppressWarnings(Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters)) importance_matrix[,"Cluster":=clusters$cluster %>% as.character] @@ -56,4 +59,4 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1 # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Feature","Gain", "Cluster")) \ No newline at end of file +globalVariables(c("Feature", "Gain", "Cluster")) \ No newline at end of file From 815789bed6d6f3f870d122622c9790f514c5a30e Mon Sep 17 00:00:00 2001 From: El Potaeto Date: Thu, 19 Feb 2015 00:16:50 +0100 Subject: [PATCH 14/14] fix --- R-package/R/xgb.importance.R | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 72e2a0719..d9f70c510 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -95,28 +95,26 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N result <- treeDump(feature_names, text = text, keepDetail = !is.null(data)) # Co-occurence computation - if(!is.null(data) & !is.null(label) & nrow(result) > 0) { - # Apply split - a <- data[, result[,Feature],drop=FALSE] < as.numeric(result[,Split]) + if(!is.null(data) & !is.null(label) & nrow(result) > 0) { # Take care of missing column - b <- data[, result[No == Missing,Feature],drop=FALSE] != 0 - # Do nothing if missing should be included in Yes - c <- data[, result[No != Missing,Feature],drop=FALSE] - # Bind the two previous Matrix and reorder columns - d <- cBind(b,c) %>% .[,result[,Feature]] - - apply(a & d, 2, . %>% target %>% sum) -> vec + a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0 + # Bind the two Matrix and reorder columns + c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]] + rm(a) + # Apply split + d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split]) + apply(c & d, 2, . %>% target %>% sum) -> vec - result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,`:=`(No = NULL, Missing = NULL)] + result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL] } } result } treeDump <- function(feature_names, text, keepDetail){ - if(keepDetail) groupBy <- c("Feature", "Split", "No", "Missing") else groupBy <- "Feature" + if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature" - result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] + result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)] result }