Generate new features based on tree leafs

2015-12-07 11:30:19 +01:00 · 2015-12-07 11:30:19 +01:00 · c1b2d9cb86
commit c1b2d9cb86
parent 115c63bcde
6 changed files with 185 additions and 6 deletions
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@ -5,6 +5,7 @@ export(setinfo)
 export(slice)
 export(xgb.DMatrix)
 export(xgb.DMatrix.save)
 export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
 export(xgb.importance)
@ -25,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(Matrix,cBind)
 importFrom(Matrix,colSums)
 importFrom(Matrix,sparse.model.matrix)
 importFrom(Matrix,sparseVector)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@ -0,0 +1,91 @@
 #' Create new features from a previously learned model
 #' 
 #' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
 #' 
 #' @importFrom magrittr %>%
 #' @importFrom Matrix cBind
 #' @importFrom Matrix sparse.model.matrix
 #' 
 #' @param model decision tree boosting model learned on the original data
 #' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
 #' 
 #' @return \code{dgCMatrix} matrix including both the original data and the new features.
 #'
 #' @details 
 #' This is the function inspired from the paragraph 3.1 of the paper:
 #' 
 #' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
 #' 
 #' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
 #' Joaquin Quiñonero Candela)}
 #'  
 #' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
 #' 
 #' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
 #' 
 #' Extract explaining the method:
 #' 
 #' "\emph{We found that boosted decision trees are a powerful and very
 #' convenient way to implement non-linear and tuple transformations
 #' of the kind we just described. We treat each individual
 #' tree as a categorical feature that takes as value the
 #' index of the leaf an instance ends up falling in. We use 
 #' 1-of-K coding of this type of features. 
 #' 
 #' For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
 #' where the first subtree has 3 leafs and the second 2 leafs. If an
 #' instance ends up in leaf 2 in the first subtree and leaf 1 in
 #' second subtree, the overall input to the linear classifier will
 #' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
 #' correspond to the leaves of the first subtree and last 2 to
 #' those of the second subtree.
 #' 
 #' [...]
 #' 
 #' We can understand boosted decision tree
 #' based transformation as a supervised feature encoding that
 #' converts a real-valued vector into a compact binary-valued
 #' vector. A traversal from root node to a leaf node represents
 #' a rule on certain features.}"
 #' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
 #' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
 #'
 #' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
 #' nround = 4
 #'
 #' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
 #' 
 #' # Model accuracy without new features
 #' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 #' 
 #' # Convert previous features to one hot encoding
 #' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
 #' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
 #' 
 #' # learning with new features
 #' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
 #' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
 #' watchlist <- list(train = new.dtrain)
 #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
 #' 
 #' # Model accuracy with new features
 #' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 #' 
 #' # Here the accuracy was already good and is now perfect.
 #' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
 #' 
 #' @export
 xgb.create.features <- function(model, training.data){
  pred_with_leaf = predict(model, training.data, predleaf = TRUE)
  cols <- list()
  for(i in 1:length(trees)){
    # max is not the real max but it s not important for the purpose of adding features
    leaf.id <- sort(unique(pred_with_leaf[,i]))
    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
  }
  cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
 }
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@ -1,7 +1,6 @@
 #' Show importance of features in a model
 #' 
-#' Read a xgboost model text dump. 
+#' Create a \code{data.table} of the most important features of a model. 
 #' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table setnames
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@ -25,7 +25,7 @@ pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
 head(pred_with_leaf)
 create.new.tree.features <- function(model, original.features){
-  pred_with_leaf = predict(model, original.features, predleaf = TRUE)
+  pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
  cols <- list()
  for(i in 1:length(trees)){
    # max is not the real max but it s not important for the purpose of adding features
@ -49,4 +49,4 @@ bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread =
 accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 # Here the accuracy was already good and is now perfect.
-print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!"))
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@ -0,0 +1,88 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.create.features.R
 \name{xgb.create.features}
 \alias{xgb.create.features}
 \title{Create new features from a previously learned model}
 \usage{
 xgb.create.features(model, training.data)
 }
 \arguments{
 \item{model}{decision tree boosting model learned on the original data}
 \item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
 }
 \value{
 \code{dgCMatrix} matrix including both the original data and the new features.
 }
 \description{
 May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
 }
 \details{
 This is the function inspired from the paragraph 3.1 of the paper:
 \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
 \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
 Joaquin Quiñonero Candela)}
 International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
 \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
 Extract explaining the method:
 "\emph{We found that boosted decision trees are a powerful and very
 convenient way to implement non-linear and tuple transformations
 of the kind we just described. We treat each individual
 tree as a categorical feature that takes as value the
 index of the leaf an instance ends up falling in. We use 
 1-of-K coding of this type of features. 
 For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
 where the first subtree has 3 leafs and the second 2 leafs. If an
 instance ends up in leaf 2 in the first subtree and leaf 1 in
 second subtree, the overall input to the linear classifier will
 be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
 correspond to the leaves of the first subtree and last 2 to
 those of the second subtree.
 [...]
 We can understand boosted decision tree
 based transformation as a supervised feature encoding that
 converts a real-valued vector into a compact binary-valued
 vector. A traversal from root node to a leaf node represents
 a rule on certain features.}"
 }
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
 param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
 nround = 4
 bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
 # Model accuracy without new features
 accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 # Convert previous features to one hot encoding
 new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
 new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
 # learning with new features
 new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
 new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
 watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
 # Model accuracy with new features
 accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 # Here the accuracy was already good and is now perfect.
 cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
 }
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@ -22,8 +22,7 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL,
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 }
 \description{
-Read a xgboost model text dump. 
+Create a \code{data.table} of the most important features of a model.
 Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
 }
 \details{
 This is the function to understand the model trained (and through your model, your data).