From 3b67028ad62a60a8e98b78b6332c3a6a8242449c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Benesty?= <pommedeterresautee@msn.com>
Date: Sat, 5 Dec 2015 19:02:05 +0100
Subject: [PATCH 1/2] remove intersect column in sparse Matrix

---
 R-package/demo/predict_leaf_indices.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
index 6cde561c2..fcde3438d 100644
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -29,10 +29,10 @@ create.new.tree.features <- function(model, original.features){
   cols <- list()
   for(i in 1:length(trees)){
     # max is not the real max but it s not important for the purpose of adding features
-    max <- max(pred_with_leaf[,i])
-    cols[[i]] <- factor(x = pred_with_leaf[,i], level = seq(to = max))
+    leaf.id <- sort(unique(pred_with_leaf[,i]))
+    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
   }
-  cBind(original.features, sparse.model.matrix( ~ ., as.data.frame(cols)))
+  cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
 }
 
 # Convert previous features to one hot encoding

From c1b2d9cb8650ebc0503c78cc7ae79b78e2ea85fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Benesty?= <pommedeterresautee@msn.com>
Date: Mon, 7 Dec 2015 11:30:19 +0100
Subject: [PATCH 2/2] Generate new features based on tree leafs

---
 R-package/NAMESPACE                   |  2 +
 R-package/R/xgb.create.features.R     | 91 +++++++++++++++++++++++++++
 R-package/R/xgb.importance.R          |  3 +-
 R-package/demo/predict_leaf_indices.R |  4 +-
 R-package/man/xgb.create.features.Rd  | 88 ++++++++++++++++++++++++++
 R-package/man/xgb.importance.Rd       |  3 +-
 6 files changed, 185 insertions(+), 6 deletions(-)
 create mode 100644 R-package/R/xgb.create.features.R
 create mode 100644 R-package/man/xgb.create.features.Rd

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index a9ae672a3..3cd80d5c2 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -5,6 +5,7 @@ export(setinfo)
 export(slice)
 export(xgb.DMatrix)
 export(xgb.DMatrix.save)
+export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
 export(xgb.importance)
@@ -25,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(Matrix,cBind)
 importFrom(Matrix,colSums)
+importFrom(Matrix,sparse.model.matrix)
 importFrom(Matrix,sparseVector)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
new file mode 100644
index 000000000..bde791fcf
--- /dev/null
+++ b/R-package/R/xgb.create.features.R
@@ -0,0 +1,91 @@
+#' Create new features from a previously learned model
+#' 
+#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
+#' 
+#' @importFrom magrittr %>%
+#' @importFrom Matrix cBind
+#' @importFrom Matrix sparse.model.matrix
+#' 
+#' @param model decision tree boosting model learned on the original data
+#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
+#' 
+#' @return \code{dgCMatrix} matrix including both the original data and the new features.
+#'
+#' @details 
+#' This is the function inspired from the paragraph 3.1 of the paper:
+#' 
+#' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
+#' 
+#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
+#' Joaquin Quiñonero Candela)}
+#'  
+#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
+#' 
+#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+#' 
+#' Extract explaining the method:
+#' 
+#' "\emph{We found that boosted decision trees are a powerful and very
+#' convenient way to implement non-linear and tuple transformations
+#' of the kind we just described. We treat each individual
+#' tree as a categorical feature that takes as value the
+#' index of the leaf an instance ends up falling in. We use 
+#' 1-of-K coding of this type of features. 
+#' 
+#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
+#' where the first subtree has 3 leafs and the second 2 leafs. If an
+#' instance ends up in leaf 2 in the first subtree and leaf 1 in
+#' second subtree, the overall input to the linear classifier will
+#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+#' correspond to the leaves of the first subtree and last 2 to
+#' those of the second subtree.
+#' 
+#' [...]
+#' 
+#' We can understand boosted decision tree
+#' based transformation as a supervised feature encoding that
+#' converts a real-valued vector into a compact binary-valued
+#' vector. A traversal from root node to a leaf node represents
+#' a rule on certain features.}"
+#' 
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#' data(agaricus.test, package='xgboost')
+#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
+#'
+#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
+#' nround = 4
+#'
+#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
+#' 
+#' # Model accuracy without new features
+#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' 
+#' # Convert previous features to one hot encoding
+#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
+#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
+#' 
+#' # learning with new features
+#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
+#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+#' watchlist <- list(train = new.dtrain)
+#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
+#' 
+#' # Model accuracy with new features
+#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' 
+#' # Here the accuracy was already good and is now perfect.
+#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
+#' 
+#' @export
+xgb.create.features <- function(model, training.data){
+  pred_with_leaf = predict(model, training.data, predleaf = TRUE)
+  cols <- list()
+  for(i in 1:length(trees)){
+    # max is not the real max but it s not important for the purpose of adding features
+    leaf.id <- sort(unique(pred_with_leaf[,i]))
+    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
+  }
+  cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
+}
\ No newline at end of file
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index e003277f0..2cd0788cf 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -1,7 +1,6 @@
 #' Show importance of features in a model
 #' 
-#' Read a xgboost model text dump. 
-#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
+#' Create a \code{data.table} of the most important features of a model. 
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table setnames
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
index fcde3438d..fc87befb7 100644
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -25,7 +25,7 @@ pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
 head(pred_with_leaf)
 
 create.new.tree.features <- function(model, original.features){
-  pred_with_leaf = predict(model, original.features, predleaf = TRUE)
+  pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
   cols <- list()
   for(i in 1:length(trees)){
     # max is not the real max but it s not important for the purpose of adding features
@@ -49,4 +49,4 @@ bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread =
 accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 
 # Here the accuracy was already good and is now perfect.
-print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!"))
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd
new file mode 100644
index 000000000..1e75cab8d
--- /dev/null
+++ b/R-package/man/xgb.create.features.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.create.features.R
+\name{xgb.create.features}
+\alias{xgb.create.features}
+\title{Create new features from a previously learned model}
+\usage{
+xgb.create.features(model, training.data)
+}
+\arguments{
+\item{model}{decision tree boosting model learned on the original data}
+
+\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
+}
+\value{
+\code{dgCMatrix} matrix including both the original data and the new features.
+}
+\description{
+May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
+}
+\details{
+This is the function inspired from the paragraph 3.1 of the paper:
+
+\strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
+
+\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
+Joaquin Quiñonero Candela)}
+ 
+International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
+
+\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+
+Extract explaining the method:
+
+"\emph{We found that boosted decision trees are a powerful and very
+convenient way to implement non-linear and tuple transformations
+of the kind we just described. We treat each individual
+tree as a categorical feature that takes as value the
+index of the leaf an instance ends up falling in. We use 
+1-of-K coding of this type of features. 
+
+For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
+where the first subtree has 3 leafs and the second 2 leafs. If an
+instance ends up in leaf 2 in the first subtree and leaf 1 in
+second subtree, the overall input to the linear classifier will
+be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+correspond to the leaves of the first subtree and last 2 to
+those of the second subtree.
+
+[...]
+
+We can understand boosted decision tree
+based transformation as a supervised feature encoding that
+converts a real-valued vector into a compact binary-valued
+vector. A traversal from root node to a leaf node represents
+a rule on certain features.}"
+}
+\examples{
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
+
+param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
+nround = 4
+
+bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy without new features
+accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+
+# Convert previous features to one hot encoding
+new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
+new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
+
+# learning with new features
+new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
+new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+watchlist <- list(train = new.dtrain)
+bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy with new features
+accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+
+# Here the accuracy was already good and is now perfect.
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
+
+}
+
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 0d59ba556..1f845a1f9 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -22,8 +22,7 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL,
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 }
 \description{
-Read a xgboost model text dump. 
-Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
+Create a \code{data.table} of the most important features of a model.
 }
 \details{
 This is the function to understand the model trained (and through your model, your data).