Generate new features based on tree leafs

This commit is contained in:
Michaël Benesty 2015-12-07 11:30:19 +01:00
parent 115c63bcde
commit c1b2d9cb86
6 changed files with 185 additions and 6 deletions

View File

@ -5,6 +5,7 @@ export(setinfo)
export(slice)
export(xgb.DMatrix)
export(xgb.DMatrix.save)
export(xgb.create.features)
export(xgb.cv)
export(xgb.dump)
export(xgb.importance)
@ -25,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Matrix,cBind)
importFrom(Matrix,colSums)
importFrom(Matrix,sparse.model.matrix)
importFrom(Matrix,sparseVector)
importFrom(data.table,":=")
importFrom(data.table,as.data.table)

View File

@ -0,0 +1,91 @@
#' Create new features from a previously learned model
#'
#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
#'
#' @importFrom magrittr %>%
#' @importFrom Matrix cBind
#' @importFrom Matrix sparse.model.matrix
#'
#' @param model decision tree boosting model learned on the original data
#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
#'
#' @return \code{dgCMatrix} matrix including both the original data and the new features.
#'
#' @details
#' This is the function inspired from the paragraph 3.1 of the paper:
#'
#' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
#'
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
#' Joaquin Quiñonero Candela)}
#'
#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
#'
#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
#'
#' Extract explaining the method:
#'
#' "\emph{We found that boosted decision trees are a powerful and very
#' convenient way to implement non-linear and tuple transformations
#' of the kind we just described. We treat each individual
#' tree as a categorical feature that takes as value the
#' index of the leaf an instance ends up falling in. We use
#' 1-of-K coding of this type of features.
#'
#' For example, consider the boosted tree model in Figure 1 with 2 subtrees,
#' where the first subtree has 3 leafs and the second 2 leafs. If an
#' instance ends up in leaf 2 in the first subtree and leaf 1 in
#' second subtree, the overall input to the linear classifier will
#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
#' correspond to the leaves of the first subtree and last 2 to
#' those of the second subtree.
#'
#' [...]
#'
#' We can understand boosted decision tree
#' based transformation as a supervised feature encoding that
#' converts a real-valued vector into a compact binary-valued
#' vector. A traversal from root node to a leaf node represents
#' a rule on certain features.}"
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost')
#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
#'
#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
#' nround = 4
#'
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
#'
#' # Model accuracy without new features
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
#'
#' # Convert previous features to one hot encoding
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
#'
#' # learning with new features
#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
#' watchlist <- list(train = new.dtrain)
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
#'
#' # Model accuracy with new features
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
#'
#' # Here the accuracy was already good and is now perfect.
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
#'
#' @export
xgb.create.features <- function(model, training.data){
pred_with_leaf = predict(model, training.data, predleaf = TRUE)
cols <- list()
for(i in 1:length(trees)){
# max is not the real max but it s not important for the purpose of adding features
leaf.id <- sort(unique(pred_with_leaf[,i]))
cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
}
cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
}

View File

@ -1,7 +1,6 @@
#' Show importance of features in a model
#'
#' Read a xgboost model text dump.
#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
#' Create a \code{data.table} of the most important features of a model.
#'
#' @importFrom data.table data.table
#' @importFrom data.table setnames

View File

@ -25,7 +25,7 @@ pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
head(pred_with_leaf)
create.new.tree.features <- function(model, original.features){
pred_with_leaf = predict(model, original.features, predleaf = TRUE)
pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
cols <- list()
for(i in 1:length(trees)){
# max is not the real max but it s not important for the purpose of adding features
@ -49,4 +49,4 @@ bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread =
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Here the accuracy was already good and is now perfect.
print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!"))
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))

View File

@ -0,0 +1,88 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.create.features.R
\name{xgb.create.features}
\alias{xgb.create.features}
\title{Create new features from a previously learned model}
\usage{
xgb.create.features(model, training.data)
}
\arguments{
\item{model}{decision tree boosting model learned on the original data}
\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
}
\value{
\code{dgCMatrix} matrix including both the original data and the new features.
}
\description{
May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
}
\details{
This is the function inspired from the paragraph 3.1 of the paper:
\strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
Joaquin Quiñonero Candela)}
International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
Extract explaining the method:
"\emph{We found that boosted decision trees are a powerful and very
convenient way to implement non-linear and tuple transformations
of the kind we just described. We treat each individual
tree as a categorical feature that takes as value the
index of the leaf an instance ends up falling in. We use
1-of-K coding of this type of features.
For example, consider the boosted tree model in Figure 1 with 2 subtrees,
where the first subtree has 3 leafs and the second 2 leafs. If an
instance ends up in leaf 2 in the first subtree and leaf 1 in
second subtree, the overall input to the linear classifier will
be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
correspond to the leaves of the first subtree and last 2 to
those of the second subtree.
[...]
We can understand boosted decision tree
based transformation as a supervised feature encoding that
converts a real-valued vector into a compact binary-valued
vector. A traversal from root node to a leaf node represents
a rule on certain features.}"
}
\examples{
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
nround = 4
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
# Model accuracy without new features
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Convert previous features to one hot encoding
new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
# learning with new features
new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
watchlist <- list(train = new.dtrain)
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
# Model accuracy with new features
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
# Here the accuracy was already good and is now perfect.
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
}

View File

@ -22,8 +22,7 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL,
A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
}
\description{
Read a xgboost model text dump.
Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
Create a \code{data.table} of the most important features of a model.
}
\details{
This is the function to understand the model trained (and through your model, your data).