From 3b67028ad62a60a8e98b78b6332c3a6a8242449c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Benesty?= Date: Sat, 5 Dec 2015 19:02:05 +0100 Subject: [PATCH 1/2] remove intersect column in sparse Matrix --- R-package/demo/predict_leaf_indices.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index 6cde561c2..fcde3438d 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -29,10 +29,10 @@ create.new.tree.features <- function(model, original.features){ cols <- list() for(i in 1:length(trees)){ # max is not the real max but it s not important for the purpose of adding features - max <- max(pred_with_leaf[,i]) - cols[[i]] <- factor(x = pred_with_leaf[,i], level = seq(to = max)) + leaf.id <- sort(unique(pred_with_leaf[,i])) + cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id) } - cBind(original.features, sparse.model.matrix( ~ ., as.data.frame(cols))) + cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols))) } # Convert previous features to one hot encoding From c1b2d9cb8650ebc0503c78cc7ae79b78e2ea85fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Benesty?= Date: Mon, 7 Dec 2015 11:30:19 +0100 Subject: [PATCH 2/2] Generate new features based on tree leafs --- R-package/NAMESPACE | 2 + R-package/R/xgb.create.features.R | 91 +++++++++++++++++++++++++++ R-package/R/xgb.importance.R | 3 +- R-package/demo/predict_leaf_indices.R | 4 +- R-package/man/xgb.create.features.Rd | 88 ++++++++++++++++++++++++++ R-package/man/xgb.importance.Rd | 3 +- 6 files changed, 185 insertions(+), 6 deletions(-) create mode 100644 R-package/R/xgb.create.features.R create mode 100644 R-package/man/xgb.create.features.Rd diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index a9ae672a3..3cd80d5c2 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -5,6 +5,7 @@ export(setinfo) export(slice) export(xgb.DMatrix) export(xgb.DMatrix.save) +export(xgb.create.features) export(xgb.cv) export(xgb.dump) export(xgb.importance) @@ -25,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(Matrix,dgeMatrix) importFrom(Matrix,cBind) importFrom(Matrix,colSums) +importFrom(Matrix,sparse.model.matrix) importFrom(Matrix,sparseVector) importFrom(data.table,":=") importFrom(data.table,as.data.table) diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R new file mode 100644 index 000000000..bde791fcf --- /dev/null +++ b/R-package/R/xgb.create.features.R @@ -0,0 +1,91 @@ +#' Create new features from a previously learned model +#' +#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model. +#' +#' @importFrom magrittr %>% +#' @importFrom Matrix cBind +#' @importFrom Matrix sparse.model.matrix +#' +#' @param model decision tree boosting model learned on the original data +#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix) +#' +#' @return \code{dgCMatrix} matrix including both the original data and the new features. +#' +#' @details +#' This is the function inspired from the paragraph 3.1 of the paper: +#' +#' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"} +#' +#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, +#' Joaquin QuiƱonero Candela)} +#' +#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014 +#' +#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}. +#' +#' Extract explaining the method: +#' +#' "\emph{We found that boosted decision trees are a powerful and very +#' convenient way to implement non-linear and tuple transformations +#' of the kind we just described. We treat each individual +#' tree as a categorical feature that takes as value the +#' index of the leaf an instance ends up falling in. We use +#' 1-of-K coding of this type of features. +#' +#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, +#' where the first subtree has 3 leafs and the second 2 leafs. If an +#' instance ends up in leaf 2 in the first subtree and leaf 1 in +#' second subtree, the overall input to the linear classifier will +#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries +#' correspond to the leaves of the first subtree and last 2 to +#' those of the second subtree. +#' +#' [...] +#' +#' We can understand boosted decision tree +#' based transformation as a supervised feature encoding that +#' converts a real-valued vector into a compact binary-valued +#' vector. A traversal from root node to a leaf node represents +#' a rule on certain features.}" +#' +#' @examples +#' data(agaricus.train, package='xgboost') +#' data(agaricus.test, package='xgboost') +#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) +#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) +#' +#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') +#' nround = 4 +#' +#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2) +#' +#' # Model accuracy without new features +#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) +#' +#' # Convert previous features to one hot encoding +#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data) +#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data) +#' +#' # learning with new features +#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) +#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) +#' watchlist <- list(train = new.dtrain) +#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2) +#' +#' # Model accuracy with new features +#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) +#' +#' # Here the accuracy was already good and is now perfect. +#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n")) +#' +#' @export +xgb.create.features <- function(model, training.data){ + pred_with_leaf = predict(model, training.data, predleaf = TRUE) + cols <- list() + for(i in 1:length(trees)){ + # max is not the real max but it s not important for the purpose of adding features + leaf.id <- sort(unique(pred_with_leaf[,i])) + cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id) + } + cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols))) +} \ No newline at end of file diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index e003277f0..2cd0788cf 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -1,7 +1,6 @@ #' Show importance of features in a model #' -#' Read a xgboost model text dump. -#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). +#' Create a \code{data.table} of the most important features of a model. #' #' @importFrom data.table data.table #' @importFrom data.table setnames diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R index fcde3438d..fc87befb7 100644 --- a/R-package/demo/predict_leaf_indices.R +++ b/R-package/demo/predict_leaf_indices.R @@ -25,7 +25,7 @@ pred_with_leaf = predict(bst, dtest, predleaf = TRUE) head(pred_with_leaf) create.new.tree.features <- function(model, original.features){ - pred_with_leaf = predict(model, original.features, predleaf = TRUE) + pred_with_leaf <- predict(model, original.features, predleaf = TRUE) cols <- list() for(i in 1:length(trees)){ # max is not the real max but it s not important for the purpose of adding features @@ -49,4 +49,4 @@ bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) # Here the accuracy was already good and is now perfect. -print(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!")) +cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n")) diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd new file mode 100644 index 000000000..1e75cab8d --- /dev/null +++ b/R-package/man/xgb.create.features.Rd @@ -0,0 +1,88 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.create.features.R +\name{xgb.create.features} +\alias{xgb.create.features} +\title{Create new features from a previously learned model} +\usage{ +xgb.create.features(model, training.data) +} +\arguments{ +\item{model}{decision tree boosting model learned on the original data} + +\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)} +} +\value{ +\code{dgCMatrix} matrix including both the original data and the new features. +} +\description{ +May improve the learning by adding new features to the training data based on the decision trees from a previously learned model. +} +\details{ +This is the function inspired from the paragraph 3.1 of the paper: + +\strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"} + +\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, +Joaquin QuiƱonero Candela)} + +International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014 + +\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}. + +Extract explaining the method: + +"\emph{We found that boosted decision trees are a powerful and very +convenient way to implement non-linear and tuple transformations +of the kind we just described. We treat each individual +tree as a categorical feature that takes as value the +index of the leaf an instance ends up falling in. We use +1-of-K coding of this type of features. + +For example, consider the boosted tree model in Figure 1 with 2 subtrees, +where the first subtree has 3 leafs and the second 2 leafs. If an +instance ends up in leaf 2 in the first subtree and leaf 1 in +second subtree, the overall input to the linear classifier will +be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries +correspond to the leaves of the first subtree and last 2 to +those of the second subtree. + +[...] + +We can understand boosted decision tree +based transformation as a supervised feature encoding that +converts a real-valued vector into a compact binary-valued +vector. A traversal from root node to a leaf node represents +a rule on certain features.}" +} +\examples{ +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label) +dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label) + +param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic') +nround = 4 + +bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2) + +# Model accuracy without new features +accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) + +# Convert previous features to one hot encoding +new.features.train <- xgb.create.features(model = bst, agaricus.train$data) +new.features.test <- xgb.create.features(model = bst, agaricus.test$data) + +# learning with new features +new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) +new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) +watchlist <- list(train = new.dtrain) +bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2) + +# Model accuracy with new features +accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label) + +# Here the accuracy was already good and is now perfect. +cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n")) + +} + diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 0d59ba556..1f845a1f9 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -22,8 +22,7 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL, A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model. } \description{ -Read a xgboost model text dump. -Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now). +Create a \code{data.table} of the most important features of a model. } \details{ This is the function to understand the model trained (and through your model, your data).