Merge pull request #679 from pommedeterresautee/master

Wording of R doc in new functions
This commit is contained in:
Michaël Benesty 2015-12-08 21:45:17 +01:00
commit 2d2f92631c
8 changed files with 42 additions and 54 deletions

View File

@ -14,7 +14,7 @@
#' @details
#' This is the function inspired from the paragraph 3.1 of the paper:
#'
#' \strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
#'
#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
#' Joaquin Quiñonero Candela)}

View File

@ -21,7 +21,7 @@
#' @details
#' This is the function to understand the model trained (and through your model, your data).
#'
#' Results are returned for both linear and tree models.
#' This function is for both linear and tree models.
#'
#' \code{data.table} is returned by the function.
#' The columns are :
@ -32,8 +32,9 @@
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
#' }
#'
#' If you don't provide name, index of the features are used.
#' They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
#' If you don't provide \code{feature_names}, index of the features will be used instead.
#'
#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
#'
#' Co-occurence count
#' ------------------
@ -47,10 +48,6 @@
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' # Both dataset are list with two items, a sparse matrix and labels
#' # (labels = outcome column which will be learned).
#' # Each column of the sparse Matrix is a feature in one hot encoding format.
#'
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
@ -114,8 +111,6 @@ xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, labe
result
}
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...

View File

@ -1,6 +1,6 @@
#' Convert tree model dump to data.table
#' Parse boosted tree model text dump
#'
#' Read a tree model text dump and return a data.table.
#' Parse a boosted tree model text dump and return a \code{data.table}.
#'
#' @importFrom data.table data.table
#' @importFrom data.table set
@ -13,17 +13,19 @@
#' @importFrom stringr str_extract
#' @importFrom stringr str_split
#' @importFrom stringr str_trim
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
#' @param model object created by the \code{xgb.train} function.
#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
#'
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
#'
#' @details
#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
#' General function to convert a text dump of tree model to a \code{data.table}.
#'
#' The content of the \code{data.table} is organised that way:
#' The purpose is to help user to explore the model and get a better understanding of it.
#'
#' The columns of the \code{data.table} are:
#'
#' \itemize{
#' \item \code{ID}: unique identifier of a node ;
@ -35,21 +37,16 @@
#' \item \code{Quality}: it's the gain related to the split in this specific node ;
#' \item \code{Cover}: metric to measure the number of observation affected by the split ;
#' \item \code{Tree}: ID of the tree. It is included in the main ID ;
#' \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
#' \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#'
#' #Both dataset are list with two items, a sparse matrix and labels
#' #(labels = outcome column which will be learned).
#' #Each column of the sparse Matrix is a feature in one hot encoding format.
#' train <- agaricus.train
#'
#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
#' eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#'
#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
#'
#' @export

View File

@ -76,6 +76,7 @@ get.paths.to.leaf <- function(dt.tree) {
#' @details
#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
#' by tree deepness level.
#'
#' The purpose of this function is to help the user to find the best trade-off to set
#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
#'
@ -88,7 +89,7 @@ get.paths.to.leaf <- function(dt.tree) {
#' \item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
#' }
#'
#' This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
#'
#' @examples
#' data(agaricus.train, package='xgboost')

View File

@ -20,7 +20,7 @@ May improve the learning by adding new features to the training data based on th
\details{
This is the function inspired from the paragraph 3.1 of the paper:
\strong{"Practical Lessons from Predicting Clicks on Ads at Facebook"}
\strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers,
Joaquin Quiñonero Candela)}

View File

@ -27,7 +27,7 @@ Create a \code{data.table} of the most important features of a model.
\details{
This is the function to understand the model trained (and through your model, your data).
Results are returned for both linear and tree models.
This function is for both linear and tree models.
\code{data.table} is returned by the function.
The columns are :
@ -38,8 +38,9 @@ The columns are :
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
}
If you don't provide name, index of the features are used.
They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
If you don't provide \code{feature_names}, index of the features will be used instead.
Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
Co-occurence count
------------------
@ -53,10 +54,6 @@ If you need to remember one thing only: until you want to leave us early, don't
\examples{
data(agaricus.train, package='xgboost')
# Both dataset are list with two items, a sparse matrix and labels
# (labels = outcome column which will be learned).
# Each column of the sparse Matrix is a feature in one hot encoding format.
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")

View File

@ -2,30 +2,32 @@
% Please edit documentation in R/xgb.model.dt.tree.R
\name{xgb.model.dt.tree}
\alias{xgb.model.dt.tree}
\title{Convert tree model dump to data.table}
\title{Parse boosted tree model text dump}
\usage{
xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
n_first_tree = NULL)
}
\arguments{
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
\item{model}{object created by the \code{xgb.train} function.}
\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
}
\value{
A \code{data.table} of the features used in the model with their gain, cover and few other thing.
A \code{data.table} of the features used in the model with their gain, cover and few other information.
}
\description{
Read a tree model text dump and return a data.table.
Parse a boosted tree model text dump and return a \code{data.table}.
}
\details{
General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
General function to convert a text dump of tree model to a \code{data.table}.
The content of the \code{data.table} is organised that way:
The purpose is to help user to explore the model and get a better understanding of it.
The columns of the \code{data.table} are:
\itemize{
\item \code{ID}: unique identifier of a node ;
@ -37,21 +39,16 @@ The content of the \code{data.table} is organised that way:
\item \code{Quality}: it's the gain related to the split in this specific node ;
\item \code{Cover}: metric to measure the number of observation affected by the split ;
\item \code{Tree}: ID of the tree. It is included in the main ID ;
\item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
\item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
}
}
\examples{
data(agaricus.train, package='xgboost')
#Both dataset are list with two items, a sparse matrix and labels
#(labels = outcome column which will be learned).
#Each column of the sparse Matrix is a feature in one hot encoding format.
train <- agaricus.train
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
}

View File

@ -18,6 +18,7 @@ Generate a graph to plot the distribution of deepness among trees.
\details{
Display both the number of \code{leaf} and the distribution of \code{weighted observations}
by tree deepness level.
The purpose of this function is to help the user to find the best trade-off to set
the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
@ -30,7 +31,7 @@ The graph is made of two parts:
\item Weighted cover: noramlized weighted cover per Leaf (weighted number of instances).
}
This function is inspired by this blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
}
\examples{
data(agaricus.train, package='xgboost')