Merge pull request #666 from pommedeterresautee/master
Code cleaning + doc improvement #Rstat
This commit is contained in:
commit
3c260c545d
@ -20,6 +20,17 @@ setClass("xgb.Booster",
|
|||||||
#' only valid for gbtree, but not for gblinear. set it to be value bigger
|
#' only valid for gbtree, but not for gblinear. set it to be value bigger
|
||||||
#' than 0. It will use all trees by default.
|
#' than 0. It will use all trees by default.
|
||||||
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
|
#' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
|
||||||
|
#'
|
||||||
|
#' @details
|
||||||
|
#' The option \code{ntreelimit} purpose is to let the user train a model with lots
|
||||||
|
#' of trees but use only the first trees for prediction to avoid overfitting
|
||||||
|
#' (without having to train a new model with less trees).
|
||||||
|
#'
|
||||||
|
#' The option \code{predleaf} purpose is inspired from §3.1 of the paper
|
||||||
|
#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
|
||||||
|
#' The idea is to use the model as a generator of new features which capture non linear link
|
||||||
|
#' from original features.
|
||||||
|
#'
|
||||||
#' @examples
|
#' @examples
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#' data(agaricus.test, package='xgboost')
|
#' data(agaricus.test, package='xgboost')
|
||||||
|
|||||||
@ -25,14 +25,17 @@
|
|||||||
#' Results are returned for both linear and tree models.
|
#' Results are returned for both linear and tree models.
|
||||||
#'
|
#'
|
||||||
#' \code{data.table} is returned by the function.
|
#' \code{data.table} is returned by the function.
|
||||||
#' There are 3 columns :
|
#' The columns are :
|
||||||
#' \itemize{
|
#' \itemize{
|
||||||
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
#' \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||||
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
#' \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||||
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
#' \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
||||||
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
#' \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
||||||
#' }
|
#' }
|
||||||
#'
|
#'
|
||||||
|
#' If you don't provide name, index of the features are used.
|
||||||
|
#' They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
|
||||||
|
#'
|
||||||
#' Co-occurence count
|
#' Co-occurence count
|
||||||
#' ------------------
|
#' ------------------
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -102,9 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T)
|
|||||||
|
|
||||||
# Finally, you can check which features are the most important.
|
# Finally, you can check which features are the most important.
|
||||||
print("Most important features (look at column Gain):")
|
print("Most important features (look at column Gain):")
|
||||||
imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt")
|
imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
|
||||||
print(imp_matrix)
|
print(imp_matrix)
|
||||||
|
|
||||||
# Feature importance bar plot by gain
|
# Feature importance bar plot by gain
|
||||||
print("Feature importance Plot : ")
|
print("Feature importance Plot : ")
|
||||||
print(xgb.plot.importance(imp_matrix))
|
print(xgb.plot.importance(importance_matrix = imp_matrix))
|
||||||
|
|||||||
@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
|
|||||||
setinfo(dtest, "base_margin", ptest)
|
setinfo(dtest, "base_margin", ptest)
|
||||||
|
|
||||||
print('this is result of boost from initial prediction')
|
print('this is result of boost from initial prediction')
|
||||||
bst <- xgb.train( param, dtrain, 1, watchlist )
|
bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
|
||||||
|
|||||||
@ -67,10 +67,9 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
|
|||||||
cat("Learning...\n")
|
cat("Learning...\n")
|
||||||
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
|
||||||
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
|
||||||
xgb.dump(bst, 'xgb.model.dump', with.stats = T)
|
|
||||||
|
|
||||||
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
# sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
|
||||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
|
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
|
||||||
print(importance)
|
print(importance)
|
||||||
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
# According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
|
||||||
|
|
||||||
|
|||||||
@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) {
|
|||||||
param <- list(max.depth=2,eta=1,silent=1,
|
param <- list(max.depth=2,eta=1,silent=1,
|
||||||
objective = logregobj, eval_metric = evalerror)
|
objective = logregobj, eval_metric = evalerror)
|
||||||
# train with customized objective
|
# train with customized objective
|
||||||
xgb.cv(param, dtrain, nround, nfold = 5)
|
xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
|
||||||
|
|
||||||
# do cross validation with prediction values for each fold
|
# do cross validation with prediction values for each fold
|
||||||
res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
|
res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
|
||||||
res$dt
|
res$dt
|
||||||
length(res$pred)
|
length(res$pred)
|
||||||
|
|||||||
@ -2,15 +2,15 @@ require(xgboost)
|
|||||||
# load in the agaricus dataset
|
# load in the agaricus dataset
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
|
dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
|
||||||
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
|
dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
|
||||||
|
|
||||||
param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
|
param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
|
||||||
watchlist <- list(eval = dtest, train = dtrain)
|
watchlist <- list(eval = dtest, train = dtrain)
|
||||||
nround = 5
|
nround = 5
|
||||||
|
|
||||||
# training the model for two rounds
|
# training the model for two rounds
|
||||||
bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
|
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2, watchlist = watchlist)
|
||||||
cat('start testing prediction from first n trees\n')
|
cat('start testing prediction from first n trees\n')
|
||||||
|
|
||||||
### predict using first 2 tree
|
### predict using first 2 tree
|
||||||
|
|||||||
@ -31,6 +31,16 @@ than 0. It will use all trees by default.}
|
|||||||
\description{
|
\description{
|
||||||
Predicted values based on xgboost model object.
|
Predicted values based on xgboost model object.
|
||||||
}
|
}
|
||||||
|
\details{
|
||||||
|
The option \code{ntreelimit} purpose is to let the user train a model with lots
|
||||||
|
of trees but use only the first trees for prediction to avoid overfitting
|
||||||
|
(without having to train a new model with less trees).
|
||||||
|
|
||||||
|
The option \code{predleaf} purpose is inspired from §3.1 of the paper
|
||||||
|
\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
|
||||||
|
The idea is to use the model as a generator of new features which capture non linear link
|
||||||
|
from original features.
|
||||||
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
data(agaricus.test, package='xgboost')
|
data(agaricus.test, package='xgboost')
|
||||||
|
|||||||
@ -31,14 +31,17 @@ This is the function to understand the model trained (and through your model, yo
|
|||||||
Results are returned for both linear and tree models.
|
Results are returned for both linear and tree models.
|
||||||
|
|
||||||
\code{data.table} is returned by the function.
|
\code{data.table} is returned by the function.
|
||||||
There are 3 columns :
|
The columns are :
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
|
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
|
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||||
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
|
\item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
|
||||||
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
|
\item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
If you don't provide name, index of the features are used.
|
||||||
|
They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
|
||||||
|
|
||||||
Co-occurence count
|
Co-occurence count
|
||||||
------------------
|
------------------
|
||||||
|
|
||||||
|
|||||||
@ -190,7 +190,7 @@ Measure feature importance
|
|||||||
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
|
In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
|
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
|
||||||
head(importance)
|
head(importance)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -213,7 +213,7 @@ One simple solution is to count the co-occurrences of a feature and a class of t
|
|||||||
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
|
For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
|
importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
|
||||||
|
|
||||||
# Cleaning for better display
|
# Cleaning for better display
|
||||||
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
|
importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
|
||||||
|
|||||||
@ -345,7 +345,7 @@ Feature importance is similar to R gbm package's relative influence (rel.inf).
|
|||||||
```
|
```
|
||||||
importance_matrix <- xgb.importance(model = bst)
|
importance_matrix <- xgb.importance(model = bst)
|
||||||
print(importance_matrix)
|
print(importance_matrix)
|
||||||
xgb.plot.importance(importance_matrix)
|
xgb.plot.importance(importance_matrix = importance_matrix)
|
||||||
```
|
```
|
||||||
|
|
||||||
View the trees from a model
|
View the trees from a model
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user