Merge pull request #666 from pommedeterresautee/master

Code cleaning + doc improvement #Rstat
2015-12-02 16:11:17 +01:00 · 2015-12-02 16:11:17 +01:00 · 3c260c545d
commit 3c260c545d
parent 88e7c6012b db922e8c88
11 changed files with 50 additions and 24 deletions
--- a/R-package/R/predict.xgb.Booster.R
+++ b/R-package/R/predict.xgb.Booster.R
@ -20,6 +20,17 @@ setClass("xgb.Booster",
 #'  only valid for gbtree, but not for gblinear. set it to be value bigger 
 #'  than 0. It will use all trees by default.
 #' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
+#' 
+#' @details  
+#' The option \code{ntreelimit} purpose is to let the user train a model with lots 
+#' of trees but use only the first trees for prediction to avoid overfitting 
+#' (without having to train a new model with less trees).
+#' 
+#' The option \code{predleaf} purpose is inspired from §3.1 of the paper 
+#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
+#' The idea is to use the model as a generator of new features which capture non linear link 
+#' from original features.
+#' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@ -25,14 +25,17 @@
 #' Results are returned for both linear and tree models.
 #' 
 #' \code{data.table} is returned by the function. 
-#' There are 3 columns :
+#' The columns are :
 #' \itemize{
-#'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
-#'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
-#'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
-#'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
+#'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
+#'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
+#'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
+#'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
 #' }
 #' 
+#' If you don't provide name, index of the features are used.
+#' They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
+#' 
 #' Co-occurence count
 #' ------------------
 #' 
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@ -102,9 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T)

 # Finally, you can check which features are the most important.
 print("Most important features (look at column Gain):")
-imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt")
+imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
 print(imp_matrix)

 # Feature importance bar plot by gain
 print("Feature importance Plot : ")
-print(xgb.plot.importance(imp_matrix))
+print(xgb.plot.importance(importance_matrix = imp_matrix))
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
 setinfo(dtest, "base_margin", ptest)

 print('this is result of boost from initial prediction')
-bst <- xgb.train( param, dtrain, 1, watchlist )
+bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@ -67,10 +67,9 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
 cat("Learning...\n")
 bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)

 # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
-importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
+importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
 print(importance)
 # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).

--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) {
 param <- list(max.depth=2,eta=1,silent=1,
              objective = logregobj, eval_metric = evalerror)
 # train with customized objective
-xgb.cv(param, dtrain, nround, nfold = 5)
+xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)

 # do cross validation with prediction values for each fold
-res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
+res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
 res$dt
 length(res$pred)
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@ -2,15 +2,15 @@ require(xgboost)
 # load in the agaricus dataset
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)

-param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
+param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
 watchlist <- list(eval = dtest, train = dtrain)
 nround = 5

 # training the model for two rounds
-bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
+bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2, watchlist = watchlist)
 cat('start testing prediction from first n trees\n')

 ### predict using first 2 tree
--- a/R-package/man/predict-xgb.Booster-method.Rd
+++ b/R-package/man/predict-xgb.Booster-method.Rd
@ -31,6 +31,16 @@ than 0. It will use all trees by default.}
 \description{
 Predicted values based on xgboost model object.
 }
+\details{
+The option \code{ntreelimit} purpose is to let the user train a model with lots 
+of trees but use only the first trees for prediction to avoid overfitting 
+(without having to train a new model with less trees).
+
+The option \code{predleaf} purpose is inspired from §3.1 of the paper 
+\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
+The idea is to use the model as a generator of new features which capture non linear link 
+from original features.
+}
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@ -31,14 +31,17 @@ This is the function to understand the model trained (and through your model, yo
 Results are returned for both linear and tree models.

 \code{data.table} is returned by the function. 
-There are 3 columns :
+The columns are :
 \itemize{
-  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
-  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
-  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
-  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
+  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
+  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
+  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
+  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
 }

+If you don't provide name, index of the features are used.
+They are extracted from the boost dump (made on the C++ side), the index starts at 0 (usual in C++) instead of 1 (usual in R).
+
 Co-occurence count
 ------------------

--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@ -190,7 +190,7 @@ Measure feature importance
 In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).

 ```{r}
-importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
+importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
 head(importance)
 ```

@ -213,7 +213,7 @@ One simple solution is to count the co-occurrences of a feature and a class of t
 For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.

 ```{r}
-importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
+importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)

 # Cleaning for better display
 importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@ -345,7 +345,7 @@ Feature importance is similar to R gbm package's relative influence (rel.inf).
 ```
 importance_matrix <- xgb.importance(model = bst)
 print(importance_matrix)
-xgb.plot.importance(importance_matrix)
+xgb.plot.importance(importance_matrix = importance_matrix)
 ```

 View the trees from a model