Fix for CRAN Submission (#1826)
* fix cran check * change required R version because of utils::globalVariables * temporary commit, monotone not working * fix test * fix doc * fix doc * fix cran note and warning * improve checks * fix urls
This commit is contained in:
parent
27ca50e2c2
commit
2f3958a455
@ -57,7 +57,8 @@
|
|||||||
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||||
#'
|
#'
|
||||||
#' # Model accuracy without new features
|
#' # Model accuracy without new features
|
||||||
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||||
|
#' length(agaricus.test$label)
|
||||||
#'
|
#'
|
||||||
#' # Convert previous features to one hot encoding
|
#' # Convert previous features to one hot encoding
|
||||||
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
||||||
@ -70,10 +71,12 @@
|
|||||||
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||||
#'
|
#'
|
||||||
#' # Model accuracy with new features
|
#' # Model accuracy with new features
|
||||||
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
|
||||||
|
#' length(agaricus.test$label)
|
||||||
#'
|
#'
|
||||||
#' # Here the accuracy was already good and is now perfect.
|
#' # Here the accuracy was already good and is now perfect.
|
||||||
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
|
#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
|
||||||
|
#' accuracy.after, "!\n"))
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.create.features <- function(model, data, ...){
|
xgb.create.features <- function(model, data, ...){
|
||||||
|
|||||||
@ -44,7 +44,8 @@
|
|||||||
#' xgb.importance(colnames(agaricus.train$data), model = bst)
|
#' xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||||
#'
|
#'
|
||||||
#' # Same thing with co-occurence computation this time
|
#' # Same thing with co-occurence computation this time
|
||||||
#' xgb.importance(colnames(agaricus.train$data), model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
#' xgb.importance(colnames(agaricus.train$data), model = bst,
|
||||||
|
#' data = agaricus.train$data, label = agaricus.train$label)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
|
||||||
|
|||||||
@ -46,7 +46,8 @@
|
|||||||
#'
|
#'
|
||||||
#' data(agaricus.train, package='xgboost')
|
#' data(agaricus.train, package='xgboost')
|
||||||
#'
|
#'
|
||||||
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
#' # Change max_depth to a higher number to get a more significant result
|
||||||
|
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
|
||||||
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
#' eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
||||||
#' subsample = 0.5, min_child_weight = 2)
|
#' subsample = 0.5, min_child_weight = 2)
|
||||||
#'
|
#'
|
||||||
|
|||||||
@ -39,7 +39,8 @@
|
|||||||
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
#' eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
||||||
#' min_child_weight = 50)
|
#' min_child_weight = 50)
|
||||||
#'
|
#'
|
||||||
#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), features_keep = 3)
|
#' p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
|
||||||
|
#' features_keep = 3)
|
||||||
#' print(p)
|
#' print(p)
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
XGBoost R Package for Scalable GBM
|
XGBoost R Package for Scalable GBM
|
||||||
==================================
|
==================================
|
||||||
|
|
||||||
[](http://cran.r-project.org/web/packages/xgboost)
|
[](https://cran.r-project.org/web/packages/xgboost)
|
||||||
[](http://cran.rstudio.com/web/packages/xgboost/index.html)
|
[](https://cran.rstudio.com/web/packages/xgboost/index.html)
|
||||||
[](http://xgboost.readthedocs.org/en/latest/R-package/index.html)
|
[](http://xgboost.readthedocs.org/en/latest/R-package/index.html)
|
||||||
|
|
||||||
Resources
|
Resources
|
||||||
@ -28,7 +28,7 @@ install.packages("xgboost", repos=c("http://dmlc.ml/drat/", getOption("repos")),
|
|||||||
latest version of R package.
|
latest version of R package.
|
||||||
For up-to-date version, please install from github.
|
For up-to-date version, please install from github.
|
||||||
|
|
||||||
Windows users will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first. They also need to download [MinGW-W64](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe) using x86_64 architecture during installation.
|
Windows users will need to install [RTools](https://cran.r-project.org/bin/windows/Rtools/) first. They also need to download [MinGW-W64](http://iweb.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/installer/mingw-w64-install.exe) using x86_64 architecture during installation.
|
||||||
|
|
||||||
Run the following command to add MinGW to PATH in Windows if not already added.
|
Run the following command to add MinGW to PATH in Windows if not already added.
|
||||||
|
|
||||||
|
|||||||
@ -9,3 +9,4 @@ create_sparse_matrix Create Sparse Matrix
|
|||||||
predict_leaf_indices Predicting the corresponding leaves
|
predict_leaf_indices Predicting the corresponding leaves
|
||||||
early_stopping Early Stop in training
|
early_stopping Early Stop in training
|
||||||
poisson_regression Poisson Regression on count data
|
poisson_regression Poisson Regression on count data
|
||||||
|
tweedie_regression Tweddie Regression
|
||||||
|
|||||||
@ -10,3 +10,4 @@ demo(predict_leaf_indices)
|
|||||||
demo(early_stopping)
|
demo(early_stopping)
|
||||||
demo(poisson_regression)
|
demo(poisson_regression)
|
||||||
demo(caret_wrapper)
|
demo(caret_wrapper)
|
||||||
|
demo(tweedie_regression)
|
||||||
@ -46,8 +46,8 @@ number of columns corresponding to the number of trees.
|
|||||||
Predicted values based on either xgboost model or model handle object.
|
Predicted values based on either xgboost model or model handle object.
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
Note that \code{ntreelimit} is not necesserily equal to the number of boosting iterations
|
Note that \code{ntreelimit} is not necessarily equal to the number of boosting iterations
|
||||||
and it is not necesserily equal to the number of trees in a model.
|
and it is not necessarily equal to the number of trees in a model.
|
||||||
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
|
E.g., in a random forest-like model, \code{ntreelimit} would limit the number of trees.
|
||||||
But for multiclass classification, there are multiple trees per iteration,
|
But for multiclass classification, there are multiple trees per iteration,
|
||||||
but \code{ntreelimit} limits the number of boosting iterations.
|
but \code{ntreelimit} limits the number of boosting iterations.
|
||||||
|
|||||||
@ -45,7 +45,7 @@ stored together with the model's binary representation, and accessed later
|
|||||||
(from R or any other interface).
|
(from R or any other interface).
|
||||||
In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
|
In contrast, any R-attribute assigned to an R-object of \code{xgb.Booster} class
|
||||||
would not be saved by \code{xgb.save} because an xgboost model is an external memory object
|
would not be saved by \code{xgb.save} because an xgboost model is an external memory object
|
||||||
and its serialization is handled extrnally.
|
and its serialization is handled externally.
|
||||||
Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
Also, setting an attribute that has the same name as one of xgboost's parameters wouldn't
|
||||||
change the value of that parameter for a model.
|
change the value of that parameter for a model.
|
||||||
Use \code{\link{xgb.parameters<-}} to set or change model parameters.
|
Use \code{\link{xgb.parameters<-}} to set or change model parameters.
|
||||||
|
|||||||
@ -68,7 +68,8 @@ nround = 4
|
|||||||
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
|
||||||
|
|
||||||
# Model accuracy without new features
|
# Model accuracy without new features
|
||||||
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) /
|
||||||
|
length(agaricus.test$label)
|
||||||
|
|
||||||
# Convert previous features to one hot encoding
|
# Convert previous features to one hot encoding
|
||||||
new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
|
||||||
@ -81,10 +82,12 @@ watchlist <- list(train = new.dtrain)
|
|||||||
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
|
||||||
|
|
||||||
# Model accuracy with new features
|
# Model accuracy with new features
|
||||||
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
|
accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) /
|
||||||
|
length(agaricus.test$label)
|
||||||
|
|
||||||
# Here the accuracy was already good and is now perfect.
|
# Here the accuracy was already good and is now perfect.
|
||||||
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
|
cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now",
|
||||||
|
accuracy.after, "!\\n"))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,7 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
\description{
|
\description{
|
||||||
The cross valudation function of xgboost
|
The cross validation function of xgboost
|
||||||
}
|
}
|
||||||
\details{
|
\details{
|
||||||
The original sample is randomly partitioned into \code{nfold} equal size subsamples.
|
The original sample is randomly partitioned into \code{nfold} equal size subsamples.
|
||||||
|
|||||||
@ -14,7 +14,7 @@ xgb.importance(feature_names = NULL, model = NULL, data = NULL,
|
|||||||
|
|
||||||
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
\item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
||||||
|
|
||||||
\item{label}{the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
\item{label}{the label vector used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
|
||||||
|
|
||||||
\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
|
\item{target}{a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.}
|
||||||
}
|
}
|
||||||
@ -28,7 +28,7 @@ Create a \code{data.table} of the most important features of a model.
|
|||||||
This function is for both linear and tree models.
|
This function is for both linear and tree models.
|
||||||
|
|
||||||
\code{data.table} is returned by the function.
|
\code{data.table} is returned by the function.
|
||||||
The columns are :
|
The columns are:
|
||||||
\itemize{
|
\itemize{
|
||||||
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
\item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
|
||||||
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
\item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
|
||||||
@ -47,7 +47,7 @@ The gain gives you indication about the information of how a feature is importan
|
|||||||
|
|
||||||
Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
|
Co-occurence computation is here to help in understanding this relation between a predictor and a specific class. It will count how many observations are returned as \code{TRUE} by the \code{target} function (see parameters). When you execute the example below, there are 92 times only over the 3140 observations of the train dataset where a mushroom have no odor and can be eaten safely.
|
||||||
|
|
||||||
If you need to remember one thing only: until you want to leave us early, don't eat a mushroom which has no odor :-)
|
If you need to remember only one thing: unless you want to leave us early, don't eat a mushroom which has no odor :-)
|
||||||
}
|
}
|
||||||
\examples{
|
\examples{
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
@ -58,7 +58,8 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
|
|||||||
xgb.importance(colnames(agaricus.train$data), model = bst)
|
xgb.importance(colnames(agaricus.train$data), model = bst)
|
||||||
|
|
||||||
# Same thing with co-occurence computation this time
|
# Same thing with co-occurence computation this time
|
||||||
xgb.importance(colnames(agaricus.train$data), model = bst, data = agaricus.train$data, label = agaricus.train$label)
|
xgb.importance(colnames(agaricus.train$data), model = bst,
|
||||||
|
data = agaricus.train$data, label = agaricus.train$label)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -56,7 +56,8 @@ This function was inspired by the blog post
|
|||||||
|
|
||||||
data(agaricus.train, package='xgboost')
|
data(agaricus.train, package='xgboost')
|
||||||
|
|
||||||
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
|
# Change max_depth to a higher number to get a more significant result
|
||||||
|
bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
|
||||||
eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
|
||||||
subsample = 0.5, min_child_weight = 2)
|
subsample = 0.5, min_child_weight = 2)
|
||||||
|
|
||||||
|
|||||||
@ -53,7 +53,8 @@ bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_dep
|
|||||||
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
|
||||||
min_child_weight = 50)
|
min_child_weight = 50)
|
||||||
|
|
||||||
p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data), features_keep = 3)
|
p <- xgb.plot.multi.trees(model = bst, feature_names = colnames(agaricus.train$data),
|
||||||
|
features_keep = 3)
|
||||||
print(p)
|
print(p)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -57,7 +57,7 @@ drat:::addRepo("dmlc")
|
|||||||
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
|
||||||
```
|
```
|
||||||
|
|
||||||
> *Windows* user will need to install [Rtools](http://cran.r-project.org/bin/windows/Rtools/) first.
|
> *Windows* user will need to install [Rtools](https://cran.r-project.org/bin/windows/Rtools/) first.
|
||||||
|
|
||||||
### CRAN version
|
### CRAN version
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ The version 0.4-2 is on CRAN, and you can install it by:
|
|||||||
install.packages("xgboost")
|
install.packages("xgboost")
|
||||||
```
|
```
|
||||||
|
|
||||||
Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost)
|
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost)
|
||||||
|
|
||||||
## Learning
|
## Learning
|
||||||
|
|
||||||
@ -107,7 +107,7 @@ train <- agaricus.train
|
|||||||
test <- agaricus.test
|
test <- agaricus.test
|
||||||
```
|
```
|
||||||
|
|
||||||
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/splitting.html).
|
> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of the purpose of this article, however `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
|
||||||
|
|
||||||
Each variable is a `list` containing two things, `label` and `data`:
|
Each variable is a `list` containing two things, `label` and `data`:
|
||||||
|
|
||||||
@ -294,7 +294,7 @@ bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nthread = 2, nrounds=2, watchl
|
|||||||
|
|
||||||
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
|
||||||
|
|
||||||
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html).
|
If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/data-splitting.html).
|
||||||
|
|
||||||
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
|
For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user