Updates from 1.2.0 cran submission (#6077)

* update for 1.2.0 cran submission

* recover cmakelists

* fix unittest from the shap PR

* trigger CI
This commit is contained in:
Tong He 2020-09-02 20:50:23 +08:00 committed by GitHub
parent 9be969cc7a
commit 3912f3de06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 203 additions and 38 deletions

View File

@ -170,7 +170,7 @@ jobs:
run: |
cd R-package
R.exe CMD INSTALL .
Rscript.exe tests/run_lint.R
Rscript.exe tests/helper_scripts/run_lint.R
test-with-R:
runs-on: ${{ matrix.config.os }}

View File

@ -134,14 +134,16 @@ Rpack: clean_all
sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win
rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it
bash R-package/remove_warning_suppression_pragma.sh
bash xgboost/remove_warning_suppression_pragma.sh
rm xgboost/remove_warning_suppression_pragma.sh
rm -rfv xgboost/tests/helper_scripts/
Rbuild: Rpack
R CMD build --no-build-vignettes xgboost
rm -rf xgboost
Rcheck: Rbuild
R CMD check xgboost*.tar.gz
R CMD check --as-cran xgboost*.tar.gz
-include build/*.d
-include build/*/*.d

View File

@ -2,7 +2,7 @@ Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 1.3.0.1
Date: 2020-02-21
Date: 2020-08-28
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),

View File

@ -38,6 +38,7 @@ export(xgb.dump)
export(xgb.gblinear.history)
export(xgb.ggplot.deepness)
export(xgb.ggplot.importance)
export(xgb.ggplot.shap.summary)
export(xgb.importance)
export(xgb.load)
export(xgb.load.raw)
@ -46,6 +47,7 @@ export(xgb.plot.deepness)
export(xgb.plot.importance)
export(xgb.plot.multi.trees)
export(xgb.plot.shap)
export(xgb.plot.shap.summary)
export(xgb.plot.tree)
export(xgb.save)
export(xgb.save.raw)

View File

@ -349,6 +349,7 @@ NULL
#' # Save as a stand-alone file (JSON); load it with xgb.load()
#' xgb.save(bst, 'xgb.model.json')
#' bst2 <- xgb.load('xgb.model.json')
#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
#'
#' # Save as a raw byte vector; load it with xgb.load.raw()
#' xgb_bytes <- xgb.save.raw(bst)
@ -364,6 +365,7 @@ NULL
#' obj2 <- readRDS('my_object.rds')
#' # Re-construct xgb.Booster object from the bytes
#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
#'
#' @name a-compatibility-note-for-saveRDS-save
NULL

View File

@ -79,7 +79,7 @@
#'
#' All observations are used for both training and validation.
#'
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
#'
#' @return
#' An object of class \code{xgb.cv.synchronous} with the following elements:

View File

@ -200,9 +200,9 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
#' @return A \code{ggplot2} object.
#' @export
#'
#' @examples See \code{\link{xgb.plot.shap}}.
#' @examples # See \code{\link{xgb.plot.shap}}.
#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
#' \code{\url{https://github.com/slundberg/shap}}
#' \url{https://github.com/slundberg/shap}
xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
# Only ggplot implementation is available.

View File

@ -130,16 +130,16 @@
#' Note that when using a customized metric, only this single metric can be used.
#' The following is the list of built-in metrics for which Xgboost provides optimized implementation:
#' \itemize{
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
#' Different threshold (e.g., 0.) could be specified as "error@0."
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
#' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
#' }
#'
#' The following callbacks are automatically created when certain parameters are set:

View File

@ -43,6 +43,7 @@ bst2 <- xgb.load('xgb.model')
# Save as a stand-alone file (JSON); load it with xgb.load()
xgb.save(bst, 'xgb.model.json')
bst2 <- xgb.load('xgb.model.json')
if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
# Save as a raw byte vector; load it with xgb.load.raw()
xgb_bytes <- xgb.save.raw(bst)
@ -58,5 +59,6 @@ saveRDS(obj, 'my_object.rds')
obj2 <- readRDS('my_object.rds')
# Re-construct xgb.Booster object from the bytes
bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
if (file.exists('my_object.rds')) file.remove('my_object.rds')
}

View File

@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R
\name{normalize}
\alias{normalize}
\title{Scale feature value to have mean 0, standard deviation 1}
\usage{
normalize(x)
}
\arguments{
\item{x}{Numeric vector}
}
\value{
Numeric vector with mean 0 and sd 1.
}
\description{
This is used to compare multiple features on the same plot.
Internal utility function
}

View File

@ -0,0 +1,27 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R
\name{prepare.ggplot.shap.data}
\alias{prepare.ggplot.shap.data}
\title{Combine and melt feature values and SHAP contributions for sample
observations.}
\usage{
prepare.ggplot.shap.data(data_list, normalize = FALSE)
}
\arguments{
\item{data_list}{List containing 'data' and 'shap_contrib' returned by
\code{xgb.shap.data()}.}
\item{normalize}{Whether to standardize feature values to have mean 0 and
standard deviation 1 (useful for comparing multiple features on the same
plot). Default \code{FALSE}.}
}
\value{
A data.table containing the observation ID, the feature name, the
feature value (normalized if specified), and the SHAP contribution value.
}
\description{
Conforms to data format required for ggplot functions.
}
\details{
Internal utility function.
}

View File

@ -154,7 +154,7 @@ The cross-validation process is then repeated \code{nrounds} times, with each of
All observations are used for both training and validation.
Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
}
\examples{
data(agaricus.train, package='xgboost')

View File

@ -131,6 +131,7 @@ bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
# multiclass example - plots for each class separately:
nclass <- 3
@ -149,6 +150,7 @@ xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
n_col = 2, col = col, pch = 16, pch_NA = 17)
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
}
\references{

View File

@ -0,0 +1,78 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.shap.R
\name{xgb.ggplot.shap.summary}
\alias{xgb.ggplot.shap.summary}
\alias{xgb.plot.shap.summary}
\title{SHAP contribution dependency summary plot}
\usage{
xgb.ggplot.shap.summary(
data,
shap_contrib = NULL,
features = NULL,
top_n = 10,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL
)
xgb.plot.shap.summary(
data,
shap_contrib = NULL,
features = NULL,
top_n = 10,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL
)
}
\arguments{
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
feature importance is calculated, and \code{top_n} high ranked features are taken.}
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
or \code{features} is missing.}
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
only SHAP contributions for that specific class are used.
If it is not set, SHAP importances are averaged over all classes.}
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
it is set so that up to 100K data points are used.}
}
\value{
A \code{ggplot2} object.
}
\description{
Compare SHAP contributions of different features.
}
\details{
A point plot (each point representing one sample from \code{data}) is
produced for each feature, with the points plotted on the SHAP value axis.
Each point (observation) is coloured based on its feature value. The plot
hence allows us to see which features have a negative / positive contribution
on the model prediction, and whether the contribution is different for larger
or smaller values of the feature. We effectively try to replicate the
\code{summary_plot} function from https://github.com/slundberg/shap.
}
\examples{
# See \code{\link{xgb.plot.shap}}.
}
\seealso{
\code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
\url{https://github.com/slundberg/shap}
}

View File

@ -0,0 +1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xgb.plot.shap.R
\name{xgb.shap.data}
\alias{xgb.shap.data}
\title{Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
Internal utility function.}
\usage{
xgb.shap.data(
data,
shap_contrib = NULL,
features = NULL,
top_n = 1,
model = NULL,
trees = NULL,
target_class = NULL,
approxcontrib = FALSE,
subsample = NULL,
max_observations = 1e+05
)
}
\value{
A list containing: 'data', a matrix containing sample observations
and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
values for these observations.
}
\description{
Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
Internal utility function.
}

View File

@ -215,16 +215,16 @@ User may set one or several \code{eval_metric} parameters.
Note that when using a customized metric, only this single metric can be used.
The following is the list of built-in metrics for which Xgboost provides optimized implementation:
\itemize{
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
\item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
Different threshold (e.g., 0.) could be specified as "error@0."
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
}
The following callbacks are automatically created when certain parameters are set:

View File

@ -1,10 +0,0 @@
model_generator_metadata <- function() {
return (list(
kRounds = 2,
kRows = 1000,
kCols = 4,
kForests = 2,
kMaxDepth = 2,
kClasses = 3
))
}

View File

@ -5,7 +5,14 @@ library(Matrix)
source('./generate_models_params.R')
set.seed(0)
metadata <- model_generator_metadata()
metadata <- list(
kRounds = 2,
kRows = 1000,
kCols = 4,
kForests = 2,
kMaxDepth = 2,
kClasses = 3
)
X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows,
ncol = metadata$kCols, sparse = TRUE)
w <- runif(metadata$kRows)

View File

@ -1,10 +1,16 @@
require(xgboost)
require(jsonlite)
source('../generate_models_params.R')
context("Models from previous versions of XGBoost can be loaded")
metadata <- model_generator_metadata()
metadata <- list(
kRounds = 2,
kRows = 1000,
kCols = 4,
kForests = 2,
kMaxDepth = 2,
kClasses = 3
)
run_model_param_check <- function (config) {
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')

View File

@ -57,7 +57,7 @@ To answer the question above we will convert *categorical* variables to `numeric
In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
@ -66,7 +66,7 @@ data(Arthritis)
df <- data.table(Arthritis, keep.rownames = FALSE)
```
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
The first thing we want to do is to have a look to the first few lines of the `data.table`:
@ -137,8 +137,8 @@ levels(df[,Treatment])
#### Encoding categorical features
Next step, we will transform the categorical data to dummy variables.
Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach.
We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
We will use the [dummy contrast coding](https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
@ -176,7 +176,7 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
A model which fits too well may [overfit](https://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
> Here you can see the numbers decrease until line 7 and then increase.
>
@ -304,7 +304,7 @@ Linear model may not be that smart in this scenario.
Special Note: What about Random Forests™?
-----------------------------------------
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
As you may know, [Random Forests™](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).

View File

@ -24,7 +24,7 @@
author = "K. Bache and M. Lichman",
year = "2013",
title = "{UCI} Machine Learning Repository",
url = "http://archive.ics.uci.edu/ml",
url = "http://archive.ics.uci.edu/ml/",
institution = "University of California, Irvine, School of Information and Computer Sciences"
}

View File

@ -68,7 +68,7 @@ The version 0.4-2 is on CRAN, and you can install it by:
install.packages("xgboost")
```
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost)
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost/)
## Learning