Updates from 1.2.0 cran submission (#6077)
* update for 1.2.0 cran submission * recover cmakelists * fix unittest from the shap PR * trigger CI
This commit is contained in:
parent
9be969cc7a
commit
3912f3de06
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@ -170,7 +170,7 @@ jobs:
|
||||
run: |
|
||||
cd R-package
|
||||
R.exe CMD INSTALL .
|
||||
Rscript.exe tests/run_lint.R
|
||||
Rscript.exe tests/helper_scripts/run_lint.R
|
||||
|
||||
test-with-R:
|
||||
runs-on: ${{ matrix.config.os }}
|
||||
|
||||
4
Makefile
4
Makefile
@ -134,14 +134,16 @@ Rpack: clean_all
|
||||
sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win
|
||||
rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it
|
||||
bash R-package/remove_warning_suppression_pragma.sh
|
||||
bash xgboost/remove_warning_suppression_pragma.sh
|
||||
rm xgboost/remove_warning_suppression_pragma.sh
|
||||
rm -rfv xgboost/tests/helper_scripts/
|
||||
|
||||
Rbuild: Rpack
|
||||
R CMD build --no-build-vignettes xgboost
|
||||
rm -rf xgboost
|
||||
|
||||
Rcheck: Rbuild
|
||||
R CMD check xgboost*.tar.gz
|
||||
R CMD check --as-cran xgboost*.tar.gz
|
||||
|
||||
-include build/*.d
|
||||
-include build/*/*.d
|
||||
|
||||
@ -2,7 +2,7 @@ Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 1.3.0.1
|
||||
Date: 2020-02-21
|
||||
Date: 2020-08-28
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
email = "tianqi.tchen@gmail.com"),
|
||||
|
||||
@ -38,6 +38,7 @@ export(xgb.dump)
|
||||
export(xgb.gblinear.history)
|
||||
export(xgb.ggplot.deepness)
|
||||
export(xgb.ggplot.importance)
|
||||
export(xgb.ggplot.shap.summary)
|
||||
export(xgb.importance)
|
||||
export(xgb.load)
|
||||
export(xgb.load.raw)
|
||||
@ -46,6 +47,7 @@ export(xgb.plot.deepness)
|
||||
export(xgb.plot.importance)
|
||||
export(xgb.plot.multi.trees)
|
||||
export(xgb.plot.shap)
|
||||
export(xgb.plot.shap.summary)
|
||||
export(xgb.plot.tree)
|
||||
export(xgb.save)
|
||||
export(xgb.save.raw)
|
||||
|
||||
@ -349,6 +349,7 @@ NULL
|
||||
#' # Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
#' xgb.save(bst, 'xgb.model.json')
|
||||
#' bst2 <- xgb.load('xgb.model.json')
|
||||
#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
#'
|
||||
#' # Save as a raw byte vector; load it with xgb.load.raw()
|
||||
#' xgb_bytes <- xgb.save.raw(bst)
|
||||
@ -364,6 +365,7 @@ NULL
|
||||
#' obj2 <- readRDS('my_object.rds')
|
||||
#' # Re-construct xgb.Booster object from the bytes
|
||||
#' bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
#' if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
#'
|
||||
#' @name a-compatibility-note-for-saveRDS-save
|
||||
NULL
|
||||
|
||||
@ -79,7 +79,7 @@
|
||||
#'
|
||||
#' All observations are used for both training and validation.
|
||||
#'
|
||||
#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
|
||||
#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
#'
|
||||
#' @return
|
||||
#' An object of class \code{xgb.cv.synchronous} with the following elements:
|
||||
|
||||
@ -200,9 +200,9 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
|
||||
#' @return A \code{ggplot2} object.
|
||||
#' @export
|
||||
#'
|
||||
#' @examples See \code{\link{xgb.plot.shap}}.
|
||||
#' @examples # See \code{\link{xgb.plot.shap}}.
|
||||
#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
|
||||
#' \code{\url{https://github.com/slundberg/shap}}
|
||||
#' \url{https://github.com/slundberg/shap}
|
||||
xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
|
||||
trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
|
||||
# Only ggplot implementation is available.
|
||||
|
||||
@ -130,16 +130,16 @@
|
||||
#' Note that when using a customized metric, only this single metric can be used.
|
||||
#' The following is the list of built-in metrics for which Xgboost provides optimized implementation:
|
||||
#' \itemize{
|
||||
#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
|
||||
#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
#' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
#' Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
#' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
#' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
#' }
|
||||
#'
|
||||
#' The following callbacks are automatically created when certain parameters are set:
|
||||
|
||||
@ -43,6 +43,7 @@ bst2 <- xgb.load('xgb.model')
|
||||
# Save as a stand-alone file (JSON); load it with xgb.load()
|
||||
xgb.save(bst, 'xgb.model.json')
|
||||
bst2 <- xgb.load('xgb.model.json')
|
||||
if (file.exists('xgb.model.json')) file.remove('xgb.model.json')
|
||||
|
||||
# Save as a raw byte vector; load it with xgb.load.raw()
|
||||
xgb_bytes <- xgb.save.raw(bst)
|
||||
@ -58,5 +59,6 @@ saveRDS(obj, 'my_object.rds')
|
||||
obj2 <- readRDS('my_object.rds')
|
||||
# Re-construct xgb.Booster object from the bytes
|
||||
bst2 <- xgb.load.raw(obj2$xgb_model_bytes)
|
||||
if (file.exists('my_object.rds')) file.remove('my_object.rds')
|
||||
|
||||
}
|
||||
|
||||
18
R-package/man/normalize.Rd
Normal file
18
R-package/man/normalize.Rd
Normal file
@ -0,0 +1,18 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.ggplot.R
|
||||
\name{normalize}
|
||||
\alias{normalize}
|
||||
\title{Scale feature value to have mean 0, standard deviation 1}
|
||||
\usage{
|
||||
normalize(x)
|
||||
}
|
||||
\arguments{
|
||||
\item{x}{Numeric vector}
|
||||
}
|
||||
\value{
|
||||
Numeric vector with mean 0 and sd 1.
|
||||
}
|
||||
\description{
|
||||
This is used to compare multiple features on the same plot.
|
||||
Internal utility function
|
||||
}
|
||||
27
R-package/man/prepare.ggplot.shap.data.Rd
Normal file
27
R-package/man/prepare.ggplot.shap.data.Rd
Normal file
@ -0,0 +1,27 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.ggplot.R
|
||||
\name{prepare.ggplot.shap.data}
|
||||
\alias{prepare.ggplot.shap.data}
|
||||
\title{Combine and melt feature values and SHAP contributions for sample
|
||||
observations.}
|
||||
\usage{
|
||||
prepare.ggplot.shap.data(data_list, normalize = FALSE)
|
||||
}
|
||||
\arguments{
|
||||
\item{data_list}{List containing 'data' and 'shap_contrib' returned by
|
||||
\code{xgb.shap.data()}.}
|
||||
|
||||
\item{normalize}{Whether to standardize feature values to have mean 0 and
|
||||
standard deviation 1 (useful for comparing multiple features on the same
|
||||
plot). Default \code{FALSE}.}
|
||||
}
|
||||
\value{
|
||||
A data.table containing the observation ID, the feature name, the
|
||||
feature value (normalized if specified), and the SHAP contribution value.
|
||||
}
|
||||
\description{
|
||||
Conforms to data format required for ggplot functions.
|
||||
}
|
||||
\details{
|
||||
Internal utility function.
|
||||
}
|
||||
@ -154,7 +154,7 @@ The cross-validation process is then repeated \code{nrounds} times, with each of
|
||||
|
||||
All observations are used for both training and validation.
|
||||
|
||||
Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation}
|
||||
Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29}
|
||||
}
|
||||
\examples{
|
||||
data(agaricus.train, package='xgboost')
|
||||
|
||||
@ -131,6 +131,7 @@ bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
|
||||
xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
|
||||
contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
|
||||
xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
|
||||
xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot
|
||||
|
||||
# multiclass example - plots for each class separately:
|
||||
nclass <- 3
|
||||
@ -149,6 +150,7 @@ xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4,
|
||||
n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
|
||||
n_col = 2, col = col, pch = 16, pch_NA = 17)
|
||||
xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot
|
||||
|
||||
}
|
||||
\references{
|
||||
|
||||
78
R-package/man/xgb.plot.shap.summary.Rd
Normal file
78
R-package/man/xgb.plot.shap.summary.Rd
Normal file
@ -0,0 +1,78 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.shap.R
|
||||
\name{xgb.ggplot.shap.summary}
|
||||
\alias{xgb.ggplot.shap.summary}
|
||||
\alias{xgb.plot.shap.summary}
|
||||
\title{SHAP contribution dependency summary plot}
|
||||
\usage{
|
||||
xgb.ggplot.shap.summary(
|
||||
data,
|
||||
shap_contrib = NULL,
|
||||
features = NULL,
|
||||
top_n = 10,
|
||||
model = NULL,
|
||||
trees = NULL,
|
||||
target_class = NULL,
|
||||
approxcontrib = FALSE,
|
||||
subsample = NULL
|
||||
)
|
||||
|
||||
xgb.plot.shap.summary(
|
||||
data,
|
||||
shap_contrib = NULL,
|
||||
features = NULL,
|
||||
top_n = 10,
|
||||
model = NULL,
|
||||
trees = NULL,
|
||||
target_class = NULL,
|
||||
approxcontrib = FALSE,
|
||||
subsample = NULL
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{data as a \code{matrix} or \code{dgCMatrix}.}
|
||||
|
||||
\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above
|
||||
\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.}
|
||||
|
||||
\item{features}{a vector of either column indices or of feature names to plot. When it is NULL,
|
||||
feature importance is calculated, and \code{top_n} high ranked features are taken.}
|
||||
|
||||
\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.}
|
||||
|
||||
\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib}
|
||||
or \code{features} is missing.}
|
||||
|
||||
\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.}
|
||||
|
||||
\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index,
|
||||
only SHAP contributions for that specific class are used.
|
||||
If it is not set, SHAP importances are averaged over all classes.}
|
||||
|
||||
\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.}
|
||||
|
||||
\item{subsample}{a random fraction of data points to use for plotting. When it is NULL,
|
||||
it is set so that up to 100K data points are used.}
|
||||
}
|
||||
\value{
|
||||
A \code{ggplot2} object.
|
||||
}
|
||||
\description{
|
||||
Compare SHAP contributions of different features.
|
||||
}
|
||||
\details{
|
||||
A point plot (each point representing one sample from \code{data}) is
|
||||
produced for each feature, with the points plotted on the SHAP value axis.
|
||||
Each point (observation) is coloured based on its feature value. The plot
|
||||
hence allows us to see which features have a negative / positive contribution
|
||||
on the model prediction, and whether the contribution is different for larger
|
||||
or smaller values of the feature. We effectively try to replicate the
|
||||
\code{summary_plot} function from https://github.com/slundberg/shap.
|
||||
}
|
||||
\examples{
|
||||
# See \code{\link{xgb.plot.shap}}.
|
||||
}
|
||||
\seealso{
|
||||
\code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
|
||||
\url{https://github.com/slundberg/shap}
|
||||
}
|
||||
29
R-package/man/xgb.shap.data.Rd
Normal file
29
R-package/man/xgb.shap.data.Rd
Normal file
@ -0,0 +1,29 @@
|
||||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/xgb.plot.shap.R
|
||||
\name{xgb.shap.data}
|
||||
\alias{xgb.shap.data}
|
||||
\title{Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
|
||||
Internal utility function.}
|
||||
\usage{
|
||||
xgb.shap.data(
|
||||
data,
|
||||
shap_contrib = NULL,
|
||||
features = NULL,
|
||||
top_n = 1,
|
||||
model = NULL,
|
||||
trees = NULL,
|
||||
target_class = NULL,
|
||||
approxcontrib = FALSE,
|
||||
subsample = NULL,
|
||||
max_observations = 1e+05
|
||||
)
|
||||
}
|
||||
\value{
|
||||
A list containing: 'data', a matrix containing sample observations
|
||||
and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
|
||||
values for these observations.
|
||||
}
|
||||
\description{
|
||||
Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
|
||||
Internal utility function.
|
||||
}
|
||||
@ -215,16 +215,16 @@ User may set one or several \code{eval_metric} parameters.
|
||||
Note that when using a customized metric, only this single metric can be used.
|
||||
The following is the list of built-in metrics for which Xgboost provides optimized implementation:
|
||||
\itemize{
|
||||
\item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss}
|
||||
\item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error}
|
||||
\item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood}
|
||||
\item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html}
|
||||
\item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
By default, it uses the 0.5 threshold for predicted values to define negative and positive instances.
|
||||
Different threshold (e.g., 0.) could be specified as "error@0."
|
||||
\item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}.
|
||||
\item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
\item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
|
||||
\item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation.
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
|
||||
\item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG}
|
||||
}
|
||||
|
||||
The following callbacks are automatically created when certain parameters are set:
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
model_generator_metadata <- function() {
|
||||
return (list(
|
||||
kRounds = 2,
|
||||
kRows = 1000,
|
||||
kCols = 4,
|
||||
kForests = 2,
|
||||
kMaxDepth = 2,
|
||||
kClasses = 3
|
||||
))
|
||||
}
|
||||
@ -5,7 +5,14 @@ library(Matrix)
|
||||
source('./generate_models_params.R')
|
||||
|
||||
set.seed(0)
|
||||
metadata <- model_generator_metadata()
|
||||
metadata <- list(
|
||||
kRounds = 2,
|
||||
kRows = 1000,
|
||||
kCols = 4,
|
||||
kForests = 2,
|
||||
kMaxDepth = 2,
|
||||
kClasses = 3
|
||||
)
|
||||
X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows,
|
||||
ncol = metadata$kCols, sparse = TRUE)
|
||||
w <- runif(metadata$kRows)
|
||||
@ -1,10 +1,16 @@
|
||||
require(xgboost)
|
||||
require(jsonlite)
|
||||
source('../generate_models_params.R')
|
||||
|
||||
context("Models from previous versions of XGBoost can be loaded")
|
||||
|
||||
metadata <- model_generator_metadata()
|
||||
metadata <- list(
|
||||
kRounds = 2,
|
||||
kRows = 1000,
|
||||
kCols = 4,
|
||||
kForests = 2,
|
||||
kMaxDepth = 2,
|
||||
kClasses = 3
|
||||
)
|
||||
|
||||
run_model_param_check <- function (config) {
|
||||
testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
|
||||
|
||||
@ -57,7 +57,7 @@ To answer the question above we will convert *categorical* variables to `numeric
|
||||
|
||||
In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
|
||||
|
||||
The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
|
||||
The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
|
||||
|
||||
The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
|
||||
|
||||
@ -66,7 +66,7 @@ data(Arthritis)
|
||||
df <- data.table(Arthritis, keep.rownames = FALSE)
|
||||
```
|
||||
|
||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||
> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`.
|
||||
|
||||
The first thing we want to do is to have a look to the first few lines of the `data.table`:
|
||||
|
||||
@ -137,8 +137,8 @@ levels(df[,Treatment])
|
||||
#### Encoding categorical features
|
||||
|
||||
Next step, we will transform the categorical data to dummy variables.
|
||||
Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach.
|
||||
We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
||||
Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
|
||||
We will use the [dummy contrast coding](https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
|
||||
|
||||
The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
|
||||
|
||||
@ -176,7 +176,7 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
|
||||
|
||||
You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
|
||||
|
||||
A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
|
||||
A model which fits too well may [overfit](https://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
|
||||
|
||||
> Here you can see the numbers decrease until line 7 and then increase.
|
||||
>
|
||||
@ -304,7 +304,7 @@ Linear model may not be that smart in this scenario.
|
||||
Special Note: What about Random Forests™?
|
||||
-----------------------------------------
|
||||
|
||||
As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
As you may know, [Random Forests™](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
|
||||
|
||||
Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
|
||||
|
||||
|
||||
@ -24,7 +24,7 @@
|
||||
author = "K. Bache and M. Lichman",
|
||||
year = "2013",
|
||||
title = "{UCI} Machine Learning Repository",
|
||||
url = "http://archive.ics.uci.edu/ml",
|
||||
url = "http://archive.ics.uci.edu/ml/",
|
||||
institution = "University of California, Irvine, School of Information and Computer Sciences"
|
||||
}
|
||||
|
||||
|
||||
@ -68,7 +68,7 @@ The version 0.4-2 is on CRAN, and you can install it by:
|
||||
install.packages("xgboost")
|
||||
```
|
||||
|
||||
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost)
|
||||
Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost/)
|
||||
|
||||
## Learning
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user