Fix R dart prediction. (#5204)

* Fix R dart prediction and add test.
This commit is contained in:
Jiaming Yuan 2020-01-16 12:11:04 +08:00 committed by GitHub
parent 808f61081b
commit 5199b86126
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 80 additions and 35 deletions

View File

@ -5,8 +5,8 @@
#' @export #' @export
xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
params = list(), nrounds, params = list(), nrounds,
verbose = 1, print_every_n = 1L, verbose = 1, print_every_n = 1L,
early_stopping_rounds = NULL, maximize = NULL, early_stopping_rounds = NULL, maximize = NULL,
save_period = NULL, save_name = "xgboost.model", save_period = NULL, save_name = "xgboost.model",
xgb_model = NULL, callbacks = list(), ...) { xgb_model = NULL, callbacks = list(), ...) {
@ -18,16 +18,16 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
early_stopping_rounds = early_stopping_rounds, maximize = maximize, early_stopping_rounds = early_stopping_rounds, maximize = maximize,
save_period = save_period, save_name = save_name, save_period = save_period, save_name = save_name,
xgb_model = xgb_model, callbacks = callbacks, ...) xgb_model = xgb_model, callbacks = callbacks, ...)
return(bst) return (bst)
} }
#' Training part from Mushroom Data Set #' Training part from Mushroom Data Set
#' #'
#' This data set is originally from the Mushroom data set, #' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository. #' UCI Machine Learning Repository.
#' #'
#' This data set includes the following fields: #' This data set includes the following fields:
#' #'
#' \itemize{ #' \itemize{
#' \item \code{label} the label for each record #' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. #' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
@ -35,16 +35,16 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
#' #'
#' @references #' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom #' https://archive.ics.uci.edu/ml/datasets/Mushroom
#' #'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science. #' School of Information and Computer Science.
#' #'
#' @docType data #' @docType data
#' @keywords datasets #' @keywords datasets
#' @name agaricus.train #' @name agaricus.train
#' @usage data(agaricus.train) #' @usage data(agaricus.train)
#' @format A list containing a label vector, and a dgCMatrix object with 6513 #' @format A list containing a label vector, and a dgCMatrix object with 6513
#' rows and 127 variables #' rows and 127 variables
NULL NULL
@ -52,9 +52,9 @@ NULL
#' #'
#' This data set is originally from the Mushroom data set, #' This data set is originally from the Mushroom data set,
#' UCI Machine Learning Repository. #' UCI Machine Learning Repository.
#' #'
#' This data set includes the following fields: #' This data set includes the following fields:
#' #'
#' \itemize{ #' \itemize{
#' \item \code{label} the label for each record #' \item \code{label} the label for each record
#' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns. #' \item \code{data} a sparse Matrix of \code{dgCMatrix} class, with 126 columns.
@ -62,16 +62,16 @@ NULL
#' #'
#' @references #' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom #' https://archive.ics.uci.edu/ml/datasets/Mushroom
#' #'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository #' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, #' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
#' School of Information and Computer Science. #' School of Information and Computer Science.
#' #'
#' @docType data #' @docType data
#' @keywords datasets #' @keywords datasets
#' @name agaricus.test #' @name agaricus.test
#' @usage data(agaricus.test) #' @usage data(agaricus.test)
#' @format A list containing a label vector, and a dgCMatrix object with 1611 #' @format A list containing a label vector, and a dgCMatrix object with 1611
#' rows and 126 variables #' rows and 126 variables
NULL NULL
@ -107,7 +107,7 @@ NULL
#' @importFrom graphics par #' @importFrom graphics par
#' @importFrom graphics title #' @importFrom graphics title
#' @importFrom grDevices rgb #' @importFrom grDevices rgb
#' #'
#' @import methods #' @import methods
#' @useDynLib xgboost, .registration = TRUE #' @useDynLib xgboost, .registration = TRUE
NULL NULL

View File

@ -313,7 +313,7 @@ SEXP XGBoosterPredict_R(SEXP handle, SEXP dmat, SEXP option_mask,
R_ExternalPtrAddr(dmat), R_ExternalPtrAddr(dmat),
asInteger(option_mask), asInteger(option_mask),
asInteger(ntree_limit), asInteger(ntree_limit),
0, asInteger(training),
&olen, &res)); &olen, &res));
ret = PROTECT(allocVector(REALSXP, olen)); ret = PROTECT(allocVector(REALSXP, olen));
for (size_t i = 0; i < olen; ++i) { for (size_t i = 0; i < olen; ++i) {

View File

@ -27,7 +27,7 @@ test_that("train and predict binary classification", {
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
expect_length(pred, 1611) expect_length(pred, 1611)
pred1 <- predict(bst, train$data, ntreelimit = 1) pred1 <- predict(bst, train$data, ntreelimit = 1)
expect_length(pred1, 6513) expect_length(pred1, 6513)
err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label) err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label)
@ -35,6 +35,54 @@ test_that("train and predict binary classification", {
expect_lt(abs(err_pred1 - err_log), 10e-6) expect_lt(abs(err_pred1 - err_log), 10e-6)
}) })
test_that("dart prediction works", {
nrounds = 32
set.seed(1994)
d <- cbind(
x1 = rnorm(100),
x2 = rnorm(100),
x3 = rnorm(100))
y <- d[,"x1"] + d[,"x2"]^2 +
ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) +
rnorm(100)
set.seed(1994)
booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
rate_drop = 0.5, one_drop = TRUE,
eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
expect_true(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE)
expect_false(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
set.seed(1994)
dtrain <- xgb.DMatrix(data=d, info = list(label=y))
booster_by_train <- xgb.train( params = list(
booster = "dart",
max_depth = 2,
eta = 1,
rate_drop = 0.5,
one_drop = TRUE,
nthread = 1,
tree_method= "exact",
verbosity = 3,
objective = "reg:squarederror"
),
data = dtrain,
nrounds = nrounds
)
pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
expect_true(all(matrix(pred_by_train_0, byrow=TRUE) == matrix(pred_by_xgboost_0, byrow=TRUE)))
expect_true(all(matrix(pred_by_train_1, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE)))
expect_true(all(matrix(pred_by_train_2, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE)))
})
test_that("train and predict softprob", { test_that("train and predict softprob", {
lb <- as.numeric(iris$Species) - 1 lb <- as.numeric(iris$Species) - 1
set.seed(11) set.seed(11)
@ -74,7 +122,7 @@ test_that("train and predict softmax", {
expect_false(is.null(bst$evaluation_log)) expect_false(is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$niter * 3, xgb.ntree(bst)) expect_equal(bst$niter * 3, xgb.ntree(bst))
pred <- predict(bst, as.matrix(iris[, -5])) pred <- predict(bst, as.matrix(iris[, -5]))
expect_length(pred, nrow(iris)) expect_length(pred, nrow(iris))
err <- sum(pred != lb)/length(lb) err <- sum(pred != lb)/length(lb)
@ -90,12 +138,12 @@ test_that("train and predict RF", {
num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1) num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
expect_equal(bst$niter, 1) expect_equal(bst$niter, 1)
expect_equal(xgb.ntree(bst), 20) expect_equal(xgb.ntree(bst), 20)
pred <- predict(bst, train$data) pred <- predict(bst, train$data)
pred_err <- sum((pred > 0.5) != lb)/length(lb) pred_err <- sum((pred > 0.5) != lb)/length(lb)
expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6) expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
#expect_lt(pred_err, 0.03) #expect_lt(pred_err, 0.03)
pred <- predict(bst, train$data, ntreelimit = 20) pred <- predict(bst, train$data, ntreelimit = 20)
pred_err_20 <- sum((pred > 0.5) != lb)/length(lb) pred_err_20 <- sum((pred > 0.5) != lb)/length(lb)
expect_equal(pred_err_20, pred_err) expect_equal(pred_err_20, pred_err)
@ -211,7 +259,7 @@ test_that("train and predict with non-strict classes", {
bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0) eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
pr0 <- predict(bst, train_dense) pr0 <- predict(bst, train_dense)
# dense matrix-like input of non-matrix class # dense matrix-like input of non-matrix class
class(train_dense) <- 'shmatrix' class(train_dense) <- 'shmatrix'
expect_true(is.matrix(train_dense)) expect_true(is.matrix(train_dense))
@ -221,7 +269,7 @@ test_that("train and predict with non-strict classes", {
, regexp = NA) , regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr) expect_equal(pr0, pr)
# dense matrix-like input of non-matrix class with some inheritance # dense matrix-like input of non-matrix class with some inheritance
class(train_dense) <- c('pphmatrix','shmatrix') class(train_dense) <- c('pphmatrix','shmatrix')
expect_true(is.matrix(train_dense)) expect_true(is.matrix(train_dense))
@ -231,7 +279,7 @@ test_that("train and predict with non-strict classes", {
, regexp = NA) , regexp = NA)
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)
expect_equal(pr0, pr) expect_equal(pr0, pr)
# when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster # when someone inhertis from xgb.Booster, it should still be possible to use it as xgb.Booster
class(bst) <- c('super.Booster', 'xgb.Booster') class(bst) <- c('super.Booster', 'xgb.Booster')
expect_error(pr <- predict(bst, train_dense), regexp = NA) expect_error(pr <- predict(bst, train_dense), regexp = NA)

View File

@ -157,7 +157,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
params = c( params = c(
list( list(
booster = booster, booster = booster,
objective = "reg:linear", objective = "reg:squarederror",
eval_metric = "rmse"), eval_metric = "rmse"),
if (booster == "dart") if (booster == "dart")
list(rate_drop = .01, one_drop = T)), list(rate_drop = .01, one_drop = T)),

View File

@ -435,9 +435,9 @@ class Dart : public GBTree {
std::fill(out_preds.begin(), out_preds.end(), std::fill(out_preds.begin(), out_preds.end(),
model_.learner_model_param_->base_score); model_.learner_model_param_->base_score);
} }
const int nthread = omp_get_max_threads();
PredLoopSpecalize(p_fmat, &out_preds, num_group, 0, InitThreadTemp(nthread);
ntree_limit, training); PredLoopSpecalize(p_fmat, &out_preds, num_group, 0, ntree_limit);
} }
void PredictInstance(const SparsePage::Inst &inst, void PredictInstance(const SparsePage::Inst &inst,
@ -489,11 +489,8 @@ class Dart : public GBTree {
std::vector<bst_float>* out_preds, std::vector<bst_float>* out_preds,
int num_group, int num_group,
unsigned tree_begin, unsigned tree_begin,
unsigned tree_end, unsigned tree_end) {
bool training) {
const int nthread = omp_get_max_threads();
CHECK_EQ(num_group, model_.learner_model_param_->num_output_group); CHECK_EQ(num_group, model_.learner_model_param_->num_output_group);
InitThreadTemp(nthread);
std::vector<bst_float>& preds = *out_preds; std::vector<bst_float>& preds = *out_preds;
CHECK_EQ(model_.param.size_leaf_vector, 0) CHECK_EQ(model_.param.size_leaf_vector, 0)
<< "size_leaf_vector is enforced to 0 so far"; << "size_leaf_vector is enforced to 0 so far";