R-callbacks tests + other tests brushup

This commit is contained in:
Vadim Khotilovich 2016-06-09 02:53:37 -05:00
parent 2e0ffcc303
commit f34f9fb9f7
6 changed files with 372 additions and 61 deletions

View File

@ -8,29 +8,152 @@ train <- agaricus.train
test <- agaricus.test
set.seed(1994)
test_that("train and predict", {
test_that("train and predict binary classification", {
nround = 2
expect_output(
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
eta = 1, nthread = 2, nround = nround, objective = "binary:logistic")
, "train-error")
expect_equal(class(bst), "xgb.Booster")
expect_true(!is.null(bst$evaluation_log))
expect_equal(nrow(bst$evaluation_log), nround)
expect_lt(bst$evaluation_log[, min(train_error)], 0.03)
expect_equal(bst$nboost, bst$ntree)
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
})
test_that("early stopping", {
res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
expect_true(nrow(res) < 20)
test_that("train and predict softprob", {
expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
max.depth = 3, eta = 0.5, nthread = 2, nround = 5,
objective = "multi:softprob", num_class=3)
, "train-merror")
expect_true(!is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$nboost * 3, bst$ntree)
pred <- predict(bst, as.matrix(iris[, -5]))
expect_equal(length(pred), nrow(iris) * 3)
})
test_that("train and predict softmax", {
expect_output(
bst <- xgboost(data = as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1,
max.depth = 3, eta = 0.15, nthread = 2, nround = 25,
objective = "multi:softmax", num_class=3)
, "train-merror")
expect_true(!is.null(bst$evaluation_log))
expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
expect_equal(bst$nboost * 3, bst$ntree)
pred <- predict(bst, as.matrix(iris[, -5]))
expect_equal(length(pred), nrow(iris))
})
test_that("early stopping xgb.train works", {
expect_output(
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
, "Stopping. Best iteration")
expect_true(!is.null(bst$best_iteration))
expect_lt(bst$best_iteration, 19)
expect_equal(bst$nboost, bst$ntree)
expect_equal(bst$best_iteration, bst$best_ntreelimit)
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
})
test_that("save_period", {
bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
save_period = 10, save_name = "xgb.model")
pred <- predict(bst, test$data)
expect_equal(length(pred), 1611)
test_that("training continuation works", {
dtrain <- xgb.DMatrix(train$data, label = train$label)
watchlist = list(train=dtrain)
param <- list(objective = "binary:logistic", max.depth = 2, eta = 1, nthread = 2)
# for the reference, use 4 iterations at once:
set.seed(11)
bst <- xgb.train(param, dtrain, nrounds = 4, watchlist)
# first two iterations:
set.seed(11)
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist)
# continue for two more:
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1)
expect_equal(bst$raw, bst2$raw)
expect_true(!is.null(bst2$evaluation_log))
expect_equal(dim(bst2$evaluation_log), c(4, 2))
expect_equal(bst2$evaluation_log, bst$evaluation_log)
# test continuing from raw model data
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = bst1$raw)
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
# test continuing from a model in file
xgb.save(bst1, "xgboost.model")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, xgb_model = "xgboost.model")
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
})
test_that("xgb.cv works", {
cv <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 1., nthread = 2, nround = 2, objective = "binary:logistic",
verbose=TRUE)
expect_is(cv, 'xgb.cv.synchronous')
expect_true(!is.null(cv$evaluation_log))
expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
expect_lt(cv$evaluation_log[, min(test_error_std)], 0.004)
expect_equal(cv$nboost, cv$ntree)
expect_true(!is.null(cv$folds) && is.list(cv$folds))
expect_length(cv$folds, 5)
expect_true(!is.null(cv$params) && is.list(cv$params))
expect_true(!is.null(cv$callbacks))
expect_true(!is.null(cv$call))
})
test_that("early stopping xgb.cv works", {
expect_output(
cv <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.5, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE, verbose=TRUE)
, "Stopping. Best iteration")
expect_true(!is.null(cv$best_iteration))
expect_lt(cv$best_iteration, 19)
expect_equal(cv$nboost, cv$ntree)
expect_equal(cv$best_iteration, cv$best_ntreelimit)
})
test_that("prediction in xgb.cv works", {
nround = 2
cv <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.5, nthread = 2, nround = nround, objective = "binary:logistic",
verbose=TRUE, prediction = T)
expect_true(!is.null(cv$evaluation_log))
expect_true(!is.null(cv$pred))
expect_length(cv$pred, nrow(train$data))
err_pred <- sum((cv$pred > 0.5) != train$label)/length(train$label)
err_log <- cv$evaluation_log[nround, test_error_mean]
expect_lt(abs(err_pred - err_log), 10e-6)
})
test_that("prediction in early-stopping xgb.cv works", {
set.seed(123)
# add some label noise
lb <- train$label
lb[sample(length(train$label), 2000)] <- 0
expect_output(
cv <- xgb.cv(data = train$data, label = lb, max.depth = 3, nfold = 5,
eta = 1., nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE, verbose=TRUE, predict=TRUE)
, "Stopping. Best iteration")
expect_true(!is.null(cv$best_iteration))
expect_lt(cv$best_iteration, 19)
expect_true(!is.null(cv$evaluation_log))
expect_true(!is.null(cv$pred))
expect_length(cv$pred, nrow(train$data))
err_pred <- sum((cv$pred > 0.5) != lb)/length(lb)
err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
expect_lt(abs(err_pred - err_log), 10e-6)
})

View File

@ -0,0 +1,174 @@
# More specific testing of callbacks
require(xgboost)
require(data.table)
context("callbacks")
data(agaricus.train, package='xgboost')
train <- agaricus.train
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
watchlist = list(train=dtrain)
param <- list(objective = "binary:logistic", max.depth = 2, nthread = 2)
test_that("cb.print_evaluation works as expected", {
bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
bst_evaluation_err <- NULL
begin_iteration <- 1
end_iteration <- 7
f0 <- cb.print_evaluation(period=0)
f1 <- cb.print_evaluation(period=1)
f5 <- cb.print_evaluation(period=5)
expect_true(!is.null(attr(f1, 'call')))
expect_equal(attr(f1, 'name'), 'cb.print_evaluation')
iteration <- 1
expect_silent(f0())
expect_output(f1(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
expect_output(f5(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
expect_null(f1())
iteration <- 2
expect_output(f1(), "\\[2\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
expect_silent(f5())
iteration <- 7
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000")
})
test_that("cb.log_evaluation works as expected", {
bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8)
bst_evaluation_err <- NULL
evaluation_log <- list()
f <- cb.log_evaluation()
expect_true(!is.null(attr(f, 'call')))
expect_equal(attr(f, 'name'), 'cb.log_evaluation')
iteration <- 1
expect_silent(f())
expect_equal(evaluation_log,
list(c(iter=1, bst_evaluation)))
iteration <- 2
expect_silent(f())
expect_equal(evaluation_log,
list(c(iter=1, bst_evaluation), c(iter=2, bst_evaluation)))
expect_silent(f(finalize = TRUE))
expect_equal(evaluation_log,
data.table(iter=1:2, train_auc=c(0.9,0.9), test_auc=c(0.8,0.8)))
bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2)
evaluation_log <- list()
f <- cb.log_evaluation()
iteration <- 1
expect_silent(f())
expect_equal(evaluation_log,
list(c(iter=1, c(bst_evaluation, bst_evaluation_err))))
iteration <- 2
expect_silent(f())
expect_equal(evaluation_log,
list(c(iter=1, c(bst_evaluation, bst_evaluation_err)),
c(iter=2, c(bst_evaluation, bst_evaluation_err))))
expect_silent(f(finalize = TRUE))
expect_equal(evaluation_log,
data.table(iter=1:2,
train_auc_mean=c(0.9,0.9), train_auc_std=c(0.1,0.1),
test_auc_mean=c(0.8,0.8), test_auc_std=c(0.2,0.2)))
})
test_that("cb.reset_parameters works as expected", {
# fixed eta
set.seed(111)
bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9)
expect_true(!is.null(bst0$evaluation_log))
expect_true(!is.null(bst0$evaluation_log$train_error))
# same eta but re-set as a vector parameter in the callback
set.seed(111)
my_par <- list(eta = c(0.9, 0.9))
bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset_parameters(my_par)))
expect_true(!is.null(bst1$evaluation_log$train_error))
expect_equal(bst0$evaluation_log$train_error,
bst1$evaluation_log$train_error)
# same eta but re-set via a function in the callback
set.seed(111)
my_par <- list(eta = function(itr, itr_end) 0.9)
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset_parameters(my_par)))
expect_true(!is.null(bst2$evaluation_log$train_error))
expect_equal(bst0$evaluation_log$train_error,
bst2$evaluation_log$train_error)
# different eta re-set as a vector parameter in the callback
set.seed(111)
my_par <- list(eta = c(0.6, 0.5))
bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset_parameters(my_par)))
expect_true(!is.null(bst3$evaluation_log$train_error))
expect_true(!all(bst0$evaluation_log$train_error == bst3$evaluation_log$train_error))
# resetting multiple parameters at the same time runs with no error
my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), alpha = c(0.01, 0.02))
expect_error(
bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset_parameters(my_par)))
, NA)
# expect no learning with 0 learning rate
my_par <- list(eta = c(0., 0.))
bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist,
callbacks = list(cb.reset_parameters(my_par)))
expect_true(!is.null(bstX$evaluation_log$train_error))
er <- unique(bstX$evaluation_log$train_error)
expect_length(er, 1)
expect_gt(er, 0.4)
})
# Note: early stopping is tested in test_basic
test_that("cb.save_model works as expected", {
files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
for (f in files) if (file.exists(f)) file.remove(f)
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
save_period = 1, save_name = "xgboost_%02d.model")
expect_true(file.exists('xgboost_01.model'))
expect_true(file.exists('xgboost_02.model'))
b1 <- xgb.load('xgboost_01.model')
expect_length(grep('^booster', xgb.dump(b1)), 1)
b2 <- xgb.load('xgboost_02.model')
expect_equal(bst$raw, b2$raw)
# save_period = 0 saves the last iteration's model
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, save_period = 0)
expect_true(file.exists('xgboost.model'))
b2 <- xgb.load('xgboost.model')
expect_equal(bst$raw, b2$raw)
for (f in files) if (file.exists(f)) file.remove(f)
})
test_that("can store evaluation_log without printing", {
expect_silent(
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1,
verbose = 0, callbacks = list(cb.log_evaluation()))
)
expect_true(!is.null(bst$evaluation_log))
expect_true(!is.null(bst$evaluation_log$train_error))
expect_lt(bst$evaluation_log[2, train_error], 0.03)
})

View File

@ -2,35 +2,50 @@ context('Test models with custom objective')
require(xgboost)
set.seed(1994)
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
watchlist <- list(eval = dtest, train = dtrain)
test_that("custom objective works", {
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
logregobj <- function(preds, dtrain) {
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(metric = "error", value = err))
}
}
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
param <- list(max.depth=2, eta=1, nthread = 2,
objective=logregobj, eval_metric=evalerror)
num_round <- 2
test_that("custom objective works", {
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)
expect_true(!is.null(bst$evaluation_log))
expect_true(!is.null(bst$evaluation_log$eval_error))
expect_lt(bst$evaluation_log[num_round, eval_error], 0.03)
})
test_that("custom objective in CV works", {
cv <- xgb.cv(param, dtrain, num_round, nfold=10, verbose=FALSE)
expect_true(!is.null(cv$evaluation_log))
expect_equal(dim(cv$evaluation_log), c(2, 5))
expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03)
})
test_that("custom objective using DMatrix attr works", {
attr(dtrain, 'label') <- getinfo(dtrain, 'label')
logregobjattr <- function(preds, dtrain) {
@ -40,8 +55,7 @@ test_that("custom objective works", {
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent = 1,
objective = logregobjattr, eval_metric = evalerror)
param$objective = logregobjattr
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)

View File

@ -7,25 +7,27 @@ require(vcd)
set.seed(1982)
data(Arthritis)
data(agaricus.train, package='xgboost')
df <- data.table(Arthritis, keep.rownames = F)
df[,AgeDiscret := as.factor(round(Age / 10,0))]
df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
df[,ID := NULL]
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
label <- df[, ifelse(Improved == "Marked", 1, 0)]
bst.Tree <- xgboost(data = sparse_matrix, label = label, max.depth = 9,
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gbtree")
bst.GLM <- xgboost(data = sparse_matrix, label = output_vector,
bst.GLM <- xgboost(data = sparse_matrix, label = label,
eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear")
feature.names <- colnames(agaricus.train$data)
feature.names <- colnames(sparse_matrix)
test_that("xgb.dump works", {
capture.output(print(xgb.dump(bst.Tree)))
capture.output(print(xgb.dump(bst.GLM)))
expect_length(xgb.dump(bst.Tree), 172)
expect_length(xgb.dump(bst.GLM), 14)
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with.stats = T))
expect_true(file.exists('xgb.model.dump'))
expect_gt(file.size('xgb.model.dump'), 8000)
})
test_that("xgb-attribute functionality", {
@ -61,19 +63,19 @@ test_that("xgb.model.dt.tree works with and without feature names", {
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
expect_equal(names.dt.trees, names(dt.tree))
expect_equal(dim(dt.tree), c(162, 10))
xgb.model.dt.tree(model = bst.Tree)
expect_output(str(xgb.model.dt.tree(model = bst.Tree)), 'Feature.*\\"3\\"')
})
test_that("xgb.importance works with and without feature names", {
importance.Tree <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.Tree)
importance.Tree <- xgb.importance(feature_names = feature.names, model = bst.Tree)
expect_equal(dim(importance.Tree), c(7, 4))
expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
xgb.importance(model = bst.Tree)
expect_output(str(xgb.importance(model = bst.Tree)), 'Feature.*\\"3\\"')
xgb.plot.importance(importance_matrix = importance.Tree)
})
test_that("xgb.importance works with GLM model", {
importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM)
importance.GLM <- xgb.importance(feature_names = feature.names, model = bst.GLM)
expect_equal(dim(importance.GLM), c(10, 2))
expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
xgb.importance(model = bst.GLM)

View File

@ -9,7 +9,7 @@ dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
bst <- xgboost(data = dtrain,
max.depth = 2,
max_depth = 2,
eta = 1,
nround = 10,
nthread = 1,
@ -17,16 +17,14 @@ bst <- xgboost(data = dtrain,
objective = "binary:logistic")
test_that("call is exposed to R", {
model_call <- attr(bst, "call")
expect_is(model_call, "call")
expect_true(!is.null(bst$call))
expect_is(bst$call, "call")
})
test_that("params is exposed to R", {
model_params <- attr(bst, "params")
model_params <- bst$params
expect_is(model_params, "list")
expect_equal(model_params$eta, 1)
expect_equal(model_params$max.depth, 2)
expect_equal(model_params$max_depth, 2)
expect_equal(model_params$objective, "binary:logistic")
})

View File

@ -5,10 +5,10 @@ set.seed(1994)
test_that("poisson regression works", {
data(mtcars)
bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11],
objective = 'count:poisson', nrounds=5)
bst <- xgboost(data = as.matrix(mtcars[,-11]), label = mtcars[,11],
objective = 'count:poisson', nrounds=10, verbose=0)
expect_equal(class(bst), "xgb.Booster")
pred <- predict(bst,as.matrix(mtcars[, -11]))
pred <- predict(bst, as.matrix(mtcars[, -11]))
expect_equal(length(pred), 32)
expect_less_than(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 2.5)
expect_lt(sqrt(mean( (pred - mtcars[,11])^2 )), 1.2)
})