adding feature contributions to R and gblinear (#2295)
* [gblinear] add features contribution prediction; fix DumpModel bug * [gbtree] minor changes to PredContrib * [R] add feature contribution prediction to R * [R] bump up version; update NEWS * [gblinear] fix the base_margin issue; fixes #1969 * [R] list of matrices as output of multiclass feature contributions * [gblinear] make order of DumpModel coefficients consistent: group index changes the fastest
This commit is contained in:
committed by
Yuan (Terry) Tang
parent
e5e721722e
commit
b52db87d5c
@@ -14,18 +14,19 @@ df[,ID := NULL]
|
||||
sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
|
||||
label <- df[, ifelse(Improved == "Marked", 1, 0)]
|
||||
|
||||
nrounds <- 12
|
||||
bst.Tree <- xgboost(data = sparse_matrix, label = label, max_depth = 9,
|
||||
eta = 1, nthread = 2, nrounds = 10, verbose = 0,
|
||||
eta = 1, nthread = 2, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gbtree")
|
||||
|
||||
bst.GLM <- xgboost(data = sparse_matrix, label = label,
|
||||
eta = 1, nthread = 2, nrounds = 10, verbose = 0,
|
||||
eta = 1, nthread = 1, nrounds = nrounds, verbose = 0,
|
||||
objective = "binary:logistic", booster = "gblinear")
|
||||
|
||||
feature.names <- colnames(sparse_matrix)
|
||||
|
||||
test_that("xgb.dump works", {
|
||||
expect_length(xgb.dump(bst.Tree), 172)
|
||||
expect_length(xgb.dump(bst.Tree), 200)
|
||||
expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with_stats = T))
|
||||
expect_true(file.exists('xgb.model.dump'))
|
||||
expect_gt(file.size('xgb.model.dump'), 8000)
|
||||
@@ -33,7 +34,7 @@ test_that("xgb.dump works", {
|
||||
# JSON format
|
||||
dmp <- xgb.dump(bst.Tree, dump_format = "json")
|
||||
expect_length(dmp, 1)
|
||||
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 162)
|
||||
expect_length(grep('nodeid', strsplit(dmp, '\n')[[1]]), 188)
|
||||
})
|
||||
|
||||
test_that("xgb.dump works for gblinear", {
|
||||
@@ -52,13 +53,74 @@ test_that("xgb.dump works for gblinear", {
|
||||
expect_length(grep('\\d', strsplit(dmp, '\n')[[1]]), 11)
|
||||
})
|
||||
|
||||
test_that("predict leafs works", {
|
||||
# no error for gbtree
|
||||
expect_error(pred_leaf <- predict(bst.Tree, sparse_matrix, predleaf = TRUE), regexp = NA)
|
||||
expect_equal(dim(pred_leaf), c(nrow(sparse_matrix), nrounds))
|
||||
# error for gblinear
|
||||
expect_error(predict(bst.GLM, sparse_matrix, predleaf = TRUE))
|
||||
})
|
||||
|
||||
test_that("predict feature contributions works", {
|
||||
# gbtree binary classifier
|
||||
expect_error(pred_contr <- predict(bst.Tree, sparse_matrix, predcontrib = TRUE), regexp = NA)
|
||||
expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
|
||||
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
|
||||
pred <- predict(bst.Tree, sparse_matrix, outputmargin = TRUE)
|
||||
expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-6)
|
||||
|
||||
# gblinear binary classifier
|
||||
expect_error(pred_contr <- predict(bst.GLM, sparse_matrix, predcontrib = TRUE), regexp = NA)
|
||||
expect_equal(dim(pred_contr), c(nrow(sparse_matrix), ncol(sparse_matrix) + 1))
|
||||
expect_equal(colnames(pred_contr), c(colnames(sparse_matrix), "BIAS"))
|
||||
pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE)
|
||||
expect_lt(max(abs(rowSums(pred_contr) - pred)), 2e-6)
|
||||
# manual calculation of linear terms
|
||||
coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric
|
||||
coefs <- c(coefs[-1], coefs[1]) # intercept must be the last
|
||||
pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*")
|
||||
expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), 2e-6)
|
||||
|
||||
# gbtree multiclass
|
||||
lb <- as.numeric(iris$Species) - 1
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0,
|
||||
max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
|
||||
objective = "multi:softprob", num_class = 3)
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
|
||||
pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
|
||||
expect_is(pred_contr, "list")
|
||||
expect_length(pred_contr, 3)
|
||||
for (g in seq_along(pred_contr)) {
|
||||
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
|
||||
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6)
|
||||
}
|
||||
|
||||
# gblinear multiclass (set base_score = 0, which is base margin in multiclass)
|
||||
bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, verbose = 0,
|
||||
booster = "gblinear", eta = 0.1, nthread = 1, nrounds = 10,
|
||||
objective = "multi:softprob", num_class = 3, base_score = 0)
|
||||
pred <- predict(bst, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE)
|
||||
pred_contr <- predict(bst, as.matrix(iris[, -5]), predcontrib = TRUE)
|
||||
expect_length(pred_contr, 3)
|
||||
coefs_all <- xgb.dump(bst)[-c(1,2,6)] %>% as.numeric
|
||||
for (g in seq_along(pred_contr)) {
|
||||
expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS"))
|
||||
expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), 2e-6)
|
||||
# manual calculation of linear terms
|
||||
coefs <- coefs_all[seq(g, length(coefs_all), by = 3)]
|
||||
coefs <- c(coefs[-1], coefs[1]) # intercept needs to be the last
|
||||
pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*")
|
||||
expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), 2e-6)
|
||||
}
|
||||
})
|
||||
|
||||
test_that("xgb-attribute functionality", {
|
||||
val <- "my attribute value"
|
||||
list.val <- list(my_attr=val, a=123, b='ok')
|
||||
list.ch <- list.val[order(names(list.val))]
|
||||
list.ch <- lapply(list.ch, as.character)
|
||||
# note: iter is 0-index in xgb attributes
|
||||
list.default <- list(niter = "9")
|
||||
list.default <- list(niter = as.character(nrounds - 1))
|
||||
list.ch <- c(list.ch, list.default)
|
||||
# proper input:
|
||||
expect_error(xgb.attr(bst.Tree, NULL))
|
||||
@@ -85,7 +147,9 @@ test_that("xgb-attribute functionality", {
|
||||
expect_null(xgb.attributes(bst))
|
||||
})
|
||||
|
||||
if (grepl('Windows', Sys.info()[['sysname']]) || grepl('Linux', Sys.info()[['sysname']]) || grepl('Darwin', Sys.info()[['sysname']])) {
|
||||
if (grepl('Windows', Sys.info()[['sysname']]) ||
|
||||
grepl('Linux', Sys.info()[['sysname']]) ||
|
||||
grepl('Darwin', Sys.info()[['sysname']])) {
|
||||
test_that("xgb-attribute numeric precision", {
|
||||
# check that lossless conversion works with 17 digits
|
||||
# numeric -> character -> numeric
|
||||
@@ -121,7 +185,7 @@ test_that("xgb.model.dt.tree works with and without feature names", {
|
||||
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
|
||||
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
|
||||
expect_equal(names.dt.trees, names(dt.tree))
|
||||
expect_equal(dim(dt.tree), c(162, 10))
|
||||
expect_equal(dim(dt.tree), c(188, 10))
|
||||
expect_output(str(dt.tree), 'Feature.*\\"Age\\"')
|
||||
|
||||
dt.tree.0 <- xgb.model.dt.tree(model = bst.Tree)
|
||||
|
||||
Reference in New Issue
Block a user