[R] rename Quality -> Gain (#9938)

This commit is contained in:
david-cortes 2023-12-31 06:01:00 +01:00 committed by GitHub
parent 8b9c98b65b
commit 73713de601
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 22 additions and 22 deletions

View File

@ -28,7 +28,7 @@
#' - `Yes`: ID of the next node when the split condition is met.
#' - `No`: ID of the next node when the split condition is not met.
#' - `Missing`: ID of the next node when the branch value is missing.
#' - `Quality`: either the split gain (change in loss) or the leaf value.
#' - `Gain`: either the split gain (change in loss) or the leaf value.
#' - `Cover`: metric related to the number of observations either seen by a split
#' or collected by a leaf during training.
#'
@ -122,7 +122,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
# parse branch lines
branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),",
"gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
td[
isLeaf == FALSE,
(branch_cols) := {
@ -132,7 +132,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree)
if (length(xtr) == 0) {
as.data.table(
list(Feature = "NA", Split = "NA", Yes = "NA", No = "NA", Missing = "NA", Quality = "NA", Cover = "NA")
list(Feature = "NA", Split = "NA", Yes = "NA", No = "NA", Missing = "NA", Gain = "NA", Cover = "NA")
)
} else {
as.data.table(xtr)
@ -152,7 +152,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
# parse leaf lines
leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")")
leaf_cols <- c("Feature", "Quality", "Cover")
leaf_cols <- c("Feature", "Gain", "Cover")
td[
isLeaf == TRUE,
(leaf_cols) := {
@ -167,7 +167,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
]
# convert some columns to numeric
numeric_cols <- c("Split", "Quality", "Cover")
numeric_cols <- c("Split", "Gain", "Cover")
td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols]
if (use_int_id) {
int_cols <- c("Yes", "No", "Missing")

View File

@ -92,7 +92,7 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d
stop("Model tree columns are not as expected!\n",
" Note that this function works only for tree models.")
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID")
dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Gain)], by = "ID")
setkeyv(dt_depths, c("Tree", "ID"))
# count by depth levels, and also calculate average cover at a depth
dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth]
@ -157,6 +157,6 @@ get.leaf.depth <- function(dt_tree) {
# They are mainly column names inferred by Data.table...
globalVariables(
c(
".N", "N", "Depth", "Quality", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
".N", "N", "Depth", "Gain", "Cover", "Tree", "ID", "Yes", "No", "Feature", "Leaf", "Weight"
)
)

View File

@ -95,13 +95,13 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5,
data.table::set(tree.matrix, j = nm, value = sub("^\\d+-", "", tree.matrix[[nm]]))
nodes.dt <- tree.matrix[
, .(Quality = sum(Quality))
, .(Gain = sum(Gain))
, by = .(abs.node.position, Feature)
][, .(Text = paste0(
paste0(
Feature[seq_len(min(length(Feature), features_keep))],
" (",
format(Quality[seq_len(min(length(Quality), features_keep))], digits = 5),
format(Gain[seq_len(min(length(Gain), features_keep))], digits = 5),
")"
),
collapse = "\n"

View File

@ -132,7 +132,7 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees)
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)]
dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Gain)]
if (show_node_id)
dt[, label := paste0(ID, ": ", label)]
dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)]
@ -199,4 +199,4 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot
# Avoid error messages during CRAN check.
# The reason is that these variables are never declared
# They are mainly column names inferred by Data.table...
globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))
globalVariables(c("Feature", "ID", "Cover", "Gain", "Split", "Yes", "No", "Missing", ".", "shape", "filledcolor", "label"))

View File

@ -46,7 +46,7 @@ for a leaf node, it simply labels it as \code{"Leaf"}.
\item \code{Yes}: ID of the next node when the split condition is met.
\item \code{No}: ID of the next node when the split condition is not met.
\item \code{Missing}: ID of the next node when the branch value is missing.
\item \code{Quality}: either the split gain (change in loss) or the leaf value.
\item \code{Gain}: either the split gain (change in loss) or the leaf value.
\item \code{Cover}: metric related to the number of observations either seen by a split
or collected by a leaf during training.
}

View File

@ -275,7 +275,7 @@ test_that("xgb.Booster serializing as R object works", {
test_that("xgb.model.dt.tree works with and without feature names", {
.skip_if_vcd_not_available()
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover")
names.dt.trees <- c("Tree", "Node", "ID", "Feature", "Split", "Yes", "No", "Missing", "Gain", "Cover")
dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
expect_equal(names.dt.trees, names(dt.tree))
if (!flag_32bit)
@ -341,7 +341,7 @@ test_that("xgb.importance works with and without feature names", {
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Gain = sum(Gain),
Cover = sum(Cover),
Frequency = .N
),

View File

@ -53,9 +53,9 @@ test_that("updating the model works", {
# should be the same evaluation but different gains and larger cover
expect_equal(bst2$evaluation_log, bst2r$evaluation_log)
if (!win32_flag) {
expect_equal(tr2[Feature == 'Leaf']$Quality, tr2r[Feature == 'Leaf']$Quality)
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2r[Feature == 'Leaf']$Gain)
}
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2r[Feature != 'Leaf']$Quality)), 100)
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Gain - tr2r[Feature != 'Leaf']$Gain)), 100)
expect_gt(sum(tr2r$Cover) / sum(tr2$Cover), 1.5)
# process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
@ -72,8 +72,8 @@ test_that("updating the model works", {
tr2u <- xgb.model.dt.tree(model = bst2u)
# should be the same evaluation but different gains and larger cover
expect_equal(bst2$evaluation_log, bst2u$evaluation_log)
expect_equal(tr2[Feature == 'Leaf']$Quality, tr2u[Feature == 'Leaf']$Quality)
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Quality - tr2u[Feature != 'Leaf']$Quality)), 100)
expect_equal(tr2[Feature == 'Leaf']$Gain, tr2u[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr2[Feature != 'Leaf']$Gain - tr2u[Feature != 'Leaf']$Gain)), 100)
expect_gt(sum(tr2u$Cover) / sum(tr2$Cover), 1.5)
# the results should be the same as for the model with an extra 'refresh' updater
expect_equal(bst2r$evaluation_log, bst2u$evaluation_log)
@ -87,8 +87,8 @@ test_that("updating the model works", {
tr1ut <- xgb.model.dt.tree(model = bst1ut)
# should be the same evaluations but different gains and smaller cover (test data is smaller)
expect_equal(bst1$evaluation_log, bst1ut$evaluation_log)
expect_equal(tr1[Feature == 'Leaf']$Quality, tr1ut[Feature == 'Leaf']$Quality)
expect_gt(sum(abs(tr1[Feature != 'Leaf']$Quality - tr1ut[Feature != 'Leaf']$Quality)), 100)
expect_equal(tr1[Feature == 'Leaf']$Gain, tr1ut[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr1[Feature != 'Leaf']$Gain - tr1ut[Feature != 'Leaf']$Gain)), 100)
expect_lt(sum(tr1ut$Cover) / sum(tr1$Cover), 0.5)
})
@ -111,7 +111,7 @@ test_that("updating works for multiclass & multitree", {
# should be the same evaluation but different gains and larger cover
expect_equal(bst0$evaluation_log, bst0u$evaluation_log)
expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality)
expect_gt(sum(abs(tr0[Feature != 'Leaf']$Quality - tr0u[Feature != 'Leaf']$Quality)), 100)
expect_equal(tr0[Feature == 'Leaf']$Gain, tr0u[Feature == 'Leaf']$Gain)
expect_gt(sum(abs(tr0[Feature != 'Leaf']$Gain - tr0u[Feature != 'Leaf']$Gain)), 100)
expect_gt(sum(tr0u$Cover) / sum(tr0$Cover), 1.5)
})