[R] Fix global feature importance and predict with 1 sample. (#7394)

* [R] Fix global feature importance. * Add implementation for tree index. The parameter is not documented in C API since we should work on porting the model slicing to R instead of supporting more use of tree index. * Fix the difference between "gain" and "total_gain". * debug. * Fix prediction.
2021-11-05 10:07:00 +08:00
parent 48aff0eabd
commit c968217ca8
11 changed files with 119 additions and 49 deletions
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  shape <- predts$shape
  ret <- predts$results

+  n_ret <- length(ret)
  n_row <- nrow(newdata)
  if (n_row != shape[1]) {
    stop("Incorrect predict shape.")
@@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
  arr <- array(data = ret, dim = rev(shape))

  cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+  n_groups <- shape[2]
+
+  ## Needed regardless of whether strict shape is being used.
  if (predcontrib) {
    dimnames(arr) <- list(cnames, NULL, NULL)
-    if (!strict_shape) {
-      arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
-    }
  } else if (predinteraction) {
    dimnames(arr) <- list(cnames, cnames, NULL, NULL)
-    if (!strict_shape) {
-      arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
-    }
+  }
+  if (strict_shape) {
+    return(arr) # strict shape is calculated by libxgboost uniformly.
  }

-  if (!strict_shape) {
-    n_groups <- shape[2]
-    if (predleaf) {
-      arr <- matrix(arr, nrow = n_row, byrow = TRUE)
-    } else if (predcontrib && n_groups != 1) {
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
-    } else if (predinteraction && n_groups != 1) {
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
-    } else if (!reshape && n_groups != 1) {
-      arr <- ret
-    } else if (reshape && n_groups != 1) {
-      arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
+  if (predleaf) {
+    ## Predict leaf
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol = 1)
+    } else {
+      matrix(arr, nrow = n_row, byrow = TRUE)
    }
-    arr <- drop(arr)
-    if (length(dim(arr)) == 1) {
-      arr <- as.vector(arr)
-    } else if (length(dim(arr)) == 2) {
-      arr <- as.matrix(arr)
+  } else if (predcontrib) {
+    ## Predict contribution
+    arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol =  1, dimnames = list(NULL, cnames))
+    } else if (n_groups != 1) {
+      ## turns array into list of matrices
+      lapply(seq_len(n_groups), function(g) arr[g, , ])
+    } else {
+      ## remove the first axis (group)
+      as.matrix(arr[1, , ])
+    }
+  } else if (predinteraction) {
+    ## Predict interaction
+    arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
+    } else if (n_groups != 1) {
+      ## turns array into list of matrices
+      lapply(seq_len(n_groups), function(g) arr[g, , , ])
+    } else {
+      ## remove the first axis (group)
+      arr[1, , , ]
+    }
+  } else {
+    ## Normal prediction
+    arr <- if (reshape && n_groups != 1) {
+      matrix(arr, ncol = n_groups, byrow = TRUE)
+    } else {
+      as.vector(ret)
    }
  }
  return(arr)
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
  } else {
    concatenated <- list()
    output_names <- vector()
-    for (importance_type in c("weight", "gain", "cover")) {
-      args <- list(importance_type = importance_type, feature_names = feature_names)
+    for (importance_type in c("weight", "total_gain", "total_cover")) {
+      args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
      results <- .Call(
        XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
      )
      names(results) <- c("features", "shape", importance_type)
      concatenated[
-        switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
+        switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
      ] <- results[importance_type]
      output_names <- results$features
    }
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -1,3 +1,4 @@
+library(testthat)
 context('Test helper functions')

 require(xgboost)
@@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
  # for multiclass
  imp.Tree <- xgb.importance(model = mbst.Tree)
  expect_equal(dim(imp.Tree), c(4, 4))
-  xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
+
+  trees <- seq(from = 0, by = 2, length.out = 2)
+  importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
+
+  importance_from_dump <- function() {
+    model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
+    imp <- xgb.model.dt.tree(
+      feature_names = feature.names,
+      text = model_text_dump,
+      trees = trees
+    )[
+      Feature != "Leaf", .(
+        Gain = sum(Quality),
+        Cover = sum(Cover),
+        Frequency = .N
+      ),
+      by = Feature
+    ][
+      , `:=`(
+        Gain = Gain / sum(Gain),
+        Cover = Cover / sum(Cover),
+        Frequency = Frequency / sum(Frequency)
+      )
+    ][
+      order(Gain, decreasing = TRUE)
+    ]
+    imp
+  }
+  expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
 })

 test_that("xgb.importance works with GLM model", {