diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..760a28108 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,52 @@ +# This is a basic workflow to help you get started with Actions + +name: XGoost-CI + +# Controls when the action will run. Triggers the workflow on push or pull request +# events but only for the master branch +on: [push, pull_request] + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + test-with-R: + runs-on: ${{ matrix.config.os }} + + name: Test R on OS ${{ matrix.config.os }}, R (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + + - uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - name: Install dependencies + shell: Rscript {0} + run: | + install.packages(c('XML','igraph')) + install.packages(c('data.table','magrittr','stringi','ggplot2','DiagrammeR','Ckmeans.1d.dp','vcd','testthat','lintr','knitr','rmarkdown')) + + - name: Config R + run: | + mkdir build && cd build + cmake .. -DCMAKE_CONFIGURATION_TYPES="Release" -DR_LIB=ON + + - name: Build R + run: | + cmake --build build --target install --config Release + + - name: Test R + run: | + cd R-package + R.exe -q -e "library(testthat); setwd('tests'); source('testthat.R')" diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R index e6f9f04b2..7b55c35d4 100644 --- a/R-package/R/callbacks.R +++ b/R-package/R/callbacks.R @@ -62,11 +62,11 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) { callback <- function(env = parent.frame()) { if (length(env$bst_evaluation) == 0 || period == 0 || - NVL(env$rank, 0) != 0 ) + NVL(env$rank, 0) != 0) return() i <- env$iteration - if ((i-1) %% period == 0 || + if ((i - 1) %% period == 0 || i == env$begin_iteration || i == env$end_iteration) { stdev <- if (showsd) env$bst_evaluation_err else NULL @@ -115,7 +115,7 @@ cb.evaluation.log <- function() { stop("bst_evaluation must have non-empty names") mnames <<- gsub('-', '_', names(env$bst_evaluation)) - if(!is.null(env$bst_evaluation_err)) + if (!is.null(env$bst_evaluation_err)) mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std')) } @@ -123,12 +123,12 @@ cb.evaluation.log <- function() { env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log))) setnames(env$evaluation_log, c('iter', mnames)) - if(!is.null(env$bst_evaluation_err)) { + if (!is.null(env$bst_evaluation_err)) { # rearrange col order from _mean,_mean,...,_std,_std,... # to be _mean,_std,_mean,_std,... len <- length(mnames) - means <- mnames[seq_len(len/2)] - stds <- mnames[(len/2 + 1):len] + means <- mnames[seq_len(len / 2)] + stds <- mnames[(len / 2 + 1):len] cnames <- numeric(len) cnames[c(TRUE, FALSE)] <- means cnames[c(FALSE, TRUE)] <- stds @@ -144,7 +144,7 @@ cb.evaluation.log <- function() { return(finalizer(env)) ev <- env$bst_evaluation - if(!is.null(env$bst_evaluation_err)) + if (!is.null(env$bst_evaluation_err)) ev <- c(ev, env$bst_evaluation_err) env$evaluation_log <- c(env$evaluation_log, list(c(iter = env$iteration, ev))) @@ -351,13 +351,13 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, finalizer <- function(env) { if (!is.null(env$bst)) { - attr_best_score = as.numeric(xgb.attr(env$bst$handle, 'best_score')) + attr_best_score <- as.numeric(xgb.attr(env$bst$handle, 'best_score')) if (best_score != attr_best_score) stop("Inconsistent 'best_score' values between the closure state: ", best_score, " and the xgb.attr: ", attr_best_score) - env$bst$best_iteration = best_iteration - env$bst$best_ntreelimit = best_ntreelimit - env$bst$best_score = best_score + env$bst$best_iteration <- best_iteration + env$bst$best_ntreelimit <- best_ntreelimit + env$bst$best_score <- best_score } else { env$basket$best_iteration <- best_iteration env$basket$best_ntreelimit <- best_ntreelimit @@ -372,9 +372,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE, return(finalizer(env)) i <- env$iteration - score = env$bst_evaluation[metric_idx] + score <- env$bst_evaluation[metric_idx] - if (( maximize && score > best_score) || + if ((maximize && score > best_score) || (!maximize && score < best_score)) { best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err) @@ -500,7 +500,7 @@ cb.cv.predict <- function(save_models = FALSE) { for (fd in env$bst_folds) { pr <- predict(fd$bst, fd$watchlist[[2]], ntreelimit = ntreelimit, reshape = TRUE) if (is.matrix(pred)) { - pred[fd$index,] <- pr + pred[fd$index, ] <- pr } else { pred[fd$index] <- pr } @@ -613,9 +613,7 @@ cb.gblinear.history <- function(sparse=FALSE) { init <- function(env) { if (!is.null(env$bst)) { # xgb.train: - coef_path <- list() } else if (!is.null(env$bst_folds)) { # xgb.cv: - coef_path <- rep(list(), length(env$bst_folds)) } else stop("Parent frame has neither 'bst' nor 'bst_folds'") } @@ -705,11 +703,11 @@ xgb.gblinear.history <- function(model, class_index = NULL) { if (!is_cv) { # extract num_class & num_feat from the internal model dmp <- xgb.dump(model) - if(length(dmp) < 2 || dmp[2] != "bias:") + if (length(dmp) < 2 || dmp[2] != "bias:") stop("It does not appear to be a gblinear model") - dmp <- dmp[-c(1,2)] + dmp <- dmp[-c(1, 2)] n <- which(dmp == 'weight:') - if(length(n) != 1) + if (length(n) != 1) stop("It does not appear to be a gblinear model") num_class <- n - 1 num_feat <- (length(dmp) - 4) / num_class @@ -732,9 +730,9 @@ xgb.gblinear.history <- function(model, class_index = NULL) { if (!is.null(class_index) && num_class > 1) { coef_path <- if (is.list(coef_path)) { lapply(coef_path, - function(x) x[, seq(1 + class_index, by=num_class, length.out=num_feat)]) + function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)]) } else { - coef_path <- coef_path[, seq(1 + class_index, by=num_class, length.out=num_feat)] + coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)] } } coef_path diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 0edbf1240..3c4e09183 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -69,23 +69,23 @@ check.booster.params <- function(params, ...) { if (!is.null(params[['monotone_constraints']]) && typeof(params[['monotone_constraints']]) != "character") { - vec2str = paste(params[['monotone_constraints']], collapse = ',') - vec2str = paste0('(', vec2str, ')') - params[['monotone_constraints']] = vec2str + vec2str <- paste(params[['monotone_constraints']], collapse = ',') + vec2str <- paste0('(', vec2str, ')') + params[['monotone_constraints']] <- vec2str } # interaction constraints parser (convert from list of column indices to string) if (!is.null(params[['interaction_constraints']]) && typeof(params[['interaction_constraints']]) != "character"){ # check input class - if (!identical(class(params[['interaction_constraints']]),'list')) stop('interaction_constraints should be class list') - if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric','integer'))) { + if (!identical(class(params[['interaction_constraints']]), 'list')) stop('interaction_constraints should be class list') + if (!all(unique(sapply(params[['interaction_constraints']], class)) %in% c('numeric', 'integer'))) { stop('interaction_constraints should be a list of numeric/integer vectors') } # recast parameter as string - interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse=','), ']')) - params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse=','), ']') + interaction_constraints <- sapply(params[['interaction_constraints']], function(x) paste0('[', paste(x, collapse = ','), ']')) + params[['interaction_constraints']] <- paste0('[', paste(interaction_constraints, collapse = ','), ']') } return(params) } @@ -167,8 +167,8 @@ xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) { if (is.null(feval)) { msg <- .Call(XGBoosterEvalOneIter_R, booster_handle, as.integer(iter), watchlist, as.list(evnames)) msg <- stri_split_regex(msg, '(\\s+|:|\\s+)')[[1]][-1] - res <- as.numeric(msg[c(FALSE,TRUE)]) # even indices are the values - names(res) <- msg[c(TRUE,FALSE)] # odds are the names + res <- as.numeric(msg[c(FALSE, TRUE)]) # even indices are the values + names(res) <- msg[c(TRUE, FALSE)] # odds are the names } else { res <- sapply(seq_along(watchlist), function(j) { w <- watchlist[[j]] @@ -315,8 +315,8 @@ depr_par_lut <- matrix(c( 'with.stats', 'with_stats', 'numberOfClusters', 'n_clusters', 'features.keep', 'features_keep', - 'plot.height','plot_height', - 'plot.width','plot_width', + 'plot.height', 'plot_height', + 'plot.width', 'plot_width', 'n_first_tree', 'trees', 'dummy', 'DUMMY' ), ncol = 2, byrow = TRUE) @@ -329,20 +329,20 @@ colnames(depr_par_lut) <- c('old', 'new') check.deprecation <- function(..., env = parent.frame()) { pars <- list(...) # exact and partial matches - all_match <- pmatch(names(pars), depr_par_lut[,1]) + all_match <- pmatch(names(pars), depr_par_lut[, 1]) # indices of matched pars' names idx_pars <- which(!is.na(all_match)) if (length(idx_pars) == 0) return() # indices of matched LUT rows idx_lut <- all_match[idx_pars] # which of idx_lut were the exact matches? - ex_match <- depr_par_lut[idx_lut,1] %in% names(pars) + ex_match <- depr_par_lut[idx_lut, 1] %in% names(pars) for (i in seq_along(idx_pars)) { pars_par <- names(pars)[idx_pars[i]] old_par <- depr_par_lut[idx_lut[i], 1] new_par <- depr_par_lut[idx_lut[i], 2] if (!ex_match[i]) { - warning("'", pars_par, "' was partially matched to '", old_par,"'") + warning("'", pars_par, "' was partially matched to '", old_par, "'") } .Deprecated(new_par, old = old_par, package = 'xgboost') if (new_par != 'NULL') { diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index dcc446985..a2bde19cf 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -1,6 +1,7 @@ # Construct an internal xgboost Booster and return a handle to it. # internal utility function -xgb.Booster.handle <- function(params = list(), cachelist = list(), modelfile = NULL) { +xgb.Booster.handle <- function(params = list(), cachelist = list(), + modelfile = NULL) { if (typeof(cachelist) != "list" || !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) { stop("cachelist must be a list of xgb.DMatrix objects") @@ -62,8 +63,8 @@ is.null.handle <- function(handle) { return(FALSE) } -# Return a verified to be valid handle out of either xgb.Booster.handle or xgb.Booster -# internal utility function +# Return a verified to be valid handle out of either xgb.Booster.handle or +# xgb.Booster internal utility function xgb.get.handle <- function(object) { if (inherits(object, "xgb.Booster")) { handle <- object$handle @@ -369,8 +370,8 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA matrix(ret, nrow = n_row, byrow = TRUE, dimnames = list(NULL, cnames)) } else { arr <- array(ret, c(n_col1, n_group, n_row), - dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2,3,1)) # [group, row, col] - lapply(seq_len(n_group), function(g) arr[g,,]) + dimnames = list(cnames, NULL, NULL)) %>% aperm(c(2, 3, 1)) # [group, row, col] + lapply(seq_len(n_group), function(g) arr[g, , ]) } } else if (predinteraction) { n_col1 <- ncol(newdata) + 1 @@ -379,11 +380,11 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA ret <- if (n_ret == n_row) { matrix(ret, ncol = 1, dimnames = list(NULL, cnames)) } else if (n_group == 1) { - array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3,1,2)) + array(ret, c(n_col1, n_col1, n_row), dimnames = list(cnames, cnames, NULL)) %>% aperm(c(3, 1, 2)) } else { arr <- array(ret, c(n_col1, n_col1, n_group, n_row), - dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3,4,1,2)) # [group, row, col1, col2] - lapply(seq_len(n_group), function(g) arr[g,,,]) + dimnames = list(cnames, cnames, NULL, NULL)) %>% aperm(c(3, 4, 1, 2)) # [group, row, col1, col2] + lapply(seq_len(n_group), function(g) arr[g, , , ]) } } else if (reshape && npred_per_case > 1) { ret <- matrix(ret, nrow = n_row, byrow = TRUE) @@ -656,7 +657,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { if (!is.null(x$params)) { cat('params (as set within xgb.train):\n') - cat( ' ', + cat(' ', paste(names(x$params), paste0('"', unlist(x$params), '"'), sep = ' = ', collapse = ', '), '\n', sep = '') @@ -669,9 +670,9 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { if (length(attrs) > 0) { cat('xgb.attributes:\n') if (verbose) { - cat( paste(paste0(' ',names(attrs)), - paste0('"', unlist(attrs), '"'), - sep = ' = ', collapse = '\n'), '\n', sep = '') + cat(paste(paste0(' ', names(attrs)), + paste0('"', unlist(attrs), '"'), + sep = ' = ', collapse = '\n'), '\n', sep = '') } else { cat(' ', paste(names(attrs), collapse = ', '), '\n', sep = '') } @@ -693,7 +694,7 @@ print.xgb.Booster <- function(x, verbose = FALSE, ...) { #cat('ntree: ', xgb.ntree(x), '\n', sep='') for (n in setdiff(names(x), c('handle', 'raw', 'call', 'params', 'callbacks', - 'evaluation_log','niter','feature_names'))) { + 'evaluation_log', 'niter', 'feature_names'))) { if (is.atomic(x[[n]])) { cat(n, ':', x[[n]], '\n', sep = ' ') } else { diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 4201a8302..9c7ba2684 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -322,7 +322,7 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { for (i in seq_along(ind)) { obj_attr <- attr(object, nms[i]) if (NCOL(obj_attr) > 1) { - attr(ret, nms[i]) <- obj_attr[idxset,] + attr(ret, nms[i]) <- obj_attr[idxset, ] } else { attr(ret, nms[i]) <- obj_attr[idxset] } @@ -360,9 +360,9 @@ slice.xgb.DMatrix <- function(object, idxset, ...) { print.xgb.DMatrix <- function(x, verbose = FALSE, ...) { cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ') infos <- c() - if(length(getinfo(x, 'label')) > 0) infos <- 'label' - if(length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') - if(length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') + if (length(getinfo(x, 'label')) > 0) infos <- 'label' + if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight') + if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin') if (length(infos) == 0) infos <- 'NA' cat(infos) cnames <- colnames(x) diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R index 1c659e529..85e328a1f 100644 --- a/R-package/R/xgb.DMatrix.save.R +++ b/R-package/R/xgb.DMatrix.save.R @@ -1,10 +1,10 @@ #' Save xgb.DMatrix object to binary file -#' +#' #' Save xgb.DMatrix object to binary file -#' +#' #' @param dmatrix the \code{xgb.DMatrix} object #' @param fname the name of the file to write. -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' train <- agaricus.train @@ -18,7 +18,7 @@ xgb.DMatrix.save <- function(dmatrix, fname) { stop("fname must be character") if (!inherits(dmatrix, "xgb.DMatrix")) stop("dmatrix must be xgb.DMatrix") - + .Call(XGDMatrixSaveBinary_R, dmatrix, fname[1], 0L) return(TRUE) } diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R index b8be64922..47e8f232e 100644 --- a/R-package/R/xgb.create.features.R +++ b/R-package/R/xgb.create.features.R @@ -1,50 +1,50 @@ #' Create new features from a previously learned model -#' +#' #' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model. -#' +#' #' @param model decision tree boosting model learned on the original data #' @param data original data (usually provided as a \code{dgCMatrix} matrix) #' @param ... currently not used -#' +#' #' @return \code{dgCMatrix} matrix including both the original data and the new features. #' -#' @details +#' @details #' This is the function inspired from the paragraph 3.1 of the paper: -#' +#' #' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook} -#' -#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, +#' +#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, #' Joaquin Quinonero Candela)} -#' +#' #' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014 -#' +#' #' \url{https://research.fb.com/publications/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}. -#' +#' #' Extract explaining the method: -#' +#' #' "We found that boosted decision trees are a powerful and very #' convenient way to implement non-linear and tuple transformations #' of the kind we just described. We treat each individual #' tree as a categorical feature that takes as value the -#' index of the leaf an instance ends up falling in. We use -#' 1-of-K coding of this type of features. -#' -#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, +#' index of the leaf an instance ends up falling in. We use +#' 1-of-K coding of this type of features. +#' +#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, #' where the first subtree has 3 leafs and the second 2 leafs. If an #' instance ends up in leaf 2 in the first subtree and leaf 1 in #' second subtree, the overall input to the linear classifier will #' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries #' correspond to the leaves of the first subtree and last 2 to #' those of the second subtree. -#' +#' #' [...] -#' +#' #' We can understand boosted decision tree #' based transformation as a supervised feature encoding that #' converts a real-valued vector into a compact binary-valued #' vector. A traversal from root node to a leaf node represents #' a rule on certain features." -#' +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') @@ -55,33 +55,33 @@ #' nrounds = 4 #' #' bst = xgb.train(params = param, data = dtrain, nrounds = nrounds, nthread = 2) -#' +#' #' # Model accuracy without new features #' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / #' length(agaricus.test$label) -#' +#' #' # Convert previous features to one hot encoding #' new.features.train <- xgb.create.features(model = bst, agaricus.train$data) #' new.features.test <- xgb.create.features(model = bst, agaricus.test$data) -#' +#' #' # learning with new features #' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label) #' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label) #' watchlist <- list(train = new.dtrain) #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2) -#' +#' #' # Model accuracy with new features #' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / #' length(agaricus.test$label) -#' +#' #' # Here the accuracy was already good and is now perfect. #' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", #' accuracy.after, "!\n")) -#' +#' #' @export xgb.create.features <- function(model, data, ...){ check.deprecation(...) pred_with_leaf <- predict(model, data, predleaf = TRUE) cols <- lapply(as.data.frame(pred_with_leaf), factor) - cbind(data, sparse.model.matrix( ~ . -1, cols)) + cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint } diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index 484581a92..fd74d0f6b 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -2,7 +2,7 @@ #' #' The cross validation function of xgboost #' -#' @param params the list of parameters. The complete list of parameters is +#' @param params the list of parameters. The complete list of parameters is #' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below #' is a shorter summary: #' \itemize{ @@ -137,20 +137,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # stop("Either 'eval_metric' or 'feval' must be provided for CV") # Check the labels - if ( (inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) || - (!inherits(data, 'xgb.DMatrix') && is.null(label))) { + if ((inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) || + (!inherits(data, 'xgb.DMatrix') && is.null(label))) { stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix") } else if (inherits(data, 'xgb.DMatrix')) { if (!is.null(label)) warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix") - cv_label = getinfo(data, 'label') + cv_label <- getinfo(data, 'label') } else { - cv_label = label + cv_label <- label } # CV folds - if(!is.null(folds)) { - if(!is.list(folds) || length(folds) < 2) + if (!is.null(folds)) { + if (!is.list(folds) || length(folds) < 2) stop("'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold") nfold <- length(folds) } else { @@ -165,7 +165,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = # verbosity & evaluation printing callback: params <- c(params, list(silent = 1)) - print_every_n <- max( as.integer(print_every_n), 1L) + print_every_n <- max(as.integer(print_every_n), 1L) if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd)) } @@ -196,20 +196,20 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = bst_folds <- lapply(seq_along(folds), function(k) { dtest <- slice(dall, folds[[k]]) # code originally contributed by @RolandASc on stackoverflow - if(is.null(train_folds)) + if (is.null(train_folds)) dtrain <- slice(dall, unlist(folds[-k])) else dtrain <- slice(dall, train_folds[[k]]) handle <- xgb.Booster.handle(params, list(dtrain, dtest)) - list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test=dtest), index = folds[[k]]) + list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]]) }) rm(dall) # a "basket" to collect some results from callbacks basket <- list() # extract parameters that can affect the relationship b/w #trees and #iterations - num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) - num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) + num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint + num_parallel_tree <- max(as.numeric(NVL(params[['num_parallel_tree']], 1)), 1) # nolint # those are fixed for CV (no training continuation) begin_iteration <- 1 @@ -226,7 +226,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = }) msg <- simplify2array(msg) bst_evaluation <- rowMeans(msg) - bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) + bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) # nolint for (f in cb$post_iter) f() @@ -285,10 +285,10 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) { } if (!is.null(x$params)) { cat('params (as set within xgb.cv):\n') - cat( ' ', - paste(names(x$params), - paste0('"', unlist(x$params), '"'), - sep = ' = ', collapse = ', '), '\n', sep = '') + cat(' ', + paste(names(x$params), + paste0('"', unlist(x$params), '"'), + sep = ' = ', collapse = ', '), '\n', sep = '') } if (!is.null(x$callbacks) && length(x$callbacks) > 0) { cat('callbacks:\n') diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index ffa3cbc03..0e8ae4f3e 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -1,15 +1,15 @@ #' Dump an xgboost model in text format. -#' +#' #' Dump an xgboost model in text format. -#' +#' #' @param model the model object. -#' @param fname the name of the text file where to save the model text dump. +#' @param fname the name of the text file where to save the model text dump. #' If not provided or set to \code{NULL}, the model is returned as a \code{character} vector. #' @param fmap feature map file representing feature types. -#' Detailed description could be found at +#' Detailed description could be found at #' \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}. #' See demo/ for walkthrough example in R, and -#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} +#' \url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} #' for example Format. #' @param with_stats whether to dump some additional statistics about the splits. #' When this option is on, the model dump contains two additional values: @@ -27,18 +27,18 @@ #' data(agaricus.test, package='xgboost') #' train <- agaricus.train #' test <- agaricus.test -#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, +#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' # save the model in file 'xgb.model.dump' #' dump_path = file.path(tempdir(), 'model.dump') #' xgb.dump(bst, dump_path, with_stats = TRUE) -#' +#' #' # print the model without saving it to a file #' print(xgb.dump(bst, with_stats = TRUE)) -#' +#' #' # print in JSON format: #' cat(xgb.dump(bst, with_stats = TRUE, dump_format='json')) -#' +#' #' @export xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE, dump_format = c("text", "json"), ...) { @@ -50,19 +50,19 @@ xgb.dump <- function(model, fname = NULL, fmap = "", with_stats=FALSE, stop("fname: argument must be a character string (when provided)") if (!(is.null(fmap) || is.character(fmap))) stop("fmap: argument must be a character string (when provided)") - + model <- xgb.Booster.complete(model) model_dump <- .Call(XGBoosterDumpModel_R, model$handle, NVL(fmap, "")[1], as.integer(with_stats), as.character(dump_format)) - if (is.null(fname)) + if (is.null(fname)) model_dump <- stri_replace_all_regex(model_dump, '\t', '') - + if (dump_format == "text") model_dump <- unlist(stri_split_regex(model_dump, '\n')) - + model_dump <- grep('^\\s*$', model_dump, invert = TRUE, value = TRUE) - + if (is.null(fname)) { return(model_dump) } else { diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R index eceb5c4fb..3b76e9fac 100644 --- a/R-package/R/xgb.ggplot.R +++ b/R-package/R/xgb.ggplot.R @@ -3,9 +3,9 @@ #' @rdname xgb.plot.importance #' @export -xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, +xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measure = NULL, rel_to_first = FALSE, n_clusters = c(1:10), ...) { - + importance_matrix <- xgb.plot.importance(importance_matrix, top_n = top_n, measure = measure, rel_to_first = rel_to_first, plot = FALSE, ...) if (!requireNamespace("ggplot2", quietly = TRUE)) { @@ -14,21 +14,21 @@ xgb.ggplot.importance <- function(importance_matrix = NULL, top_n = NULL, measur if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) { stop("Ckmeans.1d.dp package is required", call. = FALSE) } - + clusters <- suppressWarnings( Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix$Importance, n_clusters) ) importance_matrix[, Cluster := as.character(clusters$cluster)] plot <- - ggplot2::ggplot(importance_matrix, + ggplot2::ggplot(importance_matrix, ggplot2::aes(x = factor(Feature, levels = rev(Feature)), y = Importance, width = 0.5), - environment = environment()) + - ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") + - ggplot2::coord_flip() + - ggplot2::xlab("Features") + - ggplot2::ggtitle("Feature importance") + - ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), + environment = environment()) + + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position = "identity") + + ggplot2::coord_flip() + + ggplot2::xlab("Features") + + ggplot2::ggtitle("Feature importance") + + ggplot2::theme(plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()) return(plot) } @@ -42,7 +42,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med stop("ggplot2 package is required for plotting the graph deepness.", call. = FALSE) which <- match.arg(which) - + dt_depths <- xgb.plot.deepness(model = model, plot = FALSE) dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth] setkey(dt_summaries, 'Depth') @@ -60,30 +60,30 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med axis.ticks = ggplot2::element_blank(), axis.text.x = ggplot2::element_blank() ) - - p2 <- + + p2 <- ggplot2::ggplot(dt_summaries) + - ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") + + ggplot2::geom_bar(ggplot2::aes(x = Depth, y = Cover), stat = "Identity") + ggplot2::xlab("Leaf depth") + ggplot2::ylab("Weighted cover") - + multiplot(p1, p2, cols = 1) return(invisible(list(p1, p2))) - + } else if (which == "max.depth") { p <- ggplot2::ggplot(dt_depths[, max(Depth), Tree]) + ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1), - height = 0.15, alpha=0.4, size=3, stroke=0) + + height = 0.15, alpha = 0.4, size = 3, stroke = 0) + ggplot2::xlab("tree #") + ggplot2::ylab("Max tree leaf depth") return(p) - + } else if (which == "med.depth") { p <- ggplot2::ggplot(dt_depths[, median(as.numeric(Depth)), Tree]) + ggplot2::geom_jitter(ggplot2::aes(x = Tree, y = V1), - height = 0.15, alpha=0.4, size=3, stroke=0) + + height = 0.15, alpha = 0.4, size = 3, stroke = 0) + ggplot2::xlab("tree #") + ggplot2::ylab("Median tree leaf depth") return(p) @@ -92,7 +92,7 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med p <- ggplot2::ggplot(dt_depths[, median(abs(Weight)), Tree]) + ggplot2::geom_point(ggplot2::aes(x = Tree, y = V1), - alpha=0.4, size=3, stroke=0) + + alpha = 0.4, size = 3, stroke = 0) + ggplot2::xlab("tree #") + ggplot2::ylab("Median absolute leaf weight") return(p) @@ -105,11 +105,11 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med # internal utility function multiplot <- function(..., cols = 1) { plots <- list(...) - num_plots = length(plots) - + num_plots <- length(plots) + layout <- matrix(seq(1, cols * ceiling(num_plots / cols)), ncol = cols, nrow = ceiling(num_plots / cols)) - + if (num_plots == 1) { print(plots[[1]]) } else { @@ -118,7 +118,7 @@ multiplot <- function(..., cols = 1) { for (i in 1:num_plots) { # Get the i,j matrix positions of the regions that contain this subplot matchidx <- as.data.table(which(layout == i, arr.ind = TRUE)) - + print( plots[[i]], vp = grid::viewport( layout.pos.row = matchidx$row, diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 62e37e8d5..b1c59d98d 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -1,66 +1,66 @@ #' Importance of features in a model. -#' +#' #' Creates a \code{data.table} of feature importances in a model. -#' +#' #' @param feature_names character vector of feature names. If the model already #' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' Non-null \code{feature_names} could be provided to override those in the model. #' @param model object of class \code{xgb.Booster}. #' @param trees (only for the gbtree booster) an integer vector of tree indices that should be included #' into the importance calculation. If set to \code{NULL}, all trees of the model are parsed. -#' It could be useful, e.g., in multiclass classification to get feature importances +#' It could be useful, e.g., in multiclass classification to get feature importances #' for each class separately. IMPORTANT: the tree index in xgboost models #' is zero-based (e.g., use \code{trees = 0:4} for first 5 trees). #' @param data deprecated. #' @param label deprecated. #' @param target deprecated. #' -#' @details -#' +#' @details +#' #' This function works for both linear and tree models. -#' -#' For linear models, the importance is the absolute magnitude of linear coefficients. -#' For that reason, in order to obtain a meaningful ranking by importance for a linear model, -#' the features need to be on the same scale (which you also would want to do when using either +#' +#' For linear models, the importance is the absolute magnitude of linear coefficients. +#' For that reason, in order to obtain a meaningful ranking by importance for a linear model, +#' the features need to be on the same scale (which you also would want to do when using either #' L1 or L2 regularization). -#' +#' #' @return -#' +#' #' For a tree model, a \code{data.table} with the following columns: #' \itemize{ #' \item \code{Features} names of the features used in the model; #' \item \code{Gain} represents fractional contribution of each feature to the model based on -#' the total gain of this feature's splits. Higher percentage means a more important +#' the total gain of this feature's splits. Higher percentage means a more important #' predictive feature. #' \item \code{Cover} metric of the number of observation related to this feature; #' \item \code{Frequency} percentage representing the relative number of times #' a feature have been used in trees. #' } -#' +#' #' A linear model's importance \code{data.table} has the following columns: #' \itemize{ #' \item \code{Features} names of the features used in the model; #' \item \code{Weight} the linear coefficient of this feature; #' \item \code{Class} (only for multiclass models) class label. #' } -#' -#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, +#' +#' If \code{feature_names} is not provided and \code{model} doesn't have \code{feature_names}, #' index of the features will be used instead. Because the index is extracted from the model dump #' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). -#' +#' #' @examples -#' +#' #' # binomial classification using gbtree: #' data(agaricus.train, package='xgboost') -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' xgb.importance(model = bst) -#' +#' #' # binomial classification using gblinear: -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, booster = "gblinear", #' eta = 0.3, nthread = 1, nrounds = 20, objective = "binary:logistic") #' xgb.importance(model = bst) -#' +#' #' # multiclass classification using gbtree: #' nclass <- 3 #' nrounds <- 10 @@ -73,7 +73,7 @@ #' xgb.importance(model = mbst, trees = seq(from=0, by=nclass, length.out=nrounds)) #' xgb.importance(model = mbst, trees = seq(from=1, by=nclass, length.out=nrounds)) #' xgb.importance(model = mbst, trees = seq(from=2, by=nclass, length.out=nrounds)) -#' +#' #' # multiclass classification using gblinear: #' mbst <- xgboost(data = scale(as.matrix(iris[, -5])), label = as.numeric(iris$Species) - 1, #' booster = "gblinear", eta = 0.2, nthread = 1, nrounds = 15, @@ -83,33 +83,33 @@ #' @export xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, data = NULL, label = NULL, target = NULL){ - + if (!(is.null(data) && is.null(label) && is.null(target))) warning("xgb.importance: parameters 'data', 'label' and 'target' are deprecated") - + if (!inherits(model, "xgb.Booster")) stop("model: must be an object of class xgb.Booster") - + if (is.null(feature_names) && !is.null(model$feature_names)) feature_names <- model$feature_names - + if (!(is.null(feature_names) || is.character(feature_names))) stop("feature_names: Has to be a character vector") model_text_dump <- xgb.dump(model = model, with_stats = TRUE) - + # linear model - if(model_text_dump[2] == "bias:"){ + if (model_text_dump[2] == "bias:"){ weights <- which(model_text_dump == "weight:") %>% {model_text_dump[(. + 1):length(model_text_dump)]} %>% as.numeric - + num_class <- NVL(model$params$num_class, 1) - if(is.null(feature_names)) + if (is.null(feature_names)) feature_names <- seq(to = length(weights) / num_class) - 1 if (length(feature_names) * num_class != length(weights)) stop("feature_names length does not match the number of features used in the model") - + result <- if (num_class == 1) { data.table(Feature = feature_names, Weight = weights)[order(-abs(Weight))] } else { @@ -117,18 +117,17 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, Weight = weights, Class = seq_len(num_class) - 1)[order(Class, -abs(Weight))] } - } else { - # tree model - result <- xgb.model.dt.tree(feature_names = feature_names, - text = model_text_dump, - trees = trees)[ - Feature != "Leaf", .(Gain = sum(Quality), - Cover = sum(Cover), - Frequency = .N), by = Feature][ - ,`:=`(Gain = Gain / sum(Gain), - Cover = Cover / sum(Cover), - Frequency = Frequency / sum(Frequency))][ - order(Gain, decreasing = TRUE)] + } else { # tree model + result <- xgb.model.dt.tree(feature_names = feature_names, + text = model_text_dump, + trees = trees)[ + Feature != "Leaf", .(Gain = sum(Quality), + Cover = sum(Cover), + Frequency = .N), by = Feature][ + , `:=`(Gain = Gain / sum(Gain), + Cover = Cover / sum(Cover), + Frequency = Frequency / sum(Frequency))][ + order(Gain, decreasing = TRUE)] } result } diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index 6a00797fe..af872c3cb 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -1,12 +1,12 @@ #' Parse a boosted tree model text dump -#' +#' #' Parse a boosted tree model text dump into a \code{data.table} structure. -#' +#' #' @param feature_names character vector of feature names. If the model already #' contains feature names, those would be used when \code{feature_names=NULL} (default value). #' Non-null \code{feature_names} could be provided to override those in the model. #' @param model object of class \code{xgb.Booster} -#' @param text \code{character} vector previously generated by the \code{xgb.dump} +#' @param text \code{character} vector previously generated by the \code{xgb.dump} #' function (where parameter \code{with_stats = TRUE} should have been set). #' \code{text} takes precedence over \code{model}. #' @param trees an integer vector of tree indices that should be parsed. @@ -18,11 +18,11 @@ #' represented as integers (when FALSE) or as "Tree-Node" character strings (when FALSE). #' @param ... currently not used. #' -#' @return +#' @return #' A \code{data.table} with detailed information about model trees' nodes. #' #' The columns of the \code{data.table} are: -#' +#' #' \itemize{ #' \item \code{Tree}: integer ID of a tree in a model (zero-based index) #' \item \code{Node}: integer ID of a node in a tree (zero-based index) @@ -36,79 +36,79 @@ #' \item \code{Quality}: either the split gain (change in loss) or the leaf value #' \item \code{Cover}: metric related to the number of observation either seen by a split #' or collected by a leaf during training. -#' } -#' +#' } +#' #' When \code{use_int_id=FALSE}, columns "Yes", "No", and "Missing" point to model-wide node identifiers -#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from +#' in the "ID" column. When \code{use_int_id=TRUE}, those columns point to node identifiers from #' the corresponding trees in the "Node" column. -#' +#' #' @examples #' # Basic use: -#' +#' #' data(agaricus.train, package='xgboost') -#' -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, +#' +#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") -#' +#' #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst)) -#' -#' # This bst model already has feature_names stored with it, so those would be used when +#' +#' # This bst model already has feature_names stored with it, so those would be used when #' # feature_names is not set: #' (dt <- xgb.model.dt.tree(model = bst)) -#' +#' #' # How to match feature names of splits that are following a current 'Yes' branch: -#' +#' #' merge(dt, dt[, .(ID, Y.Feature=Feature)], by.x='Yes', by.y='ID', all.x=TRUE)[order(Tree,Node)] -#' +#' #' @export xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, trees = NULL, use_int_id = FALSE, ...){ check.deprecation(...) - + if (!inherits(model, "xgb.Booster") && !is.character(text)) { stop("Either 'model' must be an object of class xgb.Booster\n", " or 'text' must be a character vector with the result of xgb.dump\n", " (or NULL if 'model' was provided).") } - + if (is.null(feature_names) && !is.null(model) && !is.null(model$feature_names)) feature_names <- model$feature_names - + if (!(is.null(feature_names) || is.character(feature_names))) { stop("feature_names: must be a character vector") } - + if (!(is.null(trees) || is.numeric(trees))) { stop("trees: must be a vector of integers.") } - + if (is.null(text)){ text <- xgb.dump(model = model, with_stats = TRUE) } - + if (length(text) < 2 || sum(stri_detect_regex(text, 'yes=(\\d+),no=(\\d+)')) < 1) { stop("Non-tree model detected! This function can only be used with tree models.") } - + position <- which(!is.na(stri_match_first_regex(text, "booster"))) - + add.tree.id <- function(node, tree) if (use_int_id) node else paste(tree, node, sep = "-") - + anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" - + td <- data.table(t = text) td[position, Tree := 1L] td[, Tree := cumsum(ifelse(is.na(Tree), 0L, Tree)) - 1L] - + if (is.null(trees)) { trees <- 0:max(td$Tree) } else { trees <- trees[trees >= 0 & trees <= max(td$Tree)] } td <- td[Tree %in% trees & !grepl('^booster', t)] - - td[, Node := stri_match_first_regex(t, "(\\d+):")[,2] %>% as.integer ] + + td[, Node := stri_match_first_regex(t, "(\\d+):")[, 2] %>% as.integer] if (!use_int_id) td[, ID := add.tree.id(Node, Tree)] td[, isLeaf := !is.na(stri_match_first_regex(t, "leaf"))] @@ -116,29 +116,29 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, branch_rx <- paste0("f(\\d+)<(", anynumber_regex, ")\\] yes=(\\d+),no=(\\d+),missing=(\\d+),", "gain=(", anynumber_regex, "),cover=(", anynumber_regex, ")") branch_cols <- c("Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover") - td[isLeaf == FALSE, + td[isLeaf == FALSE, (branch_cols) := { # skip some indices with spurious capture groups from anynumber_regex - xtr <- stri_match_first_regex(t, branch_rx)[, c(2,3,5,6,7,8,10), drop = FALSE] + xtr <- stri_match_first_regex(t, branch_rx)[, c(2, 3, 5, 6, 7, 8, 10), drop = FALSE] xtr[, 3:5] <- add.tree.id(xtr[, 3:5], Tree) - lapply(seq_len(ncol(xtr)), function(i) xtr[,i]) + lapply(seq_len(ncol(xtr)), function(i) xtr[, i]) }] # assign feature_names when available if (!is.null(feature_names)) { if (length(feature_names) <= max(as.numeric(td$Feature), na.rm = TRUE)) stop("feature_names has less elements than there are features used in the model") - td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1] ] + td[isLeaf == FALSE, Feature := feature_names[as.numeric(Feature) + 1]] } - + # parse leaf lines leaf_rx <- paste0("leaf=(", anynumber_regex, "),cover=(", anynumber_regex, ")") leaf_cols <- c("Feature", "Quality", "Cover") td[isLeaf == TRUE, (leaf_cols) := { - xtr <- stri_match_first_regex(t, leaf_rx)[, c(2,4)] - c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[,i])) + xtr <- stri_match_first_regex(t, leaf_rx)[, c(2, 4)] + c("Leaf", lapply(seq_len(ncol(xtr)), function(i) xtr[, i])) }] - + # convert some columns to numeric numeric_cols <- c("Split", "Quality", "Cover") td[, (numeric_cols) := lapply(.SD, as.numeric), .SDcols = numeric_cols] @@ -146,14 +146,14 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, int_cols <- c("Yes", "No", "Missing") td[, (int_cols) := lapply(.SD, as.integer), .SDcols = int_cols] } - + td[, t := NULL] td[, isLeaf := NULL] - + td[order(Tree, Node)] } # Avoid error messages during CRAN check. # The reason is that these variables are never declared # They are mainly column names inferred by Data.table... -globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf",".SD", ".SDcols")) +globalVariables(c("Tree", "Node", "ID", "Feature", "t", "isLeaf", ".SD", ".SDcols")) diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index 87d632ace..6579fb511 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -2,48 +2,48 @@ #' #' Visualizes distributions related to depth of tree leafs. #' \code{xgb.plot.deepness} uses base R graphics, while \code{xgb.ggplot.deepness} uses the ggplot backend. -#' +#' #' @param model either an \code{xgb.Booster} model generated by the \code{xgb.train} function #' or a data.table result of the \code{xgb.model.dt.tree} function. -#' @param plot (base R barplot) whether a barplot should be produced. +#' @param plot (base R barplot) whether a barplot should be produced. #' If FALSE, only a data.table is returned. #' @param which which distribution to plot (see details). #' @param ... other parameters passed to \code{barplot} or \code{plot}. -#' +#' #' @details -#' +#' #' When \code{which="2x1"}, two distributions with respect to the leaf depth #' are plotted on top of each other: #' \itemize{ #' \item the distribution of the number of leafs in a tree model at a certain depth; -#' \item the distribution of average weighted number of observations ("cover") +#' \item the distribution of average weighted number of observations ("cover") #' ending up in leafs at certain depth. #' } -#' Those could be helpful in determining sensible ranges of the \code{max_depth} +#' Those could be helpful in determining sensible ranges of the \code{max_depth} #' and \code{min_child_weight} parameters. -#' +#' #' When \code{which="max.depth"} or \code{which="med.depth"}, plots of either maximum or median depth #' per tree with respect to tree number are created. And \code{which="med.weight"} allows to see how #' a tree's median absolute leaf weight changes through the iterations. #' #' This function was inspired by the blog post #' \url{https://github.com/aysent/random-forest-leaf-visualization}. -#' +#' #' @return -#' +#' #' Other than producing plots (when \code{plot=TRUE}), the \code{xgb.plot.deepness} function #' silently returns a processed data.table where each row corresponds to a terminal leaf in a tree model, #' and contains information about leaf's depth, cover, and weight (which is used in calculating predictions). -#' +#' #' The \code{xgb.ggplot.deepness} silently returns either a list of two ggplot graphs when \code{which="2x1"} #' or a single ggplot graph for the other \code{which} options. #' -#' @seealso -#' +#' @seealso +#' #' \code{\link{xgb.train}}, \code{\link{xgb.model.dt.tree}}. -#' +#' #' @examples -#' +#' #' data(agaricus.train, package='xgboost') #' #' # Change max_depth to a higher number to get a more significant result @@ -53,16 +53,16 @@ #' #' xgb.plot.deepness(bst) #' xgb.ggplot.deepness(bst) -#' +#' #' xgb.plot.deepness(bst, which='max.depth', pch=16, col=rgb(0,0,1,0.3), cex=2) -#' +#' #' xgb.plot.deepness(bst, which='med.weight', pch=16, col=rgb(0,0,1,0.3), cex=2) #' #' @rdname xgb.plot.deepness #' @export xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.depth", "med.weight"), plot = TRUE, ...) { - + if (!(inherits(model, "xgb.Booster") || is.data.table(model))) stop("model: Has to be either an xgb.Booster model generaged by the xgb.train function\n", "or a data.table result of the xgb.importance function") @@ -71,32 +71,32 @@ xgb.plot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med.d stop("igraph package is required for plotting the graph deepness.", call. = FALSE) which <- match.arg(which) - + dt_tree <- model if (inherits(model, "xgb.Booster")) dt_tree <- xgb.model.dt.tree(model = model) - + if (!all(c("Feature", "Tree", "ID", "Yes", "No", "Cover") %in% colnames(dt_tree))) stop("Model tree columns are not as expected!\n", " Note that this function works only for tree models.") - + dt_depths <- merge(get.leaf.depth(dt_tree), dt_tree[, .(ID, Cover, Weight = Quality)], by = "ID") setkeyv(dt_depths, c("Tree", "ID")) # count by depth levels, and also calculate average cover at a depth dt_summaries <- dt_depths[, .(.N, Cover = mean(Cover)), Depth] setkey(dt_summaries, "Depth") - + if (plot) { if (which == "2x1") { op <- par(no.readonly = TRUE) - par(mfrow = c(2,1), - oma = c(3,1,3,1) + 0.1, - mar = c(1,4,1,0) + 0.1) + par(mfrow = c(2, 1), + oma = c(3, 1, 3, 1) + 0.1, + mar = c(1, 4, 1, 0) + 0.1) dt_summaries[, barplot(N, border = NA, ylab = 'Number of leafs', ...)] dt_summaries[, barplot(Cover, border = NA, ylab = "Weighted cover", names.arg = Depth, ...)] - + title("Model complexity", xlab = "Leaf depth", outer = TRUE, line = 1) par(op) } else if (which == "max.depth") { @@ -123,14 +123,14 @@ get.leaf.depth <- function(dt_tree) { dt_tree[Feature != "Leaf", .(ID, To = No, Tree)] )) # whether "To" is a leaf: - dt_edges <- + dt_edges <- merge(dt_edges, dt_tree[Feature == "Leaf", .(ID, Leaf = TRUE)], all.x = TRUE, by.x = "To", by.y = "ID") dt_edges[is.na(Leaf), Leaf := FALSE] dt_edges[, { - graph <- igraph::graph_from_data_frame(.SD[,.(ID, To)]) + graph <- igraph::graph_from_data_frame(.SD[, .(ID, To)]) # min(ID) in a tree is a root node paths_tmp <- igraph::shortest_paths(graph, from = min(ID), to = To[Leaf == TRUE]) # list of paths to each leaf in a tree diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 598bd3b1f..b6bcade1f 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -92,10 +92,10 @@ xgb.plot.importance <- function(importance_matrix = NULL, top_n = NULL, measure importance_matrix <- head(importance_matrix, top_n) } if (rel_to_first) { - importance_matrix[, Importance := Importance/max(abs(Importance))] + importance_matrix[, Importance := Importance / max(abs(Importance))] } if (is.null(cex)) { - cex <- 2.5/log2(1 + nrow(importance_matrix)) + cex <- 2.5 / log2(1 + nrow(importance_matrix)) } if (plot) { diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 3e7b04b8c..fc02baee1 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -9,7 +9,7 @@ #' @param plot_height height in pixels of the graph to produce #' @param render a logical flag for whether the graph should be rendered (see Value). #' @param ... currently not used -#' +#' #' @details #' #' This function tries to capture the complexity of a gradient boosted tree model @@ -72,53 +72,53 @@ xgb.plot.multi.trees <- function(model, feature_names = NULL, features_keep = 5, precedent.nodes <- root.nodes - while(tree.matrix[,sum(is.na(abs.node.position))] > 0) { + while (tree.matrix[, sum(is.na(abs.node.position))] > 0) { yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)] no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)] yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0") no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1") - + tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos] tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos] precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos) } - + tree.matrix[!is.na(Yes), Yes := paste0(abs.node.position, "_0")] tree.matrix[!is.na(No), No := paste0(abs.node.position, "_1")] - + remove.tree <- . %>% stri_replace_first_regex(pattern = "^\\d+-", replacement = "") - - tree.matrix[,`:=`(abs.node.position = remove.tree(abs.node.position), - Yes = remove.tree(Yes), - No = remove.tree(No))] - + + tree.matrix[, `:=`(abs.node.position = remove.tree(abs.node.position), + Yes = remove.tree(Yes), + No = remove.tree(No))] + nodes.dt <- tree.matrix[ , .(Quality = sum(Quality)) , by = .(abs.node.position, Feature) ][, .(Text = paste0(Feature[1:min(length(Feature), features_keep)], " (", - format(Quality[1:min(length(Quality), features_keep)], digits=5), + format(Quality[1:min(length(Quality), features_keep)], digits = 5), ")") %>% paste0(collapse = "\n")) , by = abs.node.position] - + edges.dt <- tree.matrix[Feature != "Leaf", .(abs.node.position, Yes)] %>% - list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% + list(tree.matrix[Feature != "Leaf", .(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[, .N, .(From, To)] %>% - .[, N:=NULL] - + .[, N := NULL] + nodes <- DiagrammeR::create_node_df( n = nrow(nodes.dt), - label = nodes.dt[,Text] + label = nodes.dt[, Text] ) - + edges <- DiagrammeR::create_edge_df( - from = match(edges.dt[,From], nodes.dt[,abs.node.position]), - to = match(edges.dt[,To], nodes.dt[,abs.node.position]), + from = match(edges.dt[, From], nodes.dt[, abs.node.position]), + to = match(edges.dt[, To], nodes.dt[, abs.node.position]), rel = "leading_to") - + graph <- DiagrammeR::create_graph( nodes_df = nodes, edges_df = edges, diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 18f6aaa44..0eb3c1330 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -125,12 +125,12 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data)) idx <- sample(1:nrow(data), nsample) - data <- data[idx,] + data <- data[idx, ] if (is.null(shap_contrib)) { shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib) } else { - shap_contrib <- shap_contrib[idx,] + shap_contrib <- shap_contrib[idx, ] } which <- match.arg(which) @@ -168,8 +168,8 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, if (plot && which == "1d") { op <- par(mfrow = c(ceiling(length(features) / n_col), n_col), - oma = c(0,0,0,0) + 0.2, - mar = c(3.5,3.5,0,0) + 0.1, + oma = c(0, 0, 0, 0) + 0.2, + mar = c(3.5, 3.5, 0, 0) + 0.1, mgp = c(1.7, 0.6, 0)) for (f in cols) { ord <- order(data[, f]) @@ -192,7 +192,7 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, grid() if (plot_loess) { # compress x to 3 digits, and mean-aggredate y - zz <- data.table(x = signif(x, 3), y)[, .(.N, y=mean(y)), x] + zz <- data.table(x = signif(x, 3), y)[, .(.N, y = mean(y)), x] if (nrow(zz) <= 5) { lines(zz$x, zz$y, col = col_loess) } else { diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 29c37d606..6f0efd509 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -1,7 +1,7 @@ #' Plot a boosted tree model -#' -#' Read a tree model text dump and plot the model. -#' +#' +#' Read a tree model text dump and plot the model. +#' #' @param feature_names names of each feature as a \code{character} vector. #' @param model produced by the \code{xgb.train} function. #' @param trees an integer vector of tree indices that should be visualized. @@ -14,10 +14,10 @@ #' @param show_node_id a logical flag for whether to show node id's in the graph. #' @param ... currently not used. #' -#' @details -#' +#' @details +#' #' The content of each node is organised that way: -#' +#' #' \itemize{ #' \item Feature name. #' \item \code{Cover}: The sum of second order gradient of training data classified to the leaf. @@ -27,21 +27,21 @@ #' \item \code{Gain} (for split nodes): the information gain metric of a split #' (corresponds to the importance of the node in the model). #' \item \code{Value} (for leafs): the margin value that the leaf may contribute to prediction. -#' } +#' } #' The tree root nodes also indicate the Tree index (0-based). -#' +#' #' The "Yes" branches are marked by the "< split_value" label. #' The branches that also used for missing values are marked as bold #' (as in "carrying extra capacity"). -#' +#' #' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR. -#' +#' #' @return -#' +#' #' When \code{render = TRUE}: #' returns a rendered graph object which is an \code{htmlwidget} of class \code{grViz}. #' Similar to ggplot objects, it needs to be printed to see it when not running from command line. -#' +#' #' When \code{render = FALSE}: #' silently returns a graph object which is of DiagrammeR's class \code{dgr_graph}. #' This could be useful if one wants to modify some of the graph attributes @@ -49,23 +49,23 @@ #' #' @examples #' data(agaricus.train, package='xgboost') -#' +#' #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3, #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' # plot all the trees #' xgb.plot.tree(model = bst) #' # plot only the first tree and display the node ID: #' xgb.plot.tree(model = bst, trees = 0, show_node_id = TRUE) -#' +#' #' \dontrun{ -#' # Below is an example of how to save this plot to a file. +#' # Below is an example of how to save this plot to a file. #' # Note that for `export_graph` to work, the DiagrammeRsvg and rsvg packages must also be installed. #' library(DiagrammeR) #' gr <- xgb.plot.tree(model=bst, trees=0:1, render=FALSE) #' export_graph(gr, 'tree.pdf', width=1500, height=1900) #' export_graph(gr, 'tree.png', width=1500, height=1900) #' } -#' +#' #' @export xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot_width = NULL, plot_height = NULL, render = TRUE, show_node_id = FALSE, ...){ @@ -77,18 +77,18 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot if (!requireNamespace("DiagrammeR", quietly = TRUE)) { stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE) } - + dt <- xgb.model.dt.tree(feature_names = feature_names, model = model, trees = trees) - dt[, label:= paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)] + dt[, label := paste0(Feature, "\nCover: ", Cover, ifelse(Feature == "Leaf", "\nValue: ", "\nGain: "), Quality)] if (show_node_id) dt[, label := paste0(ID, ": ", label)] dt[Node == 0, label := paste0("Tree ", Tree, "\n", label)] - dt[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"] - dt[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"] + dt[, shape := "rectangle"][Feature == "Leaf", shape := "oval"] + dt[, filledcolor := "Beige"][Feature == "Leaf", filledcolor := "Khaki"] # in order to draw the first tree on top: dt <- dt[order(-Tree)] - + nodes <- DiagrammeR::create_node_df( n = nrow(dt), ID = dt$ID, @@ -97,7 +97,7 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot shape = dt$shape, data = dt$Feature, fontcolor = "black") - + edges <- DiagrammeR::create_edge_df( from = match(dt[Feature != "Leaf", c(ID)] %>% rep(2), dt$ID), to = match(dt[Feature != "Leaf", c(Yes, No)], dt$ID), @@ -126,9 +126,9 @@ xgb.plot.tree <- function(feature_names = NULL, model = NULL, trees = NULL, plot attr_type = "edge", attr = c("color", "arrowsize", "arrowhead", "fontname"), value = c("DimGray", "1.5", "vee", "Helvetica")) - + if (!render) return(invisible(graph)) - + DiagrammeR::render_graph(graph, width = plot_width, height = plot_height) } diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R index d969dae61..4a5d462ec 100644 --- a/R-package/R/xgb.save.R +++ b/R-package/R/xgb.save.R @@ -1,29 +1,29 @@ #' Save xgboost model to binary file -#' +#' #' Save xgboost model to a file in binary format. -#' +#' #' @param model model object of \code{xgb.Booster} class. #' @param fname name of the file to write. -#' -#' @details -#' This methods allows to save a model in an xgboost-internal binary format which is universal +#' +#' @details +#' This methods allows to save a model in an xgboost-internal binary format which is universal #' among the various xgboost interfaces. In R, the saved model file could be read-in later -#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter +#' using either the \code{\link{xgb.load}} function or the \code{xgb_model} parameter #' of \code{\link{xgb.train}}. -#' -#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} -#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and +#' +#' Note: a model can also be saved as an R-object (e.g., by using \code{\link[base]{readRDS}} +#' or \code{\link[base]{save}}). However, it would then only be compatible with R, and #' corresponding R-methods would need to be used to load it. -#' -#' @seealso -#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. -#' +#' +#' @seealso +#' \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}. +#' #' @examples #' data(agaricus.train, package='xgboost') #' data(agaricus.test, package='xgboost') #' train <- agaricus.train #' test <- agaricus.test -#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, +#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, #' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic") #' xgb.save(bst, 'xgb.model') #' bst <- xgb.load('xgb.model') diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 2f00e09a2..a6755f5fa 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -3,7 +3,7 @@ #' \code{xgb.train} is an advanced interface for training an xgboost model. #' The \code{xgboost} function is a simpler wrapper for \code{xgb.train}. #' -#' @param params the list of parameters. The complete list of parameters is +#' @param params the list of parameters. The complete list of parameters is #' available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below #' is a shorter summary: #' @@ -278,7 +278,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), # evaluation printing callback params <- c(params) - print_every_n <- max( as.integer(print_every_n), 1L) + print_every_n <- max(as.integer(print_every_n), 1L) if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) { callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n)) @@ -328,12 +328,9 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), niter_init <- xgb.ntree(bst) %/% (num_parallel_tree * num_class) } } - if(is_update && nrounds > niter_init) + if (is_update && nrounds > niter_init) stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)") - # TODO: distributed code - rank <- 0 - niter_skip <- ifelse(is_update, 0, niter_init) begin_iteration <- niter_skip + 1 end_iteration <- niter_skip + nrounds @@ -345,7 +342,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), xgb.iter.update(bst$handle, dtrain, iteration - 1, obj) - bst_evaluation <- numeric(0) if (length(watchlist) > 0) bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval) @@ -360,7 +356,7 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(), bst <- xgb.Booster.complete(bst, saveraw = TRUE) # store the total number of boosting iterations - bst$niter = end_iteration + bst$niter <- end_iteration # store the evaluation results if (length(evaluation_log) > 0 && diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index b23e4dd70..3cbf7f991 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -2,19 +2,19 @@ require(xgboost) context("basic functions") -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test set.seed(1994) # disable some tests for Win32 -windows_flag = .Platform$OS.type == "windows" && +windows_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8 -solaris_flag = (Sys.info()['sysname'] == "SunOS") +solaris_flag <- (Sys.info()['sysname'] == "SunOS") test_that("train and predict binary classification", { - nrounds = 2 + nrounds <- 2 expect_output( bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic") @@ -30,24 +30,24 @@ test_that("train and predict binary classification", { pred1 <- predict(bst, train$data, ntreelimit = 1) expect_length(pred1, 6513) - err_pred1 <- sum((pred1 > 0.5) != train$label)/length(train$label) + err_pred1 <- sum((pred1 > 0.5) != train$label) / length(train$label) err_log <- bst$evaluation_log[1, train_error] expect_lt(abs(err_pred1 - err_log), 10e-6) }) test_that("parameter validation works", { p <- list(foo = "bar") - nrounds = 1 + nrounds <- 1 set.seed(1994) d <- cbind( x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10)) - y <- d[,"x1"] + d[,"x2"]^2 + - ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) + + y <- d[, "x1"] + d[, "x2"]^2 + + ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + rnorm(10) - dtrain <- xgb.DMatrix(data=d, info = list(label=y)) + dtrain <- xgb.DMatrix(data = d, info = list(label = y)) correct <- function() { params <- list(max_depth = 2, booster = "dart", @@ -70,15 +70,15 @@ test_that("parameter validation works", { test_that("dart prediction works", { - nrounds = 32 + nrounds <- 32 set.seed(1994) d <- cbind( x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100)) - y <- d[,"x1"] + d[,"x2"]^2 + - ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) + + y <- d[, "x1"] + d[, "x2"]^2 + + ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + rnorm(100) set.seed(1994) @@ -87,23 +87,23 @@ test_that("dart prediction works", { eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror") pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0) pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds) - expect_true(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE))) + expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) pred_by_xgboost_2 <- predict(booster_by_xgboost, newdata = d, training = TRUE) - expect_false(all(matrix(pred_by_xgboost_0, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE))) + expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) set.seed(1994) - dtrain <- xgb.DMatrix(data=d, info = list(label=y)) - booster_by_train <- xgb.train( params = list( - booster = "dart", - max_depth = 2, - eta = 1, - rate_drop = 0.5, - one_drop = TRUE, - nthread = 1, - tree_method= "exact", - objective = "reg:squarederror" - ), + dtrain <- xgb.DMatrix(data = d, info = list(label = y)) + booster_by_train <- xgb.train(params = list( + booster = "dart", + max_depth = 2, + eta = 1, + rate_drop = 0.5, + one_drop = TRUE, + nthread = 1, + tree_method = "exact", + objective = "reg:squarederror" + ), data = dtrain, nrounds = nrounds ) @@ -111,9 +111,9 @@ test_that("dart prediction works", { pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds) pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE) - expect_true(all(matrix(pred_by_train_0, byrow=TRUE) == matrix(pred_by_xgboost_0, byrow=TRUE))) - expect_true(all(matrix(pred_by_train_1, byrow=TRUE) == matrix(pred_by_xgboost_1, byrow=TRUE))) - expect_true(all(matrix(pred_by_train_2, byrow=TRUE) == matrix(pred_by_xgboost_2, byrow=TRUE))) + expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE))) + expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE))) + expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE))) }) test_that("train and predict softprob", { @@ -122,7 +122,7 @@ test_that("train and predict softprob", { expect_output( bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, - objective = "multi:softprob", num_class=3) + objective = "multi:softprob", num_class = 3) , "train-merror") expect_false(is.null(bst$evaluation_log)) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) @@ -130,17 +130,17 @@ test_that("train and predict softprob", { pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris) * 3) # row sums add up to total probability of 1: - expect_equal(rowSums(matrix(pred, ncol=3, byrow=TRUE)), rep(1, nrow(iris)), tolerance = 1e-7) + expect_equal(rowSums(matrix(pred, ncol = 3, byrow = TRUE)), rep(1, nrow(iris)), tolerance = 1e-7) # manually calculate error at the last iteration: mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) expect_equal(as.numeric(t(mpred)), pred) pred_labels <- max.col(mpred) - 1 - err <- sum(pred_labels != lb)/length(lb) + err <- sum(pred_labels != lb) / length(lb) expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) # manually calculate error at the 1st iteration: mpred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 1) pred_labels <- max.col(mpred) - 1 - err <- sum(pred_labels != lb)/length(lb) + err <- sum(pred_labels != lb) / length(lb) expect_equal(bst$evaluation_log[1, train_merror], err, tolerance = 5e-6) }) @@ -150,7 +150,7 @@ test_that("train and predict softmax", { expect_output( bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5, - objective = "multi:softmax", num_class=3) + objective = "multi:softmax", num_class = 3) , "train-merror") expect_false(is.null(bst$evaluation_log)) expect_lt(bst$evaluation_log[, min(train_merror)], 0.025) @@ -158,7 +158,7 @@ test_that("train and predict softmax", { pred <- predict(bst, as.matrix(iris[, -5])) expect_length(pred, nrow(iris)) - err <- sum(pred != lb)/length(lb) + err <- sum(pred != lb) / length(lb) expect_equal(bst$evaluation_log[5, train_merror], err, tolerance = 5e-6) }) @@ -173,12 +173,12 @@ test_that("train and predict RF", { expect_equal(xgb.ntree(bst), 20) pred <- predict(bst, train$data) - pred_err <- sum((pred > 0.5) != lb)/length(lb) + pred_err <- sum((pred > 0.5) != lb) / length(lb) expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6) #expect_lt(pred_err, 0.03) pred <- predict(bst, train$data, ntreelimit = 20) - pred_err_20 <- sum((pred > 0.5) != lb)/length(lb) + pred_err_20 <- sum((pred > 0.5) != lb) / length(lb) expect_equal(pred_err_20, pred_err) #pred <- predict(bst, train$data, ntreelimit = 1) @@ -193,19 +193,19 @@ test_that("train and predict RF with softprob", { set.seed(11) bst <- xgboost(data = as.matrix(iris[, -5]), label = lb, max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds, - objective = "multi:softprob", num_class=3, verbose = 0, + objective = "multi:softprob", num_class = 3, verbose = 0, num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5) expect_equal(bst$niter, 15) - expect_equal(xgb.ntree(bst), 15*3*4) + expect_equal(xgb.ntree(bst), 15 * 3 * 4) # predict for all iterations: - pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE) + pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE) expect_equal(dim(pred), c(nrow(iris), 3)) pred_labels <- max.col(pred) - 1 - err <- sum(pred_labels != lb)/length(lb) + err <- sum(pred_labels != lb) / length(lb) expect_equal(bst$evaluation_log[nrounds, train_merror], err, tolerance = 5e-6) # predict for 7 iterations and adjust for 4 parallel trees per iteration - pred <- predict(bst, as.matrix(iris[, -5]), reshape=TRUE, ntreelimit = 7 * 4) - err <- sum((max.col(pred) - 1) != lb)/length(lb) + pred <- predict(bst, as.matrix(iris[, -5]), reshape = TRUE, ntreelimit = 7 * 4) + err <- sum((max.col(pred) - 1) != lb) / length(lb) expect_equal(bst$evaluation_log[7, train_merror], err, tolerance = 5e-6) }) @@ -223,7 +223,7 @@ test_that("use of multiple eval metrics works", { test_that("training continuation works", { dtrain <- xgb.DMatrix(train$data, label = train$label) - watchlist = list(train=dtrain) + watchlist <- list(train = dtrain) param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2) # for the reference, use 4 iterations at once: @@ -255,7 +255,7 @@ test_that("training continuation works", { test_that("model serialization works", { out_path <- "model_serialization" dtrain <- xgb.DMatrix(train$data, label = train$label) - watchlist = list(train=dtrain) + watchlist <- list(train = dtrain) param <- list(objective = "binary:logistic") booster <- xgb.train(param, dtrain, nrounds = 4, watchlist) raw <- xgb.serialize(booster) @@ -273,7 +273,7 @@ test_that("xgb.cv works", { expect_output( cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5, eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", - verbose=TRUE) + verbose = TRUE) , "train-error:") expect_is(cv, 'xgb.cv.synchronous') expect_false(is.null(cv$evaluation_log)) @@ -292,11 +292,11 @@ test_that("xgb.cv works with stratified folds", { set.seed(314159) cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", - verbose=TRUE, stratified = FALSE) + verbose = TRUE, stratified = FALSE) set.seed(314159) cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5, eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic", - verbose=TRUE, stratified = TRUE) + verbose = TRUE, stratified = TRUE) # Stratified folds should result in a different evaluation logs expect_true(all(cv$evaluation_log[, test_error_mean] != cv2$evaluation_log[, test_error_mean])) }) @@ -319,7 +319,7 @@ test_that("train and predict with non-strict classes", { expect_equal(pr0, pr) # dense matrix-like input of non-matrix class with some inheritance - class(train_dense) <- c('pphmatrix','shmatrix') + class(train_dense) <- c('pphmatrix', 'shmatrix') expect_true(is.matrix(train_dense)) expect_error( bst <- xgboost(data = train_dense, label = train$label, max_depth = 2, @@ -337,15 +337,15 @@ test_that("train and predict with non-strict classes", { test_that("max_delta_step works", { dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) watchlist <- list(train = dtrain) - param <- list(objective = "binary:logistic", eval_metric="logloss", max_depth = 2, nthread = 2, eta = 0.5) - nrounds = 5 + param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5) + nrounds <- 5 # model with no restriction on max_delta_step bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1) # model with restricted max_delta_step bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1) # the no-restriction model is expected to have consistently lower loss during the initial interations expect_true(all(bst1$evaluation_log$train_logloss < bst2$evaluation_log$train_logloss)) - expect_lt(mean(bst1$evaluation_log$train_logloss)/mean(bst2$evaluation_log$train_logloss), 0.8) + expect_lt(mean(bst1$evaluation_log$train_logloss) / mean(bst2$evaluation_log$train_logloss), 0.8) }) test_that("colsample_bytree works", { diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R index e7230d18e..9016c1bcb 100644 --- a/R-package/tests/testthat/test_callbacks.R +++ b/R-package/tests/testthat/test_callbacks.R @@ -5,8 +5,8 @@ require(data.table) context("callbacks") -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test @@ -21,24 +21,24 @@ ltrain <- add.noise(train$label, 0.2) ltest <- add.noise(test$label, 0.2) dtrain <- xgb.DMatrix(train$data, label = ltrain) dtest <- xgb.DMatrix(test$data, label = ltest) -watchlist = list(train=dtrain, test=dtest) +watchlist <- list(train = dtrain, test = dtest) -err <- function(label, pr) sum((pr > 0.5) != label)/length(label) +err <- function(label, pr) sum((pr > 0.5) != label) / length(label) param <- list(objective = "binary:logistic", max_depth = 2, nthread = 2) test_that("cb.print.evaluation works as expected", { - bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8) + bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8) bst_evaluation_err <- NULL begin_iteration <- 1 end_iteration <- 7 - f0 <- cb.print.evaluation(period=0) - f1 <- cb.print.evaluation(period=1) - f5 <- cb.print.evaluation(period=5) + f0 <- cb.print.evaluation(period = 0) + f1 <- cb.print.evaluation(period = 1) + f5 <- cb.print.evaluation(period = 5) expect_false(is.null(attr(f1, 'call'))) expect_equal(attr(f1, 'name'), 'cb.print.evaluation') @@ -57,13 +57,13 @@ test_that("cb.print.evaluation works as expected", { expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000") expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000") - bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2) + bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2) expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\\+0.100000\ttest-auc:0.800000\\+0.200000") }) test_that("cb.evaluation.log works as expected", { - bst_evaluation <- c('train-auc'=0.9, 'test-auc'=0.8) + bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8) bst_evaluation_err <- NULL evaluation_log <- list() @@ -75,33 +75,33 @@ test_that("cb.evaluation.log works as expected", { iteration <- 1 expect_silent(f()) expect_equal(evaluation_log, - list(c(iter=1, bst_evaluation))) + list(c(iter = 1, bst_evaluation))) iteration <- 2 expect_silent(f()) expect_equal(evaluation_log, - list(c(iter=1, bst_evaluation), c(iter=2, bst_evaluation))) + list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation))) expect_silent(f(finalize = TRUE)) expect_equal(evaluation_log, - data.table(iter=1:2, train_auc=c(0.9,0.9), test_auc=c(0.8,0.8))) + data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8))) - bst_evaluation_err <- c('train-auc'=0.1, 'test-auc'=0.2) + bst_evaluation_err <- c('train-auc' = 0.1, 'test-auc' = 0.2) evaluation_log <- list() f <- cb.evaluation.log() iteration <- 1 expect_silent(f()) expect_equal(evaluation_log, - list(c(iter=1, c(bst_evaluation, bst_evaluation_err)))) + list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)))) iteration <- 2 expect_silent(f()) expect_equal(evaluation_log, - list(c(iter=1, c(bst_evaluation, bst_evaluation_err)), - c(iter=2, c(bst_evaluation, bst_evaluation_err)))) + list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)), + c(iter = 2, c(bst_evaluation, bst_evaluation_err)))) expect_silent(f(finalize = TRUE)) expect_equal(evaluation_log, - data.table(iter=1:2, - train_auc_mean=c(0.9,0.9), train_auc_std=c(0.1,0.1), - test_auc_mean=c(0.8,0.8), test_auc_std=c(0.2,0.2))) + data.table(iter = 1:2, + train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1), + test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2))) }) @@ -237,7 +237,7 @@ test_that("early stopping using a specific metric works", { set.seed(11) expect_output( bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.6, - eval_metric="logloss", eval_metric="auc", + eval_metric = "logloss", eval_metric = "auc", callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE, metric_name = 'test_logloss'))) , "Stopping. Best iteration") @@ -267,12 +267,12 @@ test_that("early stopping xgb.cv works", { test_that("prediction in xgb.cv works", { set.seed(11) - nrounds = 4 + nrounds <- 4 cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0) expect_false(is.null(cv$evaluation_log)) expect_false(is.null(cv$pred)) expect_length(cv$pred, nrow(train$data)) - err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) ) + err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f])))) err_log <- cv$evaluation_log[nrounds, test_error_mean] expect_equal(err_pred, err_log, tolerance = 1e-6) @@ -308,7 +308,7 @@ test_that("prediction in early-stopping xgb.cv works", { expect_false(is.null(cv$pred)) expect_length(cv$pred, nrow(train$data)) - err_pred <- mean( sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))) ) + err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f])))) err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean] expect_equal(err_pred, err_log, tolerance = 1e-6) err_log_last <- cv$evaluation_log[cv$niter, test_error_mean] diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index ab01147ba..8b504e7f5 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -4,8 +4,8 @@ require(xgboost) set.seed(1994) -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) watchlist <- list(eval = dtest, train = dtrain) @@ -24,8 +24,8 @@ evalerror <- function(preds, dtrain) { return(list(metric = "error", value = err)) } -param <- list(max_depth=2, eta=1, nthread = 2, - objective=logregobj, eval_metric=evalerror) +param <- list(max_depth = 2, eta = 1, nthread = 2, + objective = logregobj, eval_metric = evalerror) num_round <- 2 test_that("custom objective works", { @@ -37,7 +37,7 @@ test_that("custom objective works", { }) test_that("custom objective in CV works", { - cv <- xgb.cv(param, dtrain, num_round, nfold=10, verbose=FALSE) + cv <- xgb.cv(param, dtrain, num_round, nfold = 10, verbose = FALSE) expect_false(is.null(cv$evaluation_log)) expect_equal(dim(cv$evaluation_log), c(2, 5)) expect_lt(cv$evaluation_log[num_round, test_error_mean], 0.03) @@ -54,14 +54,14 @@ test_that("custom objective using DMatrix attr works", { hess <- preds * (1 - preds) return(list(grad = grad, hess = hess)) } - param$objective = logregobjattr + param$objective <- logregobjattr bst <- xgb.train(param, dtrain, num_round, watchlist) expect_equal(class(bst), "xgb.Booster") }) test_that("custom objective with multi-class works", { - data = as.matrix(iris[, -5]) - label = as.numeric(iris$Species) - 1 + data <- as.matrix(iris[, -5]) + label <- as.numeric(iris$Species) - 1 dtrain <- xgb.DMatrix(data = data, label = label) nclasses <- 3 @@ -72,6 +72,6 @@ test_that("custom objective with multi-class works", { hess <- rnorm(dim(as.matrix(preds))[1]) return (list(grad = grad, hess = hess)) } - param$objective = fake_softprob - bst <- xgb.train(param, dtrain, 1, num_class=nclasses) + param$objective <- fake_softprob + bst <- xgb.train(param, dtrain, 1, num_class = nclasses) }) diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index c06358962..a261527df 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -3,29 +3,29 @@ require(Matrix) context("testing xgb.DMatrix functionality") -data(agaricus.test, package='xgboost') -test_data <- agaricus.test$data[1:100,] +data(agaricus.test, package = 'xgboost') +test_data <- agaricus.test$data[1:100, ] test_label <- agaricus.test$label[1:100] test_that("xgb.DMatrix: basic construction", { # from sparse matrix - dtest1 <- xgb.DMatrix(test_data, label=test_label) + dtest1 <- xgb.DMatrix(test_data, label = test_label) # from dense matrix - dtest2 <- xgb.DMatrix(as.matrix(test_data), label=test_label) + dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label) expect_equal(getinfo(dtest1, 'label'), getinfo(dtest2, 'label')) expect_equal(dim(dtest1), dim(dtest2)) #from dense integer matrix int_data <- as.matrix(test_data) storage.mode(int_data) <- "integer" - dtest3 <- xgb.DMatrix(int_data, label=test_label) + dtest3 <- xgb.DMatrix(int_data, label = test_label) expect_equal(dim(dtest1), dim(dtest3)) }) test_that("xgb.DMatrix: saving, loading", { # save to a local file - dtest1 <- xgb.DMatrix(test_data, label=test_label) + dtest1 <- xgb.DMatrix(test_data, label = test_label) tmp_file <- tempfile('xgb.DMatrix_') expect_true(xgb.DMatrix.save(dtest1, tmp_file)) # read from a local file @@ -35,12 +35,12 @@ test_that("xgb.DMatrix: saving, loading", { expect_equal(getinfo(dtest1, 'label'), getinfo(dtest3, 'label')) # from a libsvm text file - tmp <- c("0 1:1 2:1","1 3:1","0 1:1") + tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1") tmp_file <- 'tmp.libsvm' writeLines(tmp, tmp_file) dtest4 <- xgb.DMatrix(tmp_file, silent = TRUE) expect_equal(dim(dtest4), c(3, 4)) - expect_equal(getinfo(dtest4, 'label'), c(0,1,0)) + expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0)) unlink(tmp_file) }) @@ -61,7 +61,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", { expect_true(setinfo(dtest, 'weight', test_label)) expect_true(setinfo(dtest, 'base_margin', test_label)) - expect_true(setinfo(dtest, 'group', c(50,50))) + expect_true(setinfo(dtest, 'group', c(50, 50))) expect_error(setinfo(dtest, 'group', test_label)) # providing character values will give a warning @@ -72,35 +72,35 @@ test_that("xgb.DMatrix: getinfo & setinfo", { }) test_that("xgb.DMatrix: slice, dim", { - dtest <- xgb.DMatrix(test_data, label=test_label) + dtest <- xgb.DMatrix(test_data, label = test_label) expect_equal(dim(dtest), dim(test_data)) dsub1 <- slice(dtest, 1:42) expect_equal(nrow(dsub1), 42) expect_equal(ncol(dsub1), ncol(test_data)) - dsub2 <- dtest[1:42,] + dsub2 <- dtest[1:42, ] expect_equal(dim(dtest), dim(test_data)) expect_equal(getinfo(dsub1, 'label'), getinfo(dsub2, 'label')) }) test_that("xgb.DMatrix: slice, trailing empty rows", { - data(agaricus.train, package='xgboost') + data(agaricus.train, package = 'xgboost') train_data <- agaricus.train$data train_label <- agaricus.train$label - dtrain <- xgb.DMatrix(data=train_data, label=train_label) + dtrain <- xgb.DMatrix(data = train_data, label = train_label) slice(dtrain, 6513L) train_data[6513, ] <- 0 - dtrain <- xgb.DMatrix(data=train_data, label=train_label) + dtrain <- xgb.DMatrix(data = train_data, label = train_label) slice(dtrain, 6513L) expect_equal(nrow(dtrain), 6513) }) test_that("xgb.DMatrix: colnames", { - dtest <- xgb.DMatrix(test_data, label=test_label) + dtest <- xgb.DMatrix(test_data, label = test_label) expect_equal(colnames(dtest), colnames(test_data)) - expect_error( colnames(dtest) <- 'asdf') + expect_error(colnames(dtest) <- 'asdf') new_names <- make.names(1:ncol(test_data)) - expect_silent( colnames(dtest) <- new_names) + expect_silent(colnames(dtest) <- new_names) expect_equal(colnames(dtest), new_names) expect_silent(colnames(dtest) <- NULL) expect_null(colnames(dtest)) @@ -109,7 +109,7 @@ test_that("xgb.DMatrix: colnames", { test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", { set.seed(123) nr <- 1000 - x <- rsparsematrix(nr, 100, density=0.0005) + x <- rsparsematrix(nr, 100, density = 0.0005) # we want it very sparse, so that last rows are empty expect_lt(max(x@i), nr) dtest <- xgb.DMatrix(x) diff --git a/R-package/tests/testthat/test_gc_safety.R b/R-package/tests/testthat/test_gc_safety.R index b90f0f4ca..210674e19 100644 --- a/R-package/tests/testthat/test_gc_safety.R +++ b/R-package/tests/testthat/test_gc_safety.R @@ -3,8 +3,8 @@ require(xgboost) context("Garbage Collection Safety Check") test_that("train and prediction when gctorture is on", { - data(agaricus.train, package='xgboost') - data(agaricus.test, package='xgboost') + data(agaricus.train, package = 'xgboost') + data(agaricus.test, package = 'xgboost') train <- agaricus.train test <- agaricus.test gctorture(TRUE) diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R index 9b4aa73ad..1eaeee2ff 100644 --- a/R-package/tests/testthat/test_glm.R +++ b/R-package/tests/testthat/test_glm.R @@ -3,8 +3,8 @@ context('Test generalized linear models') require(xgboost) test_that("gblinear works", { - data(agaricus.train, package='xgboost') - data(agaricus.test, package='xgboost') + data(agaricus.train, package = 'xgboost') + data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) @@ -16,7 +16,7 @@ test_that("gblinear works", { ERR_UL <- 0.005 # upper limit for the test set error VERB <- 0 # chatterbox switch - param$updater = 'shotgun' + param$updater <- 'shotgun' bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle') ypred <- predict(bst, dtest) expect_equal(length(getinfo(dtest, 'label')), 1611) @@ -29,7 +29,7 @@ test_that("gblinear works", { expect_equal(dim(h), c(n, ncol(dtrain) + 1)) expect_is(h, "matrix") - param$updater = 'coord_descent' + param$updater <- 'coord_descent' bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic') expect_lt(bst$evaluation_log$eval_error[n], ERR_UL) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 4a44f333c..0fbcae180 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -5,18 +5,18 @@ require(data.table) require(Matrix) require(vcd, quietly = TRUE) -float_tolerance = 5e-6 +float_tolerance <- 5e-6 # disable some tests for 32-bit environment -flag_32bit = .Machine$sizeof.pointer != 8 +flag_32bit <- .Machine$sizeof.pointer != 8 set.seed(1982) data(Arthritis) df <- data.table(Arthritis, keep.rownames = FALSE) -df[,AgeDiscret := as.factor(round(Age / 10,0))] -df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] -df[,ID := NULL] -sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) +df[, AgeDiscret := as.factor(round(Age / 10, 0))] +df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))] +df[, ID := NULL] +sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df) # nolint label <- df[, ifelse(Improved == "Marked", 1, 0)] # binary @@ -46,7 +46,7 @@ mbst.GLM <- xgboost(data = as.matrix(iris[, -5]), label = mlabel, verbose = 0, test_that("xgb.dump works", { if (!flag_32bit) expect_length(xgb.dump(bst.Tree), 200) - dump_file = file.path(tempdir(), 'xgb.model.dump') + dump_file <- file.path(tempdir(), 'xgb.model.dump') expect_true(xgb.dump(bst.Tree, dump_file, with_stats = TRUE)) expect_true(file.exists(dump_file)) expect_gt(file.size(dump_file), 8000) @@ -63,7 +63,7 @@ test_that("xgb.dump works for gblinear", { # also make sure that it works properly for a sparse model where some coefficients # are 0 from setting large L1 regularization: bst.GLM.sp <- xgboost(data = sparse_matrix, label = label, eta = 1, nthread = 2, nrounds = 1, - alpha=2, objective = "binary:logistic", booster = "gblinear") + alpha = 2, objective = "binary:logistic", booster = "gblinear") d.sp <- xgb.dump(bst.GLM.sp) expect_length(d.sp, 14) expect_gt(sum(d.sp == "0"), 0) @@ -110,9 +110,9 @@ test_that("predict feature contributions works", { pred <- predict(bst.GLM, sparse_matrix, outputmargin = TRUE) expect_lt(max(abs(rowSums(pred_contr) - pred)), 1e-5) # manual calculation of linear terms - coefs <- xgb.dump(bst.GLM)[-c(1,2,4)] %>% as.numeric + coefs <- xgb.dump(bst.GLM)[-c(1, 2, 4)] %>% as.numeric coefs <- c(coefs[-1], coefs[1]) # intercept must be the last - pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN="*") + pred_contr_manual <- sweep(cbind(sparse_matrix, 1), 2, coefs, FUN = "*") expect_equal(as.numeric(pred_contr), as.numeric(pred_contr_manual), tolerance = float_tolerance) @@ -130,13 +130,13 @@ test_that("predict feature contributions works", { pred <- predict(mbst.GLM, as.matrix(iris[, -5]), outputmargin = TRUE, reshape = TRUE) pred_contr <- predict(mbst.GLM, as.matrix(iris[, -5]), predcontrib = TRUE) expect_length(pred_contr, 3) - coefs_all <- xgb.dump(mbst.GLM)[-c(1,2,6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE) + coefs_all <- xgb.dump(mbst.GLM)[-c(1, 2, 6)] %>% as.numeric %>% matrix(ncol = 3, byrow = TRUE) for (g in seq_along(pred_contr)) { expect_equal(colnames(pred_contr[[g]]), c(colnames(iris[, -5]), "BIAS")) expect_lt(max(abs(rowSums(pred_contr[[g]]) - pred[, g])), float_tolerance) # manual calculation of linear terms coefs <- c(coefs_all[-1, g], coefs_all[1, g]) # intercept needs to be the last - pred_contr_manual <- sweep(as.matrix(cbind(iris[,-5], 1)), 2, coefs, FUN="*") + pred_contr_manual <- sweep(as.matrix(cbind(iris[, -5], 1)), 2, coefs, FUN = "*") expect_equal(as.numeric(pred_contr[[g]]), as.numeric(pred_contr_manual), tolerance = float_tolerance) } @@ -147,8 +147,8 @@ test_that("SHAPs sum to predictions, with or without DART", { x1 = rnorm(100), x2 = rnorm(100), x3 = rnorm(100)) - y <- d[,"x1"] + d[,"x2"]^2 + - ifelse(d[,"x3"] > .5, d[,"x3"]^2, 2^d[,"x3"]) + + y <- d[, "x1"] + d[, "x2"]^2 + + ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) + rnorm(100) nrounds <- 30 @@ -170,19 +170,19 @@ test_that("SHAPs sum to predictions, with or without DART", { pred <- pr() shap <- pr(predcontrib = TRUE) shapi <- pr(predinteraction = TRUE) - tol = 1e-5 + tol <- 1e-5 expect_equal(rowSums(shap), pred, tol = tol) expect_equal(apply(shapi, 1, sum), pred, tol = tol) for (i in 1 : nrow(d)) for (f in list(rowSums, colSums)) - expect_equal(f(shapi[i,,]), shap[i,], tol = tol) + expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol) } }) test_that("xgb-attribute functionality", { val <- "my attribute value" - list.val <- list(my_attr=val, a=123, b='ok') + list.val <- list(my_attr = val, a = 123, b = 'ok') list.ch <- list.val[order(names(list.val))] list.ch <- lapply(list.ch, as.character) # note: iter is 0-index in xgb attributes @@ -208,9 +208,9 @@ test_that("xgb-attribute functionality", { xgb.attr(bst, "my_attr") <- NULL expect_null(xgb.attr(bst, "my_attr")) expect_equal(xgb.attributes(bst), list.ch[c("a", "b", "niter")]) - xgb.attributes(bst) <- list(a=NULL, b=NULL) + xgb.attributes(bst) <- list(a = NULL, b = NULL) expect_equal(xgb.attributes(bst), list.default) - xgb.attributes(bst) <- list(niter=NULL) + xgb.attributes(bst) <- list(niter = NULL) expect_null(xgb.attributes(bst)) }) @@ -268,7 +268,7 @@ test_that("xgb.model.dt.tree works with and without feature names", { bst.Tree.x$feature_names <- NULL dt.tree.x <- xgb.model.dt.tree(model = bst.Tree.x) expect_output(str(dt.tree.x), 'Feature.*\\"3\\"') - expect_equal(dt.tree[, -4, with=FALSE], dt.tree.x[, -4, with=FALSE]) + expect_equal(dt.tree[, -4, with = FALSE], dt.tree.x[, -4, with = FALSE]) # using integer node ID instead of character dt.tree.int <- xgb.model.dt.tree(model = bst.Tree, use_int_id = TRUE) @@ -295,7 +295,7 @@ test_that("xgb.importance works with and without feature names", { bst.Tree.x <- bst.Tree bst.Tree.x$feature_names <- NULL importance.Tree.x <- xgb.importance(model = bst.Tree) - expect_equal(importance.Tree[, -1, with=FALSE], importance.Tree.x[, -1, with=FALSE], + expect_equal(importance.Tree[, -1, with = FALSE], importance.Tree.x[, -1, with = FALSE], tolerance = float_tolerance) imp2plot <- xgb.plot.importance(importance_matrix = importance.Tree) @@ -305,7 +305,7 @@ test_that("xgb.importance works with and without feature names", { # for multiclass imp.Tree <- xgb.importance(model = mbst.Tree) expect_equal(dim(imp.Tree), c(4, 4)) - xgb.importance(model = mbst.Tree, trees = seq(from=0, by=nclass, length.out=nrounds)) + xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds)) }) test_that("xgb.importance works with GLM model", { @@ -320,7 +320,7 @@ test_that("xgb.importance works with GLM model", { # for multiclass imp.GLM <- xgb.importance(model = mbst.GLM) expect_equal(dim(imp.GLM), c(12, 3)) - expect_equal(imp.GLM$Class, rep(0:2, each=4)) + expect_equal(imp.GLM$Class, rep(0:2, each = 4)) }) test_that("xgb.model.dt.tree and xgb.importance work with a single split model", { diff --git a/R-package/tests/testthat/test_interaction_constraints.R b/R-package/tests/testthat/test_interaction_constraints.R index 9a3ddf442..7f6a8b09b 100644 --- a/R-package/tests/testthat/test_interaction_constraints.R +++ b/R-package/tests/testthat/test_interaction_constraints.R @@ -5,20 +5,20 @@ context("interaction constraints") set.seed(1024) x1 <- rnorm(1000, 1) x2 <- rnorm(1000, 1) -x3 <- sample(c(1,2,3), size=1000, replace=TRUE) -y <- x1 + x2 + x3 + x1*x2*x3 + rnorm(1000, 0.001) + 3*sin(x1) -train <- matrix(c(x1,x2,x3), ncol = 3) +x3 <- sample(c(1, 2, 3), size = 1000, replace = TRUE) +y <- x1 + x2 + x3 + x1 * x2 * x3 + rnorm(1000, 0.001) + 3 * sin(x1) +train <- matrix(c(x1, x2, x3), ncol = 3) test_that("interaction constraints for regression", { # Fit a model that only allows interaction between x1 and x2 bst <- xgboost(data = train, label = y, max_depth = 3, eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, - interaction_constraints = list(c(0,1))) + interaction_constraints = list(c(0, 1))) # Set all observations to have the same x3 values then increment # by the same amount - preds <- lapply(c(1,2,3), function(x){ - tmat <- matrix(c(x1,x2,rep(x,1000)), ncol=3) + preds <- lapply(c(1, 2, 3), function(x){ + tmat <- matrix(c(x1, x2, rep(x, 1000)), ncol = 3) return(predict(bst, tmat)) }) @@ -40,16 +40,16 @@ test_that("interaction constraints scientific representation", { rows <- 10 ## When number exceeds 1e5, R paste function uses scientific representation. ## See: https://github.com/dmlc/xgboost/issues/5179 - cols <- 1e5+10 + cols <- 1e5 + 10 - d <- matrix(rexp(rows, rate=.1), nrow=rows, ncol=cols) + d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols) y <- rnorm(rows) - dtrain <- xgb.DMatrix(data=d, info = list(label=y)) + dtrain <- xgb.DMatrix(data = d, info = list(label = y)) inc <- list(c(seq.int(from = 0, to = cols, by = 1))) - with_inc <- xgb.train(data=dtrain, tree_method='hist', - interaction_constraints=inc, nrounds=10) - without_inc <- xgb.train(data=dtrain, tree_method='hist', nrounds=10) + with_inc <- xgb.train(data = dtrain, tree_method = 'hist', + interaction_constraints = inc, nrounds = 10) + without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10) expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc)) }) diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R index ee35339a3..be7698ce9 100644 --- a/R-package/tests/testthat/test_interactions.R +++ b/R-package/tests/testthat/test_interactions.R @@ -9,9 +9,9 @@ test_that("predict feature interactions works", { # simulate some binary data and a linear outcome with an interaction term N <- 1000 P <- 5 - X <- matrix(rbinom(N * P, 1, 0.5), ncol=P, dimnames = list(NULL, letters[1:P])) + X <- matrix(rbinom(N * P, 1, 0.5), ncol = P, dimnames = list(NULL, letters[1:P])) # center the data (as contributions are computed WRT feature means) - X <- scale(X, scale=FALSE) + X <- scale(X, scale = FALSE) # outcome without any interactions, without any noise: f <- function(x) 2 * x[, 1] - 3 * x[, 2] @@ -23,14 +23,14 @@ test_that("predict feature interactions works", { y <- f_int(X) dm <- xgb.DMatrix(X, label = y) - param <- list(eta=0.1, max_depth=4, base_score=mean(y), lambda=0, nthread=2) + param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2) b <- xgb.train(param, dm, 100) - - pred = predict(b, dm, outputmargin=TRUE) + + pred <- predict(b, dm, outputmargin = TRUE) # SHAP contributions: - cont <- predict(b, dm, predcontrib=TRUE) - expect_equal(dim(cont), c(N, P+1)) + cont <- predict(b, dm, predcontrib = TRUE) + expect_equal(dim(cont), c(N, P + 1)) # make sure for each row they add up to marginal predictions max(abs(rowSums(cont) - pred)) %>% expect_lt(0.001) # Hand-construct the 'ground truth' feature contributions: @@ -39,43 +39,43 @@ test_that("predict feature interactions works", { -3. * X[, 2] + 1. * X[, 2] * X[, 3], # attribute a HALF of the interaction term to feature #2 1. * X[, 2] * X[, 3] # and another HALF of the interaction term to feature #3 ) - gt_cont <- cbind(gt_cont, matrix(0, nrow=N, ncol=P + 1 - 3)) + gt_cont <- cbind(gt_cont, matrix(0, nrow = N, ncol = P + 1 - 3)) # These should be relatively close: expect_lt(max(abs(cont - gt_cont)), 0.05) # SHAP interaction contributions: - intr <- predict(b, dm, predinteraction=TRUE) - expect_equal(dim(intr), c(N, P+1, P+1)) + intr <- predict(b, dm, predinteraction = TRUE) + expect_equal(dim(intr), c(N, P + 1, P + 1)) # check assigned colnames cn <- c(letters[1:P], "BIAS") expect_equal(dimnames(intr), list(NULL, cn, cn)) # check the symmetry - max(abs(aperm(intr, c(1,3,2)) - intr)) %>% expect_lt(0.00001) + max(abs(aperm(intr, c(1, 3, 2)) - intr)) %>% expect_lt(0.00001) # sums WRT columns must be close to feature contributions - max(abs(apply(intr, c(1,2), sum) - cont)) %>% expect_lt(0.00001) + max(abs(apply(intr, c(1, 2), sum) - cont)) %>% expect_lt(0.00001) # diagonal terms for features 3,4,5 must be close to zero Reduce(max, sapply(3:P, function(i) max(abs(intr[, i, i])))) %>% expect_lt(0.05) # BIAS must have no interactions - max(abs(intr[, 1:P, P+1])) %>% expect_lt(0.00001) + max(abs(intr[, 1:P, P + 1])) %>% expect_lt(0.00001) # interactions other than 2 x 3 must be close to zero intr23 <- intr - intr23[,2,3] <- 0 - Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i+1):(P+1)])))) %>% expect_lt(0.05) + intr23[, 2, 3] <- 0 + Reduce(max, sapply(1:P, function(i) max(abs(intr23[, i, (i + 1):(P + 1)])))) %>% expect_lt(0.05) # Construct the 'ground truth' contributions of interactions directly from the linear terms: - gt_intr <- array(0, c(N, P+1, P+1)) - gt_intr[,2,3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element - gt_intr[,3,2] <- gt_intr[, 2, 3] + gt_intr <- array(0, c(N, P + 1, P + 1)) + gt_intr[, 2, 3] <- 1. * X[, 2] * X[, 3] # attribute a HALF of the interaction term to each symmetric element + gt_intr[, 3, 2] <- gt_intr[, 2, 3] # merge-in the diagonal based on 'ground truth' feature contributions - intr_diag = gt_cont - apply(gt_intr, c(1,2), sum) - for(j in seq_len(P)) { - gt_intr[,j,j] = intr_diag[,j] + intr_diag <- gt_cont - apply(gt_intr, c(1, 2), sum) + for (j in seq_len(P)) { + gt_intr[, j, j] <- intr_diag[, j] } # These should be relatively close: expect_lt(max(abs(intr - gt_intr)), 0.1) @@ -116,26 +116,26 @@ test_that("SHAP contribution values are not NAN", { test_that("multiclass feature interactions work", { - dm <- xgb.DMatrix(as.matrix(iris[,-5]), label=as.numeric(iris$Species)-1) - param <- list(eta=0.1, max_depth=4, objective='multi:softprob', num_class=3) + dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1) + param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3) b <- xgb.train(param, dm, 40) - pred = predict(b, dm, outputmargin=TRUE) %>% array(c(3, 150)) %>% t + pred <- predict(b, dm, outputmargin = TRUE) %>% array(c(3, 150)) %>% t # SHAP contributions: - cont <- predict(b, dm, predcontrib=TRUE) + cont <- predict(b, dm, predcontrib = TRUE) expect_length(cont, 3) # rewrap them as a 3d array cont <- unlist(cont) %>% array(c(150, 5, 3)) # make sure for each row they add up to marginal predictions - max(abs(apply(cont, c(1,3), sum) - pred)) %>% expect_lt(0.001) + max(abs(apply(cont, c(1, 3), sum) - pred)) %>% expect_lt(0.001) # SHAP interaction contributions: - intr <- predict(b, dm, predinteraction=TRUE) + intr <- predict(b, dm, predinteraction = TRUE) expect_length(intr, 3) # rewrap them as a 4d array intr <- unlist(intr) %>% array(c(150, 5, 5, 3)) %>% aperm(c(4, 1, 2, 3)) # [grp, row, col, col] # check the symmetry - max(abs(aperm(intr, c(1,2,4,3)) - intr)) %>% expect_lt(0.00001) + max(abs(aperm(intr, c(1, 2, 4, 3)) - intr)) %>% expect_lt(0.00001) # sums WRT columns must be close to feature contributions - max(abs(apply(intr, c(1,2,3), sum) - aperm(cont, c(3,1,2)))) %>% expect_lt(0.00001) + max(abs(apply(intr, c(1, 2, 3), sum) - aperm(cont, c(3, 1, 2)))) %>% expect_lt(0.00001) }) diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R index 168eb0fc8..5d32c85d6 100644 --- a/R-package/tests/testthat/test_lint.R +++ b/R-package/tests/testthat/test_lint.R @@ -2,25 +2,25 @@ context("Code is of high quality and lint free") test_that("Code Lint", { skip_on_cran() my_linters <- list( - absolute_paths_linter=lintr::absolute_paths_linter, - assignment_linter=lintr::assignment_linter, - closed_curly_linter=lintr::closed_curly_linter, - commas_linter=lintr::commas_linter, - # commented_code_linter=lintr::commented_code_linter, - infix_spaces_linter=lintr::infix_spaces_linter, - line_length_linter=lintr::line_length_linter, - no_tab_linter=lintr::no_tab_linter, - object_usage_linter=lintr::object_usage_linter, - # snake_case_linter=lintr::snake_case_linter, - # multiple_dots_linter=lintr::multiple_dots_linter, - object_length_linter=lintr::object_length_linter, - open_curly_linter=lintr::open_curly_linter, - # single_quotes_linter=lintr::single_quotes_linter, - spaces_inside_linter=lintr::spaces_inside_linter, - spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter, - trailing_blank_lines_linter=lintr::trailing_blank_lines_linter, - trailing_whitespace_linter=lintr::trailing_whitespace_linter, - true_false=lintr::T_and_F_symbol_linter + absolute_paths_linter = lintr::absolute_paths_linter, + assignment_linter = lintr::assignment_linter, + closed_curly_linter = lintr::closed_curly_linter, + commas_linter = lintr::commas_linter, + # commented_code_linter = lintr::commented_code_linter, + infix_spaces_linter = lintr::infix_spaces_linter, + line_length_linter = lintr::line_length_linter, + no_tab_linter = lintr::no_tab_linter, + object_usage_linter = lintr::object_usage_linter, + # snake_case_linter = lintr::snake_case_linter, + # multiple_dots_linter = lintr::multiple_dots_linter, + object_length_linter = lintr::object_length_linter, + open_curly_linter = lintr::open_curly_linter, + # single_quotes_linter = lintr::single_quotes_linter, + spaces_inside_linter = lintr::spaces_inside_linter, + spaces_left_parentheses_linter = lintr::spaces_left_parentheses_linter, + trailing_blank_lines_linter = lintr::trailing_blank_lines_linter, + trailing_whitespace_linter = lintr::trailing_whitespace_linter, + true_false = lintr::T_and_F_symbol_linter ) - lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality + lintr::expect_lint_free(linters = my_linters) # uncomment this if you want to check code quality }) diff --git a/R-package/tests/testthat/test_monotone.R b/R-package/tests/testthat/test_monotone.R index 9991e917e..54070cd61 100644 --- a/R-package/tests/testthat/test_monotone.R +++ b/R-package/tests/testthat/test_monotone.R @@ -3,22 +3,21 @@ require(xgboost) context("monotone constraints") set.seed(1024) -x = rnorm(1000, 10) -y = -1*x + rnorm(1000, 0.001) + 3*sin(x) -train = matrix(x, ncol = 1) +x <- rnorm(1000, 10) +y <- -1 * x + rnorm(1000, 0.001) + 3 * sin(x) +train <- matrix(x, ncol = 1) test_that("monotone constraints for regression", { - bst = xgboost(data = train, label = y, max_depth = 2, - eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, - monotone_constraints = -1) - - pred = predict(bst, train) - - ind = order(train[,1]) - pred.ord = pred[ind] - expect_true({ - !any(diff(pred.ord) > 0) - }, "Monotone Contraint Satisfied") - + bst <- xgboost(data = train, label = y, max_depth = 2, + eta = 0.1, nthread = 2, nrounds = 100, verbose = 0, + monotone_constraints = -1) + + pred <- predict(bst, train) + + ind <- order(train[, 1]) + pred.ord <- pred[ind] + expect_true({ + !any(diff(pred.ord) > 0) + }, "Monotone Contraint Satisfied") }) diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R index 1a0dcb39f..86413174b 100644 --- a/R-package/tests/testthat/test_parameter_exposure.R +++ b/R-package/tests/testthat/test_parameter_exposure.R @@ -2,8 +2,8 @@ context('Test model params and call are exposed to R') require(xgboost) -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') +data(agaricus.train, package = 'xgboost') +data(agaricus.test, package = 'xgboost') dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R index a48f2fc47..a731dc23a 100644 --- a/R-package/tests/testthat/test_poisson_regression.R +++ b/R-package/tests/testthat/test_poisson_regression.R @@ -5,10 +5,10 @@ set.seed(1994) test_that("poisson regression works", { data(mtcars) - bst <- xgboost(data = as.matrix(mtcars[,-11]), label = mtcars[,11], - objective = 'count:poisson', nrounds=10, verbose=0) + bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11], + objective = 'count:poisson', nrounds = 10, verbose = 0) expect_equal(class(bst), "xgb.Booster") pred <- predict(bst, as.matrix(mtcars[, -11])) expect_equal(length(pred), 32) - expect_lt(sqrt(mean( (pred - mtcars[,11])^2 )), 1.2) + expect_lt(sqrt(mean((pred - mtcars[, 11])^2)), 1.2) }) diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R index fa48c9144..541fdf68e 100644 --- a/R-package/tests/testthat/test_update.R +++ b/R-package/tests/testthat/test_update.R @@ -9,23 +9,23 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) # Disable flaky tests for 32-bit Windows. # See https://github.com/dmlc/xgboost/issues/3720 -win32_flag = .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8 +win32_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8 test_that("updating the model works", { - watchlist = list(train = dtrain, test = dtest) + watchlist <- list(train = dtrain, test = dtest) # no-subsampling p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2) set.seed(11) bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0) tr1 <- xgb.model.dt.tree(model = bst1) - + # with subsampling p2 <- modifyList(p1, list(subsample = 0.1)) set.seed(11) bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0) tr2 <- xgb.model.dt.tree(model = bst2) - + # the same no-subsampling boosting with an extra 'refresh' updater: p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE)) set.seed(11) @@ -57,7 +57,7 @@ test_that("updating the model works", { # all should be the same when no subsampling expect_equal(bst1$evaluation_log, bst1u$evaluation_log) expect_equal(tr1, tr1u, tolerance = 0.00001, check.attributes = FALSE) - + # process type 'update' for model with subsampling, refreshing only the tree stats from training data: p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2) @@ -72,7 +72,7 @@ test_that("updating the model works", { if (!win32_flag) { expect_equal(tr2r, tr2u, tolerance = 0.00001, check.attributes = FALSE) } - + # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data: p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1) @@ -93,12 +93,12 @@ test_that("updating works for multiclass & multitree", { set.seed(121) bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0) tr0 <- xgb.model.dt.tree(model = bst0) - + # run update process for an original model with subsampling - p0u <- modifyList(p0, list(process_type='update', updater='refresh', refresh_leaf=FALSE)) + p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE)) bst0u <- xgb.train(p0u, dtr, nrounds = bst0$niter, watchlist, xgb_model = bst0, verbose = 0) tr0u <- xgb.model.dt.tree(model = bst0u) - + # should be the same evaluation but different gains and larger cover expect_equal(bst0$evaluation_log, bst0u$evaluation_log) expect_equal(tr0[Feature == 'Leaf']$Quality, tr0u[Feature == 'Leaf']$Quality)