[R] Don't cap global number of threads for serialization (#10028)

This commit is contained in:
david-cortes 2024-02-20 04:13:00 +01:00 committed by GitHub
parent edf501d227
commit 6e3c899ba7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 37 additions and 20 deletions

View File

@ -56,7 +56,8 @@ Suggests:
testthat, testthat,
igraph (>= 1.0.1), igraph (>= 1.0.1),
float, float,
titanic titanic,
RhpcBLASctl
Depends: Depends:
R (>= 4.3.0) R (>= 4.3.0)
Imports: Imports:

View File

@ -6,6 +6,7 @@
#' @param fname the name of the file to write. #' @param fname the name of the file to write.
#' #'
#' @examples #' @examples
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
#' fname <- file.path(tempdir(), "xgb.DMatrix.data") #' fname <- file.path(tempdir(), "xgb.DMatrix.data")

View File

@ -4,7 +4,14 @@
#' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current #' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
#' values of all global-scope parameters (listed in #' values of all global-scope parameters (listed in
#' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}). #' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
#' @details
#' Note that serialization-related functions might use a globally-configured number of threads,
#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
#' parameter can be supplied.
#' #'
#' The number of OMP threads can in turn be configured for example through an environment variable
#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
#' @rdname xgbConfig #' @rdname xgbConfig
#' @title Set and get global configuration #' @title Set and get global configuration
#' @name xgb.set.config, xgb.get.config #' @name xgb.set.config, xgb.get.config

View File

@ -24,6 +24,7 @@
#' as a \code{character} vector. Otherwise it will return \code{TRUE}. #' as a \code{character} vector. Otherwise it will return \code{TRUE}.
#' #'
#' @examples #' @examples
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' train <- agaricus.train #' train <- agaricus.train

View File

@ -20,6 +20,7 @@
#' \code{\link{xgb.save}} #' \code{\link{xgb.save}}
#' #'
#' @examples #' @examples
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'

View File

@ -35,6 +35,7 @@
#' \code{\link{xgb.load}} #' \code{\link{xgb.load}}
#' #'
#' @examples #' @examples
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'

View File

@ -12,6 +12,7 @@
#' } #' }
#' #'
#' @examples #' @examples
#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
#' data(agaricus.train, package='xgboost') #' data(agaricus.train, package='xgboost')
#' data(agaricus.test, package='xgboost') #' data(agaricus.test, package='xgboost')
#' #'

View File

@ -55,6 +55,8 @@ print(paste("test-error=", err))
# save model to binary local file # save model to binary local file
xgb.save(bst, "xgboost.model") xgb.save(bst, "xgboost.model")
# load binary model to R # load binary model to R
# Function doesn't take 'nthreads', but can be set like this:
RhpcBLASctl::omp_set_num_threads(1)
bst2 <- xgb.load("xgboost.model") bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data) pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred # pred2 should be identical to pred

View File

@ -15,6 +15,7 @@ xgb.DMatrix.save(dmatrix, fname)
Save xgb.DMatrix object to binary file Save xgb.DMatrix object to binary file
} }
\examples{ \examples{
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2)) dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
fname <- file.path(tempdir(), "xgb.DMatrix.data") fname <- file.path(tempdir(), "xgb.DMatrix.data")

View File

@ -44,6 +44,7 @@ as a \code{character} vector. Otherwise it will return \code{TRUE}.
Dump an xgboost model in text format. Dump an xgboost model in text format.
} }
\examples{ \examples{
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')
train <- agaricus.train train <- agaricus.train

View File

@ -25,6 +25,7 @@ Note: a model saved as an R-object, has to be loaded using corresponding R-metho
not \code{xgb.load}. not \code{xgb.load}.
} }
\examples{ \examples{
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')

View File

@ -41,6 +41,7 @@ how to persist models in a future-proof way, i.e. to make the model accessible i
releases of XGBoost. releases of XGBoost.
} }
\examples{ \examples{
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')

View File

@ -21,6 +21,7 @@ xgb.save.raw(model, raw_format = "ubj")
Save xgboost model from xgboost or xgb.train Save xgboost model from xgboost or xgb.train
} }
\examples{ \examples{
\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
data(agaricus.train, package='xgboost') data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost') data(agaricus.test, package='xgboost')

View File

@ -25,6 +25,15 @@ values of one or more global-scope parameters. Use \code{xgb.get.config} to fetc
values of all global-scope parameters (listed in values of all global-scope parameters (listed in
\url{https://xgboost.readthedocs.io/en/stable/parameter.html}). \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
} }
\details{
Note that serialization-related functions might use a globally-configured number of threads,
which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
accept an \code{nthreads} parameter, but some methods like \code{readRDS} might get executed before such
parameter can be supplied.
The number of OMP threads can in turn be configured for example through an environment variable
\code{OMP_NUM_THREADS} (needs to be set before R is started), or through \code{RhpcBLASctl::omp_set_num_threads}.
}
\examples{ \examples{
# Set verbosity level to silent (0) # Set verbosity level to silent (0)
xgb.set.config(verbosity = 0) xgb.set.config(verbosity = 0)

View File

@ -20,6 +20,7 @@ pkgs <- c(
"igraph", "igraph",
"float", "float",
"titanic", "titanic",
"RhpcBLASctl",
## imports ## imports
"Matrix", "Matrix",
"methods", "methods",

View File

@ -2,3 +2,4 @@ library(testthat)
library(xgboost) library(xgboost)
test_check("xgboost", reporter = ProgressReporter) test_check("xgboost", reporter = ProgressReporter)
RhpcBLASctl::omp_set_num_threads(1)

View File

@ -496,6 +496,9 @@ An interesting test to see how identical our saved model is to the original one
```{r loadModel, message=F, warning=F} ```{r loadModel, message=F, warning=F}
# load binary model to R # load binary model to R
# Note that the number of threads for 'xgb.load' is taken from global config,
# can be modified like this:
RhpcBLASctl::omp_set_num_threads(1)
bst2 <- xgb.load(fname) bst2 <- xgb.load(fname)
xgb.parameters(bst2) <- list(nthread = 2) xgb.parameters(bst2) <- list(nthread = 2)
pred2 <- predict(bst2, test$data) pred2 <- predict(bst2, test$data)

View File

@ -106,30 +106,13 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
Validate(*this); Validate(*this);
} }
namespace {
std::int32_t IOThreads(Context const* ctx) {
CHECK(ctx);
std::int32_t n_threads = ctx->Threads();
// CRAN checks for number of threads used by examples, but we might not have the right
// number of threads when serializing/unserializing models as nthread is a booster
// parameter, which is only effective after booster initialization.
//
// The threshold ratio of CPU time to user time for R is 2.5, we set the number of
// threads to 2.
#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
n_threads = std::min(2, n_threads);
#endif
return n_threads;
}
} // namespace
void GBTreeModel::SaveModel(Json* p_out) const { void GBTreeModel::SaveModel(Json* p_out) const {
auto& out = *p_out; auto& out = *p_out;
CHECK_EQ(param.num_trees, static_cast<int>(trees.size())); CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
out["gbtree_model_param"] = ToJson(param); out["gbtree_model_param"] = ToJson(param);
std::vector<Json> trees_json(trees.size()); std::vector<Json> trees_json(trees.size());
common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) { common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
auto const& tree = trees[t]; auto const& tree = trees[t];
Json jtree{Object{}}; Json jtree{Object{}};
tree->SaveModel(&jtree); tree->SaveModel(&jtree);
@ -167,7 +150,7 @@ void GBTreeModel::LoadModel(Json const& in) {
CHECK_EQ(tree_info_json.size(), param.num_trees); CHECK_EQ(tree_info_json.size(), param.num_trees);
tree_info.resize(param.num_trees); tree_info.resize(param.num_trees);
common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) { common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
auto tree_id = get<Integer const>(trees_json[t]["id"]); auto tree_id = get<Integer const>(trees_json[t]["id"]);
trees.at(tree_id).reset(new RegTree{}); trees.at(tree_id).reset(new RegTree{});
trees[tree_id]->LoadModel(trees_json[t]); trees[tree_id]->LoadModel(trees_json[t]);