[R] Use new interface for creating DMatrix from CSR. (#8455)

* [R] Use new interface for creating DMatrix from CSR. - CSC is still using the old API. The old API is not aware of `nthread` parameter, which makes DMatrix to use all available thread during construction and during transformation lie `SparsePage` -> `CSCPage`.
2022-11-23 21:36:43 +08:00 · 2022-11-23 21:36:43 +08:00 · 5f1a6fca0d
commit 5f1a6fca0d
parent 58d211545f
6 changed files with 68 additions and 33 deletions
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@ -66,5 +66,5 @@ Imports:
    methods,
    data.table (>= 1.9.6),
    jsonlite (>= 1.0),
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.2
 SystemRequirements: GNU make, C++14
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@ -592,12 +592,12 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' #### Multiclass classification:
 #' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
 #' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
+#'               lambda = 0.0003, alpha = 0.0003, nthread = 1)
 #' # For the default linear updater 'shotgun' it sometimes is helpful
 #' # to use smaller eta to reduce instability
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
 #'                  callbacks = list(cb.gblinear.history()))
 #' # Will plot the coefficient paths separately for each class:
 #' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/cb.gblinear.history.Rd
@ -72,12 +72,12 @@ matplot(xgb.gblinear.history(bst)[[3]], type = 'l')

 #### Multiclass classification:
 #
-dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
+dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
 param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-              lambda = 0.0003, alpha = 0.0003, nthread = 2)
+              lambda = 0.0003, alpha = 0.0003, nthread = 1)
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
-bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
+bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
                 callbacks = list(cb.gblinear.history()))
 # Will plot the coefficient paths separately for each class:
 matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@ -164,33 +164,68 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data,
  return ret;
 }

-XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data,
-                                      SEXP num_col, SEXP n_threads) {
+XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
+                                      SEXP n_threads) {
  SEXP ret;
  R_API_BEGIN();
  const int *p_indptr = INTEGER(indptr);
  const int *p_indices = INTEGER(indices);
  const double *p_data = REAL(data);
-  size_t nindptr = static_cast<size_t>(length(indptr));
-  size_t ndata = static_cast<size_t>(length(data));
-  size_t ncol = static_cast<size_t>(INTEGER(num_col)[0]);
-  std::vector<size_t> row_ptr_(nindptr);
-  std::vector<unsigned> indices_(ndata);
-  std::vector<float> data_(ndata);

-  for (size_t i = 0; i < nindptr; ++i) {
-    row_ptr_[i] = static_cast<size_t>(p_indptr[i]);
+  auto nindptr = static_cast<std::size_t>(length(indptr));
+  auto ndata = static_cast<std::size_t>(length(data));
+  auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
+  std::int32_t threads = asInteger(n_threads);
+
+  using xgboost::Array;
+  using xgboost::Integer;
+  using xgboost::Json;
+  using xgboost::Object;
+  using xgboost::String;
+  // Construct array interfaces
+  Json jindptr{Object{}};
+  Json jindices{Object{}};
+  Json jdata{Object{}};
+  jindptr["data"] =
+      Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indptr)}, Json{true}}};
+  jindptr["shape"] = std::vector<Json>{Json{nindptr}};
+  jindptr["version"] = Integer{3};
+
+  jindices["data"] =
+      Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indices)}, Json{true}}};
+  jindices["shape"] = std::vector<Json>{Json{ndata}};
+  jindices["version"] = Integer{3};
+
+  jdata["data"] =
+      Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_data)}, Json{true}}};
+  jdata["shape"] = std::vector<Json>{Json{ndata}};
+  jdata["version"] = Integer{3};
+
+  if (DMLC_LITTLE_ENDIAN) {
+    jindptr["typestr"] = String{"<i4"};
+    jindices["typestr"] = String{"<i4"};
+    jdata["typestr"] = String{"<i8"};
+  } else {
+    jindptr["typestr"] = String{">i4"};
+    jindices["typestr"] = String{">i4"};
+    jdata["typestr"] = String{">i8"};
  }
-  int32_t threads = xgboost::common::OmpGetNumThreads(asInteger(n_threads));
-  xgboost::common::ParallelFor(ndata, threads, [&](xgboost::omp_ulong i) {
-    indices_[i] = static_cast<unsigned>(p_indices[i]);
-    data_[i] = static_cast<float>(p_data[i]);
-  });
+  std::string indptr, indices, data;
+  Json::Dump(jindptr, &indptr);
+  Json::Dump(jindices, &indices);
+  Json::Dump(jdata, &data);
+
  DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromCSREx(BeginPtr(row_ptr_), BeginPtr(indices_),
-                                      BeginPtr(data_), nindptr, ndata,
-                                      ncol, &handle));
+  Json jconfig{Object{}};
+  // Construct configuration
+  jconfig["nthread"] = Integer{threads};
+  jconfig["missing"] = xgboost::Number{std::numeric_limits<float>::quiet_NaN()};
+  std::string config;
+  Json::Dump(jconfig, &config);
+  CHECK_CALL(XGDMatrixCreateFromCSR(indptr.c_str(), indices.c_str(), data.c_str(), ncol,
+                                    config.c_str(), &handle));
  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -397,17 +397,14 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
  API_END();
 }

-XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
-                                   char const *indices, char const *data,
-                                   xgboost::bst_ulong ncol,
-                                   char const* c_json_config,
+XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char const *data,
+                                   xgboost::bst_ulong ncol, char const *c_json_config,
                                   DMatrixHandle *out) {
  API_BEGIN();
  xgboost_CHECK_C_ARG_PTR(indptr);
  xgboost_CHECK_C_ARG_PTR(indices);
  xgboost_CHECK_C_ARG_PTR(data);
-  data::CSRArrayAdapter adapter(StringView{indptr}, StringView{indices},
-                                StringView{data}, ncol);
+  data::CSRArrayAdapter adapter(StringView{indptr}, StringView{indices}, StringView{data}, ncol);
  xgboost_CHECK_C_ARG_PTR(c_json_config);
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@ -165,7 +165,10 @@ def check_rmarkdown() -> None:
    subprocess.check_call([rscript, "-e", "devtools::document()"], env=env)
    output = subprocess.run(["git", "diff", "--name-only"], capture_output=True)
    if len(output.stdout.decode("utf-8").strip()) != 0:
-        raise ValueError("Please run `devtools::document()`.")
+        output = subprocess.run(["git", "diff"], capture_output=True)
+        raise ValueError(
+            "Please run `devtools::document()`. Diff:\n", output.stdout.decode("utf-8")
+        )


@cd(r_package)