[R] Use new interface for creating DMatrix from CSR. (#8455)
* [R] Use new interface for creating DMatrix from CSR. - CSC is still using the old API. The old API is not aware of `nthread` parameter, which makes DMatrix to use all available thread during construction and during transformation lie `SparsePage` -> `CSCPage`.
This commit is contained in:
parent
58d211545f
commit
5f1a6fca0d
@ -66,5 +66,5 @@ Imports:
|
|||||||
methods,
|
methods,
|
||||||
data.table (>= 1.9.6),
|
data.table (>= 1.9.6),
|
||||||
jsonlite (>= 1.0),
|
jsonlite (>= 1.0),
|
||||||
RoxygenNote: 7.2.1
|
RoxygenNote: 7.2.2
|
||||||
SystemRequirements: GNU make, C++14
|
SystemRequirements: GNU make, C++14
|
||||||
|
|||||||
@ -592,12 +592,12 @@ cb.cv.predict <- function(save_models = FALSE) {
|
|||||||
#'
|
#'
|
||||||
#' #### Multiclass classification:
|
#' #### Multiclass classification:
|
||||||
#' #
|
#' #
|
||||||
#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
|
#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
|
||||||
#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
||||||
#' lambda = 0.0003, alpha = 0.0003, nthread = 2)
|
#' lambda = 0.0003, alpha = 0.0003, nthread = 1)
|
||||||
#' # For the default linear updater 'shotgun' it sometimes is helpful
|
#' # For the default linear updater 'shotgun' it sometimes is helpful
|
||||||
#' # to use smaller eta to reduce instability
|
#' # to use smaller eta to reduce instability
|
||||||
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
|
#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
|
||||||
#' callbacks = list(cb.gblinear.history()))
|
#' callbacks = list(cb.gblinear.history()))
|
||||||
#' # Will plot the coefficient paths separately for each class:
|
#' # Will plot the coefficient paths separately for each class:
|
||||||
#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
||||||
|
|||||||
@ -72,12 +72,12 @@ matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
|
|||||||
|
|
||||||
#### Multiclass classification:
|
#### Multiclass classification:
|
||||||
#
|
#
|
||||||
dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 2)
|
dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
|
||||||
param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
|
||||||
lambda = 0.0003, alpha = 0.0003, nthread = 2)
|
lambda = 0.0003, alpha = 0.0003, nthread = 1)
|
||||||
# For the default linear updater 'shotgun' it sometimes is helpful
|
# For the default linear updater 'shotgun' it sometimes is helpful
|
||||||
# to use smaller eta to reduce instability
|
# to use smaller eta to reduce instability
|
||||||
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 70, eta = 0.5,
|
bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
|
||||||
callbacks = list(cb.gblinear.history()))
|
callbacks = list(cb.gblinear.history()))
|
||||||
# Will plot the coefficient paths separately for each class:
|
# Will plot the coefficient paths separately for each class:
|
||||||
matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
|
||||||
|
|||||||
@ -164,33 +164,68 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data,
|
XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
|
||||||
SEXP num_col, SEXP n_threads) {
|
SEXP n_threads) {
|
||||||
SEXP ret;
|
SEXP ret;
|
||||||
R_API_BEGIN();
|
R_API_BEGIN();
|
||||||
const int *p_indptr = INTEGER(indptr);
|
const int *p_indptr = INTEGER(indptr);
|
||||||
const int *p_indices = INTEGER(indices);
|
const int *p_indices = INTEGER(indices);
|
||||||
const double *p_data = REAL(data);
|
const double *p_data = REAL(data);
|
||||||
size_t nindptr = static_cast<size_t>(length(indptr));
|
|
||||||
size_t ndata = static_cast<size_t>(length(data));
|
|
||||||
size_t ncol = static_cast<size_t>(INTEGER(num_col)[0]);
|
|
||||||
std::vector<size_t> row_ptr_(nindptr);
|
|
||||||
std::vector<unsigned> indices_(ndata);
|
|
||||||
std::vector<float> data_(ndata);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < nindptr; ++i) {
|
auto nindptr = static_cast<std::size_t>(length(indptr));
|
||||||
row_ptr_[i] = static_cast<size_t>(p_indptr[i]);
|
auto ndata = static_cast<std::size_t>(length(data));
|
||||||
|
auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
|
||||||
|
std::int32_t threads = asInteger(n_threads);
|
||||||
|
|
||||||
|
using xgboost::Array;
|
||||||
|
using xgboost::Integer;
|
||||||
|
using xgboost::Json;
|
||||||
|
using xgboost::Object;
|
||||||
|
using xgboost::String;
|
||||||
|
// Construct array interfaces
|
||||||
|
Json jindptr{Object{}};
|
||||||
|
Json jindices{Object{}};
|
||||||
|
Json jdata{Object{}};
|
||||||
|
jindptr["data"] =
|
||||||
|
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indptr)}, Json{true}}};
|
||||||
|
jindptr["shape"] = std::vector<Json>{Json{nindptr}};
|
||||||
|
jindptr["version"] = Integer{3};
|
||||||
|
|
||||||
|
jindices["data"] =
|
||||||
|
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indices)}, Json{true}}};
|
||||||
|
jindices["shape"] = std::vector<Json>{Json{ndata}};
|
||||||
|
jindices["version"] = Integer{3};
|
||||||
|
|
||||||
|
jdata["data"] =
|
||||||
|
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_data)}, Json{true}}};
|
||||||
|
jdata["shape"] = std::vector<Json>{Json{ndata}};
|
||||||
|
jdata["version"] = Integer{3};
|
||||||
|
|
||||||
|
if (DMLC_LITTLE_ENDIAN) {
|
||||||
|
jindptr["typestr"] = String{"<i4"};
|
||||||
|
jindices["typestr"] = String{"<i4"};
|
||||||
|
jdata["typestr"] = String{"<i8"};
|
||||||
|
} else {
|
||||||
|
jindptr["typestr"] = String{">i4"};
|
||||||
|
jindices["typestr"] = String{">i4"};
|
||||||
|
jdata["typestr"] = String{">i8"};
|
||||||
}
|
}
|
||||||
int32_t threads = xgboost::common::OmpGetNumThreads(asInteger(n_threads));
|
std::string indptr, indices, data;
|
||||||
xgboost::common::ParallelFor(ndata, threads, [&](xgboost::omp_ulong i) {
|
Json::Dump(jindptr, &indptr);
|
||||||
indices_[i] = static_cast<unsigned>(p_indices[i]);
|
Json::Dump(jindices, &indices);
|
||||||
data_[i] = static_cast<float>(p_data[i]);
|
Json::Dump(jdata, &data);
|
||||||
});
|
|
||||||
DMatrixHandle handle;
|
DMatrixHandle handle;
|
||||||
CHECK_CALL(XGDMatrixCreateFromCSREx(BeginPtr(row_ptr_), BeginPtr(indices_),
|
Json jconfig{Object{}};
|
||||||
BeginPtr(data_), nindptr, ndata,
|
// Construct configuration
|
||||||
ncol, &handle));
|
jconfig["nthread"] = Integer{threads};
|
||||||
|
jconfig["missing"] = xgboost::Number{std::numeric_limits<float>::quiet_NaN()};
|
||||||
|
std::string config;
|
||||||
|
Json::Dump(jconfig, &config);
|
||||||
|
CHECK_CALL(XGDMatrixCreateFromCSR(indptr.c_str(), indices.c_str(), data.c_str(), ncol,
|
||||||
|
config.c_str(), &handle));
|
||||||
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
|
||||||
|
|
||||||
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
|
||||||
R_API_END();
|
R_API_END();
|
||||||
UNPROTECT(1);
|
UNPROTECT(1);
|
||||||
|
|||||||
@ -397,17 +397,14 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
|
|||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|
||||||
XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
|
XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char const *data,
|
||||||
char const *indices, char const *data,
|
xgboost::bst_ulong ncol, char const *c_json_config,
|
||||||
xgboost::bst_ulong ncol,
|
|
||||||
char const* c_json_config,
|
|
||||||
DMatrixHandle *out) {
|
DMatrixHandle *out) {
|
||||||
API_BEGIN();
|
API_BEGIN();
|
||||||
xgboost_CHECK_C_ARG_PTR(indptr);
|
xgboost_CHECK_C_ARG_PTR(indptr);
|
||||||
xgboost_CHECK_C_ARG_PTR(indices);
|
xgboost_CHECK_C_ARG_PTR(indices);
|
||||||
xgboost_CHECK_C_ARG_PTR(data);
|
xgboost_CHECK_C_ARG_PTR(data);
|
||||||
data::CSRArrayAdapter adapter(StringView{indptr}, StringView{indices},
|
data::CSRArrayAdapter adapter(StringView{indptr}, StringView{indices}, StringView{data}, ncol);
|
||||||
StringView{data}, ncol);
|
|
||||||
xgboost_CHECK_C_ARG_PTR(c_json_config);
|
xgboost_CHECK_C_ARG_PTR(c_json_config);
|
||||||
auto config = Json::Load(StringView{c_json_config});
|
auto config = Json::Load(StringView{c_json_config});
|
||||||
float missing = GetMissing(config);
|
float missing = GetMissing(config);
|
||||||
|
|||||||
@ -165,7 +165,10 @@ def check_rmarkdown() -> None:
|
|||||||
subprocess.check_call([rscript, "-e", "devtools::document()"], env=env)
|
subprocess.check_call([rscript, "-e", "devtools::document()"], env=env)
|
||||||
output = subprocess.run(["git", "diff", "--name-only"], capture_output=True)
|
output = subprocess.run(["git", "diff", "--name-only"], capture_output=True)
|
||||||
if len(output.stdout.decode("utf-8").strip()) != 0:
|
if len(output.stdout.decode("utf-8").strip()) != 0:
|
||||||
raise ValueError("Please run `devtools::document()`.")
|
output = subprocess.run(["git", "diff"], capture_output=True)
|
||||||
|
raise ValueError(
|
||||||
|
"Please run `devtools::document()`. Diff:\n", output.stdout.decode("utf-8")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@cd(r_package)
|
@cd(r_package)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user