Use array interface for CSC matrix. (#8672)

* Use array interface for CSC matrix.

Use array interface for CSC matrix and align the interface with CSR and dense.

- Fix nthread issue in the R package DMatrix.
- Unify the behavior of handling `missing` with other inputs.
- Unify the behavior of handling `missing` around R, Python, Java, and Scala DMatrix.
- Expose `num_non_missing` to the JVM interface.
- Deprecate old CSR and CSC constructors.
This commit is contained in:
Jiaming Yuan 2023-02-05 01:59:46 +08:00 committed by GitHub
parent 213b5602d9
commit c1786849e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 673 additions and 380 deletions

View File

@ -36,19 +36,37 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre
cnames <- colnames(data) cnames <- colnames(data)
} else if (inherits(data, "dgCMatrix")) { } else if (inherits(data, "dgCMatrix")) {
handle <- .Call( handle <- .Call(
XGDMatrixCreateFromCSC_R, data@p, data@i, data@x, nrow(data), as.integer(NVL(nthread, -1)) XGDMatrixCreateFromCSC_R,
data@p,
data@i,
data@x,
nrow(data),
missing,
as.integer(NVL(nthread, -1))
) )
cnames <- colnames(data) cnames <- colnames(data)
} else if (inherits(data, "dgRMatrix")) { } else if (inherits(data, "dgRMatrix")) {
handle <- .Call( handle <- .Call(
XGDMatrixCreateFromCSR_R, data@p, data@j, data@x, ncol(data), as.integer(NVL(nthread, -1)) XGDMatrixCreateFromCSR_R,
data@p,
data@j,
data@x,
ncol(data),
missing,
as.integer(NVL(nthread, -1))
) )
cnames <- colnames(data) cnames <- colnames(data)
} else if (inherits(data, "dsparseVector")) { } else if (inherits(data, "dsparseVector")) {
indptr <- c(0L, as.integer(length(data@i))) indptr <- c(0L, as.integer(length(data@i)))
ind <- as.integer(data@i) - 1L ind <- as.integer(data@i) - 1L
handle <- .Call( handle <- .Call(
XGDMatrixCreateFromCSR_R, indptr, ind, data@x, length(data), as.integer(NVL(nthread, -1)) XGDMatrixCreateFromCSR_R,
indptr,
ind,
data@x,
length(data),
missing,
as.integer(NVL(nthread, -1))
) )
} else { } else {
stop("xgb.DMatrix does not support construction from ", typeof(data)) stop("xgb.DMatrix does not support construction from ", typeof(data))

View File

@ -36,8 +36,8 @@ extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP);
extern SEXP XGCheckNullPtr_R(SEXP); extern SEXP XGCheckNullPtr_R(SEXP);
extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP); extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetInfo_R(SEXP, SEXP); extern SEXP XGDMatrixGetInfo_R(SEXP, SEXP);
@ -73,8 +73,8 @@ static const R_CallMethodDef CallEntries[] = {
{"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3},
{"XGBoosterUpdateOneIter_R", (DL_FUNC) &XGBoosterUpdateOneIter_R, 3}, {"XGBoosterUpdateOneIter_R", (DL_FUNC) &XGBoosterUpdateOneIter_R, 3},
{"XGCheckNullPtr_R", (DL_FUNC) &XGCheckNullPtr_R, 1}, {"XGCheckNullPtr_R", (DL_FUNC) &XGCheckNullPtr_R, 1},
{"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 5}, {"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6},
{"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 5}, {"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6},
{"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2}, {"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2},
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3}, {"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
{"XGDMatrixGetInfo_R", (DL_FUNC) &XGDMatrixGetInfo_R, 2}, {"XGDMatrixGetInfo_R", (DL_FUNC) &XGDMatrixGetInfo_R, 2},

View File

@ -16,10 +16,11 @@
#include <vector> #include <vector>
#include "../../src/c_api/c_api_error.h" #include "../../src/c_api/c_api_error.h"
#include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr
#include "../../src/common/threading_utils.h" #include "../../src/common/threading_utils.h"
#include "./xgboost_R.h" // Must follow other includes.
#include "./xgboost_R.h" // Must follow other include. #include "Rinternals.h"
/*! /*!
* \brief macro to annotate begin of api * \brief macro to annotate begin of api
@ -134,34 +135,47 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
return ret; return ret;
} }
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, namespace {
SEXP num_row, SEXP n_threads) { void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
SEXP ret; std::string *indices_str, std::string *data_str) {
R_API_BEGIN();
const int *p_indptr = INTEGER(indptr); const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices); const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data); const double *p_data = REAL(data);
size_t nindptr = static_cast<size_t>(length(indptr));
size_t ndata = static_cast<size_t>(length(data));
size_t nrow = static_cast<size_t>(INTEGER(num_row)[0]);
std::vector<size_t> col_ptr_(nindptr);
std::vector<unsigned> indices_(ndata);
std::vector<float> data_(ndata);
for (size_t i = 0; i < nindptr; ++i) { auto nindptr = static_cast<std::size_t>(length(indptr));
col_ptr_[i] = static_cast<size_t>(p_indptr[i]); auto ndata = static_cast<std::size_t>(length(data));
CHECK_EQ(ndata, p_indptr[nindptr - 1]);
xgboost::detail::MakeSparseFromPtr(p_indptr, p_indices, p_data, nindptr, indptr_str, indices_str,
data_str);
} }
xgboost::Context ctx; } // namespace
ctx.nthread = asInteger(n_threads);
xgboost::common::ParallelFor(ndata, ctx.Threads(), [&](xgboost::omp_ulong i) { XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row,
indices_[i] = static_cast<unsigned>(p_indices[i]); SEXP missing, SEXP n_threads) {
data_[i] = static_cast<float>(p_data[i]); SEXP ret;
}); R_API_BEGIN();
std::int32_t threads = asInteger(n_threads);
using xgboost::Integer;
using xgboost::Json;
using xgboost::Object;
std::string sindptr, sindices, sdata;
CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
auto nrow = static_cast<std::size_t>(INTEGER(num_row)[0]);
DMatrixHandle handle; DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_), Json jconfig{Object{}};
BeginPtr(data_), nindptr, ndata, // Construct configuration
nrow, &handle)); jconfig["nthread"] = Integer{threads};
jconfig["missing"] = xgboost::Number{asReal(missing)};
std::string config;
Json::Dump(jconfig, &config);
CHECK_CALL(XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
config.c_str(), &handle));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END(); R_API_END();
UNPROTECT(1); UNPROTECT(1);
@ -169,64 +183,27 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data,
} }
XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col, XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
SEXP n_threads) { SEXP missing, SEXP n_threads) {
SEXP ret; SEXP ret;
R_API_BEGIN(); R_API_BEGIN();
const int *p_indptr = INTEGER(indptr);
const int *p_indices = INTEGER(indices);
const double *p_data = REAL(data);
auto nindptr = static_cast<std::size_t>(length(indptr));
auto ndata = static_cast<std::size_t>(length(data));
auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
std::int32_t threads = asInteger(n_threads); std::int32_t threads = asInteger(n_threads);
using xgboost::Array;
using xgboost::Integer; using xgboost::Integer;
using xgboost::Json; using xgboost::Json;
using xgboost::Object; using xgboost::Object;
using xgboost::String;
// Construct array interfaces
Json jindptr{Object{}};
Json jindices{Object{}};
Json jdata{Object{}};
jindptr["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indptr)}, Json{true}}};
jindptr["shape"] = std::vector<Json>{Json{nindptr}};
jindptr["version"] = Integer{3};
jindices["data"] = std::string sindptr, sindices, sdata;
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indices)}, Json{true}}}; CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
jindices["shape"] = std::vector<Json>{Json{ndata}}; auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
jindices["version"] = Integer{3};
jdata["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_data)}, Json{true}}};
jdata["shape"] = std::vector<Json>{Json{ndata}};
jdata["version"] = Integer{3};
if (DMLC_LITTLE_ENDIAN) {
jindptr["typestr"] = String{"<i4"};
jindices["typestr"] = String{"<i4"};
jdata["typestr"] = String{"<f8"};
} else {
jindptr["typestr"] = String{">i4"};
jindices["typestr"] = String{">i4"};
jdata["typestr"] = String{">f8"};
}
std::string indptr, indices, data;
Json::Dump(jindptr, &indptr);
Json::Dump(jindices, &indices);
Json::Dump(jdata, &data);
DMatrixHandle handle; DMatrixHandle handle;
Json jconfig{Object{}}; Json jconfig{Object{}};
// Construct configuration // Construct configuration
jconfig["nthread"] = Integer{threads}; jconfig["nthread"] = Integer{threads};
jconfig["missing"] = xgboost::Number{std::numeric_limits<float>::quiet_NaN()}; jconfig["missing"] = xgboost::Number{asReal(missing)};
std::string config; std::string config;
Json::Dump(jconfig, &config); Json::Dump(jconfig, &config);
CHECK_CALL(XGDMatrixCreateFromCSR(indptr.c_str(), indices.c_str(), data.c_str(), ncol, CHECK_CALL(XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
config.c_str(), &handle)); config.c_str(), &handle));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));

View File

@ -59,11 +59,12 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
* \param indices row indices * \param indices row indices
* \param data content of the data * \param data content of the data
* \param num_row numer of rows (when it's set to 0, then guess from data) * \param num_row numer of rows (when it's set to 0, then guess from data)
* \param missing which value to represent missing value
* \param n_threads Number of threads used to construct DMatrix from csc matrix. * \param n_threads Number of threads used to construct DMatrix from csc matrix.
* \return created dmatrix * \return created dmatrix
*/ */
XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row, XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row,
SEXP n_threads); SEXP missing, SEXP n_threads);
/*! /*!
* \brief create a matrix content from CSR format * \brief create a matrix content from CSR format
@ -71,11 +72,12 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
* \param indices column indices * \param indices column indices
* \param data content of the data * \param data content of the data
* \param num_col numer of columns (when it's set to 0, then guess from data) * \param num_col numer of columns (when it's set to 0, then guess from data)
* \param missing which value to represent missing value
* \param n_threads Number of threads used to construct DMatrix from csr matrix. * \param n_threads Number of threads used to construct DMatrix from csr matrix.
* \return created dmatrix * \return created dmatrix
*/ */
XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col, XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
SEXP n_threads); SEXP missing, SEXP n_threads);
/*! /*!
* \brief create a new dmatrix from sliced content of existing matrix * \brief create a new dmatrix from sliced content of existing matrix

View File

@ -22,20 +22,20 @@ test_that("xgb.DMatrix: basic construction", {
n_samples <- 100 n_samples <- 100
X <- cbind( X <- cbind(
x1 = rnorm(n_samples), x1 = sample(x = 4, size = n_samples, replace = TRUE),
x2 = rnorm(n_samples), x2 = sample(x = 4, size = n_samples, replace = TRUE),
x3 = rnorm(n_samples) x3 = sample(x = 4, size = n_samples, replace = TRUE)
) )
X <- matrix(X, nrow = n_samples) X <- matrix(X, nrow = n_samples)
y <- rbinom(n = n_samples, size = 1, prob = 1 / 2) y <- rbinom(n = n_samples, size = 1, prob = 1 / 2)
fd <- xgb.DMatrix(X, label = y) fd <- xgb.DMatrix(X, label = y, missing = 1)
dgc <- as(X, "dgCMatrix") dgc <- as(X, "dgCMatrix")
fdgc <- xgb.DMatrix(dgc, label = y) fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0)
dgr <- as(X, "dgRMatrix") dgr <- as(X, "dgRMatrix")
fdgr <- xgb.DMatrix(dgr, label = y) fdgr <- xgb.DMatrix(dgr, label = y, missing = 1)
params <- list(tree_method = "hist") params <- list(tree_method = "hist")
bst_fd <- xgb.train( bst_fd <- xgb.train(

View File

@ -1,13 +1,16 @@
/*! /**
* Copyright 2019 XGBoost contributors * Copyright 2019-2023 by XGBoost contributors
* *
* \file c-api-demo.c * \file c-api-demo.c
* \brief A simple example of using xgboost C API. * \brief A simple example of using xgboost C API.
*/ */
#include <assert.h> #include <assert.h>
#include <stddef.h>
#include <stdint.h> /* uint32_t,uint64_t */
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h>
#include <xgboost/c_api.h> #include <xgboost/c_api.h>
#define safe_xgboost(call) { \ #define safe_xgboost(call) { \
@ -18,6 +21,21 @@ if (err != 0) { \
} \ } \
} }
/* Make Json encoded array interface. */
static void MakeArrayInterface(size_t data, size_t n, char const* typestr, size_t length,
char* out) {
static char const kTemplate[] =
"{\"data\": [%lu, true], \"shape\": [%lu, %lu], \"typestr\": \"%s\", \"version\": 3}";
memset(out, '\0', length);
sprintf(out, kTemplate, data, n, 1ul, typestr);
}
/* Make Json encoded DMatrix configuration. */
static void MakeConfig(int n_threads, size_t length, char* out) {
static char const kTemplate[] = "{\"missing\": NaN, \"nthread\": %d}";
memset(out, '\0', length);
sprintf(out, kTemplate, n_threads);
}
int main() { int main() {
int silent = 0; int silent = 0;
int use_gpu = 0; // set to 1 to use the GPU for training int use_gpu = 0; // set to 1 to use the GPU for training
@ -121,17 +139,27 @@ int main() {
} }
{ {
printf("Sparse Matrix Example (XGDMatrixCreateFromCSREx): "); printf("Sparse Matrix Example (XGDMatrixCreateFromCSR): ");
const size_t indptr[] = {0, 22}; const uint64_t indptr[] = {0, 22};
const unsigned indices[] = {1, 9, 19, 21, 24, 34, 36, 39, 42, 53, 56, 65, const uint32_t indices[] = {1, 9, 19, 21, 24, 34, 36, 39, 42, 53, 56,
69, 77, 86, 88, 92, 95, 102, 106, 117, 122}; 65, 69, 77, 86, 88, 92, 95, 102, 106, 117, 122};
const float data[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, const float data[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
DMatrixHandle dmat; DMatrixHandle dmat;
safe_xgboost(XGDMatrixCreateFromCSREx(indptr, indices, data, 2, 22, 127, char j_indptr[128];
&dmat)); MakeArrayInterface((size_t)indptr, 2ul, "<u8", sizeof(j_indptr), j_indptr);
char j_indices[128];
MakeArrayInterface((size_t)indices, sizeof(indices) / sizeof(uint32_t), "<u4",
sizeof(j_indices), j_indices);
char j_data[128];
MakeArrayInterface((size_t)data, sizeof(data) / sizeof(float), "<f4", sizeof(j_data), j_data);
char j_config[64];
MakeConfig(0, sizeof(j_config), j_config);
safe_xgboost(XGDMatrixCreateFromCSR(j_indptr, j_indices, j_data, 127, j_config, &dmat));
const float* out_result = NULL; const float* out_result = NULL;
@ -145,25 +173,34 @@ int main() {
} }
{ {
printf("Sparse Matrix Example (XGDMatrixCreateFromCSCEx): "); printf("Sparse Matrix Example (XGDMatrixCreateFromCSC): ");
const size_t col_ptr[] = {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, const uint64_t indptr[] = {
2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11, 11, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9,
11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
14, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15,
18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20,
20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22}; 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22};
const unsigned indices[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, const uint32_t indices[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
0, 0, 0, 0, 0, 0};
const float data[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, const float data[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
char j_indptr[128];
MakeArrayInterface((size_t)indptr, 128ul, "<u8", sizeof(j_indptr), j_indptr);
char j_indices[128];
MakeArrayInterface((size_t)indices, sizeof(indices) / sizeof(unsigned), "<u4",
sizeof(j_indices), j_indices);
char j_data[128];
MakeArrayInterface((size_t)data, sizeof(data) / sizeof(float), "<f4", sizeof(j_data), j_data);
char j_config[64];
MakeConfig(0, sizeof(j_config), j_config);
DMatrixHandle dmat; DMatrixHandle dmat;
safe_xgboost(XGDMatrixCreateFromCSCEx(col_ptr, indices, data, 128, 22, 1, safe_xgboost(XGDMatrixCreateFromCSC(j_indptr, j_indices, j_data, 1, j_config, &dmat));
&dmat));
const float* out_result = NULL; const float* out_result = NULL;

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright (c) 2015~2022 by XGBoost Contributors * Copyright 2015~2023 by XGBoost Contributors
* \file c_api.h * \file c_api.h
* \author Tianqi Chen * \author Tianqi Chen
* \brief C API of XGBoost, used for interfacing to other languages. * \brief C API of XGBoost, used for interfacing to other languages.
@ -148,29 +148,19 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
*/ */
XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out); XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
/**
* @example c-api-demo.c
*/
/*! /*!
* \brief create a matrix content from CSR format * \brief create a matrix content from CSR format
* \param indptr pointer to row headers * \deprecated since 2.0.0
* \param indices findex * \see XGDMatrixCreateFromCSR()
* \param data fvalue
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to kAdapterUnknownSize, then guess from data
* \param out created dmatrix
* \return 0 when success, -1 when failure happens
*/ */
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
const unsigned* indices, const float *data, size_t nindptr, size_t nelem,
const float* data, size_t num_col, DMatrixHandle *out);
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out);
/**
* @example c-api-demo.c
*/
/*! /*!
* \brief Create a matrix from CSR matrix. * \brief Create a matrix from CSR matrix.
* \param indptr JSON encoded __array_interface__ to row pointers in CSR. * \param indptr JSON encoded __array_interface__ to row pointers in CSR.
@ -198,23 +188,28 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatrixHandle *out); XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatrixHandle *out);
/*! /*!
* \brief create a matrix content from CSC format * \brief Create a matrix from a CSC matrix.
* \param col_ptr pointer to col headers * \param indptr JSON encoded __array_interface__ to column pointers in CSC.
* \param indices findex * \param indices JSON encoded __array_interface__ to row indices in CSC.
* \param data fvalue * \param data JSON encoded __array_interface__ to values in CSC.
* \param nindptr number of rows in the matrix + 1 * \param nrow number of rows in the matrix.
* \param nelem number of nonzero elements in the matrix * \param config JSON encoded configuration. Supported values are:
* \param num_row number of rows; when it's set to 0, then guess from data * - missing: Which value to represent missing value.
* - nthread (optional): Number of threads used for initializing DMatrix.
* \param out created dmatrix * \param out created dmatrix
* \return 0 when success, -1 when failure happens * \return 0 when success, -1 when failure happens
*/ */
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
const unsigned* indices, bst_ulong nrow, char const *c_json_config, DMatrixHandle *out);
const float* data,
size_t nindptr, /*!
size_t nelem, * \brief create a matrix content from CSC format
size_t num_row, * \deprecated since 2.0.0
DMatrixHandle* out); * \see XGDMatrixCreateFromCSC()
*/
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices,
const float *data, size_t nindptr, size_t nelem,
size_t num_row, DMatrixHandle *out);
/*! /*!
* \brief create matrix content from dense matrix * \brief create matrix content from dense matrix

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014-2022 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -79,17 +79,9 @@ public class DMatrix {
* @throws XGBoostError * @throws XGBoostError
*/ */
@Deprecated @Deprecated
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st) public DMatrix(long[] headers, int[] indices, float[] data,
throws XGBoostError { DMatrix.SparseType st) throws XGBoostError {
long[] out = new long[1]; this(headers, indices, data, st, 0, Float.NaN, -1);
if (st == SparseType.CSR) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSREx(headers, indices, data, 0, out));
} else if (st == SparseType.CSC) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSCEx(headers, indices, data, 0, out));
} else {
throw new UnknownError("unknow sparsetype");
}
handle = out[0];
} }
/** /**
@ -102,15 +94,20 @@ public class DMatrix {
* row number * row number
* @throws XGBoostError * @throws XGBoostError
*/ */
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st, int shapeParam) public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st,
throws XGBoostError { int shapeParam) throws XGBoostError {
this(headers, indices, data, st, shapeParam, Float.NaN, -1);
}
public DMatrix(long[] headers, int[] indices, float[] data, DMatrix.SparseType st, int shapeParam,
float missing, int nthread) throws XGBoostError {
long[] out = new long[1]; long[] out = new long[1];
if (st == SparseType.CSR) { if (st == SparseType.CSR) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSREx(headers, indices, data, XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSR(headers, indices, data,
shapeParam, out)); shapeParam, missing, nthread, out));
} else if (st == SparseType.CSC) { } else if (st == SparseType.CSC) {
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSCEx(headers, indices, data, XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixCreateFromCSC(headers, indices, data,
shapeParam, out)); shapeParam, missing, nthread, out));
} else { } else {
throw new UnknownError("unknow sparsetype"); throw new UnknownError("unknow sparsetype");
} }
@ -425,6 +422,18 @@ public class DMatrix {
return rowNum[0]; return rowNum[0];
} }
/**
* Get the number of non-missing values of DMatrix.
*
* @return The number of non-missing values
* @throws XGBoostError native error
*/
public long nonMissingNum() throws XGBoostError {
long[] n = new long[1];
XGBoostJNI.checkCall(XGBoostJNI.XGDMatrixNumNonMissing(handle, n));
return n[0];
}
/** /**
* save DMatrix to filePath * save DMatrix to filePath
*/ */

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014-2022 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -56,11 +56,15 @@ class XGBoostJNI {
final static native int XGDMatrixCreateFromDataIter(java.util.Iterator<DataBatch> iter, final static native int XGDMatrixCreateFromDataIter(java.util.Iterator<DataBatch> iter,
String cache_info, long[] out); String cache_info, long[] out);
public final static native int XGDMatrixCreateFromCSREx(long[] indptr, int[] indices, float[] data, public final static native int XGDMatrixCreateFromCSR(long[] indptr, int[] indices,
int shapeParam, long[] out); float[] data, int shapeParam,
float missing, int nthread,
long[] out);
public final static native int XGDMatrixCreateFromCSCEx(long[] colptr, int[] indices, float[] data, public final static native int XGDMatrixCreateFromCSC(long[] colptr, int[] indices,
int shapeParam, long[] out); float[] data, int shapeParam,
float missing, int nthread,
long[] out);
public final static native int XGDMatrixCreateFromMat(float[] data, int nrow, int ncol, public final static native int XGDMatrixCreateFromMat(float[] data, int nrow, int ncol,
float missing, long[] out); float missing, long[] out);
@ -96,6 +100,7 @@ class XGBoostJNI {
long[] outLength, String[][] outValues); long[] outLength, String[][] outValues);
public final static native int XGDMatrixNumRow(long handle, long[] row); public final static native int XGDMatrixNumRow(long handle, long[] row);
public final static native int XGDMatrixNumNonMissing(long handle, long[] nonMissings);
public final static native int XGBoosterCreate(long[] handles, long[] out); public final static native int XGBoosterCreate(long[] handles, long[] out);

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014,2021 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -54,7 +54,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
@throws(classOf[XGBoostError]) @throws(classOf[XGBoostError])
@deprecated @deprecated
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType) { def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType) {
this(new JDMatrix(headers, indices, data, st)) this(new JDMatrix(headers, indices, data, st, 0, Float.NaN, -1))
} }
/** /**
@ -70,7 +70,25 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
@throws(classOf[XGBoostError]) @throws(classOf[XGBoostError])
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType, def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType,
shapeParam: Int) { shapeParam: Int) {
this(new JDMatrix(headers, indices, data, st, shapeParam)) this(new JDMatrix(headers, indices, data, st, shapeParam, Float.NaN, -1))
}
/**
* create DMatrix from sparse matrix
*
* @param headers index to headers (rowHeaders for CSR or colHeaders for CSC)
* @param indices Indices (colIndexs for CSR or rowIndexs for CSC)
* @param data non zero values (sequence by row for CSR or by col for CSC)
* @param st sparse matrix type (CSR or CSC)
* @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as
* row number
* @param missing missing value
* @param nthread The number of threads used for constructing DMatrix
*/
@throws(classOf[XGBoostError])
def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType,
shapeParam: Int, missing: Float, nthread: Int) {
this(new JDMatrix(headers, indices, data, st, shapeParam, missing, nthread))
} }
/** /**
@ -78,7 +96,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
* @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface * @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface
* of feature columns * of feature columns
* @param missing missing value * @param missing missing value
* @param nthread threads number * @param nthread The number of threads used for constructing DMatrix
*/ */
@throws(classOf[XGBoostError]) @throws(classOf[XGBoostError])
def this(columnBatch: ColumnBatch, missing: Float, nthread: Int) { def this(columnBatch: ColumnBatch, missing: Float, nthread: Int) {
@ -246,6 +264,16 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
jDMatrix.rowNum jDMatrix.rowNum
} }
/**
* Get the number of non-missing values of DMatrix.
*
* @return The number of non-missing values
*/
@throws(classOf[XGBoostError])
def nonMissingNum: Long = {
jDMatrix.nonMissingNum
}
/** /**
* save DMatrix to filePath * save DMatrix to filePath
* *

View File

@ -1,5 +1,5 @@
/* /**
Copyright (c) 2014-2022 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
@ -12,18 +12,23 @@
limitations under the License. limitations under the License.
*/ */
#include "./xgboost4j.h"
#include <rabit/c_api.h>
#include <xgboost/base.h>
#include <xgboost/c_api.h>
#include <xgboost/json.h>
#include <xgboost/logging.h>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
#include <limits>
#include <rabit/c_api.h>
#include <xgboost/c_api.h>
#include <xgboost/base.h>
#include <xgboost/logging.h>
#include <xgboost/json.h>
#include "./xgboost4j.h"
#include <cstring> #include <cstring>
#include <vector> #include <limits>
#include <string> #include <string>
#include <type_traits>
#include <vector>
#include "../../../src/c_api/c_api_utils.h"
#define JVM_CHECK_CALL(__expr) \ #define JVM_CHECK_CALL(__expr) \
{ \ { \
@ -219,58 +224,89 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro
return ret; return ret;
} }
/* namespace {
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI /**
* Method: XGDMatrixCreateFromCSREx * \brief Create from sparse matrix.
* Signature: ([J[I[FI[J)I *
* \param maker Indirect call to XGBoost C function for creating CSC and CSR.
*
* \return Status
*/ */
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx template <typename Fn>
(JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol, jlongArray jout) { jint MakeJVMSparseInput(JNIEnv *jenv, jlongArray jindptr, jintArray jindices, jfloatArray jdata,
jfloat jmissing, jint jnthread, Fn &&maker, jlongArray jout) {
DMatrixHandle result; DMatrixHandle result;
jlong* indptr = jenv->GetLongArrayElements(jindptr, 0);
jint* indices = jenv->GetIntArrayElements(jindices, 0); jlong *indptr = jenv->GetLongArrayElements(jindptr, nullptr);
jfloat* data = jenv->GetFloatArrayElements(jdata, 0); jint *indices = jenv->GetIntArrayElements(jindices, nullptr);
bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); jfloat *data = jenv->GetFloatArrayElements(jdata, nullptr);
bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata); bst_ulong nindptr = static_cast<bst_ulong>(jenv->GetArrayLength(jindptr));
jint ret = (jint) XGDMatrixCreateFromCSREx((size_t const *)indptr, bst_ulong nelem = static_cast<bst_ulong>(jenv->GetArrayLength(jdata));
(unsigned int const *)indices,
(float const *)data, std::string sindptr, sindices, sdata;
nindptr, nelem, jcol, &result); CHECK_EQ(indptr[nindptr - 1], nelem);
using IndPtrT = std::conditional_t<std::is_convertible<jlong *, long *>::value, long, long long>;
using IndT =
std::conditional_t<std::is_convertible<jint *, std::int32_t *>::value, std::int32_t, long>;
xgboost::detail::MakeSparseFromPtr(
static_cast<IndPtrT const *>(indptr), static_cast<IndT const *>(indices),
static_cast<float const *>(data), nindptr, &sindptr, &sindices, &sdata);
xgboost::Json jconfig{xgboost::Object{}};
auto missing = static_cast<float>(jmissing);
auto n_threads = static_cast<std::int32_t>(jnthread);
// Construct configuration
jconfig["nthread"] = xgboost::Integer{n_threads};
jconfig["missing"] = xgboost::Number{missing};
std::string config;
xgboost::Json::Dump(jconfig, &config);
jint ret = maker(sindptr.c_str(), sindices.c_str(), sdata.c_str(), config.c_str(), &result);
JVM_CHECK_CALL(ret); JVM_CHECK_CALL(ret);
setHandle(jenv, jout, result); setHandle(jenv, jout, result);
// Release // Release
jenv->ReleaseLongArrayElements(jindptr, indptr, 0); jenv->ReleaseLongArrayElements(jindptr, indptr, 0);
jenv->ReleaseIntArrayElements(jindices, indices, 0); jenv->ReleaseIntArrayElements(jindices, indices, 0);
jenv->ReleaseFloatArrayElements(jdata, data, 0); jenv->ReleaseFloatArrayElements(jdata, data, 0);
return ret; return ret;
} }
} // anonymous namespace
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSCEx * Method: XGDMatrixCreateFromCSR
* Signature: ([J[I[FI[J)I * Signature: ([J[I[FIFI[J)I
*/ */
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR(
(JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow, jlongArray jout) { JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol,
DMatrixHandle result; jfloat jmissing, jint jnthread, jlongArray jout) {
jlong* indptr = jenv->GetLongArrayElements(jindptr, NULL); using CSTR = char const *;
jint* indices = jenv->GetIntArrayElements(jindices, 0); return MakeJVMSparseInput(
jfloat* data = jenv->GetFloatArrayElements(jdata, NULL); jenv, jindptr, jindices, jdata, jmissing, jnthread,
bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); [&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) {
bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata); return XGDMatrixCreateFromCSR(sindptr, sindices, sdata, static_cast<std::int32_t>(jcol),
sconfig, result);
},
jout);
}
jint ret = (jint) XGDMatrixCreateFromCSCEx((size_t const *)indptr, /*
(unsigned int const *)indices, * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
(float const *)data, * Method: XGDMatrixCreateFromCSC
nindptr, nelem, jrow, &result); * Signature: ([J[I[FIFI[J)I
JVM_CHECK_CALL(ret); */
setHandle(jenv, jout, result); JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC(
//release JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow,
jenv->ReleaseLongArrayElements(jindptr, indptr, 0); jfloat jmissing, jint jnthread, jlongArray jout) {
jenv->ReleaseIntArrayElements(jindices, indices, 0); using CSTR = char const *;
jenv->ReleaseFloatArrayElements(jdata, data, 0); return MakeJVMSparseInput(
jenv, jindptr, jindices, jdata, jmissing, jnthread,
return ret; [&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) {
return XGDMatrixCreateFromCSC(sindptr, sindices, sdata, static_cast<bst_ulong>(jrow),
sconfig, result);
},
jout);
} }
/* /*
@ -459,6 +495,23 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow
return ret; return ret;
} }
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumNonMissing
* Signature: (J[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing(
JNIEnv *jenv, jclass, jlong jhandle, jlongArray jout) {
DMatrixHandle handle = reinterpret_cast<DMatrixHandle>(jhandle);
CHECK(handle);
bst_ulong result[1];
auto ret = static_cast<jint>(XGDMatrixNumNonMissing(handle, result));
jlong jresult[1]{static_cast<jlong>(result[0])};
jenv->SetLongArrayRegion(jout, 0, 1, jresult);
JVM_CHECK_CALL(ret);
return ret;
}
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGBoosterCreate * Method: XGBoosterCreate

View File

@ -33,19 +33,19 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSREx * Method: XGDMatrixCreateFromCSR
* Signature: ([J[I[FI[J)I * Signature: ([J[I[FIFI[J)I
*/ */
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray); (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray);
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixCreateFromCSCEx * Method: XGDMatrixCreateFromCSC
* Signature: ([J[I[FI[J)I * Signature: ([J[I[FIFI[J)I
*/ */
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC
(JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray); (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray);
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
@ -119,6 +119,22 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetFloatI
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntInfo JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray); (JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixSetStrFeatureInfo
* Signature: (JLjava/lang/String;[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixGetStrFeatureInfo
* Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray);
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumRow * Method: XGDMatrixNumRow
@ -127,6 +143,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntIn
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow
(JNIEnv *, jclass, jlong, jlongArray); (JNIEnv *, jclass, jlong, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixNumNonMissing
* Signature: (J[J)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing
(JNIEnv *, jclass, jlong, jlongArray);
/* /*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGBoosterCreate * Method: XGBoosterCreate
@ -359,22 +383,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixC
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromArrayInterfaceColumns JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromArrayInterfaceColumns
(JNIEnv *, jclass, jstring, jfloat, jint, jlongArray); (JNIEnv *, jclass, jstring, jfloat, jint, jlongArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixSetStrFeatureInfo
* Signature: (JLjava/lang/String;[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jobjectArray);
/*
* Class: ml_dmlc_xgboost4j_java_XGBoostJNI
* Method: XGDMatrixGetStrFeatureInfo
* Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I
*/
JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo
(JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2014 by Contributors Copyright (c) 2014-2023 by Contributors
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -54,6 +54,9 @@ class DMatrixSuite extends FunSuite {
dmat1.setLabel(label1) dmat1.setLabel(label1)
val label2 = dmat1.getLabel val label2 = dmat1.getLabel
assert(label2 === label1) assert(label2 === label1)
val dmat2 = new DMatrix(rowHeaders, colIndex, data, JDMatrix.SparseType.CSR, 5, 1.0f, -1)
assert(dmat2.nonMissingNum === 9);
} }
test("create DMatrix from CSREx") { test("create DMatrix from CSREx") {
@ -94,6 +97,9 @@ class DMatrixSuite extends FunSuite {
dmat1.setLabel(label1) dmat1.setLabel(label1)
val label2 = dmat1.getLabel val label2 = dmat1.getLabel
assert(label2 === label1) assert(label2 === label1)
val dmat2 = new DMatrix(colHeaders, rowIndex, data, JDMatrix.SparseType.CSC, 5, 1.0f, -1)
assert(dmat2.nonMissingNum === 9);
} }
test("create DMatrix from CSCEx") { test("create DMatrix from CSCEx") {

View File

@ -2311,9 +2311,9 @@ class Booster:
) )
return _prediction_output(shape, dims, preds, False) return _prediction_output(shape, dims, preds, False)
if isinstance(data, scipy.sparse.csr_matrix): if isinstance(data, scipy.sparse.csr_matrix):
from .data import _transform_scipy_csr from .data import transform_scipy_sparse
data = _transform_scipy_csr(data) data = transform_scipy_sparse(data, True)
_check_call( _check_call(
_LIB.XGBoosterPredictFromCSR( _LIB.XGBoosterPredictFromCSR(
self.handle, self.handle,

View File

@ -28,7 +28,6 @@ from .core import (
_check_call, _check_call,
_cuda_array_interface, _cuda_array_interface,
_ProxyDMatrix, _ProxyDMatrix,
c_array,
c_str, c_str,
from_pystr_to_cstr, from_pystr_to_cstr,
make_jcargs, make_jcargs,
@ -76,8 +75,15 @@ def _array_interface(data: np.ndarray) -> bytes:
return interface_str return interface_str
def _transform_scipy_csr(data: DataType) -> DataType: def transform_scipy_sparse(data: DataType, is_csr: bool) -> DataType:
from scipy.sparse import csr_matrix """Ensure correct data alignment and data type for scipy sparse inputs. Input should
be either csr or csc matrix.
"""
from scipy.sparse import csc_matrix, csr_matrix
if len(data.indices) != len(data.data):
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype) indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype) indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
@ -87,7 +93,10 @@ def _transform_scipy_csr(data: DataType) -> DataType:
or indices is not data.indices or indices is not data.indices
or values is not data.data or values is not data.data
): ):
if is_csr:
data = csr_matrix((values, indices, indptr), shape=data.shape) data = csr_matrix((values, indices, indptr), shape=data.shape)
else:
data = csc_matrix((values, indices, indptr), shape=data.shape)
return data return data
@ -99,12 +108,8 @@ def _from_scipy_csr(
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType: ) -> DispatchedDataBackendReturnType:
"""Initialize data from a CSR matrix.""" """Initialize data from a CSR matrix."""
if len(data.indices) != len(data.data):
raise ValueError(
f"length mismatch: {len(data.indices)} vs {len(data.data)}"
)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
data = _transform_scipy_csr(data) data = transform_scipy_sparse(data, True)
_check_call( _check_call(
_LIB.XGDMatrixCreateFromCSR( _LIB.XGDMatrixCreateFromCSR(
_array_interface(data.indptr), _array_interface(data.indptr),
@ -128,22 +133,24 @@ def _is_scipy_csc(data: DataType) -> bool:
def _from_scipy_csc( def _from_scipy_csc(
data: DataType, data: DataType,
missing: Optional[FloatCompatible], missing: FloatCompatible,
nthread: int,
feature_names: Optional[FeatureNames], feature_names: Optional[FeatureNames],
feature_types: Optional[FeatureTypes], feature_types: Optional[FeatureTypes],
) -> DispatchedDataBackendReturnType: ) -> DispatchedDataBackendReturnType:
if len(data.indices) != len(data.data): """Initialize data from a CSC matrix."""
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
_warn_unused_missing(data, missing)
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromCSCEx( transform_scipy_sparse(data, False)
c_array(ctypes.c_size_t, data.indptr), _check_call(
c_array(ctypes.c_uint, data.indices), _LIB.XGDMatrixCreateFromCSC(
c_array(ctypes.c_float, data.data), _array_interface(data.indptr),
ctypes.c_size_t(len(data.indptr)), _array_interface(data.indices),
ctypes.c_size_t(len(data.data)), _array_interface(data.data),
ctypes.c_size_t(data.shape[0]), c_bst_ulong(data.shape[0]),
ctypes.byref(handle))) make_jcargs(missing=float(missing), nthread=int(nthread)),
ctypes.byref(handle),
)
)
return handle, feature_names, feature_types return handle, feature_names, feature_types
@ -1032,7 +1039,7 @@ def dispatch_data_backend(
if _is_scipy_csr(data): if _is_scipy_csr(data):
return _from_scipy_csr(data, missing, threads, feature_names, feature_types) return _from_scipy_csr(data, missing, threads, feature_names, feature_types)
if _is_scipy_csc(data): if _is_scipy_csc(data):
return _from_scipy_csc(data, missing, feature_names, feature_types) return _from_scipy_csc(data, missing, threads, feature_names, feature_types)
if _is_scipy_coo(data): if _is_scipy_coo(data):
return _from_scipy_csr( return _from_scipy_csr(
data.tocsr(), missing, threads, feature_names, feature_types data.tocsr(), missing, threads, feature_names, feature_types
@ -1288,7 +1295,7 @@ def _proxy_transform(
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
data = _transform_scipy_csr(data) data = transform_scipy_sparse(data, True)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_pandas_series(data): if _is_pandas_series(data):
import pandas as pd import pandas as pd

View File

@ -112,8 +112,8 @@ def _objective_decorator(
def _metric_decorator(func: Callable) -> Metric: def _metric_decorator(func: Callable) -> Metric:
"""Decorate a metric function from sklearn. """Decorate a metric function from sklearn.
Converts an metric function that uses the typical sklearn metric signature so that it Converts an metric function that uses the typical sklearn metric signature so that
is compatible with :py:func:`train` it is compatible with :py:func:`train`
""" """
@ -122,7 +122,6 @@ def _metric_decorator(func: Callable) -> Metric:
weight = dmatrix.get_weight() weight = dmatrix.get_weight()
if weight.size == 0: if weight.size == 0:
return func.__name__, func(y_true, y_score) return func.__name__, func(y_true, y_score)
else:
return func.__name__, func(y_true, y_score, sample_weight=weight) return func.__name__, func(y_true, y_score, sample_weight=weight)
return inner return inner

View File

@ -1,31 +1,32 @@
/** /**
* Copyright 2014-2023 by XGBoost Contributors * Copyright 2014-2023 by XGBoost Contributors
*/ */
#include "xgboost/c_api.h"
#include <rabit/c_api.h> #include <rabit/c_api.h>
#include <cstring> #include <cstring>
#include <fstream> #include <fstream>
#include <vector>
#include <string>
#include <memory> #include <memory>
#include <string>
#include <vector>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/learner.h"
#include "xgboost/c_api.h"
#include "xgboost/logging.h"
#include "xgboost/version_config.h"
#include "xgboost/json.h"
#include "xgboost/global_config.h"
#include "c_api_error.h"
#include "c_api_utils.h"
#include "../collective/communicator-inl.h" #include "../collective/communicator-inl.h"
#include "../common/io.h"
#include "../common/charconv.h" #include "../common/charconv.h"
#include "../common/io.h"
#include "../data/adapter.h" #include "../data/adapter.h"
#include "../data/simple_dmatrix.h" #include "../data/simple_dmatrix.h"
#include "c_api_error.h"
#include "c_api_utils.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/global_config.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/json.h"
#include "xgboost/learner.h"
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // StringView
#include "xgboost/version_config.h"
#if defined(XGBOOST_USE_FEDERATED) #if defined(XGBOOST_USE_FEDERATED)
#include "../../plugin/federated/federated_server.h" #include "../../plugin/federated/federated_server.h"
@ -58,6 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
} // namespace xgboost } // namespace xgboost
#endif #endif
namespace {
void DeprecatedFunc(StringView old, StringView since, StringView replacement) {
LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement
<< "` instead.";
}
} // anonymous namespace
XGB_DLL int XGBuildInfo(char const **out) { XGB_DLL int XGBuildInfo(char const **out) {
API_BEGIN(); API_BEGIN();
xgboost_CHECK_C_ARG_PTR(out); xgboost_CHECK_C_ARG_PTR(out);
@ -298,7 +306,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
int nthread, int max_bin, int nthread, int max_bin,
DMatrixHandle *out) { DMatrixHandle *out) {
API_BEGIN(); API_BEGIN();
LOG(WARNING) << __func__ << " is deprecated. Use `XGQuantileDMatrixCreateFromCallback` instead."; DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
*out = new std::shared_ptr<xgboost::DMatrix>{ *out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)}; xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
API_END(); API_END();
@ -398,14 +406,11 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
// End Create from data iterator // End Create from data iterator
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
const unsigned* indices, const bst_float *data, size_t nindptr, size_t nelem,
const bst_float* data, size_t num_col, DMatrixHandle *out) {
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out) {
API_BEGIN(); API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col); data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1)); *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
API_END(); API_END();
@ -443,14 +448,29 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
API_END(); API_END();
} }
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
const unsigned* indices, xgboost::bst_ulong nrow, char const *c_json_config,
const bst_float* data,
size_t nindptr,
size_t,
size_t num_row,
DMatrixHandle *out) { DMatrixHandle *out) {
API_BEGIN(); API_BEGIN();
xgboost_CHECK_C_ARG_PTR(indptr);
xgboost_CHECK_C_ARG_PTR(indices);
xgboost_CHECK_C_ARG_PTR(data);
data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
xgboost_CHECK_C_ARG_PTR(c_json_config);
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices,
const bst_float *data, size_t nindptr, size_t, size_t num_row,
DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row); data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
xgboost_CHECK_C_ARG_PTR(out); xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1)); *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
@ -1203,8 +1223,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l
raw_str.resize(0); raw_str.resize(0);
common::MemoryBufferStream fo(&raw_str); common::MemoryBufferStream fo(&raw_str);
LOG(WARNING) << "`" << __func__ DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
<< "` is deprecated, please use `XGBoosterSaveModelToBuffer` instead.";
learner->Configure(); learner->Configure();
learner->SaveModel(&fo); learner->SaveModel(&fo);

View File

@ -1,10 +1,11 @@
/*! /**
* Copyright (c) 2021-2022 by XGBoost Contributors * Copyright 2021-2023 by XGBoost Contributors
*/ */
#ifndef XGBOOST_C_API_C_API_UTILS_H_ #ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_ #define XGBOOST_C_API_C_API_UTILS_H_
#include <algorithm> #include <algorithm>
#include <cstddef>
#include <functional> #include <functional>
#include <memory> // std::shared_ptr #include <memory> // std::shared_ptr
#include <string> #include <string>
@ -14,6 +15,7 @@
#include "xgboost/data.h" // DMatrix #include "xgboost/data.h" // DMatrix
#include "xgboost/json.h" #include "xgboost/json.h"
#include "xgboost/learner.h" #include "xgboost/learner.h"
#include "xgboost/linalg.h" // ArrayInterfaceHandler
#include "xgboost/logging.h" #include "xgboost/logging.h"
#include "xgboost/string_view.h" // StringView #include "xgboost/string_view.h" // StringView
@ -281,5 +283,55 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
CHECK(p_m) << msg; CHECK(p_m) << msg;
return p_m; return p_m;
} }
namespace detail {
template <typename PtrT, typename I, typename T>
void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
std::string *data_str) {
auto ndata = static_cast<Integer::Int>(p_indptr[nindptr - 1]);
// Construct array interfaces
Json jindptr{Object{}};
Json jindices{Object{}};
Json jdata{Object{}};
CHECK(p_indptr);
jindptr["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indptr)}, Json{true}}};
jindptr["shape"] = std::vector<Json>{Json{nindptr}};
jindptr["version"] = Integer{3};
CHECK(p_indices);
jindices["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indices)}, Json{true}}};
jindices["shape"] = std::vector<Json>{Json{ndata}};
jindices["version"] = Integer{3};
CHECK(p_data);
jdata["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_data)}, Json{true}}};
jdata["shape"] = std::vector<Json>{Json{ndata}};
jdata["version"] = Integer{3};
std::string pindptr_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<PtrT>() + std::to_string(sizeof(PtrT));
std::string ind_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<I>() + std::to_string(sizeof(I));
std::string data_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<T>() + std::to_string(sizeof(T));
if (DMLC_LITTLE_ENDIAN) {
jindptr["typestr"] = String{"<" + pindptr_typestr};
jindices["typestr"] = String{"<" + ind_typestr};
jdata["typestr"] = String{"<" + data_typestr};
} else {
jindptr["typestr"] = String{">" + pindptr_typestr};
jindices["typestr"] = String{">" + ind_typestr};
jdata["typestr"] = String{">" + data_typestr};
}
Json::Dump(jindptr, indptr_str);
Json::Dump(jindices, indices_str);
Json::Dump(jdata, data_str);
}
} // namespace detail
} // namespace xgboost } // namespace xgboost
#endif // XGBOOST_C_API_C_API_UTILS_H_ #endif // XGBOOST_C_API_C_API_UTILS_H_

View File

@ -6,25 +6,25 @@
#define XGBOOST_DATA_ADAPTER_H_ #define XGBOOST_DATA_ADAPTER_H_
#include <dmlc/data.h> #include <dmlc/data.h>
#include <cstddef> #include <algorithm>
#include <cstddef> // std::size_t
#include <functional> #include <functional>
#include <limits> #include <limits>
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility> // std::move
#include <vector> #include <vector>
#include <map>
#include <algorithm>
#include "xgboost/logging.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/span.h"
#include "array_interface.h"
#include "../c_api/c_api_error.h" #include "../c_api/c_api_error.h"
#include "../common/math.h" #include "../common/math.h"
#include "array_interface.h"
#include "arrow-cdi.h" #include "arrow-cdi.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/logging.h"
#include "xgboost/span.h"
#include "xgboost/string_view.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@ -472,6 +472,84 @@ class CSCAdapter : public detail::SingleBatchDataIter<CSCAdapterBatch> {
size_t num_columns_; size_t num_columns_;
}; };
class CSCArrayAdapterBatch : public detail::NoMetaInfo {
ArrayInterface<1> indptr_;
ArrayInterface<1> indices_;
ArrayInterface<1> values_;
bst_row_t n_rows_;
class Line {
std::size_t column_idx_;
ArrayInterface<1> row_idx_;
ArrayInterface<1> values_;
std::size_t offset_;
public:
Line(std::size_t idx, ArrayInterface<1> row_idx, ArrayInterface<1> values, std::size_t offset)
: column_idx_{idx},
row_idx_{std::move(row_idx)},
values_{std::move(values)},
offset_{offset} {}
std::size_t Size() const { return values_.Shape(0); }
COOTuple GetElement(std::size_t idx) const {
return {TypedIndex<std::size_t, 1>{row_idx_}(offset_ + idx), column_idx_,
values_(offset_ + idx)};
}
};
public:
static constexpr bool kIsRowMajor = false;
CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
ArrayInterface<1> values, bst_row_t n_rows)
: indptr_{std::move(indptr)},
indices_{std::move(indices)},
values_{std::move(values)},
n_rows_{n_rows} {}
std::size_t Size() const { return indptr_.n - 1; }
Line GetLine(std::size_t idx) const {
auto begin_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx);
auto end_no_stride = TypedIndex<std::size_t, 1>{indptr_}(idx + 1);
auto indices = indices_;
auto values = values_;
// Slice indices and values, stride remains unchanged since this is slicing by
// specific index.
auto offset = indices.strides[0] * begin_no_stride;
indices.shape[0] = end_no_stride - begin_no_stride;
values.shape[0] = end_no_stride - begin_no_stride;
return Line{idx, indices, values, offset};
}
};
/**
* \brief CSC adapter with support for array interface.
*/
class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch> {
ArrayInterface<1> indptr_;
ArrayInterface<1> indices_;
ArrayInterface<1> values_;
size_t num_rows_;
CSCArrayAdapterBatch batch_;
public:
CSCArrayAdapter(StringView indptr, StringView indices, StringView values, std::size_t num_rows)
: indptr_{indptr},
indices_{indices},
values_{values},
num_rows_{num_rows},
batch_{
CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
// JVM package sends 0 as unknown
size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }
size_t NumColumns() const { return indptr_.n - 1; }
const CSCArrayAdapterBatch& Value() const override { return batch_; }
};
class DataTableAdapterBatch : public detail::NoMetaInfo { class DataTableAdapterBatch : public detail::NoMetaInfo {
enum class DTType : std::uint8_t { enum class DTType : std::uint8_t {
kFloat32 = 0, kFloat32 = 0,

View File

@ -945,30 +945,32 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const st
return new data::SimpleDMatrix(adapter, missing, nthread); return new data::SimpleDMatrix(adapter, missing, nthread);
} }
template DMatrix* DMatrix::Create<data::DenseAdapter>( template DMatrix* DMatrix::Create<data::DenseAdapter>(data::DenseAdapter* adapter, float missing,
data::DenseAdapter* adapter, float missing, int nthread, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::ArrayAdapter>( template DMatrix* DMatrix::Create<data::ArrayAdapter>(data::ArrayAdapter* adapter, float missing,
data::ArrayAdapter* adapter, float missing, int nthread, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRAdapter>( template DMatrix* DMatrix::Create<data::CSRAdapter>(data::CSRAdapter* adapter, float missing,
data::CSRAdapter* adapter, float missing, int nthread, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSCAdapter>( template DMatrix* DMatrix::Create<data::CSCAdapter>(data::CSCAdapter* adapter, float missing,
data::CSCAdapter* adapter, float missing, int nthread, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::DataTableAdapter>( template DMatrix* DMatrix::Create<data::DataTableAdapter>(data::DataTableAdapter* adapter,
data::DataTableAdapter* adapter, float missing, int nthread, float missing, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::FileAdapter>( template DMatrix* DMatrix::Create<data::FileAdapter>(data::FileAdapter* adapter, float missing,
data::FileAdapter* adapter, float missing, int nthread, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::CSRArrayAdapter>( template DMatrix* DMatrix::Create<data::CSRArrayAdapter>(data::CSRArrayAdapter* adapter,
data::CSRArrayAdapter* adapter, float missing, int nthread, float missing, std::int32_t nthread,
const std::string& cache_prefix); const std::string& cache_prefix);
template DMatrix * template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter* adapter,
DMatrix::Create(data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, float missing, std::int32_t nthread,
XGBoostBatchCSR> *adapter, const std::string& cache_prefix);
template DMatrix* DMatrix::Create(
data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
float missing, int nthread, const std::string& cache_prefix); float missing, int nthread, const std::string& cache_prefix);
template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>( template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&); data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&);
@ -1221,20 +1223,19 @@ void SparsePage::PushCSC(const SparsePage &batch) {
self_offset = std::move(offset); self_offset = std::move(offset);
} }
template uint64_t template uint64_t SparsePage::Push(const data::DenseAdapterBatch& batch, float missing,
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread); int nthread);
template uint64_t template uint64_t SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing,
SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread); int nthread);
template uint64_t template uint64_t SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing,
template uint64_t int nthread);
SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::CSCArrayAdapterBatch& batch, float missing,
template uint64_t int nthread);
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
template uint64_t template uint64_t SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing,
SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread); int nthread);
template uint64_t template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread);
namespace data { namespace data {

View File

@ -1,23 +1,23 @@
/*! /**
* Copyright 2014~2022 by XGBoost Contributors * Copyright 2014~2023 by XGBoost Contributors
* \file simple_dmatrix.cc * \file simple_dmatrix.cc
* \brief the input data structure for gradient boosting * \brief the input data structure for gradient boosting
* \author Tianqi Chen * \author Tianqi Chen
*/ */
#include <vector> #include "simple_dmatrix.h"
#include <algorithm>
#include <limits> #include <limits>
#include <type_traits> #include <type_traits>
#include <algorithm> #include <vector>
#include "xgboost/data.h"
#include "xgboost/c_api.h"
#include "simple_dmatrix.h"
#include "./simple_batch_iterator.h"
#include "../common/random.h" #include "../common/random.h"
#include "../common/threading_utils.h" #include "../common/threading_utils.h"
#include "./simple_batch_iterator.h"
#include "adapter.h" #include "adapter.h"
#include "gradient_index.h" #include "gradient_index.h"
#include "xgboost/c_api.h"
#include "xgboost/data.h"
namespace xgboost { namespace xgboost {
namespace data { namespace data {
@ -229,7 +229,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
offset_vec.emplace_back(offset_vec.back()); offset_vec.emplace_back(offset_vec.back());
} }
} else { } else {
CHECK((std::is_same<AdapterT, CSCAdapter>::value)) << "Expecting CSCAdapter"; CHECK((std::is_same<AdapterT, CSCAdapter>::value ||
std::is_same<AdapterT, CSCArrayAdapter>::value))
<< "Expecting CSCAdapter";
info_.num_row_ = offset_vec.size() - 1; info_.num_row_ = offset_vec.size() - 1;
} }
} else { } else {
@ -267,20 +269,14 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
fo->Write(sparse_page_->data.HostVector()); fo->Write(sparse_page_->data.HostVector());
} }
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread);
int nthread); template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread);
int nthread); template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread);
int nthread); template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread);
int nthread); template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread);
template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix( template SimpleDMatrix::SimpleDMatrix(
IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>
*adapter, *adapter,

View File

@ -8,6 +8,7 @@
#include "../../../src/common/hist_util.h" #include "../../../src/common/hist_util.h"
#include "../../../src/common/quantile.h" #include "../../../src/common/quantile.h"
#include "../../../src/data/adapter.h" #include "../../../src/data/adapter.h"
#include "xgboost/context.h"
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@ -183,7 +184,7 @@ void TestSameOnAllWorkers() {
} }
auto m = RandomDataGenerator{kRows, kCols, 0} auto m = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(Context::kCpuId)
.Type(ft) .Type(ft)
.MaxCategory(17) .MaxCategory(17)
.Seed(rank + seed) .Seed(rank + seed)

View File

@ -82,10 +82,6 @@ class TestDMatrix:
assert len(record) == 0 assert len(record) == 0
with pytest.warns(UserWarning):
csr = csr_matrix(x)
xgb.DMatrix(csr.tocsc(), y, missing=4)
def test_dmatrix_numpy_init(self): def test_dmatrix_numpy_init(self):
data = np.random.randn(5, 5) data = np.random.randn(5, 5)
dm = xgb.DMatrix(data) dm = xgb.DMatrix(data)
@ -130,6 +126,12 @@ class TestDMatrix:
assert dtrain.num_row() == 3 assert dtrain.num_row() == 3
assert dtrain.num_col() == 3 assert dtrain.num_col() == 3
indptr = np.array([0, 3, 5])
data = np.array([0, 1, 2, 3, 4])
row_idx = np.array([0, 1, 2, 0, 2])
X = scipy.sparse.csc_matrix((data, row_idx, indptr), shape=(3, 2))
assert tm.predictor_equal(xgb.DMatrix(X.tocsr()), xgb.DMatrix(X))
def test_coo(self): def test_coo(self):
row = np.array([0, 2, 2, 0, 1, 2]) row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2]) col = np.array([0, 0, 1, 2, 2, 2])