From c1786849e37e12fa69c2a86f2aff7ba77b5ee178 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sun, 5 Feb 2023 01:59:46 +0800 Subject: [PATCH] Use array interface for CSC matrix. (#8672) * Use array interface for CSC matrix. Use array interface for CSC matrix and align the interface with CSR and dense. - Fix nthread issue in the R package DMatrix. - Unify the behavior of handling `missing` with other inputs. - Unify the behavior of handling `missing` around R, Python, Java, and Scala DMatrix. - Expose `num_non_missing` to the JVM interface. - Deprecate old CSR and CSC constructors. --- R-package/R/xgb.DMatrix.R | 24 ++- R-package/src/init.c | 8 +- R-package/src/xgboost_R.cc | 111 ++++++------- R-package/src/xgboost_R.h | 6 +- R-package/tests/testthat/test_dmatrix.R | 12 +- demo/c-api/basic/c-api-demo.c | 81 +++++++--- include/xgboost/c_api.h | 63 ++++---- .../java/ml/dmlc/xgboost4j/java/DMatrix.java | 45 +++--- .../ml/dmlc/xgboost4j/java/XGBoostJNI.java | 15 +- .../ml/dmlc/xgboost4j/scala/DMatrix.scala | 36 ++++- .../xgboost4j/src/native/xgboost4j.cpp | 149 ++++++++++++------ jvm-packages/xgboost4j/src/native/xgboost4j.h | 58 ++++--- .../dmlc/xgboost4j/scala/DMatrixSuite.scala | 8 +- python-package/xgboost/core.py | 4 +- python-package/xgboost/data.py | 53 ++++--- python-package/xgboost/sklearn.py | 7 +- src/c_api/c_api.cc | 83 ++++++---- src/c_api/c_api_utils.h | 56 ++++++- src/data/adapter.h | 98 ++++++++++-- src/data/data.cc | 79 +++++----- src/data/simple_dmatrix.cc | 44 +++--- tests/cpp/common/test_quantile.cc | 3 +- tests/python/test_dmatrix.py | 10 +- 23 files changed, 673 insertions(+), 380 deletions(-) diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 0a2c45dbf..20aab5335 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -36,19 +36,37 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre cnames <- colnames(data) } else if (inherits(data, "dgCMatrix")) { handle <- .Call( - XGDMatrixCreateFromCSC_R, data@p, data@i, data@x, nrow(data), as.integer(NVL(nthread, -1)) + XGDMatrixCreateFromCSC_R, + data@p, + data@i, + data@x, + nrow(data), + missing, + as.integer(NVL(nthread, -1)) ) cnames <- colnames(data) } else if (inherits(data, "dgRMatrix")) { handle <- .Call( - XGDMatrixCreateFromCSR_R, data@p, data@j, data@x, ncol(data), as.integer(NVL(nthread, -1)) + XGDMatrixCreateFromCSR_R, + data@p, + data@j, + data@x, + ncol(data), + missing, + as.integer(NVL(nthread, -1)) ) cnames <- colnames(data) } else if (inherits(data, "dsparseVector")) { indptr <- c(0L, as.integer(length(data@i))) ind <- as.integer(data@i) - 1L handle <- .Call( - XGDMatrixCreateFromCSR_R, indptr, ind, data@x, length(data), as.integer(NVL(nthread, -1)) + XGDMatrixCreateFromCSR_R, + indptr, + ind, + data@x, + length(data), + missing, + as.integer(NVL(nthread, -1)) ) } else { stop("xgb.DMatrix does not support construction from ", typeof(data)) diff --git a/R-package/src/init.c b/R-package/src/init.c index b0dce8959..583dc7e32 100644 --- a/R-package/src/init.c +++ b/R-package/src/init.c @@ -36,8 +36,8 @@ extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP); extern SEXP XGBoosterUpdateOneIter_R(SEXP, SEXP, SEXP); extern SEXP XGCheckNullPtr_R(SEXP); -extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP); -extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP); extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP); extern SEXP XGDMatrixGetInfo_R(SEXP, SEXP); @@ -73,8 +73,8 @@ static const R_CallMethodDef CallEntries[] = { {"XGBoosterSetParam_R", (DL_FUNC) &XGBoosterSetParam_R, 3}, {"XGBoosterUpdateOneIter_R", (DL_FUNC) &XGBoosterUpdateOneIter_R, 3}, {"XGCheckNullPtr_R", (DL_FUNC) &XGCheckNullPtr_R, 1}, - {"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 5}, - {"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 5}, + {"XGDMatrixCreateFromCSC_R", (DL_FUNC) &XGDMatrixCreateFromCSC_R, 6}, + {"XGDMatrixCreateFromCSR_R", (DL_FUNC) &XGDMatrixCreateFromCSR_R, 6}, {"XGDMatrixCreateFromFile_R", (DL_FUNC) &XGDMatrixCreateFromFile_R, 2}, {"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3}, {"XGDMatrixGetInfo_R", (DL_FUNC) &XGDMatrixGetInfo_R, 2}, diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc index 990274100..805e63a32 100644 --- a/R-package/src/xgboost_R.cc +++ b/R-package/src/xgboost_R.cc @@ -16,10 +16,11 @@ #include #include "../../src/c_api/c_api_error.h" +#include "../../src/c_api/c_api_utils.h" // MakeSparseFromPtr #include "../../src/common/threading_utils.h" - -#include "./xgboost_R.h" // Must follow other include. +#include "./xgboost_R.h" // Must follow other includes. +#include "Rinternals.h" /*! * \brief macro to annotate begin of api @@ -134,34 +135,47 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) { return ret; } -XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, - SEXP num_row, SEXP n_threads) { - SEXP ret; - R_API_BEGIN(); +namespace { +void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str, + std::string *indices_str, std::string *data_str) { const int *p_indptr = INTEGER(indptr); const int *p_indices = INTEGER(indices); const double *p_data = REAL(data); - size_t nindptr = static_cast(length(indptr)); - size_t ndata = static_cast(length(data)); - size_t nrow = static_cast(INTEGER(num_row)[0]); - std::vector col_ptr_(nindptr); - std::vector indices_(ndata); - std::vector data_(ndata); - for (size_t i = 0; i < nindptr; ++i) { - col_ptr_[i] = static_cast(p_indptr[i]); - } - xgboost::Context ctx; - ctx.nthread = asInteger(n_threads); - xgboost::common::ParallelFor(ndata, ctx.Threads(), [&](xgboost::omp_ulong i) { - indices_[i] = static_cast(p_indices[i]); - data_[i] = static_cast(p_data[i]); - }); + auto nindptr = static_cast(length(indptr)); + auto ndata = static_cast(length(data)); + CHECK_EQ(ndata, p_indptr[nindptr - 1]); + xgboost::detail::MakeSparseFromPtr(p_indptr, p_indices, p_data, nindptr, indptr_str, indices_str, + data_str); +} +} // namespace + +XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row, + SEXP missing, SEXP n_threads) { + SEXP ret; + R_API_BEGIN(); + std::int32_t threads = asInteger(n_threads); + + using xgboost::Integer; + using xgboost::Json; + using xgboost::Object; + + std::string sindptr, sindices, sdata; + CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata); + auto nrow = static_cast(INTEGER(num_row)[0]); + DMatrixHandle handle; - CHECK_CALL(XGDMatrixCreateFromCSCEx(BeginPtr(col_ptr_), BeginPtr(indices_), - BeginPtr(data_), nindptr, ndata, - nrow, &handle)); + Json jconfig{Object{}}; + // Construct configuration + jconfig["nthread"] = Integer{threads}; + jconfig["missing"] = xgboost::Number{asReal(missing)}; + std::string config; + Json::Dump(jconfig, &config); + CHECK_CALL(XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow, + config.c_str(), &handle)); + ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); + R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE); R_API_END(); UNPROTECT(1); @@ -169,64 +183,27 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, } XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col, - SEXP n_threads) { + SEXP missing, SEXP n_threads) { SEXP ret; R_API_BEGIN(); - const int *p_indptr = INTEGER(indptr); - const int *p_indices = INTEGER(indices); - const double *p_data = REAL(data); - - auto nindptr = static_cast(length(indptr)); - auto ndata = static_cast(length(data)); - auto ncol = static_cast(INTEGER(num_col)[0]); std::int32_t threads = asInteger(n_threads); - using xgboost::Array; using xgboost::Integer; using xgboost::Json; using xgboost::Object; - using xgboost::String; - // Construct array interfaces - Json jindptr{Object{}}; - Json jindices{Object{}}; - Json jdata{Object{}}; - jindptr["data"] = - Array{std::vector{Json{reinterpret_cast(p_indptr)}, Json{true}}}; - jindptr["shape"] = std::vector{Json{nindptr}}; - jindptr["version"] = Integer{3}; - jindices["data"] = - Array{std::vector{Json{reinterpret_cast(p_indices)}, Json{true}}}; - jindices["shape"] = std::vector{Json{ndata}}; - jindices["version"] = Integer{3}; - - jdata["data"] = - Array{std::vector{Json{reinterpret_cast(p_data)}, Json{true}}}; - jdata["shape"] = std::vector{Json{ndata}}; - jdata["version"] = Integer{3}; - - if (DMLC_LITTLE_ENDIAN) { - jindptr["typestr"] = String{"i4"}; - jindices["typestr"] = String{">i4"}; - jdata["typestr"] = String{">f8"}; - } - std::string indptr, indices, data; - Json::Dump(jindptr, &indptr); - Json::Dump(jindices, &indices); - Json::Dump(jdata, &data); + std::string sindptr, sindices, sdata; + CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata); + auto ncol = static_cast(INTEGER(num_col)[0]); DMatrixHandle handle; Json jconfig{Object{}}; // Construct configuration jconfig["nthread"] = Integer{threads}; - jconfig["missing"] = xgboost::Number{std::numeric_limits::quiet_NaN()}; + jconfig["missing"] = xgboost::Number{asReal(missing)}; std::string config; Json::Dump(jconfig, &config); - CHECK_CALL(XGDMatrixCreateFromCSR(indptr.c_str(), indices.c_str(), data.c_str(), ncol, + CHECK_CALL(XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol, config.c_str(), &handle)); ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue)); diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h index 766ef53e8..45a43a5bd 100644 --- a/R-package/src/xgboost_R.h +++ b/R-package/src/xgboost_R.h @@ -59,11 +59,12 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, * \param indices row indices * \param data content of the data * \param num_row numer of rows (when it's set to 0, then guess from data) + * \param missing which value to represent missing value * \param n_threads Number of threads used to construct DMatrix from csc matrix. * \return created dmatrix */ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row, - SEXP n_threads); + SEXP missing, SEXP n_threads); /*! * \brief create a matrix content from CSR format @@ -71,11 +72,12 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP * \param indices column indices * \param data content of the data * \param num_col numer of columns (when it's set to 0, then guess from data) + * \param missing which value to represent missing value * \param n_threads Number of threads used to construct DMatrix from csr matrix. * \return created dmatrix */ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col, - SEXP n_threads); + SEXP missing, SEXP n_threads); /*! * \brief create a new dmatrix from sliced content of existing matrix diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R index 913ccca82..1d8cb0f23 100644 --- a/R-package/tests/testthat/test_dmatrix.R +++ b/R-package/tests/testthat/test_dmatrix.R @@ -22,20 +22,20 @@ test_that("xgb.DMatrix: basic construction", { n_samples <- 100 X <- cbind( - x1 = rnorm(n_samples), - x2 = rnorm(n_samples), - x3 = rnorm(n_samples) + x1 = sample(x = 4, size = n_samples, replace = TRUE), + x2 = sample(x = 4, size = n_samples, replace = TRUE), + x3 = sample(x = 4, size = n_samples, replace = TRUE) ) X <- matrix(X, nrow = n_samples) y <- rbinom(n = n_samples, size = 1, prob = 1 / 2) - fd <- xgb.DMatrix(X, label = y) + fd <- xgb.DMatrix(X, label = y, missing = 1) dgc <- as(X, "dgCMatrix") - fdgc <- xgb.DMatrix(dgc, label = y) + fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0) dgr <- as(X, "dgRMatrix") - fdgr <- xgb.DMatrix(dgr, label = y) + fdgr <- xgb.DMatrix(dgr, label = y, missing = 1) params <- list(tree_method = "hist") bst_fd <- xgb.train( diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c index aee6285f9..ca6e689aa 100644 --- a/demo/c-api/basic/c-api-demo.c +++ b/demo/c-api/basic/c-api-demo.c @@ -1,13 +1,16 @@ -/*! - * Copyright 2019 XGBoost contributors +/** + * Copyright 2019-2023 by XGBoost contributors * * \file c-api-demo.c * \brief A simple example of using xgboost C API. */ #include +#include +#include /* uint32_t,uint64_t */ #include #include +#include #include #define safe_xgboost(call) { \ @@ -18,6 +21,21 @@ if (err != 0) { \ } \ } +/* Make Json encoded array interface. */ +static void MakeArrayInterface(size_t data, size_t n, char const* typestr, size_t length, + char* out) { + static char const kTemplate[] = + "{\"data\": [%lu, true], \"shape\": [%lu, %lu], \"typestr\": \"%s\", \"version\": 3}"; + memset(out, '\0', length); + sprintf(out, kTemplate, data, n, 1ul, typestr); +} +/* Make Json encoded DMatrix configuration. */ +static void MakeConfig(int n_threads, size_t length, char* out) { + static char const kTemplate[] = "{\"missing\": NaN, \"nthread\": %d}"; + memset(out, '\0', length); + sprintf(out, kTemplate, n_threads); +} + int main() { int silent = 0; int use_gpu = 0; // set to 1 to use the GPU for training @@ -121,17 +139,27 @@ int main() { } { - printf("Sparse Matrix Example (XGDMatrixCreateFromCSREx): "); + printf("Sparse Matrix Example (XGDMatrixCreateFromCSR): "); - const size_t indptr[] = {0, 22}; - const unsigned indices[] = {1, 9, 19, 21, 24, 34, 36, 39, 42, 53, 56, 65, - 69, 77, 86, 88, 92, 95, 102, 106, 117, 122}; + const uint64_t indptr[] = {0, 22}; + const uint32_t indices[] = {1, 9, 19, 21, 24, 34, 36, 39, 42, 53, 56, + 65, 69, 77, 86, 88, 92, 95, 102, 106, 117, 122}; const float data[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; DMatrixHandle dmat; - safe_xgboost(XGDMatrixCreateFromCSREx(indptr, indices, data, 2, 22, 127, - &dmat)); + char j_indptr[128]; + MakeArrayInterface((size_t)indptr, 2ul, " iter, String cache_info, long[] out); - public final static native int XGDMatrixCreateFromCSREx(long[] indptr, int[] indices, float[] data, - int shapeParam, long[] out); + public final static native int XGDMatrixCreateFromCSR(long[] indptr, int[] indices, + float[] data, int shapeParam, + float missing, int nthread, + long[] out); - public final static native int XGDMatrixCreateFromCSCEx(long[] colptr, int[] indices, float[] data, - int shapeParam, long[] out); + public final static native int XGDMatrixCreateFromCSC(long[] colptr, int[] indices, + float[] data, int shapeParam, + float missing, int nthread, + long[] out); public final static native int XGDMatrixCreateFromMat(float[] data, int nrow, int ncol, float missing, long[] out); @@ -96,6 +100,7 @@ class XGBoostJNI { long[] outLength, String[][] outValues); public final static native int XGDMatrixNumRow(long handle, long[] row); + public final static native int XGDMatrixNumNonMissing(long handle, long[] nonMissings); public final static native int XGBoosterCreate(long[] handles, long[] out); diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala index 2c7dd2b2c..9269f3fde 100644 --- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala +++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014,2021 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { @throws(classOf[XGBoostError]) @deprecated def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType) { - this(new JDMatrix(headers, indices, data, st)) + this(new JDMatrix(headers, indices, data, st, 0, Float.NaN, -1)) } /** @@ -70,7 +70,25 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { @throws(classOf[XGBoostError]) def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType, shapeParam: Int) { - this(new JDMatrix(headers, indices, data, st, shapeParam)) + this(new JDMatrix(headers, indices, data, st, shapeParam, Float.NaN, -1)) + } + + /** + * create DMatrix from sparse matrix + * + * @param headers index to headers (rowHeaders for CSR or colHeaders for CSC) + * @param indices Indices (colIndexs for CSR or rowIndexs for CSC) + * @param data non zero values (sequence by row for CSR or by col for CSC) + * @param st sparse matrix type (CSR or CSC) + * @param shapeParam when st is CSR, it specifies the column number, otherwise it is taken as + * row number + * @param missing missing value + * @param nthread The number of threads used for constructing DMatrix + */ + @throws(classOf[XGBoostError]) + def this(headers: Array[Long], indices: Array[Int], data: Array[Float], st: JDMatrix.SparseType, + shapeParam: Int, missing: Float, nthread: Int) { + this(new JDMatrix(headers, indices, data, st, shapeParam, missing, nthread)) } /** @@ -78,7 +96,7 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { * @param columnBatch the XGBoost ColumnBatch to provide the cuda array interface * of feature columns * @param missing missing value - * @param nthread threads number + * @param nthread The number of threads used for constructing DMatrix */ @throws(classOf[XGBoostError]) def this(columnBatch: ColumnBatch, missing: Float, nthread: Int) { @@ -246,6 +264,16 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) { jDMatrix.rowNum } + /** + * Get the number of non-missing values of DMatrix. + * + * @return The number of non-missing values + */ + @throws(classOf[XGBoostError]) + def nonMissingNum: Long = { + jDMatrix.nonMissingNum + } + /** * save DMatrix to filePath * diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp index 5ca2dc42d..141ec51bc 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp +++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp @@ -1,5 +1,5 @@ -/* - Copyright (c) 2014-2022 by Contributors +/** + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -12,18 +12,23 @@ limitations under the License. */ +#include "./xgboost4j.h" + +#include +#include +#include +#include +#include + #include #include -#include -#include -#include -#include -#include -#include -#include "./xgboost4j.h" #include -#include +#include #include +#include +#include + +#include "../../../src/c_api/c_api_utils.h" #define JVM_CHECK_CALL(__expr) \ { \ @@ -219,58 +224,89 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro return ret; } -/* - * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixCreateFromCSREx - * Signature: ([J[I[FI[J)I +namespace { +/** + * \brief Create from sparse matrix. + * + * \param maker Indirect call to XGBoost C function for creating CSC and CSR. + * + * \return Status */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx - (JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol, jlongArray jout) { +template +jint MakeJVMSparseInput(JNIEnv *jenv, jlongArray jindptr, jintArray jindices, jfloatArray jdata, + jfloat jmissing, jint jnthread, Fn &&maker, jlongArray jout) { DMatrixHandle result; - jlong* indptr = jenv->GetLongArrayElements(jindptr, 0); - jint* indices = jenv->GetIntArrayElements(jindices, 0); - jfloat* data = jenv->GetFloatArrayElements(jdata, 0); - bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); - bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata); - jint ret = (jint) XGDMatrixCreateFromCSREx((size_t const *)indptr, - (unsigned int const *)indices, - (float const *)data, - nindptr, nelem, jcol, &result); + + jlong *indptr = jenv->GetLongArrayElements(jindptr, nullptr); + jint *indices = jenv->GetIntArrayElements(jindices, nullptr); + jfloat *data = jenv->GetFloatArrayElements(jdata, nullptr); + bst_ulong nindptr = static_cast(jenv->GetArrayLength(jindptr)); + bst_ulong nelem = static_cast(jenv->GetArrayLength(jdata)); + + std::string sindptr, sindices, sdata; + CHECK_EQ(indptr[nindptr - 1], nelem); + using IndPtrT = std::conditional_t::value, long, long long>; + using IndT = + std::conditional_t::value, std::int32_t, long>; + xgboost::detail::MakeSparseFromPtr( + static_cast(indptr), static_cast(indices), + static_cast(data), nindptr, &sindptr, &sindices, &sdata); + + xgboost::Json jconfig{xgboost::Object{}}; + auto missing = static_cast(jmissing); + auto n_threads = static_cast(jnthread); + // Construct configuration + jconfig["nthread"] = xgboost::Integer{n_threads}; + jconfig["missing"] = xgboost::Number{missing}; + std::string config; + xgboost::Json::Dump(jconfig, &config); + + jint ret = maker(sindptr.c_str(), sindices.c_str(), sdata.c_str(), config.c_str(), &result); JVM_CHECK_CALL(ret); setHandle(jenv, jout, result); - //Release + + // Release jenv->ReleaseLongArrayElements(jindptr, indptr, 0); jenv->ReleaseIntArrayElements(jindices, indices, 0); jenv->ReleaseFloatArrayElements(jdata, data, 0); return ret; } +} // anonymous namespace /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixCreateFromCSCEx - * Signature: ([J[I[FI[J)I + * Method: XGDMatrixCreateFromCSR + * Signature: ([J[I[FIFI[J)I */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx - (JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow, jlongArray jout) { - DMatrixHandle result; - jlong* indptr = jenv->GetLongArrayElements(jindptr, NULL); - jint* indices = jenv->GetIntArrayElements(jindices, 0); - jfloat* data = jenv->GetFloatArrayElements(jdata, NULL); - bst_ulong nindptr = (bst_ulong)jenv->GetArrayLength(jindptr); - bst_ulong nelem = (bst_ulong)jenv->GetArrayLength(jdata); +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR( + JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jcol, + jfloat jmissing, jint jnthread, jlongArray jout) { + using CSTR = char const *; + return MakeJVMSparseInput( + jenv, jindptr, jindices, jdata, jmissing, jnthread, + [&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) { + return XGDMatrixCreateFromCSR(sindptr, sindices, sdata, static_cast(jcol), + sconfig, result); + }, + jout); +} - jint ret = (jint) XGDMatrixCreateFromCSCEx((size_t const *)indptr, - (unsigned int const *)indices, - (float const *)data, - nindptr, nelem, jrow, &result); - JVM_CHECK_CALL(ret); - setHandle(jenv, jout, result); - //release - jenv->ReleaseLongArrayElements(jindptr, indptr, 0); - jenv->ReleaseIntArrayElements(jindices, indices, 0); - jenv->ReleaseFloatArrayElements(jdata, data, 0); - - return ret; +/* + * Class: ml_dmlc_xgboost4j_java_XGBoostJNI + * Method: XGDMatrixCreateFromCSC + * Signature: ([J[I[FIFI[J)I + */ +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC( + JNIEnv *jenv, jclass jcls, jlongArray jindptr, jintArray jindices, jfloatArray jdata, jint jrow, + jfloat jmissing, jint jnthread, jlongArray jout) { + using CSTR = char const *; + return MakeJVMSparseInput( + jenv, jindptr, jindices, jdata, jmissing, jnthread, + [&](CSTR sindptr, CSTR sindices, CSTR sdata, CSTR sconfig, DMatrixHandle *result) { + return XGDMatrixCreateFromCSC(sindptr, sindices, sdata, static_cast(jrow), + sconfig, result); + }, + jout); } /* @@ -459,6 +495,23 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow return ret; } +/* + * Class: ml_dmlc_xgboost4j_java_XGBoostJNI + * Method: XGDMatrixNumNonMissing + * Signature: (J[J)I + */ +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing( + JNIEnv *jenv, jclass, jlong jhandle, jlongArray jout) { + DMatrixHandle handle = reinterpret_cast(jhandle); + CHECK(handle); + bst_ulong result[1]; + auto ret = static_cast(XGDMatrixNumNonMissing(handle, result)); + jlong jresult[1]{static_cast(result[0])}; + jenv->SetLongArrayRegion(jout, 0, 1, jresult); + JVM_CHECK_CALL(ret); + return ret; +} + /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Method: XGBoosterCreate diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h index adc5e814c..75dcd4b77 100644 --- a/jvm-packages/xgboost4j/src/native/xgboost4j.h +++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h @@ -33,19 +33,19 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixCreateFromCSREx - * Signature: ([J[I[FI[J)I + * Method: XGDMatrixCreateFromCSR + * Signature: ([J[I[FIFI[J)I */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSREx - (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray); +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSR + (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray); /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixCreateFromCSCEx - * Signature: ([J[I[FI[J)I + * Method: XGDMatrixCreateFromCSC + * Signature: ([J[I[FIFI[J)I */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSCEx - (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jlongArray); +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromCSC + (JNIEnv *, jclass, jlongArray, jintArray, jfloatArray, jint, jfloat, jint, jlongArray); /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI @@ -119,6 +119,22 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetFloatI JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntInfo (JNIEnv *, jclass, jlong, jstring, jobjectArray); +/* + * Class: ml_dmlc_xgboost4j_java_XGBoostJNI + * Method: XGDMatrixSetStrFeatureInfo + * Signature: (JLjava/lang/String;[Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo + (JNIEnv *, jclass, jlong, jstring, jobjectArray); + +/* + * Class: ml_dmlc_xgboost4j_java_XGBoostJNI + * Method: XGDMatrixGetStrFeatureInfo + * Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I + */ +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo + (JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray); + /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Method: XGDMatrixNumRow @@ -127,6 +143,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetUIntIn JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumRow (JNIEnv *, jclass, jlong, jlongArray); +/* + * Class: ml_dmlc_xgboost4j_java_XGBoostJNI + * Method: XGDMatrixNumNonMissing + * Signature: (J[J)I + */ +JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixNumNonMissing + (JNIEnv *, jclass, jlong, jlongArray); + /* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Method: XGBoosterCreate @@ -351,7 +375,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDeviceQuantileDM JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixCreateFromCallback (JNIEnv *, jclass, jobject, jobject, jstring, jlongArray); - /* +/* * Class: ml_dmlc_xgboost4j_java_XGBoostJNI * Method: XGDMatrixCreateFromArrayInterfaceColumns * Signature: (Ljava/lang/String;FI[J)I @@ -359,22 +383,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixC JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromArrayInterfaceColumns (JNIEnv *, jclass, jstring, jfloat, jint, jlongArray); -/* - * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixSetStrFeatureInfo - * Signature: (JLjava/lang/String;[Ljava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetStrFeatureInfo - (JNIEnv *, jclass, jlong, jstring, jobjectArray); - -/* - * Class: ml_dmlc_xgboost4j_java_XGBoostJNI - * Method: XGDMatrixGetStrFeatureInfo - * Signature: (JLjava/lang/String;[J[[Ljava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFeatureInfo - (JNIEnv *, jclass, jlong, jstring, jlongArray, jobjectArray); - #ifdef __cplusplus } #endif diff --git a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala index 50bc1a548..05200f49e 100644 --- a/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala +++ b/jvm-packages/xgboost4j/src/test/scala/ml/dmlc/xgboost4j/scala/DMatrixSuite.scala @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 by Contributors + Copyright (c) 2014-2023 by Contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -54,6 +54,9 @@ class DMatrixSuite extends FunSuite { dmat1.setLabel(label1) val label2 = dmat1.getLabel assert(label2 === label1) + + val dmat2 = new DMatrix(rowHeaders, colIndex, data, JDMatrix.SparseType.CSR, 5, 1.0f, -1) + assert(dmat2.nonMissingNum === 9); } test("create DMatrix from CSREx") { @@ -94,6 +97,9 @@ class DMatrixSuite extends FunSuite { dmat1.setLabel(label1) val label2 = dmat1.getLabel assert(label2 === label1) + + val dmat2 = new DMatrix(colHeaders, rowIndex, data, JDMatrix.SparseType.CSC, 5, 1.0f, -1) + assert(dmat2.nonMissingNum === 9); } test("create DMatrix from CSCEx") { diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 011afbb9e..c64737957 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2311,9 +2311,9 @@ class Booster: ) return _prediction_output(shape, dims, preds, False) if isinstance(data, scipy.sparse.csr_matrix): - from .data import _transform_scipy_csr + from .data import transform_scipy_sparse - data = _transform_scipy_csr(data) + data = transform_scipy_sparse(data, True) _check_call( _LIB.XGBoosterPredictFromCSR( self.handle, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index db7fdd960..3cfa84395 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -28,7 +28,6 @@ from .core import ( _check_call, _cuda_array_interface, _ProxyDMatrix, - c_array, c_str, from_pystr_to_cstr, make_jcargs, @@ -76,8 +75,15 @@ def _array_interface(data: np.ndarray) -> bytes: return interface_str -def _transform_scipy_csr(data: DataType) -> DataType: - from scipy.sparse import csr_matrix +def transform_scipy_sparse(data: DataType, is_csr: bool) -> DataType: + """Ensure correct data alignment and data type for scipy sparse inputs. Input should + be either csr or csc matrix. + + """ + from scipy.sparse import csc_matrix, csr_matrix + + if len(data.indices) != len(data.data): + raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype) indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype) @@ -87,7 +93,10 @@ def _transform_scipy_csr(data: DataType) -> DataType: or indices is not data.indices or values is not data.data ): - data = csr_matrix((values, indices, indptr), shape=data.shape) + if is_csr: + data = csr_matrix((values, indices, indptr), shape=data.shape) + else: + data = csc_matrix((values, indices, indptr), shape=data.shape) return data @@ -99,12 +108,8 @@ def _from_scipy_csr( feature_types: Optional[FeatureTypes], ) -> DispatchedDataBackendReturnType: """Initialize data from a CSR matrix.""" - if len(data.indices) != len(data.data): - raise ValueError( - f"length mismatch: {len(data.indices)} vs {len(data.data)}" - ) handle = ctypes.c_void_p() - data = _transform_scipy_csr(data) + data = transform_scipy_sparse(data, True) _check_call( _LIB.XGDMatrixCreateFromCSR( _array_interface(data.indptr), @@ -128,22 +133,24 @@ def _is_scipy_csc(data: DataType) -> bool: def _from_scipy_csc( data: DataType, - missing: Optional[FloatCompatible], + missing: FloatCompatible, + nthread: int, feature_names: Optional[FeatureNames], feature_types: Optional[FeatureTypes], ) -> DispatchedDataBackendReturnType: - if len(data.indices) != len(data.data): - raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}") - _warn_unused_missing(data, missing) + """Initialize data from a CSC matrix.""" handle = ctypes.c_void_p() - _check_call(_LIB.XGDMatrixCreateFromCSCEx( - c_array(ctypes.c_size_t, data.indptr), - c_array(ctypes.c_uint, data.indices), - c_array(ctypes.c_float, data.data), - ctypes.c_size_t(len(data.indptr)), - ctypes.c_size_t(len(data.data)), - ctypes.c_size_t(data.shape[0]), - ctypes.byref(handle))) + transform_scipy_sparse(data, False) + _check_call( + _LIB.XGDMatrixCreateFromCSC( + _array_interface(data.indptr), + _array_interface(data.indices), + _array_interface(data.data), + c_bst_ulong(data.shape[0]), + make_jcargs(missing=float(missing), nthread=int(nthread)), + ctypes.byref(handle), + ) + ) return handle, feature_names, feature_types @@ -1032,7 +1039,7 @@ def dispatch_data_backend( if _is_scipy_csr(data): return _from_scipy_csr(data, missing, threads, feature_names, feature_types) if _is_scipy_csc(data): - return _from_scipy_csc(data, missing, feature_names, feature_types) + return _from_scipy_csc(data, missing, threads, feature_names, feature_types) if _is_scipy_coo(data): return _from_scipy_csr( data.tocsr(), missing, threads, feature_names, feature_types @@ -1288,7 +1295,7 @@ def _proxy_transform( data, _ = _ensure_np_dtype(data, data.dtype) return data, None, feature_names, feature_types if _is_scipy_csr(data): - data = _transform_scipy_csr(data) + data = transform_scipy_sparse(data, True) return data, None, feature_names, feature_types if _is_pandas_series(data): import pandas as pd diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index e1f8e087d..946c87235 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -112,8 +112,8 @@ def _objective_decorator( def _metric_decorator(func: Callable) -> Metric: """Decorate a metric function from sklearn. - Converts an metric function that uses the typical sklearn metric signature so that it - is compatible with :py:func:`train` + Converts an metric function that uses the typical sklearn metric signature so that + it is compatible with :py:func:`train` """ @@ -122,8 +122,7 @@ def _metric_decorator(func: Callable) -> Metric: weight = dmatrix.get_weight() if weight.size == 0: return func.__name__, func(y_true, y_score) - else: - return func.__name__, func(y_true, y_score, sample_weight=weight) + return func.__name__, func(y_true, y_score, sample_weight=weight) return inner diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index d388f1506..dcb2eb4c2 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1,31 +1,32 @@ /** * Copyright 2014-2023 by XGBoost Contributors */ +#include "xgboost/c_api.h" + #include #include #include -#include -#include #include +#include +#include -#include "xgboost/base.h" -#include "xgboost/data.h" -#include "xgboost/host_device_vector.h" -#include "xgboost/learner.h" -#include "xgboost/c_api.h" -#include "xgboost/logging.h" -#include "xgboost/version_config.h" -#include "xgboost/json.h" -#include "xgboost/global_config.h" - -#include "c_api_error.h" -#include "c_api_utils.h" #include "../collective/communicator-inl.h" -#include "../common/io.h" #include "../common/charconv.h" +#include "../common/io.h" #include "../data/adapter.h" #include "../data/simple_dmatrix.h" +#include "c_api_error.h" +#include "c_api_utils.h" +#include "xgboost/base.h" +#include "xgboost/data.h" +#include "xgboost/global_config.h" +#include "xgboost/host_device_vector.h" +#include "xgboost/json.h" +#include "xgboost/learner.h" +#include "xgboost/logging.h" +#include "xgboost/string_view.h" // StringView +#include "xgboost/version_config.h" #if defined(XGBOOST_USE_FEDERATED) #include "../../plugin/federated/federated_server.h" @@ -58,6 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) { } // namespace xgboost #endif +namespace { +void DeprecatedFunc(StringView old, StringView since, StringView replacement) { + LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement + << "` instead."; +} +} // anonymous namespace + XGB_DLL int XGBuildInfo(char const **out) { API_BEGIN(); xgboost_CHECK_C_ARG_PTR(out); @@ -298,7 +306,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr int nthread, int max_bin, DMatrixHandle *out) { API_BEGIN(); - LOG(WARNING) << __func__ << " is deprecated. Use `XGQuantileDMatrixCreateFromCallback` instead."; + DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback"); *out = new std::shared_ptr{ xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)}; API_END(); @@ -398,14 +406,11 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr, // End Create from data iterator -XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr, - const unsigned* indices, - const bst_float* data, - size_t nindptr, - size_t nelem, - size_t num_col, - DMatrixHandle* out) { +XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices, + const bst_float *data, size_t nindptr, size_t nelem, + size_t num_col, DMatrixHandle *out) { API_BEGIN(); + DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR"); data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col); *out = new std::shared_ptr(DMatrix::Create(&adapter, std::nan(""), 1)); API_END(); @@ -443,14 +448,29 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, API_END(); } -XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, - const unsigned* indices, - const bst_float* data, - size_t nindptr, - size_t, - size_t num_row, - DMatrixHandle* out) { +XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data, + xgboost::bst_ulong nrow, char const *c_json_config, + DMatrixHandle *out) { API_BEGIN(); + xgboost_CHECK_C_ARG_PTR(indptr); + xgboost_CHECK_C_ARG_PTR(indices); + xgboost_CHECK_C_ARG_PTR(data); + data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow}; + xgboost_CHECK_C_ARG_PTR(c_json_config); + auto config = Json::Load(StringView{c_json_config}); + float missing = GetMissing(config); + auto n_threads = OptionalArg(config, "nthread", common::OmpGetNumThreads(0)); + xgboost_CHECK_C_ARG_PTR(out); + *out = new std::shared_ptr(DMatrix::Create(&adapter, missing, n_threads)); + + API_END(); +} + +XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices, + const bst_float *data, size_t nindptr, size_t, size_t num_row, + DMatrixHandle *out) { + API_BEGIN(); + DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC"); data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row); xgboost_CHECK_C_ARG_PTR(out); *out = new std::shared_ptr(DMatrix::Create(&adapter, std::nan(""), 1)); @@ -1203,8 +1223,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l raw_str.resize(0); common::MemoryBufferStream fo(&raw_str); - LOG(WARNING) << "`" << __func__ - << "` is deprecated, please use `XGBoosterSaveModelToBuffer` instead."; + DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer"); learner->Configure(); learner->SaveModel(&fo); diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h index 2ccf628bf..78c477f42 100644 --- a/src/c_api/c_api_utils.h +++ b/src/c_api/c_api_utils.h @@ -1,10 +1,11 @@ -/*! - * Copyright (c) 2021-2022 by XGBoost Contributors +/** + * Copyright 2021-2023 by XGBoost Contributors */ #ifndef XGBOOST_C_API_C_API_UTILS_H_ #define XGBOOST_C_API_C_API_UTILS_H_ #include +#include #include #include // std::shared_ptr #include @@ -14,6 +15,7 @@ #include "xgboost/data.h" // DMatrix #include "xgboost/json.h" #include "xgboost/learner.h" +#include "xgboost/linalg.h" // ArrayInterfaceHandler #include "xgboost/logging.h" #include "xgboost/string_view.h" // StringView @@ -281,5 +283,55 @@ inline std::shared_ptr CastDMatrixHandle(DMatrixHandle const handle) { CHECK(p_m) << msg; return p_m; } + +namespace detail { +template +void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data, + std::size_t nindptr, std::string *indptr_str, std::string *indices_str, + std::string *data_str) { + auto ndata = static_cast(p_indptr[nindptr - 1]); + // Construct array interfaces + Json jindptr{Object{}}; + Json jindices{Object{}}; + Json jdata{Object{}}; + CHECK(p_indptr); + jindptr["data"] = + Array{std::vector{Json{reinterpret_cast(p_indptr)}, Json{true}}}; + jindptr["shape"] = std::vector{Json{nindptr}}; + jindptr["version"] = Integer{3}; + + CHECK(p_indices); + jindices["data"] = + Array{std::vector{Json{reinterpret_cast(p_indices)}, Json{true}}}; + jindices["shape"] = std::vector{Json{ndata}}; + jindices["version"] = Integer{3}; + + CHECK(p_data); + jdata["data"] = + Array{std::vector{Json{reinterpret_cast(p_data)}, Json{true}}}; + jdata["shape"] = std::vector{Json{ndata}}; + jdata["version"] = Integer{3}; + + std::string pindptr_typestr = + linalg::detail::ArrayInterfaceHandler::TypeChar() + std::to_string(sizeof(PtrT)); + std::string ind_typestr = + linalg::detail::ArrayInterfaceHandler::TypeChar() + std::to_string(sizeof(I)); + std::string data_typestr = + linalg::detail::ArrayInterfaceHandler::TypeChar() + std::to_string(sizeof(T)); + if (DMLC_LITTLE_ENDIAN) { + jindptr["typestr"] = String{"<" + pindptr_typestr}; + jindices["typestr"] = String{"<" + ind_typestr}; + jdata["typestr"] = String{"<" + data_typestr}; + } else { + jindptr["typestr"] = String{">" + pindptr_typestr}; + jindices["typestr"] = String{">" + ind_typestr}; + jdata["typestr"] = String{">" + data_typestr}; + } + + Json::Dump(jindptr, indptr_str); + Json::Dump(jindices, indices_str); + Json::Dump(jdata, data_str); +} +} // namespace detail } // namespace xgboost #endif // XGBOOST_C_API_C_API_UTILS_H_ diff --git a/src/data/adapter.h b/src/data/adapter.h index 34e918cd2..b027084aa 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -6,25 +6,25 @@ #define XGBOOST_DATA_ADAPTER_H_ #include -#include +#include +#include // std::size_t #include #include +#include #include #include -#include +#include // std::move #include -#include -#include -#include "xgboost/logging.h" -#include "xgboost/base.h" -#include "xgboost/data.h" -#include "xgboost/span.h" - -#include "array_interface.h" #include "../c_api/c_api_error.h" #include "../common/math.h" +#include "array_interface.h" #include "arrow-cdi.h" +#include "xgboost/base.h" +#include "xgboost/data.h" +#include "xgboost/logging.h" +#include "xgboost/span.h" +#include "xgboost/string_view.h" namespace xgboost { namespace data { @@ -472,6 +472,84 @@ class CSCAdapter : public detail::SingleBatchDataIter { size_t num_columns_; }; +class CSCArrayAdapterBatch : public detail::NoMetaInfo { + ArrayInterface<1> indptr_; + ArrayInterface<1> indices_; + ArrayInterface<1> values_; + bst_row_t n_rows_; + + class Line { + std::size_t column_idx_; + ArrayInterface<1> row_idx_; + ArrayInterface<1> values_; + std::size_t offset_; + + public: + Line(std::size_t idx, ArrayInterface<1> row_idx, ArrayInterface<1> values, std::size_t offset) + : column_idx_{idx}, + row_idx_{std::move(row_idx)}, + values_{std::move(values)}, + offset_{offset} {} + + std::size_t Size() const { return values_.Shape(0); } + COOTuple GetElement(std::size_t idx) const { + return {TypedIndex{row_idx_}(offset_ + idx), column_idx_, + values_(offset_ + idx)}; + } + }; + + public: + static constexpr bool kIsRowMajor = false; + + CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices, + ArrayInterface<1> values, bst_row_t n_rows) + : indptr_{std::move(indptr)}, + indices_{std::move(indices)}, + values_{std::move(values)}, + n_rows_{n_rows} {} + + std::size_t Size() const { return indptr_.n - 1; } + Line GetLine(std::size_t idx) const { + auto begin_no_stride = TypedIndex{indptr_}(idx); + auto end_no_stride = TypedIndex{indptr_}(idx + 1); + + auto indices = indices_; + auto values = values_; + // Slice indices and values, stride remains unchanged since this is slicing by + // specific index. + auto offset = indices.strides[0] * begin_no_stride; + indices.shape[0] = end_no_stride - begin_no_stride; + values.shape[0] = end_no_stride - begin_no_stride; + + return Line{idx, indices, values, offset}; + } +}; + +/** + * \brief CSC adapter with support for array interface. + */ +class CSCArrayAdapter : public detail::SingleBatchDataIter { + ArrayInterface<1> indptr_; + ArrayInterface<1> indices_; + ArrayInterface<1> values_; + size_t num_rows_; + CSCArrayAdapterBatch batch_; + + public: + CSCArrayAdapter(StringView indptr, StringView indices, StringView values, std::size_t num_rows) + : indptr_{indptr}, + indices_{indices}, + values_{values}, + num_rows_{num_rows}, + batch_{ + CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast(num_rows_)}} {} + + // JVM package sends 0 as unknown + size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; } + size_t NumColumns() const { return indptr_.n - 1; } + const CSCArrayAdapterBatch& Value() const override { return batch_; } +}; + class DataTableAdapterBatch : public detail::NoMetaInfo { enum class DTType : std::uint8_t { kFloat32 = 0, diff --git a/src/data/data.cc b/src/data/data.cc index 91052f274..9aa0271c2 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -945,31 +945,33 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const st return new data::SimpleDMatrix(adapter, missing, nthread); } -template DMatrix* DMatrix::Create( - data::DenseAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::ArrayAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::CSRAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::CSCAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::DataTableAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::FileAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix* DMatrix::Create( - data::CSRArrayAdapter* adapter, float missing, int nthread, - const std::string& cache_prefix); -template DMatrix * -DMatrix::Create(data::IteratorAdapter *adapter, - float missing, int nthread, const std::string &cache_prefix); +template DMatrix* DMatrix::Create(data::DenseAdapter* adapter, float missing, + std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::ArrayAdapter* adapter, float missing, + std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::CSRAdapter* adapter, float missing, + std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::CSCAdapter* adapter, float missing, + std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::DataTableAdapter* adapter, + float missing, std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::FileAdapter* adapter, float missing, + std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::CSRArrayAdapter* adapter, + float missing, std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create(data::CSCArrayAdapter* adapter, + float missing, std::int32_t nthread, + const std::string& cache_prefix); +template DMatrix* DMatrix::Create( + data::IteratorAdapter* adapter, + float missing, int nthread, const std::string& cache_prefix); template DMatrix* DMatrix::Create( data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&); @@ -1221,20 +1223,19 @@ void SparsePage::PushCSC(const SparsePage &batch) { self_offset = std::move(offset); } -template uint64_t -SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, int nthread); -template uint64_t -SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread); +template uint64_t SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, + int nthread); +template uint64_t SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, + int nthread); +template uint64_t SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); +template uint64_t SparsePage::Push(const data::CSRArrayAdapterBatch& batch, float missing, + int nthread); +template uint64_t SparsePage::Push(const data::CSCArrayAdapterBatch& batch, float missing, + int nthread); +template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); +template uint64_t SparsePage::Push(const data::DataTableAdapterBatch& batch, float missing, + int nthread); +template uint64_t SparsePage::Push(const data::FileAdapterBatch& batch, float missing, int nthread); namespace data { diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index 28868da7d..33992f5f7 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -1,23 +1,23 @@ -/*! - * Copyright 2014~2022 by XGBoost Contributors +/** + * Copyright 2014~2023 by XGBoost Contributors * \file simple_dmatrix.cc * \brief the input data structure for gradient boosting * \author Tianqi Chen */ -#include +#include "simple_dmatrix.h" + +#include #include #include -#include +#include -#include "xgboost/data.h" -#include "xgboost/c_api.h" - -#include "simple_dmatrix.h" -#include "./simple_batch_iterator.h" #include "../common/random.h" #include "../common/threading_utils.h" +#include "./simple_batch_iterator.h" #include "adapter.h" #include "gradient_index.h" +#include "xgboost/c_api.h" +#include "xgboost/data.h" namespace xgboost { namespace data { @@ -229,7 +229,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { offset_vec.emplace_back(offset_vec.back()); } } else { - CHECK((std::is_same::value)) << "Expecting CSCAdapter"; + CHECK((std::is_same::value || + std::is_same::value)) + << "Expecting CSCAdapter"; info_.num_row_ = offset_vec.size() - 1; } } else { @@ -267,20 +269,14 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { fo->Write(sparse_page_->data.HostVector()); } -template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, - int nthread); -template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, - int nthread); +template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(CSCArrayAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(CSCAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(DataTableAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int nthread); template SimpleDMatrix::SimpleDMatrix( IteratorAdapter *adapter, diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index 3541b977f..7b609f476 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -8,6 +8,7 @@ #include "../../../src/common/hist_util.h" #include "../../../src/common/quantile.h" #include "../../../src/data/adapter.h" +#include "xgboost/context.h" namespace xgboost { namespace common { @@ -183,7 +184,7 @@ void TestSameOnAllWorkers() { } auto m = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(Context::kCpuId) .Type(ft) .MaxCategory(17) .Seed(rank + seed) diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index ed557da32..610a46639 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -82,10 +82,6 @@ class TestDMatrix: assert len(record) == 0 - with pytest.warns(UserWarning): - csr = csr_matrix(x) - xgb.DMatrix(csr.tocsc(), y, missing=4) - def test_dmatrix_numpy_init(self): data = np.random.randn(5, 5) dm = xgb.DMatrix(data) @@ -130,6 +126,12 @@ class TestDMatrix: assert dtrain.num_row() == 3 assert dtrain.num_col() == 3 + indptr = np.array([0, 3, 5]) + data = np.array([0, 1, 2, 3, 4]) + row_idx = np.array([0, 1, 2, 0, 2]) + X = scipy.sparse.csc_matrix((data, row_idx, indptr), shape=(3, 2)) + assert tm.predictor_equal(xgb.DMatrix(X.tocsr()), xgb.DMatrix(X)) + def test_coo(self): row = np.array([0, 2, 2, 0, 1, 2]) col = np.array([0, 0, 1, 2, 2, 2])