Use array interface for CSC matrix. (#8672)

* Use array interface for CSC matrix.

Use array interface for CSC matrix and align the interface with CSR and dense.

- Fix nthread issue in the R package DMatrix.
- Unify the behavior of handling `missing` with other inputs.
- Unify the behavior of handling `missing` around R, Python, Java, and Scala DMatrix.
- Expose `num_non_missing` to the JVM interface.
- Deprecate old CSR and CSC constructors.
This commit is contained in:
Jiaming Yuan
2023-02-05 01:59:46 +08:00
committed by GitHub
parent 213b5602d9
commit c1786849e3
23 changed files with 673 additions and 380 deletions

View File

@@ -1,31 +1,32 @@
/**
* Copyright 2014-2023 by XGBoost Contributors
*/
#include "xgboost/c_api.h"
#include <rabit/c_api.h>
#include <cstring>
#include <fstream>
#include <vector>
#include <string>
#include <memory>
#include <string>
#include <vector>
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/learner.h"
#include "xgboost/c_api.h"
#include "xgboost/logging.h"
#include "xgboost/version_config.h"
#include "xgboost/json.h"
#include "xgboost/global_config.h"
#include "c_api_error.h"
#include "c_api_utils.h"
#include "../collective/communicator-inl.h"
#include "../common/io.h"
#include "../common/charconv.h"
#include "../common/io.h"
#include "../data/adapter.h"
#include "../data/simple_dmatrix.h"
#include "c_api_error.h"
#include "c_api_utils.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
#include "xgboost/global_config.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/json.h"
#include "xgboost/learner.h"
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // StringView
#include "xgboost/version_config.h"
#if defined(XGBOOST_USE_FEDERATED)
#include "../../plugin/federated/federated_server.h"
@@ -58,6 +59,13 @@ void XGBBuildInfoDevice(Json *p_info) {
} // namespace xgboost
#endif
namespace {
void DeprecatedFunc(StringView old, StringView since, StringView replacement) {
LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement
<< "` instead.";
}
} // anonymous namespace
XGB_DLL int XGBuildInfo(char const **out) {
API_BEGIN();
xgboost_CHECK_C_ARG_PTR(out);
@@ -298,7 +306,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
int nthread, int max_bin,
DMatrixHandle *out) {
API_BEGIN();
LOG(WARNING) << __func__ << " is deprecated. Use `XGQuantileDMatrixCreateFromCallback` instead.";
DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
API_END();
@@ -398,14 +406,11 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
// End Create from data iterator
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t* indptr,
const unsigned* indices,
const bst_float* data,
size_t nindptr,
size_t nelem,
size_t num_col,
DMatrixHandle* out) {
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
const bst_float *data, size_t nindptr, size_t nelem,
size_t num_col, DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
API_END();
@@ -443,14 +448,29 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
const unsigned* indices,
const bst_float* data,
size_t nindptr,
size_t,
size_t num_row,
DMatrixHandle* out) {
XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char const *data,
xgboost::bst_ulong nrow, char const *c_json_config,
DMatrixHandle *out) {
API_BEGIN();
xgboost_CHECK_C_ARG_PTR(indptr);
xgboost_CHECK_C_ARG_PTR(indices);
xgboost_CHECK_C_ARG_PTR(data);
data::CSCArrayAdapter adapter{StringView{indptr}, StringView{indices}, StringView{data}, nrow};
xgboost_CHECK_C_ARG_PTR(c_json_config);
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indices,
const bst_float *data, size_t nindptr, size_t, size_t num_row,
DMatrixHandle *out) {
API_BEGIN();
DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
xgboost_CHECK_C_ARG_PTR(out);
*out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
@@ -1203,8 +1223,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l
raw_str.resize(0);
common::MemoryBufferStream fo(&raw_str);
LOG(WARNING) << "`" << __func__
<< "` is deprecated, please use `XGBoosterSaveModelToBuffer` instead.";
DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
learner->Configure();
learner->SaveModel(&fo);

View File

@@ -1,10 +1,11 @@
/*!
* Copyright (c) 2021-2022 by XGBoost Contributors
/**
* Copyright 2021-2023 by XGBoost Contributors
*/
#ifndef XGBOOST_C_API_C_API_UTILS_H_
#define XGBOOST_C_API_C_API_UTILS_H_
#include <algorithm>
#include <cstddef>
#include <functional>
#include <memory> // std::shared_ptr
#include <string>
@@ -14,6 +15,7 @@
#include "xgboost/data.h" // DMatrix
#include "xgboost/json.h"
#include "xgboost/learner.h"
#include "xgboost/linalg.h" // ArrayInterfaceHandler
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // StringView
@@ -281,5 +283,55 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
CHECK(p_m) << msg;
return p_m;
}
namespace detail {
template <typename PtrT, typename I, typename T>
void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
std::string *data_str) {
auto ndata = static_cast<Integer::Int>(p_indptr[nindptr - 1]);
// Construct array interfaces
Json jindptr{Object{}};
Json jindices{Object{}};
Json jdata{Object{}};
CHECK(p_indptr);
jindptr["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indptr)}, Json{true}}};
jindptr["shape"] = std::vector<Json>{Json{nindptr}};
jindptr["version"] = Integer{3};
CHECK(p_indices);
jindices["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_indices)}, Json{true}}};
jindices["shape"] = std::vector<Json>{Json{ndata}};
jindices["version"] = Integer{3};
CHECK(p_data);
jdata["data"] =
Array{std::vector<Json>{Json{reinterpret_cast<Integer::Int>(p_data)}, Json{true}}};
jdata["shape"] = std::vector<Json>{Json{ndata}};
jdata["version"] = Integer{3};
std::string pindptr_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<PtrT>() + std::to_string(sizeof(PtrT));
std::string ind_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<I>() + std::to_string(sizeof(I));
std::string data_typestr =
linalg::detail::ArrayInterfaceHandler::TypeChar<T>() + std::to_string(sizeof(T));
if (DMLC_LITTLE_ENDIAN) {
jindptr["typestr"] = String{"<" + pindptr_typestr};
jindices["typestr"] = String{"<" + ind_typestr};
jdata["typestr"] = String{"<" + data_typestr};
} else {
jindptr["typestr"] = String{">" + pindptr_typestr};
jindices["typestr"] = String{">" + ind_typestr};
jdata["typestr"] = String{">" + data_typestr};
}
Json::Dump(jindptr, indptr_str);
Json::Dump(jindices, indices_str);
Json::Dump(jdata, data_str);
}
} // namespace detail
} // namespace xgboost
#endif // XGBOOST_C_API_C_API_UTILS_H_