Support numpy array interface (#6998)

2021-05-27 16:08:22 +08:00 · 2021-05-27 16:08:22 +08:00 · 4cf95a6041
commit 4cf95a6041
parent ab6fd304c4
6 changed files with 59 additions and 38 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
                                   char const* json_config,
                                   DMatrixHandle* out);
 /*!
 * \brief Create a matrix from dense array.
 * \param data  JSON encoded __array_interface__ to array values.
 * \param json_config JSON encoded configuration.  Required values are:
 *
 *          - missing
 *          - nthread
 *
 * \param out created dmatrix
 * \return 0 when success, -1 when failure happens
 */
 XGB_DLL int XGDMatrixCreateFromArray(char const *data,
                                     char const *json_config,
                                     DMatrixHandle *out);
 /*!
 * \brief create a matrix content from CSC format
 * \param col_ptr pointer to col headers
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype):
    '''
    try:
        if not data.flags.c_contiguous:
            warnings.warn(
                "Use of np.ndarray subsets (sliced data) is not recommended " +
                "because it will generate extra copies and increase " +
                "memory consumption. Consider using np.ascontiguousarray to " +
                "make the array contiguous.")
            data = np.array(data, copy=True, dtype=dtype)
        else:
            data = np.array(data, copy=False, dtype=dtype)
@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype):
    return data
 def _transform_np_array(data: np.ndarray) -> np.ndarray:
    if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
        data = np.array(data, copy=False)
    if len(data.shape) != 2:
        raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ',
                         data.shape)
    # flatten the array by rows and ensure it is float32.  we try to avoid
    # data copies if possible (reshape returns a view when possible and we
    # explicitly tell np.array to try and avoid copying)
    flatten = np.array(data.reshape(data.size), copy=False,
                       dtype=np.float32)
    flatten = _maybe_np_slice(flatten, np.float32)
    _check_complex(data)
    return flatten
 def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
    """Initialize data from a 2-D numpy matrix.
    If ``mat`` does not have ``order='C'`` (aka row-major) or is
    not contiguous, a temporary copy will be made.
    If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will
    be made.
    So there could be as many as two temporary data copies; be mindful of
    input layout and type if memory use is a concern.
    """
-    flatten: np.ndarray = _transform_np_array(data)
+    if len(data.shape) != 2:
        raise ValueError(
            "Expecting 2 dimensional numpy.ndarray, got: ", data.shape
        )
    data, _ = _ensure_np_dtype(data, data.dtype)
    handle = ctypes.c_void_p()
-    _check_call(_LIB.XGDMatrixCreateFromMat_omp(
+    args = {
-        flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+        "missing": float(missing),
-        c_bst_ulong(data.shape[0]),
+        "nthread": int(nthread),
-        c_bst_ulong(data.shape[1]),
+    }
-        ctypes.c_float(missing),
+    config = bytes(json.dumps(args), "utf-8")
-        ctypes.byref(handle),
+    _check_call(
-        ctypes.c_int(nthread)))
+        _LIB.XGDMatrixCreateFromArray(
            _array_interface(data),
            config,
            ctypes.byref(handle),
        )
    )
    return handle, feature_names, feature_types
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
  API_END();
 }
 XGB_DLL int XGDMatrixCreateFromArray(char const *data,
                                     char const *c_json_config,
                                     DMatrixHandle *out) {
  API_BEGIN();
  xgboost::data::ArrayAdapter adapter{
      xgboost::data::ArrayAdapter(StringView{data})};
  auto config = Json::Load(StringView{c_json_config});
  float missing = GetMissing(config);
  auto nthread = get<Integer const>(config["nthread"]);
  *out =
      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
  API_END();
 }
 XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
                                     const unsigned* indices,
                                     const bst_float* data,
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
 };
 class ArrayAdapterBatch : public detail::NoMetaInfo {
 public:
  static constexpr bool kIsRowMajor = true;
 private:
  ArrayInterface array_interface_;
  class Line {
@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
  Line const GetLine(size_t idx) const {
    return Line{array_interface_, idx};
  }
  size_t Size() const { return array_interface_.num_rows; }
  explicit ArrayAdapterBatch(ArrayInterface array_interface)
      : array_interface_{std::move(array_interface)} {}
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
 template DMatrix* DMatrix::Create<data::DenseAdapter>(
    data::DenseAdapter* adapter, float missing, int nthread,
    const std::string& cache_prefix, size_t page_size);
 template DMatrix* DMatrix::Create<data::ArrayAdapter>(
    data::ArrayAdapter* adapter, float missing, int nthread,
    const std::string& cache_prefix, size_t page_size);
 template DMatrix* DMatrix::Create<data::CSRAdapter>(
    data::CSRAdapter* adapter, float missing, int nthread,
    const std::string& cache_prefix, size_t page_size);
@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) {
 template uint64_t
 SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
 template uint64_t
 SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
 template uint64_t
 SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
 template uint64_t
 SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
 template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
                                     int nthread);
 template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing,
                                     int nthread);
 template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
                                     int nthread);
 template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,