Support numpy array interface (#6998)

2021-05-27 16:08:22 +08:00 · 2021-05-27 16:08:22 +08:00 · 4cf95a6041
commit 4cf95a6041
parent ab6fd304c4
6 changed files with 59 additions and 38 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
                                   char const* json_config,
                                   DMatrixHandle* out);

+
+/*!
+ * \brief Create a matrix from dense array.
+ * \param data  JSON encoded __array_interface__ to array values.
+ * \param json_config JSON encoded configuration.  Required values are:
+ *
+ *          - missing
+ *          - nthread
+ *
+ * \param out created dmatrix
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixCreateFromArray(char const *data,
+                                     char const *json_config,
+                                     DMatrixHandle *out);
+
 /*!
 * \brief create a matrix content from CSC format
 * \param col_ptr pointer to col headers
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype):
    '''
    try:
        if not data.flags.c_contiguous:
-            warnings.warn(
-                "Use of np.ndarray subsets (sliced data) is not recommended " +
-                "because it will generate extra copies and increase " +
-                "memory consumption. Consider using np.ascontiguousarray to " +
-                "make the array contiguous.")
            data = np.array(data, copy=True, dtype=dtype)
        else:
            data = np.array(data, copy=False, dtype=dtype)
@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype):
    return data


-def _transform_np_array(data: np.ndarray) -> np.ndarray:
-    if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
-        data = np.array(data, copy=False)
-    if len(data.shape) != 2:
-        raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ',
-                         data.shape)
-    # flatten the array by rows and ensure it is float32.  we try to avoid
-    # data copies if possible (reshape returns a view when possible and we
-    # explicitly tell np.array to try and avoid copying)
-    flatten = np.array(data.reshape(data.size), copy=False,
-                       dtype=np.float32)
-    flatten = _maybe_np_slice(flatten, np.float32)
-    _check_complex(data)
-    return flatten
-
-
 def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
    """Initialize data from a 2-D numpy matrix.

-    If ``mat`` does not have ``order='C'`` (aka row-major) or is
-    not contiguous, a temporary copy will be made.
-
-    If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will
-    be made.
-
-    So there could be as many as two temporary data copies; be mindful of
-    input layout and type if memory use is a concern.
-
    """
-    flatten: np.ndarray = _transform_np_array(data)
+    if len(data.shape) != 2:
+        raise ValueError(
+            "Expecting 2 dimensional numpy.ndarray, got: ", data.shape
+        )
+    data, _ = _ensure_np_dtype(data, data.dtype)
    handle = ctypes.c_void_p()
-    _check_call(_LIB.XGDMatrixCreateFromMat_omp(
-        flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-        c_bst_ulong(data.shape[0]),
-        c_bst_ulong(data.shape[1]),
-        ctypes.c_float(missing),
-        ctypes.byref(handle),
-        ctypes.c_int(nthread)))
+    args = {
+        "missing": float(missing),
+        "nthread": int(nthread),
+    }
+    config = bytes(json.dumps(args), "utf-8")
+    _check_call(
+        _LIB.XGDMatrixCreateFromArray(
+            _array_interface(data),
+            config,
+            ctypes.byref(handle),
+        )
+    )
    return handle, feature_names, feature_types


--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
  API_END();
 }

+XGB_DLL int XGDMatrixCreateFromArray(char const *data,
+                                     char const *c_json_config,
+                                     DMatrixHandle *out) {
+  API_BEGIN();
+  xgboost::data::ArrayAdapter adapter{
+      xgboost::data::ArrayAdapter(StringView{data})};
+  auto config = Json::Load(StringView{c_json_config});
+  float missing = GetMissing(config);
+  auto nthread = get<Integer const>(config["nthread"]);
+  *out =
+      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
+  API_END();
+}
+
 XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
                                     const unsigned* indices,
                                     const bst_float* data,
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
 };

 class ArrayAdapterBatch : public detail::NoMetaInfo {
+ public:
+  static constexpr bool kIsRowMajor = true;
+
+ private:
  ArrayInterface array_interface_;

  class Line {
@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
  Line const GetLine(size_t idx) const {
    return Line{array_interface_, idx};
  }
+  size_t Size() const { return array_interface_.num_rows; }

  explicit ArrayAdapterBatch(ArrayInterface array_interface)
      : array_interface_{std::move(array_interface)} {}
--- a/src/data/data.cc
+++ b/src/data/data.cc
@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
 template DMatrix* DMatrix::Create<data::DenseAdapter>(
    data::DenseAdapter* adapter, float missing, int nthread,
    const std::string& cache_prefix, size_t page_size);
+template DMatrix* DMatrix::Create<data::ArrayAdapter>(
+    data::ArrayAdapter* adapter, float missing, int nthread,
+    const std::string& cache_prefix, size_t page_size);
 template DMatrix* DMatrix::Create<data::CSRAdapter>(
    data::CSRAdapter* adapter, float missing, int nthread,
    const std::string& cache_prefix, size_t page_size);
@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) {
 template uint64_t
 SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
 template uint64_t
+SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
+template uint64_t
 SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
 template uint64_t
 SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {

 template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
                                     int nthread);
+template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing,
+                                     int nthread);
 template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
                                     int nthread);
 template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,