diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index f4d62439e..90e4b185f 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const* json_config, DMatrixHandle* out); + +/*! + * \brief Create a matrix from dense array. + * \param data JSON encoded __array_interface__ to array values. + * \param json_config JSON encoded configuration. Required values are: + * + * - missing + * - nthread + * + * \param out created dmatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixCreateFromArray(char const *data, + char const *json_config, + DMatrixHandle *out); + /*! * \brief create a matrix content from CSC format * \param col_ptr pointer to col headers diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index b664fc3a2..1c6305f38 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype): ''' try: if not data.flags.c_contiguous: - warnings.warn( - "Use of np.ndarray subsets (sliced data) is not recommended " + - "because it will generate extra copies and increase " + - "memory consumption. Consider using np.ascontiguousarray to " + - "make the array contiguous.") data = np.array(data, copy=True, dtype=dtype) else: data = np.array(data, copy=False, dtype=dtype) @@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype): return data -def _transform_np_array(data: np.ndarray) -> np.ndarray: - if not isinstance(data, np.ndarray) and hasattr(data, '__array__'): - data = np.array(data, copy=False) - if len(data.shape) != 2: - raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ', - data.shape) - # flatten the array by rows and ensure it is float32. we try to avoid - # data copies if possible (reshape returns a view when possible and we - # explicitly tell np.array to try and avoid copying) - flatten = np.array(data.reshape(data.size), copy=False, - dtype=np.float32) - flatten = _maybe_np_slice(flatten, np.float32) - _check_complex(data) - return flatten - - def _from_numpy_array(data, missing, nthread, feature_names, feature_types): """Initialize data from a 2-D numpy matrix. - If ``mat`` does not have ``order='C'`` (aka row-major) or is - not contiguous, a temporary copy will be made. - - If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will - be made. - - So there could be as many as two temporary data copies; be mindful of - input layout and type if memory use is a concern. - """ - flatten: np.ndarray = _transform_np_array(data) + if len(data.shape) != 2: + raise ValueError( + "Expecting 2 dimensional numpy.ndarray, got: ", data.shape + ) + data, _ = _ensure_np_dtype(data, data.dtype) handle = ctypes.c_void_p() - _check_call(_LIB.XGDMatrixCreateFromMat_omp( - flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - c_bst_ulong(data.shape[0]), - c_bst_ulong(data.shape[1]), - ctypes.c_float(missing), - ctypes.byref(handle), - ctypes.c_int(nthread))) + args = { + "missing": float(missing), + "nthread": int(nthread), + } + config = bytes(json.dumps(args), "utf-8") + _check_call( + _LIB.XGDMatrixCreateFromArray( + _array_interface(data), + config, + ctypes.byref(handle), + ) + ) return handle, feature_names, feature_types diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index f7586c192..73c405ba7 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, API_END(); } +XGB_DLL int XGDMatrixCreateFromArray(char const *data, + char const *c_json_config, + DMatrixHandle *out) { + API_BEGIN(); + xgboost::data::ArrayAdapter adapter{ + xgboost::data::ArrayAdapter(StringView{data})}; + auto config = Json::Load(StringView{c_json_config}); + float missing = GetMissing(config); + auto nthread = get(config["nthread"]); + *out = + new std::shared_ptr(DMatrix::Create(&adapter, missing, nthread)); + API_END(); +} + XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr, const unsigned* indices, const bst_float* data, diff --git a/src/data/adapter.h b/src/data/adapter.h index 80c14e272..924fb9f82 100644 --- a/src/data/adapter.h +++ b/src/data/adapter.h @@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter { }; class ArrayAdapterBatch : public detail::NoMetaInfo { + public: + static constexpr bool kIsRowMajor = true; + + private: ArrayInterface array_interface_; class Line { @@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo { Line const GetLine(size_t idx) const { return Line{array_interface_, idx}; } + size_t Size() const { return array_interface_.num_rows; } explicit ArrayAdapterBatch(ArrayInterface array_interface) : array_interface_{std::move(array_interface)} {} diff --git a/src/data/data.cc b/src/data/data.cc index df606849f..536a836ec 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, template DMatrix* DMatrix::Create( data::DenseAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); +template DMatrix* DMatrix::Create( + data::ArrayAdapter* adapter, float missing, int nthread, + const std::string& cache_prefix, size_t page_size); template DMatrix* DMatrix::Create( data::CSRAdapter* adapter, float missing, int nthread, const std::string& cache_prefix, size_t page_size); @@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) { template uint64_t SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread); template uint64_t +SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread); +template uint64_t SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread); template uint64_t SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread); diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index ec2d5d36d..85e38a52d 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing, int nthread); +template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing, + int nthread); template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing, int nthread); template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,