Support numpy array interface (#6998)

This commit is contained in:
Jiaming Yuan 2021-05-27 16:08:22 +08:00 committed by GitHub
parent ab6fd304c4
commit 4cf95a6041
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 59 additions and 38 deletions

View File

@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
char const* json_config,
DMatrixHandle* out);
/*!
* \brief Create a matrix from dense array.
* \param data JSON encoded __array_interface__ to array values.
* \param json_config JSON encoded configuration. Required values are:
*
* - missing
* - nthread
*
* \param out created dmatrix
* \return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
char const *json_config,
DMatrixHandle *out);
/*!
* \brief create a matrix content from CSC format
* \param col_ptr pointer to col headers

View File

@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype):
'''
try:
if not data.flags.c_contiguous:
warnings.warn(
"Use of np.ndarray subsets (sliced data) is not recommended " +
"because it will generate extra copies and increase " +
"memory consumption. Consider using np.ascontiguousarray to " +
"make the array contiguous.")
data = np.array(data, copy=True, dtype=dtype)
else:
data = np.array(data, copy=False, dtype=dtype)
@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype):
return data
def _transform_np_array(data: np.ndarray) -> np.ndarray:
if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
data = np.array(data, copy=False)
if len(data.shape) != 2:
raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ',
data.shape)
# flatten the array by rows and ensure it is float32. we try to avoid
# data copies if possible (reshape returns a view when possible and we
# explicitly tell np.array to try and avoid copying)
flatten = np.array(data.reshape(data.size), copy=False,
dtype=np.float32)
flatten = _maybe_np_slice(flatten, np.float32)
_check_complex(data)
return flatten
def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
"""Initialize data from a 2-D numpy matrix.
If ``mat`` does not have ``order='C'`` (aka row-major) or is
not contiguous, a temporary copy will be made.
If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will
be made.
So there could be as many as two temporary data copies; be mindful of
input layout and type if memory use is a concern.
"""
flatten: np.ndarray = _transform_np_array(data)
if len(data.shape) != 2:
raise ValueError(
"Expecting 2 dimensional numpy.ndarray, got: ", data.shape
)
data, _ = _ensure_np_dtype(data, data.dtype)
handle = ctypes.c_void_p()
_check_call(_LIB.XGDMatrixCreateFromMat_omp(
flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
c_bst_ulong(data.shape[0]),
c_bst_ulong(data.shape[1]),
ctypes.c_float(missing),
ctypes.byref(handle),
ctypes.c_int(nthread)))
args = {
"missing": float(missing),
"nthread": int(nthread),
}
config = bytes(json.dumps(args), "utf-8")
_check_call(
_LIB.XGDMatrixCreateFromArray(
_array_interface(data),
config,
ctypes.byref(handle),
)
)
return handle, feature_names, feature_types

View File

@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
API_END();
}
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
char const *c_json_config,
DMatrixHandle *out) {
API_BEGIN();
xgboost::data::ArrayAdapter adapter{
xgboost::data::ArrayAdapter(StringView{data})};
auto config = Json::Load(StringView{c_json_config});
float missing = GetMissing(config);
auto nthread = get<Integer const>(config["nthread"]);
*out =
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
API_END();
}
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
const unsigned* indices,
const bst_float* data,

View File

@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
};
class ArrayAdapterBatch : public detail::NoMetaInfo {
public:
static constexpr bool kIsRowMajor = true;
private:
ArrayInterface array_interface_;
class Line {
@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
Line const GetLine(size_t idx) const {
return Line{array_interface_, idx};
}
size_t Size() const { return array_interface_.num_rows; }
explicit ArrayAdapterBatch(ArrayInterface array_interface)
: array_interface_{std::move(array_interface)} {}

View File

@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
template DMatrix* DMatrix::Create<data::DenseAdapter>(
data::DenseAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size);
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
data::ArrayAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size);
template DMatrix* DMatrix::Create<data::CSRAdapter>(
data::CSRAdapter* adapter, float missing, int nthread,
const std::string& cache_prefix, size_t page_size);
@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) {
template uint64_t
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
template uint64_t
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);

View File

@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
int nthread);
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,