Support numpy array interface (#6998)
This commit is contained in:
parent
ab6fd304c4
commit
4cf95a6041
@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
|
|||||||
char const* json_config,
|
char const* json_config,
|
||||||
DMatrixHandle* out);
|
DMatrixHandle* out);
|
||||||
|
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* \brief Create a matrix from dense array.
|
||||||
|
* \param data JSON encoded __array_interface__ to array values.
|
||||||
|
* \param json_config JSON encoded configuration. Required values are:
|
||||||
|
*
|
||||||
|
* - missing
|
||||||
|
* - nthread
|
||||||
|
*
|
||||||
|
* \param out created dmatrix
|
||||||
|
* \return 0 when success, -1 when failure happens
|
||||||
|
*/
|
||||||
|
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
|
||||||
|
char const *json_config,
|
||||||
|
DMatrixHandle *out);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief create a matrix content from CSC format
|
* \brief create a matrix content from CSC format
|
||||||
* \param col_ptr pointer to col headers
|
* \param col_ptr pointer to col headers
|
||||||
|
|||||||
@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype):
|
|||||||
'''
|
'''
|
||||||
try:
|
try:
|
||||||
if not data.flags.c_contiguous:
|
if not data.flags.c_contiguous:
|
||||||
warnings.warn(
|
|
||||||
"Use of np.ndarray subsets (sliced data) is not recommended " +
|
|
||||||
"because it will generate extra copies and increase " +
|
|
||||||
"memory consumption. Consider using np.ascontiguousarray to " +
|
|
||||||
"make the array contiguous.")
|
|
||||||
data = np.array(data, copy=True, dtype=dtype)
|
data = np.array(data, copy=True, dtype=dtype)
|
||||||
else:
|
else:
|
||||||
data = np.array(data, copy=False, dtype=dtype)
|
data = np.array(data, copy=False, dtype=dtype)
|
||||||
@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _transform_np_array(data: np.ndarray) -> np.ndarray:
|
|
||||||
if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
|
|
||||||
data = np.array(data, copy=False)
|
|
||||||
if len(data.shape) != 2:
|
|
||||||
raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ',
|
|
||||||
data.shape)
|
|
||||||
# flatten the array by rows and ensure it is float32. we try to avoid
|
|
||||||
# data copies if possible (reshape returns a view when possible and we
|
|
||||||
# explicitly tell np.array to try and avoid copying)
|
|
||||||
flatten = np.array(data.reshape(data.size), copy=False,
|
|
||||||
dtype=np.float32)
|
|
||||||
flatten = _maybe_np_slice(flatten, np.float32)
|
|
||||||
_check_complex(data)
|
|
||||||
return flatten
|
|
||||||
|
|
||||||
|
|
||||||
def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
|
def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
|
||||||
"""Initialize data from a 2-D numpy matrix.
|
"""Initialize data from a 2-D numpy matrix.
|
||||||
|
|
||||||
If ``mat`` does not have ``order='C'`` (aka row-major) or is
|
|
||||||
not contiguous, a temporary copy will be made.
|
|
||||||
|
|
||||||
If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will
|
|
||||||
be made.
|
|
||||||
|
|
||||||
So there could be as many as two temporary data copies; be mindful of
|
|
||||||
input layout and type if memory use is a concern.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
flatten: np.ndarray = _transform_np_array(data)
|
if len(data.shape) != 2:
|
||||||
|
raise ValueError(
|
||||||
|
"Expecting 2 dimensional numpy.ndarray, got: ", data.shape
|
||||||
|
)
|
||||||
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
_check_call(_LIB.XGDMatrixCreateFromMat_omp(
|
args = {
|
||||||
flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
"missing": float(missing),
|
||||||
c_bst_ulong(data.shape[0]),
|
"nthread": int(nthread),
|
||||||
c_bst_ulong(data.shape[1]),
|
}
|
||||||
ctypes.c_float(missing),
|
config = bytes(json.dumps(args), "utf-8")
|
||||||
ctypes.byref(handle),
|
_check_call(
|
||||||
ctypes.c_int(nthread)))
|
_LIB.XGDMatrixCreateFromArray(
|
||||||
|
_array_interface(data),
|
||||||
|
config,
|
||||||
|
ctypes.byref(handle),
|
||||||
|
)
|
||||||
|
)
|
||||||
return handle, feature_names, feature_types
|
return handle, feature_names, feature_types
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
|
|||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
|
||||||
|
char const *c_json_config,
|
||||||
|
DMatrixHandle *out) {
|
||||||
|
API_BEGIN();
|
||||||
|
xgboost::data::ArrayAdapter adapter{
|
||||||
|
xgboost::data::ArrayAdapter(StringView{data})};
|
||||||
|
auto config = Json::Load(StringView{c_json_config});
|
||||||
|
float missing = GetMissing(config);
|
||||||
|
auto nthread = get<Integer const>(config["nthread"]);
|
||||||
|
*out =
|
||||||
|
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
|
||||||
|
API_END();
|
||||||
|
}
|
||||||
|
|
||||||
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||||
const unsigned* indices,
|
const unsigned* indices,
|
||||||
const bst_float* data,
|
const bst_float* data,
|
||||||
|
|||||||
@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
class ArrayAdapterBatch : public detail::NoMetaInfo {
|
class ArrayAdapterBatch : public detail::NoMetaInfo {
|
||||||
|
public:
|
||||||
|
static constexpr bool kIsRowMajor = true;
|
||||||
|
|
||||||
|
private:
|
||||||
ArrayInterface array_interface_;
|
ArrayInterface array_interface_;
|
||||||
|
|
||||||
class Line {
|
class Line {
|
||||||
@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
|
|||||||
Line const GetLine(size_t idx) const {
|
Line const GetLine(size_t idx) const {
|
||||||
return Line{array_interface_, idx};
|
return Line{array_interface_, idx};
|
||||||
}
|
}
|
||||||
|
size_t Size() const { return array_interface_.num_rows; }
|
||||||
|
|
||||||
explicit ArrayAdapterBatch(ArrayInterface array_interface)
|
explicit ArrayAdapterBatch(ArrayInterface array_interface)
|
||||||
: array_interface_{std::move(array_interface)} {}
|
: array_interface_{std::move(array_interface)} {}
|
||||||
|
|||||||
@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
|||||||
template DMatrix* DMatrix::Create<data::DenseAdapter>(
|
template DMatrix* DMatrix::Create<data::DenseAdapter>(
|
||||||
data::DenseAdapter* adapter, float missing, int nthread,
|
data::DenseAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix, size_t page_size);
|
||||||
|
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
|
||||||
|
data::ArrayAdapter* adapter, float missing, int nthread,
|
||||||
|
const std::string& cache_prefix, size_t page_size);
|
||||||
template DMatrix* DMatrix::Create<data::CSRAdapter>(
|
template DMatrix* DMatrix::Create<data::CSRAdapter>(
|
||||||
data::CSRAdapter* adapter, float missing, int nthread,
|
data::CSRAdapter* adapter, float missing, int nthread,
|
||||||
const std::string& cache_prefix, size_t page_size);
|
const std::string& cache_prefix, size_t page_size);
|
||||||
@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) {
|
|||||||
template uint64_t
|
template uint64_t
|
||||||
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
|
||||||
template uint64_t
|
template uint64_t
|
||||||
|
SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
|
||||||
|
template uint64_t
|
||||||
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
|
||||||
template uint64_t
|
template uint64_t
|
||||||
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
|
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
|
||||||
|
|||||||
@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
|||||||
|
|
||||||
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
|
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
|
||||||
int nthread);
|
int nthread);
|
||||||
|
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing,
|
||||||
|
int nthread);
|
||||||
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
|
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
|
||||||
int nthread);
|
int nthread);
|
||||||
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,
|
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user