Support numpy array interface (#6998)
This commit is contained in:
parent
ab6fd304c4
commit
4cf95a6041
@ -130,6 +130,22 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
|
||||
char const* json_config,
|
||||
DMatrixHandle* out);
|
||||
|
||||
|
||||
/*!
|
||||
* \brief Create a matrix from dense array.
|
||||
* \param data JSON encoded __array_interface__ to array values.
|
||||
* \param json_config JSON encoded configuration. Required values are:
|
||||
*
|
||||
* - missing
|
||||
* - nthread
|
||||
*
|
||||
* \param out created dmatrix
|
||||
* \return 0 when success, -1 when failure happens
|
||||
*/
|
||||
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
|
||||
char const *json_config,
|
||||
DMatrixHandle *out);
|
||||
|
||||
/*!
|
||||
* \brief create a matrix content from CSC format
|
||||
* \param col_ptr pointer to col headers
|
||||
|
||||
@ -116,11 +116,6 @@ def _maybe_np_slice(data, dtype):
|
||||
'''
|
||||
try:
|
||||
if not data.flags.c_contiguous:
|
||||
warnings.warn(
|
||||
"Use of np.ndarray subsets (sliced data) is not recommended " +
|
||||
"because it will generate extra copies and increase " +
|
||||
"memory consumption. Consider using np.ascontiguousarray to " +
|
||||
"make the array contiguous.")
|
||||
data = np.array(data, copy=True, dtype=dtype)
|
||||
else:
|
||||
data = np.array(data, copy=False, dtype=dtype)
|
||||
@ -130,44 +125,28 @@ def _maybe_np_slice(data, dtype):
|
||||
return data
|
||||
|
||||
|
||||
def _transform_np_array(data: np.ndarray) -> np.ndarray:
|
||||
if not isinstance(data, np.ndarray) and hasattr(data, '__array__'):
|
||||
data = np.array(data, copy=False)
|
||||
if len(data.shape) != 2:
|
||||
raise ValueError('Expecting 2 dimensional numpy.ndarray, got: ',
|
||||
data.shape)
|
||||
# flatten the array by rows and ensure it is float32. we try to avoid
|
||||
# data copies if possible (reshape returns a view when possible and we
|
||||
# explicitly tell np.array to try and avoid copying)
|
||||
flatten = np.array(data.reshape(data.size), copy=False,
|
||||
dtype=np.float32)
|
||||
flatten = _maybe_np_slice(flatten, np.float32)
|
||||
_check_complex(data)
|
||||
return flatten
|
||||
|
||||
|
||||
def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
|
||||
"""Initialize data from a 2-D numpy matrix.
|
||||
|
||||
If ``mat`` does not have ``order='C'`` (aka row-major) or is
|
||||
not contiguous, a temporary copy will be made.
|
||||
|
||||
If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will
|
||||
be made.
|
||||
|
||||
So there could be as many as two temporary data copies; be mindful of
|
||||
input layout and type if memory use is a concern.
|
||||
|
||||
"""
|
||||
flatten: np.ndarray = _transform_np_array(data)
|
||||
if len(data.shape) != 2:
|
||||
raise ValueError(
|
||||
"Expecting 2 dimensional numpy.ndarray, got: ", data.shape
|
||||
)
|
||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||
handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGDMatrixCreateFromMat_omp(
|
||||
flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
||||
c_bst_ulong(data.shape[0]),
|
||||
c_bst_ulong(data.shape[1]),
|
||||
ctypes.c_float(missing),
|
||||
ctypes.byref(handle),
|
||||
ctypes.c_int(nthread)))
|
||||
args = {
|
||||
"missing": float(missing),
|
||||
"nthread": int(nthread),
|
||||
}
|
||||
config = bytes(json.dumps(args), "utf-8")
|
||||
_check_call(
|
||||
_LIB.XGDMatrixCreateFromArray(
|
||||
_array_interface(data),
|
||||
config,
|
||||
ctypes.byref(handle),
|
||||
)
|
||||
)
|
||||
return handle, feature_names, feature_types
|
||||
|
||||
|
||||
|
||||
@ -261,6 +261,20 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromArray(char const *data,
|
||||
char const *c_json_config,
|
||||
DMatrixHandle *out) {
|
||||
API_BEGIN();
|
||||
xgboost::data::ArrayAdapter adapter{
|
||||
xgboost::data::ArrayAdapter(StringView{data})};
|
||||
auto config = Json::Load(StringView{c_json_config});
|
||||
float missing = GetMissing(config);
|
||||
auto nthread = get<Integer const>(config["nthread"]);
|
||||
*out =
|
||||
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
|
||||
API_END();
|
||||
}
|
||||
|
||||
XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
|
||||
const unsigned* indices,
|
||||
const bst_float* data,
|
||||
|
||||
@ -231,6 +231,10 @@ class DenseAdapter : public detail::SingleBatchDataIter<DenseAdapterBatch> {
|
||||
};
|
||||
|
||||
class ArrayAdapterBatch : public detail::NoMetaInfo {
|
||||
public:
|
||||
static constexpr bool kIsRowMajor = true;
|
||||
|
||||
private:
|
||||
ArrayInterface array_interface_;
|
||||
|
||||
class Line {
|
||||
@ -253,6 +257,7 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
|
||||
Line const GetLine(size_t idx) const {
|
||||
return Line{array_interface_, idx};
|
||||
}
|
||||
size_t Size() const { return array_interface_.num_rows; }
|
||||
|
||||
explicit ArrayAdapterBatch(ArrayInterface array_interface)
|
||||
: array_interface_{std::move(array_interface)} {}
|
||||
|
||||
@ -803,6 +803,9 @@ DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
|
||||
template DMatrix* DMatrix::Create<data::DenseAdapter>(
|
||||
data::DenseAdapter* adapter, float missing, int nthread,
|
||||
const std::string& cache_prefix, size_t page_size);
|
||||
template DMatrix* DMatrix::Create<data::ArrayAdapter>(
|
||||
data::ArrayAdapter* adapter, float missing, int nthread,
|
||||
const std::string& cache_prefix, size_t page_size);
|
||||
template DMatrix* DMatrix::Create<data::CSRAdapter>(
|
||||
data::CSRAdapter* adapter, float missing, int nthread,
|
||||
const std::string& cache_prefix, size_t page_size);
|
||||
@ -1037,6 +1040,8 @@ void SparsePage::PushCSC(const SparsePage &batch) {
|
||||
template uint64_t
|
||||
SparsePage::Push(const data::DenseAdapterBatch& batch, float missing, int nthread);
|
||||
template uint64_t
|
||||
SparsePage::Push(const data::ArrayAdapterBatch& batch, float missing, int nthread);
|
||||
template uint64_t
|
||||
SparsePage::Push(const data::CSRAdapterBatch& batch, float missing, int nthread);
|
||||
template uint64_t
|
||||
SparsePage::Push(const data::CSCAdapterBatch& batch, float missing, int nthread);
|
||||
|
||||
@ -203,6 +203,8 @@ void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
|
||||
|
||||
template SimpleDMatrix::SimpleDMatrix(DenseAdapter* adapter, float missing,
|
||||
int nthread);
|
||||
template SimpleDMatrix::SimpleDMatrix(ArrayAdapter* adapter, float missing,
|
||||
int nthread);
|
||||
template SimpleDMatrix::SimpleDMatrix(CSRAdapter* adapter, float missing,
|
||||
int nthread);
|
||||
template SimpleDMatrix::SimpleDMatrix(CSRArrayAdapter* adapter, float missing,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user