From a985a99cf0dacb26a5d734835473d492d3c2a0df Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 18 Feb 2019 22:21:34 +0800 Subject: [PATCH] Accept numpy array view. (#4147) * Accept array view (slice) in metainfo. --- include/xgboost/c_api.h | 36 +++++++++++++++++++++- include/xgboost/data.h | 1 + python-package/xgboost/core.py | 50 +++++++++++++++++++++--------- src/c_api/c_api.cc | 46 +++++++++++++++++++++------- src/data/data.cc | 56 ++++++++++++++++++++++++---------- tests/python/test_basic.py | 6 ++++ 6 files changed, 152 insertions(+), 43 deletions(-) diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 787397ad9..452cb33ef 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2015 by Contributors + * Copyright (c) 2015-2019 by Contributors * \file c_api.h * \author Tianqi Chen * \brief C API of XGBoost, used for interfacing to other languages. @@ -283,6 +283,23 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const float *array, bst_ulong len); +/*! + * \brief `XGDMatrixSetFloatInfo' with strided array as input. + * + * \param handle a instance of data matrix + * \param field field name, can be label, weight + * \param array pointer to float vector + * \param stride stride of input vector + * \param len length of array + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetFloatInfoStrided(DMatrixHandle handle, + const char *field, + const float *array, + const bst_ulong stride, + bst_ulong len); + /*! * \brief set uint32 vector to a content in info * \param handle a instance of data matrix @@ -295,6 +312,23 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const unsigned *array, bst_ulong len); + +/*! + * \brief `XGDMatrixSetUIntInfo' with strided array as input. + * + * \param handle a instance of data matrix + * \param field field name + * \param array pointer to unsigned int vector + * \param stride stride of input vector + * \param len length of array + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetUIntInfoStrided(DMatrixHandle handle, + const char *field, + const unsigned *array, + const bst_ulong stride, + bst_ulong len); /*! * \brief set label of the training matrix * \param handle a instance of data matrix diff --git a/include/xgboost/data.h b/include/xgboost/data.h index e2d800ca4..5e0e84b65 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -122,6 +122,7 @@ class MetaInfo { * \param num Number of elements in the source array. */ void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num); + void SetInfo(const char* key, const void* dptr, DataType dtype, size_t stride, size_t num); private: /*! \brief argsort of labels */ diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index c03079d1e..34bc68031 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -219,6 +219,17 @@ def c_array(ctype, values): return (ctype * len(values))(*values) +def _get_length_and_stride(data): + "Return length and stride of 1-D data." + if isinstance(data, np.ndarray) and data.base is not None: + length = len(data.base) + stride = data.strides[0] // data.dtype.itemsize + else: + length = len(data) + stride = 1 + return length, stride + + PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'float32': 'float', 'float64': 'float', @@ -585,10 +596,13 @@ class DMatrix(object): The array of data to be set """ c_data = c_array(ctypes.c_float, data) - _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle, - c_str(field), - c_data, - c_bst_ulong(len(data)))) + length, stride = _get_length_and_stride(data) + _check_call(_LIB.XGDMatrixSetFloatInfoStrided( + self.handle, + c_str(field), + c_data, + c_bst_ulong(stride), + c_bst_ulong(length))) def set_float_info_npy2d(self, field, data): """Set float type property into the DMatrix @@ -604,10 +618,13 @@ class DMatrix(object): """ data = np.array(data, copy=False, dtype=np.float32) c_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - _check_call(_LIB.XGDMatrixSetFloatInfo(self.handle, - c_str(field), - c_data, - c_bst_ulong(len(data)))) + length, stride = _get_length_and_stride(data) + _check_call(_LIB.XGDMatrixSetFloatInfoStrided( + self.handle, + c_str(field), + c_data, + c_bst_ulong(stride), + c_bst_ulong(length))) def set_uint_info(self, field, data): """Set uint type property into the DMatrix. @@ -620,10 +637,15 @@ class DMatrix(object): data: numpy array The array of data to be set """ - _check_call(_LIB.XGDMatrixSetUIntInfo(self.handle, - c_str(field), - c_array(ctypes.c_uint, data), - c_bst_ulong(len(data)))) + data = np.array(data, copy=False, dtype=ctypes.c_uint) + c_data = c_array(ctypes.c_uint, data) + length, stride = _get_length_and_stride(data) + _check_call(_LIB.XGDMatrixSetUIntInfoStrided( + self.handle, + c_str(field), + c_data, + c_bst_ulong(stride), + c_bst_ulong(length))) def save_binary(self, fname, silent=True): """Save DMatrix to an XGBoost buffer. @@ -719,9 +741,7 @@ class DMatrix(object): group : array like Group size of each group """ - _check_call(_LIB.XGDMatrixSetGroup(self.handle, - c_array(ctypes.c_uint, group), - c_bst_ulong(len(group)))) + self.set_uint_info('group', group) def get_label(self): """Get the label of the DMatrix. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 45749a7db..34901d967 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014 by Contributors +// Copyright (c) 2014-2019 by Contributors #include #include @@ -768,9 +768,9 @@ XGB_DLL int XGDMatrixSaveBinary(DMatrixHandle handle, } XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, - const char* field, - const bst_float* info, - xgboost::bst_ulong len) { + const char* field, + const xgboost::bst_float* info, + xgboost::bst_ulong len) { API_BEGIN(); CHECK_HANDLE(); static_cast*>(handle) @@ -778,14 +778,38 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, API_END(); } -XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, - const char* field, - const unsigned* info, - xgboost::bst_ulong len) { +XGB_DLL int XGDMatrixSetFloatInfoStrided(DMatrixHandle handle, + const char* field, + const xgboost::bst_float* info, + const xgboost::bst_ulong stride, + xgboost::bst_ulong len) { API_BEGIN(); CHECK_HANDLE(); static_cast*>(handle) - ->get()->Info().SetInfo(field, info, kUInt32, len); + ->get()->Info().SetInfo(field, info, kFloat32, stride, len); + API_END(); +} + +XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, + const char* field, + const unsigned* array, + xgboost::bst_ulong len) { + API_BEGIN(); + CHECK_HANDLE(); + static_cast*>(handle) + ->get()->Info().SetInfo(field, array, kUInt32, len); + API_END(); +} + +XGB_DLL int XGDMatrixSetUIntInfoStrided(DMatrixHandle handle, + const char* field, + const unsigned* array, + const xgboost::bst_ulong stride, + xgboost::bst_ulong len) { + API_BEGIN(); + CHECK_HANDLE(); + static_cast*>(handle) + ->get()->Info().SetInfo(field, array, kUInt32, stride, len); API_END(); } @@ -864,8 +888,8 @@ XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle, // xgboost implementation XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], - xgboost::bst_ulong len, - BoosterHandle *out) { + xgboost::bst_ulong len, + BoosterHandle *out) { API_BEGIN(); std::vector > mats; for (xgboost::bst_ulong i = 0; i < len; ++i) { diff --git a/src/data/data.cc b/src/data/data.cc index 24791d7a1..7357bb5f6 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2019 by Contributors * \file data.cc */ #include @@ -100,45 +100,70 @@ inline bool MetaTryLoadFloatInfo(const std::string& fname, #define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc) \ switch (dtype) { \ case kFloat32: { \ - auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + auto cast_ptr = reinterpret_cast(old_ptr); proc; \ + break; \ } \ case kDouble: { \ - auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + auto cast_ptr = reinterpret_cast(old_ptr); proc; \ + break; \ } \ case kUInt32: { \ - auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + auto cast_ptr = reinterpret_cast(old_ptr); proc; \ + break; \ } \ case kUInt64: { \ - auto cast_ptr = reinterpret_cast(old_ptr); proc; break; \ + auto cast_ptr = reinterpret_cast(old_ptr); proc; \ + break; \ } \ default: LOG(FATAL) << "Unknown data type" << dtype; \ } \ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) { + this->SetInfo(key, dptr, dtype, 1, num); +} + +template +void StridedCopy(IterIn in_beg, IterIn in_end, IterOut out_beg, size_t stride) { + if (stride != 1) { + IterOut out_iter = out_beg; + for (IterIn in_iter = in_beg; in_iter < in_end; in_iter += stride) { + *out_iter = *in_iter; + out_iter++; + } + } else { + // There can be builtin optimization in std::copy + std::copy(in_beg, in_end, out_beg); + } +} + +void MetaInfo::SetInfo( + const char* key, const void* dptr, DataType dtype, size_t stride, size_t num) { + size_t view_length = + static_cast(std::ceil(static_cast(num) / stride)); if (!std::strcmp(key, "root_index")) { - root_index_.resize(num); + root_index_.resize(view_length); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, - std::copy(cast_dptr, cast_dptr + num, root_index_.begin())); + StridedCopy(cast_dptr, cast_dptr + num, root_index_.begin(), stride)); } else if (!std::strcmp(key, "label")) { auto& labels = labels_.HostVector(); - labels.resize(num); + labels.resize(view_length); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, - std::copy(cast_dptr, cast_dptr + num, labels.begin())); + StridedCopy(cast_dptr, cast_dptr + num, labels.begin(), stride)); } else if (!std::strcmp(key, "weight")) { auto& weights = weights_.HostVector(); - weights.resize(num); + weights.resize(view_length); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, - std::copy(cast_dptr, cast_dptr + num, weights.begin())); + StridedCopy(cast_dptr, cast_dptr + num, weights.begin(), stride)); } else if (!std::strcmp(key, "base_margin")) { auto& base_margin = base_margin_.HostVector(); - base_margin.resize(num); + base_margin.resize(view_length); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, - std::copy(cast_dptr, cast_dptr + num, base_margin.begin())); + StridedCopy(cast_dptr, cast_dptr + num, base_margin.begin(), stride)); } else if (!std::strcmp(key, "group")) { - group_ptr_.resize(num + 1); + group_ptr_.resize(view_length+1); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, - std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1)); + StridedCopy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1, stride)); group_ptr_[0] = 0; for (size_t i = 1; i < group_ptr_.size(); ++i) { group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i]; @@ -146,7 +171,6 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t } } - DMatrix* DMatrix::Load(const std::string& uri, bool silent, bool load_row_split, diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index dc22b4d1e..316895d83 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -63,6 +63,12 @@ class TestBasic(unittest.TestCase): # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 + def test_np_view(self): + y = np.array([12, 34, 56], np.float32)[::2] + from_view = xgb.DMatrix([], label=y).get_label() + from_array = xgb.DMatrix([], label=y + 0).get_label() + assert (from_view == from_array).all() + def test_record_results(self): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')