/*! * Copyright 2021 by XGBoost Contributors * \file linalg.h * \brief Linear algebra related utilities. */ #ifndef XGBOOST_LINALG_H_ #define XGBOOST_LINALG_H_ #include #include #include #include #include #include #include #include #include #include #include #include // decouple it from xgboost. #ifndef LINALG_HD #if defined(__CUDA__) || defined(__NVCC__) #define LINALG_HD __host__ __device__ #else #define LINALG_HD #endif // defined (__CUDA__) || defined(__NVCC__) #endif // LINALG_HD namespace xgboost { namespace linalg { namespace detail { struct ArrayInterfaceHandler { template static constexpr char TypeChar() { return (std::is_floating_point::value ? 'f' : (std::is_integral::value ? (std::is_signed::value ? 'i' : 'u') : '\0')); } }; template constexpr size_t Offset(S (&strides)[D], size_t n, Head head) { static_assert(dim < D, ""); return n + head * strides[dim]; } template constexpr std::enable_if_t Offset(S (&strides)[D], size_t n, Head head, Tail &&...rest) { static_assert(dim < D, ""); return Offset(strides, n + (head * strides[dim]), std::forward(rest)...); } template constexpr void CalcStride(size_t const (&shape)[D], size_t (&stride)[D]) { if (f_array) { stride[0] = 1; for (int32_t s = 1; s < D; ++s) { stride[s] = shape[s - 1] * stride[s - 1]; } } else { stride[D - 1] = 1; for (int32_t s = D - 2; s >= 0; --s) { stride[s] = shape[s + 1] * stride[s + 1]; } } } struct AllTag {}; struct IntTag {}; template struct RangeTag { I beg; I end; constexpr size_t Size() const { return end - beg; } }; /** * \brief Calculate the dimension of sliced tensor. */ template constexpr int32_t CalcSliceDim() { return std::is_same::value ? 0 : 1; } template constexpr std::enable_if_t CalcSliceDim() { return CalcSliceDim() + CalcSliceDim(); } template constexpr size_t CalcSize(size_t (&shape)[D]) { size_t size = 1; for (auto d : shape) { size *= d; } return size; } template using RemoveCRType = std::remove_const_t>; template using IndexToTag = std::conditional_t>::value, IntTag, S>; template LINALG_HD constexpr auto UnrollLoop(Fn fn) { #if defined __CUDA_ARCH__ #pragma unroll n #endif // defined __CUDA_ARCH__ for (int32_t i = 0; i < n; ++i) { fn(i); } } template int32_t NativePopc(T v) { int c = 0; for (; v != 0; v &= v - 1) c++; return c; } inline LINALG_HD int Popc(uint32_t v) { #if defined(__CUDA_ARCH__) return __popc(v); #elif defined(__GNUC__) || defined(__clang__) return __builtin_popcount(v); #elif defined(_MSC_VER) return __popcnt(v); #else return NativePopc(v); #endif // compiler } inline LINALG_HD int Popc(uint64_t v) { #if defined(__CUDA_ARCH__) return __popcll(v); #elif defined(__GNUC__) || defined(__clang__) return __builtin_popcountll(v); #elif defined(_MSC_VER) return __popcnt64(v); #else return NativePopc(v); #endif // compiler } template constexpr auto Arr2Tup(T (&arr)[N], std::index_sequence) { return std::make_tuple(arr[Idx]...); } template constexpr auto Arr2Tup(T (&arr)[N]) { return Arr2Tup(arr, std::make_index_sequence{}); } // uint division optimization inspired by the CIndexer in cupy. Division operation is // slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 // bit when the index is smaller, then try to avoid division when it's exp of 2. template LINALG_HD auto UnravelImpl(I idx, common::Span shape) { size_t index[D]{0}; static_assert(std::is_signed::value, "Don't change the type without changing the for loop."); for (int32_t dim = D; --dim > 0;) { auto s = static_cast>>(shape[dim]); if (s & (s - 1)) { auto t = idx / s; index[dim] = idx - t * s; idx = t; } else { // exp of 2 index[dim] = idx & (s - 1); idx >>= Popc(s - 1); } } index[0] = idx; return Arr2Tup(index); } template void ReshapeImpl(size_t (&out_shape)[D], I s) { static_assert(dim < D, ""); out_shape[dim] = s; } template * = nullptr> void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) { static_assert(dim < D, ""); out_shape[dim] = s; ReshapeImpl(out_shape, std::forward(rest)...); } template LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequence) { return f(std::get(t)...); } /** * C++ 17 style apply. * * \param f function to apply * \param t tuple of arguments */ template LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t) { constexpr auto kSize = std::tuple_size::value; return Apply(std::forward(f), std::forward(t), std::make_index_sequence{}); } } // namespace detail /** * \brief Specify all elements in the axis for slicing. */ constexpr detail::AllTag All() { return {}; } /** * \brief Specify a range of elements in the axis for slicing. */ template constexpr detail::RangeTag Range(I beg, I end) { return {beg, end}; } /** * \brief A tensor view with static type and dimension. It implements indexing and slicing. * * Most of the algorithms in XGBoost are implemented for both CPU and GPU without using * much linear algebra routines, this class is a helper intended to ease some high level * operations like indexing into prediction tensor or gradient matrix. It can be passed * into CUDA kernel as normal argument for GPU algorithms. * * Ideally we should add a template parameter `bool on_host` so that the compiler can * prevent passing/accessing the wrong view, but inheritance is heavily used in XGBoost so * some functions expect data types that can be used in everywhere (update prediction * cache for example). */ template class TensorView { public: using ShapeT = size_t[kDim]; using StrideT = ShapeT; private: StrideT stride_{1}; ShapeT shape_{0}; common::Span data_; T *ptr_{nullptr}; // pointer of data_ to avoid bound check. size_t size_{0}; int32_t device_{-1}; // Unlike `Tensor`, the data_ can have arbitrary size since this is just a view. LINALG_HD void CalcSize() { if (data_.empty()) { size_ = 0; } else { size_ = detail::CalcSize(shape_); } } template LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::RangeTag &&range) const { static_assert(new_dim < D, ""); static_assert(old_dim < kDim, ""); new_stride[new_dim] = stride_[old_dim]; new_shape[new_dim] = range.Size(); assert(static_cast(range.end) <= shape_[old_dim]); auto offset = stride_[old_dim] * range.beg; return offset; } /** * \brief Slice dimension for Range tag. */ template LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::RangeTag &&range, S &&...slices) const { static_assert(new_dim < D, ""); static_assert(old_dim < kDim, ""); new_stride[new_dim] = stride_[old_dim]; new_shape[new_dim] = range.Size(); assert(static_cast(range.end) <= shape_[old_dim]); auto offset = stride_[old_dim] * range.beg; return MakeSliceDim(new_shape, new_stride, std::forward(slices)...) + offset; } template LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag) const { static_assert(new_dim < D, ""); static_assert(old_dim < kDim, ""); new_stride[new_dim] = stride_[old_dim]; new_shape[new_dim] = shape_[old_dim]; return 0; } /** * \brief Slice dimension for All tag. */ template LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag, S &&...slices) const { static_assert(new_dim < D, ""); static_assert(old_dim < kDim, ""); new_stride[new_dim] = stride_[old_dim]; new_shape[new_dim] = shape_[old_dim]; return MakeSliceDim(new_shape, new_stride, std::forward(slices)...); } template LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], Index i) const { static_assert(old_dim < kDim, ""); return stride_[old_dim] * i; } /** * \brief Slice dimension for Index tag. */ template LINALG_HD std::enable_if_t::value, size_t> MakeSliceDim( size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const { static_assert(old_dim < kDim, ""); auto offset = stride_[old_dim] * i; auto res = MakeSliceDim(new_shape, new_stride, std::forward(slices)...); return res + offset; } public: size_t constexpr static kValueSize = sizeof(T); size_t constexpr static kDimension = kDim; public: /** * \brief Create a tensor with data and shape. * * \tparam I Type of the shape array element. * \tparam D Size of the shape array, can be lesser than or equal to tensor dimension. * * \param data Raw data input, can be const if this tensor has const type in its * template parameter. * \param shape shape of the tensor * \param device Device ordinal */ template LINALG_HD TensorView(common::Span data, I const (&shape)[D], int32_t device) : data_{data}, ptr_{data_.data()}, device_{device} { static_assert(D > 0 && D <= kDim, "Invalid shape."); // shape detail::UnrollLoop([&](auto i) { shape_[i] = shape[i]; }); for (auto i = D; i < kDim; ++i) { shape_[i] = 1; } // stride detail::CalcStride(shape_, stride_); // size this->CalcSize(); } /** * \brief Create a tensor with data, shape and strides. Don't use this constructor if * stride can be calculated from shape. */ template LINALG_HD TensorView(common::Span data, I const (&shape)[D], I const (&stride)[D], int32_t device) : data_{data}, ptr_{data_.data()}, device_{device} { static_assert(D == kDim, "Invalid shape & stride."); detail::UnrollLoop([&](auto i) { shape_[i] = shape[i]; stride_[i] = stride[i]; }); this->CalcSize(); } template < typename U, std::enable_if_t::value> * = nullptr> LINALG_HD TensorView(TensorView const &that) // NOLINT : data_{that.Values()}, ptr_{data_.data()}, size_{that.Size()}, device_{that.DeviceIdx()} { detail::UnrollLoop([&](auto i) { stride_[i] = that.Stride(i); shape_[i] = that.Shape(i); }); } /** * \brief Index the tensor to obtain a scalar value. * * \code * * // Create a 3-dim tensor. * Tensor t {data, shape, 0}; * float pi = 3.14159; * t(1, 2, 3) = pi; * ASSERT_EQ(t(1, 2, 3), pi); * * \endcode */ template LINALG_HD T &operator()(Index &&...index) { static_assert(sizeof...(index) <= kDim, "Invalid index."); size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward(index)...); assert(offset < data_.size() && "Out of bound access."); return ptr_[offset]; } /** * \brief Index the tensor to obtain a scalar value. */ template LINALG_HD T const &operator()(Index &&...index) const { static_assert(sizeof...(index) <= kDim, "Invalid index."); size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward(index)...); assert(offset < data_.size() && "Out of bound access."); return ptr_[offset]; } /** * \brief Slice the tensor. The returned tensor has inferred dim and shape. Scalar * result is not supported. * * \code * * // Create a 3-dim tensor. * Tensor t {data, shape, 0}; * // s has 2 dimensions (matrix) * auto s = t.Slice(1, All(), All()); * * \endcode */ template LINALG_HD auto Slice(S &&...slices) const { static_assert(sizeof...(slices) <= kDim, "Invalid slice."); int32_t constexpr kNewDim{detail::CalcSliceDim...>()}; size_t new_shape[kNewDim]; size_t new_stride[kNewDim]; auto offset = MakeSliceDim<0, 0, kNewDim>(new_shape, new_stride, std::forward(slices)...); // ret is a different type due to changed dimension, so we can not access its private // fields. TensorView ret{data_.subspan(data_.empty() ? 0 : offset), new_shape, new_stride, device_}; return ret; } LINALG_HD auto Shape() const { return common::Span{shape_}; } /** * Get the shape for i^th dimension */ LINALG_HD auto Shape(size_t i) const { return shape_[i]; } LINALG_HD auto Stride() const { return common::Span{stride_}; } /** * Get the stride for i^th dimension, stride is specified as number of items instead of bytes. */ LINALG_HD auto Stride(size_t i) const { return stride_[i]; } /** * \brief Number of items in the tensor. */ LINALG_HD size_t Size() const { return size_; } /** * \brief Whether this is a contiguous array, both C and F contiguous returns true. */ LINALG_HD bool Contiguous() const { return data_.size() == this->Size() || this->CContiguous() || this->FContiguous(); } /** * \brief Whether it's a c-contiguous array. */ LINALG_HD bool CContiguous() const { StrideT stride; static_assert(std::is_same::value, ""); // It's contiguous if the stride can be calculated from shape. detail::CalcStride(shape_, stride); return common::Span{stride_} == common::Span{stride}; } /** * \brief Whether it's a f-contiguous array. */ LINALG_HD bool FContiguous() const { StrideT stride; static_assert(std::is_same::value, ""); // It's contiguous if the stride can be calculated from shape. detail::CalcStride(shape_, stride); return common::Span{stride_} == common::Span{stride}; } /** * \brief Obtain a reference to the raw data. */ LINALG_HD auto Values() const -> decltype(data_) const & { return data_; } /** * \brief Obtain the CUDA device ordinal. */ LINALG_HD auto DeviceIdx() const { return device_; } }; /** * \brief Constructor for automatic type deduction. */ template ::value> * = nullptr> auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device) { // NOLINT using T = typename Container::value_type; return TensorView{data, shape, device}; } template LINALG_HD auto MakeTensorView(common::Span data, I const (&shape)[D], int32_t device) { return TensorView{data, shape, device}; } /** * \brief Turns linear index into multi-dimension index. Similar to numpy unravel. */ template LINALG_HD auto UnravelIndex(size_t idx, common::Span shape) { if (idx > std::numeric_limits::max()) { return detail::UnravelImpl(static_cast(idx), shape); } else { return detail::UnravelImpl(static_cast(idx), shape); } } /** * \brief A view over a vector, specialization of Tensor * * \tparam T data type of vector */ template using VectorView = TensorView; /** * \brief Create a vector view from contigious memory. * * \param ptr Pointer to the contigious memory. * \param s Size of the vector. * \param device (optional) Device ordinal, default to be host. */ template auto MakeVec(T *ptr, size_t s, int32_t device = -1) { using U = std::remove_const_t> const; return linalg::TensorView{{ptr, s}, {s}, device}; } /** * \brief A view over a matrix, specialization of Tensor. * * \tparam T data type of matrix */ template using MatrixView = TensorView; /** * \brief Array Interface defined by * numpy. * * `stream` is optionally included when data is on CUDA device. */ template Json ArrayInterface(TensorView const &t) { Json array_interface{Object{}}; array_interface["data"] = std::vector(2); array_interface["data"][0] = Integer(reinterpret_cast(t.Values().data())); array_interface["data"][1] = Boolean{true}; if (t.DeviceIdx() >= 0) { // Change this once we have different CUDA stream. array_interface["stream"] = Null{}; } std::vector shape(t.Shape().size()); std::vector stride(t.Stride().size()); for (size_t i = 0; i < t.Shape().size(); ++i) { shape[i] = Integer(t.Shape(i)); stride[i] = Integer(t.Stride(i) * sizeof(T)); } array_interface["shape"] = Array{shape}; array_interface["strides"] = Array{stride}; array_interface["version"] = 3; char constexpr kT = detail::ArrayInterfaceHandler::TypeChar(); static_assert(kT != '\0', ""); if (DMLC_LITTLE_ENDIAN) { array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))}; } else { array_interface["typestr"] = String{">" + (kT + std::to_string(sizeof(T)))}; } return array_interface; } /** * \brief Same as const version, but returns non-readonly data pointer. */ template Json ArrayInterface(TensorView const &t) { TensorView const &as_const = t; auto res = ArrayInterface(as_const); res["data"][1] = Boolean{false}; return res; } /** * \brief Return string representation of array interface. */ template auto ArrayInterfaceStr(TensorView const &t) { std::string str; Json::Dump(ArrayInterface(t), &str); return str; } template auto ArrayInterfaceStr(TensorView const &t) { std::string str; Json::Dump(ArrayInterface(t), &str); return str; } /** * \brief A tensor storage. To use it for other functionality like slicing one needs to * obtain a view first. This way we can use it on both host and device. */ template class Tensor { public: using ShapeT = size_t[kDim]; using StrideT = ShapeT; private: HostDeviceVector data_; ShapeT shape_{0}; public: Tensor() = default; /** * \brief Create a tensor with shape and device ordinal. The storage is initialized * automatically. * * See \ref TensorView for parameters of this constructor. */ template explicit Tensor(I const (&shape)[D], int32_t device) { // No device unroll as this is a host only function. std::copy(shape, shape + D, shape_); for (auto i = D; i < kDim; ++i) { shape_[i] = 1; } auto size = detail::CalcSize(shape_); if (device >= 0) { data_.SetDevice(device); } data_.Resize(size); if (device >= 0) { data_.DevicePointer(); // Pull to device } } /** * Initialize from 2 host iterators. */ template explicit Tensor(It begin, It end, I const (&shape)[D], int32_t device) { // shape static_assert(D <= kDim, "Invalid shape."); std::copy(shape, shape + D, shape_); for (auto i = D; i < kDim; ++i) { shape_[i] = 1; } auto &h_vec = data_.HostVector(); h_vec.insert(h_vec.begin(), begin, end); if (device >= 0) { data_.SetDevice(device); data_.DevicePointer(); // Pull to device; } CHECK_EQ(data_.Size(), detail::CalcSize(shape_)); } /** * \brief Get a \ref TensorView for this tensor. */ TensorView View(int32_t device) { if (device >= 0) { data_.SetDevice(device); auto span = data_.DeviceSpan(); return {span, shape_, device}; } else { auto span = data_.HostSpan(); return {span, shape_, device}; } } TensorView View(int32_t device) const { if (device >= 0) { data_.SetDevice(device); auto span = data_.ConstDeviceSpan(); return {span, shape_, device}; } else { auto span = data_.ConstHostSpan(); return {span, shape_, device}; } } size_t Size() const { return data_.Size(); } auto Shape() const { return common::Span{shape_}; } auto Shape(size_t i) const { return shape_[i]; } HostDeviceVector *Data() { return &data_; } HostDeviceVector const *Data() const { return &data_; } /** * \brief Visitor function for modification that changes shape and data. * * \tparam Fn function that takes a pointer to `HostDeviceVector` and a static sized * span as parameters. */ template void ModifyInplace(Fn &&fn) { fn(this->Data(), common::Span{this->shape_}); CHECK_EQ(this->Data()->Size(), detail::CalcSize(this->shape_)) << "Inconsistent size after modification."; } /** * \brief Reshape the tensor. * * If the total size is changed, then data in this tensor is no longer valid. */ template void Reshape(S &&...s) { static_assert(sizeof...(S) <= kDim, "Invalid shape."); detail::ReshapeImpl<0>(shape_, std::forward(s)...); auto constexpr kEnd = sizeof...(S); static_assert(kEnd <= kDim, "Invalid shape."); std::fill(shape_ + kEnd, shape_ + kDim, 1); auto n = detail::CalcSize(shape_); data_.Resize(n); } /** * \brief Reshape the tensor. * * If the total size is changed, then data in this tensor is no longer valid. */ template void Reshape(size_t (&shape)[D]) { static_assert(D <= kDim, "Invalid shape."); std::copy(shape, shape + D, this->shape_); std::fill(shape_ + D, shape_ + kDim, 1); auto n = detail::CalcSize(shape_); data_.Resize(n); } /** * \brief Set device ordinal for this tensor. */ void SetDevice(int32_t device) { data_.SetDevice(device); } }; // Only first axis is supported for now. template void Stack(Tensor *l, Tensor const &r) { if (r.Data()->DeviceIdx() >= 0) { l->Data()->SetDevice(r.Data()->DeviceIdx()); } l->ModifyInplace([&](HostDeviceVector *data, common::Span shape) { for (size_t i = 1; i < D; ++i) { if (shape[i] == 0) { shape[i] = r.Shape(i); } else { CHECK_EQ(shape[i], r.Shape(i)); } } data->Extend(*r.Data()); shape[0] = l->Shape(0) + r.Shape(0); }); } } // namespace linalg } // namespace xgboost #if defined(LINALG_HD) #undef LINALG_HD #endif // defined(LINALG_HD) #endif // XGBOOST_LINALG_H_