/** * Copyright 2017-2023 by XGBoost Contributors * \file compressed_iterator.h */ #pragma once #include #include #include #include // for size_t #include "common.h" #if defined(__CUDACC__) #include "device_helpers.cuh" #elif defined(__HIP_PLATFORM_AMD__) #include "device_helpers.hip.h" #endif // __CUDACC__ || __HIP_PLATFORM_AMD__ namespace xgboost { namespace common { using CompressedByteT = unsigned char; namespace detail { inline void SetBit(CompressedByteT *byte, int bit_idx) { *byte |= 1 << bit_idx; } template inline T CheckBit(const T &byte, int bit_idx) { return byte & (1 << bit_idx); } inline void ClearBit(CompressedByteT *byte, int bit_idx) { *byte &= ~(1 << bit_idx); } static const int kPadding = 4; // Assign padding so we can read slightly off // the beginning of the array // The number of bits required to represent a given unsigned range inline XGBOOST_DEVICE size_t SymbolBits(size_t num_symbols) { auto bits = std::ceil(log2(static_cast(num_symbols))); return common::Max(static_cast(bits), static_cast(1)); } } // namespace detail /** * \class CompressedBufferWriter * * \brief Writes bit compressed symbols to a memory buffer. Use * CompressedIterator to read symbols back from buffer. Currently limited to a * maximum symbol size of 28 bits. * * \author Rory * \date 7/9/2017 */ class CompressedBufferWriter { size_t symbol_bits_; public: XGBOOST_DEVICE explicit CompressedBufferWriter(size_t num_symbols) : symbol_bits_(detail::SymbolBits(num_symbols)) {} /** * \fn static size_t CompressedBufferWriter::CalculateBufferSize(int * num_elements, int num_symbols) * * \brief Calculates number of bytes required for a given number of elements * and a symbol range. * * \author Rory * \date 7/9/2017 * * \param num_elements Number of elements. * \param num_symbols Max number of symbols (alphabet size) * * \return The calculated buffer size. */ static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) { constexpr int kBitsPerByte = 8; size_t compressed_size = static_cast(std::ceil( static_cast(detail::SymbolBits(num_symbols) * num_elements) / kBitsPerByte)); // Handle atomicOr where input must be unsigned int, hence 4 bytes aligned. size_t ret = std::ceil(static_cast(compressed_size + detail::kPadding) / static_cast(sizeof(unsigned int))) * sizeof(unsigned int); return ret; } template void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) { const int bits_per_byte = 8; for (size_t i = 0; i < symbol_bits_; i++) { size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte; byte_idx += detail::kPadding; size_t bit_idx = ((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte; if (detail::CheckBit(symbol, i)) { detail::SetBit(&buffer[byte_idx], bit_idx); } else { detail::ClearBit(&buffer[byte_idx], bit_idx); } } } #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__) __device__ void AtomicWriteSymbol (CompressedByteT* buffer, uint64_t symbol, size_t offset) { size_t ibit_start = offset * symbol_bits_; size_t ibit_end = (offset + 1) * symbol_bits_ - 1; size_t ibyte_start = ibit_start / 8, ibyte_end = ibit_end / 8; symbol <<= 7 - ibit_end % 8; for (ptrdiff_t ibyte = ibyte_end; ibyte >= static_cast(ibyte_start); --ibyte) { dh::AtomicOrByte(reinterpret_cast(buffer + detail::kPadding), ibyte, symbol & 0xff); symbol >>= 8; } } #endif // __CUDACC__ || __HIP_PLATFORM_AMD__ template void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) { uint64_t tmp = 0; size_t stored_bits = 0; const size_t max_stored_bits = 64 - symbol_bits_; size_t buffer_position = detail::kPadding; const size_t num_symbols = input_end - input_begin; for (size_t i = 0; i < num_symbols; i++) { typename std::iterator_traits::value_type symbol = input_begin[i]; if (stored_bits > max_stored_bits) { // Eject only full bytes size_t tmp_bytes = stored_bits / 8; for (size_t j = 0; j < tmp_bytes; j++) { buffer[buffer_position] = static_cast( tmp >> (stored_bits - (j + 1) * 8)); buffer_position++; } stored_bits -= tmp_bytes * 8; tmp &= (1 << stored_bits) - 1; } // Store symbol tmp <<= symbol_bits_; tmp |= symbol; stored_bits += symbol_bits_; } // Eject all bytes int tmp_bytes = static_cast(std::ceil(static_cast(stored_bits) / 8)); for (int j = 0; j < tmp_bytes; j++) { int shift_bits = static_cast(stored_bits) - (j + 1) * 8; if (shift_bits >= 0) { buffer[buffer_position] = static_cast(tmp >> shift_bits); } else { buffer[buffer_position] = static_cast(tmp << std::abs(shift_bits)); } buffer_position++; } } }; /** * \brief Read symbols from a bit compressed memory buffer. Usable on device and host. * * \author Rory * \date 7/9/2017 * * \tparam T Generic type parameter. */ template class CompressedIterator { public: // Type definitions for thrust typedef CompressedIterator self_type; // NOLINT typedef ptrdiff_t difference_type; // NOLINT typedef T value_type; // NOLINT typedef value_type *pointer; // NOLINT typedef value_type reference; // NOLINT private: const CompressedByteT *buffer_ {nullptr}; size_t symbol_bits_ {0}; size_t offset_ {0}; public: CompressedIterator() = default; CompressedIterator(const CompressedByteT *buffer, size_t num_symbols) : buffer_(buffer) { symbol_bits_ = detail::SymbolBits(num_symbols); } XGBOOST_DEVICE reference operator*() const { const int bits_per_byte = 8; size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1); size_t start_byte_idx = start_bit_idx / bits_per_byte; start_byte_idx += detail::kPadding; // Read 5 bytes - the maximum we will need uint64_t tmp = static_cast(buffer_[start_byte_idx - 4]) << 32 | static_cast(buffer_[start_byte_idx - 3]) << 24 | static_cast(buffer_[start_byte_idx - 2]) << 16 | static_cast(buffer_[start_byte_idx - 1]) << 8 | buffer_[start_byte_idx]; int bit_shift = (bits_per_byte - ((offset_ + 1) * symbol_bits_)) % bits_per_byte; tmp >>= bit_shift; // Mask off unneeded bits uint64_t mask = (static_cast(1) << symbol_bits_) - 1; return static_cast(tmp & mask); } XGBOOST_DEVICE reference operator[](size_t idx) const { self_type offset = (*this); offset.offset_ += idx; return *offset; } }; } // namespace common } // namespace xgboost