Clang-tidy static analysis (#3222)
* Clang-tidy static analysis * Modernise checks * Google coding standard checks * Identifier renaming according to Google style
This commit is contained in:
@@ -68,10 +68,10 @@ inline Float8 round(const Float8& x) {
|
||||
|
||||
// Overload std::max/min
|
||||
namespace std {
|
||||
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) {
|
||||
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) { // NOLINT
|
||||
return avx::Float8(_mm256_max_ps(a.x, b.x));
|
||||
}
|
||||
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) {
|
||||
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) { // NOLINT
|
||||
return avx::Float8(_mm256_min_ps(a.x, b.x));
|
||||
}
|
||||
} // namespace std
|
||||
@@ -172,7 +172,7 @@ inline Float8 Sigmoid(Float8 x) {
|
||||
}
|
||||
|
||||
// Store 8 gradient pairs given vectors containing gradient and Hessian
|
||||
inline void StoreGpair(xgboost::bst_gpair* dst, const Float8& grad,
|
||||
inline void StoreGpair(xgboost::GradientPair* dst, const Float8& grad,
|
||||
const Float8& hess) {
|
||||
float* ptr = reinterpret_cast<float*>(dst);
|
||||
__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
|
||||
@@ -190,11 +190,11 @@ namespace avx {
|
||||
* \brief Fallback implementation not using AVX.
|
||||
*/
|
||||
|
||||
struct Float8 {
|
||||
struct Float8 { // NOLINT
|
||||
float x[8];
|
||||
explicit Float8(const float& val) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
x[i] = val;
|
||||
for (float & i : x) {
|
||||
i = val;
|
||||
}
|
||||
}
|
||||
explicit Float8(const float* vec) {
|
||||
@@ -202,7 +202,7 @@ struct Float8 {
|
||||
x[i] = vec[i];
|
||||
}
|
||||
}
|
||||
Float8() {}
|
||||
Float8() = default;
|
||||
Float8& operator+=(const Float8& rhs) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
x[i] += rhs.x[i];
|
||||
@@ -228,7 +228,7 @@ struct Float8 {
|
||||
return *this;
|
||||
}
|
||||
void Print() {
|
||||
float* f = reinterpret_cast<float*>(&x);
|
||||
auto* f = reinterpret_cast<float*>(&x);
|
||||
printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
|
||||
f[6], f[7]);
|
||||
}
|
||||
@@ -252,10 +252,10 @@ inline Float8 operator/(Float8 lhs, const Float8& rhs) {
|
||||
}
|
||||
|
||||
// Store 8 gradient pairs given vectors containing gradient and Hessian
|
||||
inline void StoreGpair(xgboost::bst_gpair* dst, const Float8& grad,
|
||||
inline void StoreGpair(xgboost::GradientPair* dst, const Float8& grad,
|
||||
const Float8& hess) {
|
||||
for (int i = 0; i < 8; i++) {
|
||||
dst[i] = xgboost::bst_gpair(grad.x[i], hess.x[i]);
|
||||
dst[i] = xgboost::GradientPair(grad.x[i], hess.x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -269,14 +269,14 @@ inline Float8 Sigmoid(Float8 x) {
|
||||
} // namespace avx
|
||||
|
||||
namespace std {
|
||||
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) {
|
||||
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) { // NOLINT
|
||||
avx::Float8 max;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
max.x[i] = std::max(a.x[i], b.x[i]);
|
||||
}
|
||||
return max;
|
||||
}
|
||||
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) {
|
||||
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) { // NOLINT
|
||||
avx::Float8 min;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
min.x[i] = std::min(a.x[i], b.x[i]);
|
||||
|
||||
@@ -42,7 +42,7 @@ struct BitMap {
|
||||
inline void InitFromBool(const std::vector<int>& vec) {
|
||||
this->Resize(vec.size());
|
||||
// parallel over the full cases
|
||||
bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
|
||||
auto nsize = static_cast<bst_omp_uint>(vec.size() / 32);
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize; ++i) {
|
||||
uint32_t res = 0;
|
||||
|
||||
@@ -8,21 +8,27 @@
|
||||
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
|
||||
|
||||
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
||||
switch (dtype) { \
|
||||
case xgboost::common::uint32 : { \
|
||||
typedef uint32_t DType; \
|
||||
OP; break; \
|
||||
} \
|
||||
case xgboost::common::uint16 : { \
|
||||
typedef uint16_t DType; \
|
||||
OP; break; \
|
||||
} \
|
||||
case xgboost::common::uint8 : { \
|
||||
typedef uint8_t DType; \
|
||||
OP; break; \
|
||||
default: LOG(FATAL) << "don't recognize type flag" << dtype; \
|
||||
} \
|
||||
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
|
||||
\
|
||||
switch(dtype) { \
|
||||
case xgboost::common::uint32: { \
|
||||
using DType = uint32_t; \
|
||||
OP; \
|
||||
break; \
|
||||
} \
|
||||
case xgboost::common::uint16: { \
|
||||
using DType = uint16_t; \
|
||||
OP; \
|
||||
break; \
|
||||
} \
|
||||
case xgboost::common::uint8: { \
|
||||
using DType = uint8_t; \
|
||||
OP; \
|
||||
break; \
|
||||
default: \
|
||||
LOG(FATAL) << "don't recognize type flag" << dtype; \
|
||||
} \
|
||||
\
|
||||
}
|
||||
|
||||
#include <type_traits>
|
||||
@@ -31,11 +37,12 @@ switch (dtype) { \
|
||||
#include "hist_util.h"
|
||||
#include "../tree/fast_hist_param.h"
|
||||
|
||||
using xgboost::tree::FastHistParam;
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
using tree::FastHistParam;
|
||||
|
||||
/*! \brief indicator of data type used for storing bin id's in a column. */
|
||||
enum DataType {
|
||||
uint8 = 1,
|
||||
@@ -78,7 +85,7 @@ class ColumnMatrix {
|
||||
slot of internal buffer. */
|
||||
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
|
||||
|
||||
const bst_uint nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
|
||||
const auto nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
|
||||
// identify type of each column
|
||||
|
||||
@@ -14,7 +14,7 @@ struct RandomThreadLocalEntry {
|
||||
GlobalRandomEngine engine;
|
||||
};
|
||||
|
||||
typedef dmlc::ThreadLocalStore<RandomThreadLocalEntry> RandomThreadLocalStore;
|
||||
using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
|
||||
|
||||
GlobalRandomEngine& GlobalRandom() {
|
||||
return RandomThreadLocalStore::Get()->engine;
|
||||
|
||||
@@ -11,20 +11,20 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
typedef unsigned char compressed_byte_t;
|
||||
using CompressedByteT = unsigned char;
|
||||
|
||||
namespace detail {
|
||||
inline void SetBit(compressed_byte_t *byte, int bit_idx) {
|
||||
inline void SetBit(CompressedByteT *byte, int bit_idx) {
|
||||
*byte |= 1 << bit_idx;
|
||||
}
|
||||
template <typename T>
|
||||
inline T CheckBit(const T &byte, int bit_idx) {
|
||||
return byte & (1 << bit_idx);
|
||||
}
|
||||
inline void ClearBit(compressed_byte_t *byte, int bit_idx) {
|
||||
inline void ClearBit(CompressedByteT *byte, int bit_idx) {
|
||||
*byte &= ~(1 << bit_idx);
|
||||
}
|
||||
static const int padding = 4; // Assign padding so we can read slightly off
|
||||
static const int kPadding = 4; // Assign padding so we can read slightly off
|
||||
// the beginning of the array
|
||||
|
||||
// The number of bits required to represent a given unsigned range
|
||||
@@ -76,16 +76,16 @@ class CompressedBufferWriter {
|
||||
size_t compressed_size = static_cast<size_t>(std::ceil(
|
||||
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
|
||||
bits_per_byte));
|
||||
return compressed_size + detail::padding;
|
||||
return compressed_size + detail::kPadding;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void WriteSymbol(compressed_byte_t *buffer, T symbol, size_t offset) {
|
||||
void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) {
|
||||
const int bits_per_byte = 8;
|
||||
|
||||
for (size_t i = 0; i < symbol_bits_; i++) {
|
||||
size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte;
|
||||
byte_idx += detail::padding;
|
||||
byte_idx += detail::kPadding;
|
||||
size_t bit_idx =
|
||||
((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte;
|
||||
|
||||
@@ -96,20 +96,20 @@ class CompressedBufferWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
template <typename iter_t>
|
||||
void Write(compressed_byte_t *buffer, iter_t input_begin, iter_t input_end) {
|
||||
template <typename IterT>
|
||||
void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
|
||||
uint64_t tmp = 0;
|
||||
size_t stored_bits = 0;
|
||||
const size_t max_stored_bits = 64 - symbol_bits_;
|
||||
size_t buffer_position = detail::padding;
|
||||
size_t buffer_position = detail::kPadding;
|
||||
const size_t num_symbols = input_end - input_begin;
|
||||
for (size_t i = 0; i < num_symbols; i++) {
|
||||
typename std::iterator_traits<iter_t>::value_type symbol = input_begin[i];
|
||||
typename std::iterator_traits<IterT>::value_type symbol = input_begin[i];
|
||||
if (stored_bits > max_stored_bits) {
|
||||
// Eject only full bytes
|
||||
size_t tmp_bytes = stored_bits / 8;
|
||||
for (size_t j = 0; j < tmp_bytes; j++) {
|
||||
buffer[buffer_position] = static_cast<compressed_byte_t>(
|
||||
buffer[buffer_position] = static_cast<CompressedByteT>(
|
||||
tmp >> (stored_bits - (j + 1) * 8));
|
||||
buffer_position++;
|
||||
}
|
||||
@@ -129,10 +129,10 @@ class CompressedBufferWriter {
|
||||
int shift_bits = static_cast<int>(stored_bits) - (j + 1) * 8;
|
||||
if (shift_bits >= 0) {
|
||||
buffer[buffer_position] =
|
||||
static_cast<compressed_byte_t>(tmp >> shift_bits);
|
||||
static_cast<CompressedByteT>(tmp >> shift_bits);
|
||||
} else {
|
||||
buffer[buffer_position] =
|
||||
static_cast<compressed_byte_t>(tmp << std::abs(shift_bits));
|
||||
static_cast<CompressedByteT>(tmp << std::abs(shift_bits));
|
||||
}
|
||||
buffer_position++;
|
||||
}
|
||||
@@ -153,23 +153,21 @@ template <typename T>
|
||||
|
||||
class CompressedIterator {
|
||||
public:
|
||||
typedef CompressedIterator<T> self_type; ///< My own type
|
||||
typedef ptrdiff_t
|
||||
difference_type; ///< Type to express the result of subtracting
|
||||
/// one iterator from another
|
||||
typedef T value_type; ///< The type of the element the iterator can point to
|
||||
typedef value_type *pointer; ///< The type of a pointer to an element the
|
||||
/// iterator can point to
|
||||
typedef value_type reference; ///< The type of a reference to an element the
|
||||
/// iterator can point to
|
||||
// Type definitions for thrust
|
||||
typedef CompressedIterator<T> self_type; // NOLINT
|
||||
typedef ptrdiff_t difference_type; // NOLINT
|
||||
typedef T value_type; // NOLINT
|
||||
typedef value_type *pointer; // NOLINT
|
||||
typedef value_type reference; // NOLINT
|
||||
|
||||
private:
|
||||
compressed_byte_t *buffer_;
|
||||
CompressedByteT *buffer_;
|
||||
size_t symbol_bits_;
|
||||
size_t offset_;
|
||||
|
||||
public:
|
||||
CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
|
||||
CompressedIterator(compressed_byte_t *buffer, int num_symbols)
|
||||
CompressedIterator(CompressedByteT *buffer, int num_symbols)
|
||||
: buffer_(buffer), offset_(0) {
|
||||
symbol_bits_ = detail::SymbolBits(num_symbols);
|
||||
}
|
||||
@@ -178,7 +176,7 @@ class CompressedIterator {
|
||||
const int bits_per_byte = 8;
|
||||
size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1);
|
||||
size_t start_byte_idx = start_bit_idx / bits_per_byte;
|
||||
start_byte_idx += detail::padding;
|
||||
start_byte_idx += detail::kPadding;
|
||||
|
||||
// Read 5 bytes - the maximum we will need
|
||||
uint64_t tmp = static_cast<uint64_t>(buffer_[start_byte_idx - 4]) << 32 |
|
||||
|
||||
@@ -24,33 +24,33 @@ class ConfigReaderBase {
|
||||
* \brief get current name, called after Next returns true
|
||||
* \return current parameter name
|
||||
*/
|
||||
inline const char *name(void) const {
|
||||
return s_name.c_str();
|
||||
inline const char *Name() const {
|
||||
return s_name_.c_str();
|
||||
}
|
||||
/*!
|
||||
* \brief get current value, called after Next returns true
|
||||
* \return current parameter value
|
||||
*/
|
||||
inline const char *val(void) const {
|
||||
return s_val.c_str();
|
||||
inline const char *Val() const {
|
||||
return s_val_.c_str();
|
||||
}
|
||||
/*!
|
||||
* \brief move iterator to next position
|
||||
* \return true if there is value in next position
|
||||
*/
|
||||
inline bool Next(void) {
|
||||
inline bool Next() {
|
||||
while (!this->IsEnd()) {
|
||||
GetNextToken(&s_name);
|
||||
if (s_name == "=") return false;
|
||||
if (GetNextToken(&s_buf) || s_buf != "=") return false;
|
||||
if (GetNextToken(&s_val) || s_val == "=") return false;
|
||||
GetNextToken(&s_name_);
|
||||
if (s_name_ == "=") return false;
|
||||
if (GetNextToken(&s_buf_) || s_buf_ != "=") return false;
|
||||
if (GetNextToken(&s_val_) || s_val_ == "=") return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// called before usage
|
||||
inline void Init(void) {
|
||||
ch_buf = this->GetChar();
|
||||
inline void Init() {
|
||||
ch_buf_ = this->GetChar();
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -58,38 +58,38 @@ class ConfigReaderBase {
|
||||
* \brief to be implemented by subclass,
|
||||
* get next token, return EOF if end of file
|
||||
*/
|
||||
virtual char GetChar(void) = 0;
|
||||
virtual char GetChar() = 0;
|
||||
/*! \brief to be implemented by child, check if end of stream */
|
||||
virtual bool IsEnd(void) = 0;
|
||||
virtual bool IsEnd() = 0;
|
||||
|
||||
private:
|
||||
char ch_buf;
|
||||
std::string s_name, s_val, s_buf;
|
||||
char ch_buf_;
|
||||
std::string s_name_, s_val_, s_buf_;
|
||||
|
||||
inline void SkipLine(void) {
|
||||
inline void SkipLine() {
|
||||
do {
|
||||
ch_buf = this->GetChar();
|
||||
} while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
|
||||
ch_buf_ = this->GetChar();
|
||||
} while (ch_buf_ != EOF && ch_buf_ != '\n' && ch_buf_ != '\r');
|
||||
}
|
||||
|
||||
inline void ParseStr(std::string *tok) {
|
||||
while ((ch_buf = this->GetChar()) != EOF) {
|
||||
switch (ch_buf) {
|
||||
while ((ch_buf_ = this->GetChar()) != EOF) {
|
||||
switch (ch_buf_) {
|
||||
case '\\': *tok += this->GetChar(); break;
|
||||
case '\"': return;
|
||||
case '\r':
|
||||
case '\n': LOG(FATAL)<< "ConfigReader: unterminated string";
|
||||
default: *tok += ch_buf;
|
||||
default: *tok += ch_buf_;
|
||||
}
|
||||
}
|
||||
LOG(FATAL) << "ConfigReader: unterminated string";
|
||||
}
|
||||
inline void ParseStrML(std::string *tok) {
|
||||
while ((ch_buf = this->GetChar()) != EOF) {
|
||||
switch (ch_buf) {
|
||||
while ((ch_buf_ = this->GetChar()) != EOF) {
|
||||
switch (ch_buf_) {
|
||||
case '\\': *tok += this->GetChar(); break;
|
||||
case '\'': return;
|
||||
default: *tok += ch_buf;
|
||||
default: *tok += ch_buf_;
|
||||
}
|
||||
}
|
||||
LOG(FATAL) << "unterminated string";
|
||||
@@ -98,24 +98,24 @@ class ConfigReaderBase {
|
||||
inline bool GetNextToken(std::string *tok) {
|
||||
tok->clear();
|
||||
bool new_line = false;
|
||||
while (ch_buf != EOF) {
|
||||
switch (ch_buf) {
|
||||
while (ch_buf_ != EOF) {
|
||||
switch (ch_buf_) {
|
||||
case '#' : SkipLine(); new_line = true; break;
|
||||
case '\"':
|
||||
if (tok->length() == 0) {
|
||||
ParseStr(tok); ch_buf = this->GetChar(); return new_line;
|
||||
ParseStr(tok); ch_buf_ = this->GetChar(); return new_line;
|
||||
} else {
|
||||
LOG(FATAL) << "ConfigReader: token followed directly by string";
|
||||
}
|
||||
case '\'':
|
||||
if (tok->length() == 0) {
|
||||
ParseStrML(tok); ch_buf = this->GetChar(); return new_line;
|
||||
ParseStrML(tok); ch_buf_ = this->GetChar(); return new_line;
|
||||
} else {
|
||||
LOG(FATAL) << "ConfigReader: token followed directly by string";
|
||||
}
|
||||
case '=':
|
||||
if (tok->length() == 0) {
|
||||
ch_buf = this->GetChar();
|
||||
ch_buf_ = this->GetChar();
|
||||
*tok = '=';
|
||||
}
|
||||
return new_line;
|
||||
@@ -124,12 +124,12 @@ class ConfigReaderBase {
|
||||
if (tok->length() == 0) new_line = true;
|
||||
case '\t':
|
||||
case ' ' :
|
||||
ch_buf = this->GetChar();
|
||||
ch_buf_ = this->GetChar();
|
||||
if (tok->length() != 0) return new_line;
|
||||
break;
|
||||
default:
|
||||
*tok += ch_buf;
|
||||
ch_buf = this->GetChar();
|
||||
*tok += ch_buf_;
|
||||
ch_buf_ = this->GetChar();
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -149,19 +149,19 @@ class ConfigStreamReader: public ConfigReaderBase {
|
||||
* \brief constructor
|
||||
* \param fin istream input stream
|
||||
*/
|
||||
explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
|
||||
explicit ConfigStreamReader(std::istream &fin) : fin_(fin) {}
|
||||
|
||||
protected:
|
||||
virtual char GetChar(void) {
|
||||
return fin.get();
|
||||
char GetChar() override {
|
||||
return fin_.get();
|
||||
}
|
||||
/*! \brief to be implemented by child, check if end of stream */
|
||||
virtual bool IsEnd(void) {
|
||||
return fin.eof();
|
||||
bool IsEnd() override {
|
||||
return fin_.eof();
|
||||
}
|
||||
|
||||
private:
|
||||
std::istream &fin;
|
||||
std::istream &fin_;
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -173,20 +173,20 @@ class ConfigIterator: public ConfigStreamReader {
|
||||
* \brief constructor
|
||||
* \param fname name of configure file
|
||||
*/
|
||||
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
|
||||
fi.open(fname);
|
||||
if (fi.fail()) {
|
||||
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi_) {
|
||||
fi_.open(fname);
|
||||
if (fi_.fail()) {
|
||||
LOG(FATAL) << "cannot open file " << fname;
|
||||
}
|
||||
ConfigReaderBase::Init();
|
||||
}
|
||||
/*! \brief destructor */
|
||||
~ConfigIterator(void) {
|
||||
fi.close();
|
||||
~ConfigIterator() {
|
||||
fi_.close();
|
||||
}
|
||||
|
||||
private:
|
||||
std::ifstream fi;
|
||||
std::ifstream fi_;
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -25,16 +25,16 @@
|
||||
|
||||
namespace dh {
|
||||
|
||||
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
||||
#define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
|
||||
#define DEV_INLINE __device__ __forceinline__
|
||||
|
||||
/*
|
||||
* Error handling functions
|
||||
*/
|
||||
|
||||
#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__)
|
||||
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
|
||||
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
|
||||
int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::stringstream ss;
|
||||
@@ -48,9 +48,9 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
|
||||
}
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
|
||||
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
|
||||
int line) {
|
||||
if (code != ncclSuccess) {
|
||||
std::stringstream ss;
|
||||
@@ -64,16 +64,16 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
T *Raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T *raw(const thrust::device_vector<T> &v) { // NOLINT
|
||||
const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
inline int n_visible_devices() {
|
||||
inline int NVisibleDevices() {
|
||||
int n_visgpus = 0;
|
||||
|
||||
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
|
||||
@@ -81,40 +81,40 @@ inline int n_visible_devices() {
|
||||
return n_visgpus;
|
||||
}
|
||||
|
||||
inline int n_devices_all(int n_gpus) {
|
||||
int n_devices_visible = dh::n_visible_devices();
|
||||
inline int NDevicesAll(int n_gpus) {
|
||||
int n_devices_visible = dh::NVisibleDevices();
|
||||
int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
|
||||
return (n_devices);
|
||||
}
|
||||
inline int n_devices(int n_gpus, int num_rows) {
|
||||
int n_devices = dh::n_devices_all(n_gpus);
|
||||
inline int NDevices(int n_gpus, int num_rows) {
|
||||
int n_devices = dh::NDevicesAll(n_gpus);
|
||||
// fix-up device number to be limited by number of rows
|
||||
n_devices = n_devices > num_rows ? num_rows : n_devices;
|
||||
return (n_devices);
|
||||
}
|
||||
|
||||
// if n_devices=-1, then use all visible devices
|
||||
inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
|
||||
inline void SynchronizeNDevices(int n_devices, std::vector<int> dList) {
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
inline void synchronize_all() {
|
||||
for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
|
||||
inline void SynchronizeAll() {
|
||||
for (int device_idx = 0; device_idx < NVisibleDevices(); device_idx++) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string device_name(int device_idx) {
|
||||
inline std::string DeviceName(int device_idx) {
|
||||
cudaDeviceProp prop;
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
inline size_t available_memory(int device_idx) {
|
||||
inline size_t AvailableMemory(int device_idx) {
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
@@ -130,20 +130,20 @@ inline size_t available_memory(int device_idx) {
|
||||
* \param device_idx Zero-based index of the device.
|
||||
*/
|
||||
|
||||
inline size_t max_shared_memory(int device_idx) {
|
||||
inline size_t MaxSharedMemory(int device_idx) {
|
||||
cudaDeviceProp prop;
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
|
||||
return prop.sharedMemPerBlock;
|
||||
}
|
||||
|
||||
// ensure gpu_id is correct, so not dependent upon user knowing details
|
||||
inline int get_device_idx(int gpu_id) {
|
||||
inline int GetDeviceIdx(int gpu_id) {
|
||||
// protect against overrun for gpu_id
|
||||
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
|
||||
return (std::abs(gpu_id) + 0) % dh::NVisibleDevices();
|
||||
}
|
||||
|
||||
inline void check_compute_capability() {
|
||||
int n_devices = n_visible_devices();
|
||||
inline void CheckComputeCapability() {
|
||||
int n_devices = NVisibleDevices();
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
cudaDeviceProp prop;
|
||||
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
|
||||
@@ -159,72 +159,72 @@ inline void check_compute_capability() {
|
||||
* Range iterator
|
||||
*/
|
||||
|
||||
class range {
|
||||
class Range {
|
||||
public:
|
||||
class iterator {
|
||||
friend class range;
|
||||
class Iterator {
|
||||
friend class Range;
|
||||
|
||||
public:
|
||||
__host__ __device__ int64_t operator*() const { return i_; }
|
||||
__host__ __device__ const iterator &operator++() {
|
||||
XGBOOST_DEVICE int64_t operator*() const { return i_; }
|
||||
XGBOOST_DEVICE const Iterator &operator++() {
|
||||
i_ += step_;
|
||||
return *this;
|
||||
}
|
||||
__host__ __device__ iterator operator++(int) {
|
||||
iterator copy(*this);
|
||||
XGBOOST_DEVICE Iterator operator++(int) {
|
||||
Iterator copy(*this);
|
||||
i_ += step_;
|
||||
return copy;
|
||||
}
|
||||
|
||||
__host__ __device__ bool operator==(const iterator &other) const {
|
||||
XGBOOST_DEVICE bool operator==(const Iterator &other) const {
|
||||
return i_ >= other.i_;
|
||||
}
|
||||
__host__ __device__ bool operator!=(const iterator &other) const {
|
||||
XGBOOST_DEVICE bool operator!=(const Iterator &other) const {
|
||||
return i_ < other.i_;
|
||||
}
|
||||
|
||||
__host__ __device__ void step(int s) { step_ = s; }
|
||||
XGBOOST_DEVICE void Step(int s) { step_ = s; }
|
||||
|
||||
protected:
|
||||
__host__ __device__ explicit iterator(int64_t start) : i_(start) {}
|
||||
XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
|
||||
|
||||
public:
|
||||
uint64_t i_;
|
||||
int step_ = 1;
|
||||
};
|
||||
|
||||
__host__ __device__ iterator begin() const { return begin_; }
|
||||
__host__ __device__ iterator end() const { return end_; }
|
||||
__host__ __device__ range(int64_t begin, int64_t end)
|
||||
XGBOOST_DEVICE Iterator begin() const { return begin_; } // NOLINT
|
||||
XGBOOST_DEVICE Iterator end() const { return end_; } // NOLINT
|
||||
XGBOOST_DEVICE Range(int64_t begin, int64_t end)
|
||||
: begin_(begin), end_(end) {}
|
||||
__host__ __device__ void step(int s) { begin_.step(s); }
|
||||
XGBOOST_DEVICE void Step(int s) { begin_.Step(s); }
|
||||
|
||||
private:
|
||||
iterator begin_;
|
||||
iterator end_;
|
||||
Iterator begin_;
|
||||
Iterator end_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ range grid_stride_range(T begin, T end) {
|
||||
__device__ Range GridStrideRange(T begin, T end) {
|
||||
begin += blockDim.x * blockIdx.x + threadIdx.x;
|
||||
range r(begin, end);
|
||||
r.step(gridDim.x * blockDim.x);
|
||||
Range r(begin, end);
|
||||
r.Step(gridDim.x * blockDim.x);
|
||||
return r;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ range block_stride_range(T begin, T end) {
|
||||
__device__ Range BlockStrideRange(T begin, T end) {
|
||||
begin += threadIdx.x;
|
||||
range r(begin, end);
|
||||
r.step(blockDim.x);
|
||||
Range r(begin, end);
|
||||
r.Step(blockDim.x);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Threadblock iterates over range, filling with value. Requires all threads in
|
||||
// block to be active.
|
||||
template <typename IterT, typename ValueT>
|
||||
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
|
||||
__device__ void BlockFill(IterT begin, size_t n, ValueT value) {
|
||||
for (auto i : BlockStrideRange(static_cast<size_t>(0), n)) {
|
||||
begin[i] = value;
|
||||
}
|
||||
}
|
||||
@@ -234,34 +234,34 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
*/
|
||||
|
||||
template <typename T1, typename T2>
|
||||
T1 div_round_up(const T1 a, const T2 b) {
|
||||
T1 DivRoundUp(const T1 a, const T2 b) {
|
||||
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
||||
}
|
||||
|
||||
template <typename L>
|
||||
__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
|
||||
for (auto i : grid_stride_range(begin, end)) {
|
||||
__global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
|
||||
for (auto i : GridStrideRange(begin, end)) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
template <typename L>
|
||||
__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
|
||||
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
|
||||
L lambda) {
|
||||
for (auto i : grid_stride_range(begin, end)) {
|
||||
for (auto i : GridStrideRange(begin, end)) {
|
||||
lambda(i, device_idx);
|
||||
}
|
||||
}
|
||||
|
||||
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
|
||||
inline void launch_n(int device_idx, size_t n, L lambda) {
|
||||
inline void LaunchN(int device_idx, size_t n, L lambda) {
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
const int GRID_SIZE =
|
||||
static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
|
||||
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
|
||||
lambda);
|
||||
}
|
||||
|
||||
@@ -269,91 +269,91 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
|
||||
* Memory
|
||||
*/
|
||||
|
||||
enum memory_type { DEVICE, DEVICE_MANAGED };
|
||||
enum MemoryType { kDevice, kDeviceManaged };
|
||||
|
||||
template <memory_type MemoryT>
|
||||
class bulk_allocator;
|
||||
template <MemoryType MemoryT>
|
||||
class BulkAllocator;
|
||||
template <typename T>
|
||||
class dvec2;
|
||||
class DVec2;
|
||||
|
||||
template <typename T>
|
||||
class dvec {
|
||||
friend class dvec2<T>;
|
||||
class DVec {
|
||||
friend class DVec2<T>;
|
||||
|
||||
private:
|
||||
T *_ptr;
|
||||
size_t _size;
|
||||
int _device_idx;
|
||||
T *ptr_;
|
||||
size_t size_;
|
||||
int device_idx_;
|
||||
|
||||
public:
|
||||
void external_allocate(int device_idx, void *ptr, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec but already allocated");
|
||||
void ExternalAllocate(int device_idx, void *ptr, size_t size) {
|
||||
if (!Empty()) {
|
||||
throw std::runtime_error("Tried to allocate DVec but already allocated");
|
||||
}
|
||||
_ptr = static_cast<T *>(ptr);
|
||||
_size = size;
|
||||
_device_idx = device_idx;
|
||||
safe_cuda(cudaSetDevice(_device_idx));
|
||||
ptr_ = static_cast<T *>(ptr);
|
||||
size_ = size;
|
||||
device_idx_ = device_idx;
|
||||
safe_cuda(cudaSetDevice(device_idx_));
|
||||
}
|
||||
|
||||
dvec() : _ptr(NULL), _size(0), _device_idx(-1) {}
|
||||
size_t size() const { return _size; }
|
||||
int device_idx() const { return _device_idx; }
|
||||
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||
DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
|
||||
size_t Size() const { return size_; }
|
||||
int DeviceIdx() const { return device_idx_; }
|
||||
bool Empty() const { return ptr_ == NULL || size_ == 0; }
|
||||
|
||||
T *data() { return _ptr; }
|
||||
T *Data() { return ptr_; }
|
||||
|
||||
const T *data() const { return _ptr; }
|
||||
const T *Data() const { return ptr_; }
|
||||
|
||||
std::vector<T> as_vector() const {
|
||||
std::vector<T> h_vector(size());
|
||||
safe_cuda(cudaSetDevice(_device_idx));
|
||||
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
|
||||
std::vector<T> AsVector() const {
|
||||
std::vector<T> h_vector(Size());
|
||||
safe_cuda(cudaSetDevice(device_idx_));
|
||||
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return h_vector;
|
||||
}
|
||||
|
||||
void fill(T value) {
|
||||
auto d_ptr = _ptr;
|
||||
launch_n(_device_idx, size(),
|
||||
void Fill(T value) {
|
||||
auto d_ptr = ptr_;
|
||||
LaunchN(device_idx_, Size(),
|
||||
[=] __device__(size_t idx) { d_ptr[idx] = value; });
|
||||
}
|
||||
|
||||
void print() {
|
||||
auto h_vector = this->as_vector();
|
||||
void Print() {
|
||||
auto h_vector = this->AsVector();
|
||||
for (auto e : h_vector) {
|
||||
std::cout << e << " ";
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(_ptr); }
|
||||
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(ptr_); }
|
||||
|
||||
thrust::device_ptr<T> tend() {
|
||||
return thrust::device_pointer_cast(_ptr + size());
|
||||
return thrust::device_pointer_cast(ptr_ + Size());
|
||||
}
|
||||
|
||||
template <typename T2>
|
||||
dvec &operator=(const std::vector<T2> &other) {
|
||||
DVec &operator=(const std::vector<T2> &other) {
|
||||
this->copy(other.begin(), other.end());
|
||||
return *this;
|
||||
}
|
||||
|
||||
dvec &operator=(dvec<T> &other) {
|
||||
if (other.size() != size()) {
|
||||
DVec &operator=(DVec<T> &other) {
|
||||
if (other.Size() != Size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign dvec to dvec, sizes are different");
|
||||
"Cannot copy assign DVec to DVec, sizes are different");
|
||||
}
|
||||
safe_cuda(cudaSetDevice(this->device_idx()));
|
||||
if (other.device_idx() == this->device_idx()) {
|
||||
dh::safe_cuda(cudaMemcpy(this->data(), other.data(),
|
||||
other.size() * sizeof(T),
|
||||
safe_cuda(cudaSetDevice(this->DeviceIdx()));
|
||||
if (other.DeviceIdx() == this->DeviceIdx()) {
|
||||
dh::safe_cuda(cudaMemcpy(this->Data(), other.Data(),
|
||||
other.Size() * sizeof(T),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
std::cout << "deviceother: " << other.device_idx()
|
||||
<< " devicethis: " << this->device_idx() << std::endl;
|
||||
std::cout << "size deviceother: " << other.size()
|
||||
<< " devicethis: " << this->device_idx() << std::endl;
|
||||
std::cout << "deviceother: " << other.DeviceIdx()
|
||||
<< " devicethis: " << this->DeviceIdx() << std::endl;
|
||||
std::cout << "size deviceother: " << other.Size()
|
||||
<< " devicethis: " << this->DeviceIdx() << std::endl;
|
||||
throw std::runtime_error("Cannot copy to/from different devices");
|
||||
}
|
||||
|
||||
@@ -362,177 +362,178 @@ class dvec {
|
||||
|
||||
template <typename IterT>
|
||||
void copy(IterT begin, IterT end) {
|
||||
safe_cuda(cudaSetDevice(this->device_idx()));
|
||||
if (end - begin != size()) {
|
||||
safe_cuda(cudaSetDevice(this->DeviceIdx()));
|
||||
if (end - begin != Size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
"Cannot copy assign vector to DVec, sizes are different");
|
||||
}
|
||||
thrust::copy(begin, end, this->tbegin());
|
||||
}
|
||||
|
||||
void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
safe_cuda(cudaSetDevice(this->device_idx()));
|
||||
if (end - begin != size()) {
|
||||
safe_cuda(cudaSetDevice(this->DeviceIdx()));
|
||||
if (end - begin != Size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
"Cannot copy assign vector to DVec, sizes are different");
|
||||
}
|
||||
safe_cuda(cudaMemcpy(this->data(), begin.get(),
|
||||
size() * sizeof(T), cudaMemcpyDefault));
|
||||
safe_cuda(cudaMemcpy(this->Data(), begin.get(),
|
||||
Size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @class dvec2 device_helpers.cuh
|
||||
* @brief wrapper for storing 2 dvec's which are needed for cub::DoubleBuffer
|
||||
* @class DVec2 device_helpers.cuh
|
||||
* @brief wrapper for storing 2 DVec's which are needed for cub::DoubleBuffer
|
||||
*/
|
||||
template <typename T>
|
||||
class dvec2 {
|
||||
class DVec2 {
|
||||
private:
|
||||
dvec<T> _d1, _d2;
|
||||
cub::DoubleBuffer<T> _buff;
|
||||
int _device_idx;
|
||||
DVec<T> d1_, d2_;
|
||||
cub::DoubleBuffer<T> buff_;
|
||||
int device_idx_;
|
||||
|
||||
public:
|
||||
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
||||
void ExternalAllocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
||||
if (!Empty()) {
|
||||
throw std::runtime_error("Tried to allocate DVec2 but already allocated");
|
||||
}
|
||||
_device_idx = device_idx;
|
||||
_d1.external_allocate(_device_idx, ptr1, size);
|
||||
_d2.external_allocate(_device_idx, ptr2, size);
|
||||
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
||||
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
||||
_buff.selector = 0;
|
||||
device_idx_ = device_idx;
|
||||
d1_.ExternalAllocate(device_idx_, ptr1, size);
|
||||
d2_.ExternalAllocate(device_idx_, ptr2, size);
|
||||
buff_.d_buffers[0] = static_cast<T *>(ptr1);
|
||||
buff_.d_buffers[1] = static_cast<T *>(ptr2);
|
||||
buff_.selector = 0;
|
||||
}
|
||||
dvec2() : _d1(), _d2(), _buff(), _device_idx(-1) {}
|
||||
DVec2() : d1_(), d2_(), buff_(), device_idx_(-1) {}
|
||||
|
||||
size_t size() const { return _d1.size(); }
|
||||
int device_idx() const { return _device_idx; }
|
||||
bool empty() const { return _d1.empty() || _d2.empty(); }
|
||||
size_t Size() const { return d1_.Size(); }
|
||||
int DeviceIdx() const { return device_idx_; }
|
||||
bool Empty() const { return d1_.Empty() || d2_.Empty(); }
|
||||
|
||||
cub::DoubleBuffer<T> &buff() { return _buff; }
|
||||
cub::DoubleBuffer<T> &buff() { return buff_; }
|
||||
|
||||
dvec<T> &d1() { return _d1; }
|
||||
dvec<T> &d2() { return _d2; }
|
||||
DVec<T> &D1() { return d1_; }
|
||||
|
||||
T *current() { return _buff.Current(); }
|
||||
DVec<T> &D2() { return d2_; }
|
||||
|
||||
dvec<T> ¤t_dvec() { return _buff.selector == 0 ? d1() : d2(); }
|
||||
T *Current() { return buff_.Current(); }
|
||||
|
||||
T *other() { return _buff.Alternate(); }
|
||||
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
|
||||
|
||||
T *other() { return buff_.Alternate(); }
|
||||
};
|
||||
|
||||
template <memory_type MemoryT>
|
||||
class bulk_allocator {
|
||||
std::vector<char *> d_ptr;
|
||||
std::vector<size_t> _size;
|
||||
std::vector<int> _device_idx;
|
||||
template <MemoryType MemoryT>
|
||||
class BulkAllocator {
|
||||
std::vector<char *> d_ptr_;
|
||||
std::vector<size_t> size_;
|
||||
std::vector<int> device_idx_;
|
||||
|
||||
const int align = 256;
|
||||
static const int kAlign = 256;
|
||||
|
||||
size_t align_round_up(size_t n) const {
|
||||
n = (n + align - 1) / align;
|
||||
return n * align;
|
||||
size_t AlignRoundUp(size_t n) const {
|
||||
n = (n + kAlign - 1) / kAlign;
|
||||
return n * kAlign;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size) {
|
||||
return align_round_up(first_size * sizeof(T));
|
||||
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size) {
|
||||
return AlignRoundUp(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size, Args... args) {
|
||||
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
|
||||
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size, Args... args) {
|
||||
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
|
||||
size_t first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_vec->ExternalAllocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
|
||||
size_t first_size, Args... args) {
|
||||
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += align_round_up(first_size * sizeof(T));
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += AlignRoundUp(first_size * sizeof(T));
|
||||
AllocateDVec(device_idx, ptr, args...);
|
||||
}
|
||||
|
||||
char *allocate_device(int device_idx, size_t bytes, memory_type t) {
|
||||
char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
|
||||
char *ptr;
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaMalloc(&ptr, bytes));
|
||||
return ptr;
|
||||
}
|
||||
template <typename T>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size) {
|
||||
return 2 * align_round_up(first_size * sizeof(T));
|
||||
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size) {
|
||||
return 2 * AlignRoundUp(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
|
||||
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
|
||||
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size, Args... args) {
|
||||
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
|
||||
size_t first_size) {
|
||||
first_vec->external_allocate(
|
||||
first_vec->ExternalAllocate(
|
||||
device_idx, static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
|
||||
static_cast<void *>(ptr + AlignRoundUp(first_size * sizeof(T))),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
|
||||
size_t first_size, Args... args) {
|
||||
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += (AlignRoundUp(first_size * sizeof(T)) * 2);
|
||||
AllocateDVec(device_idx, ptr, args...);
|
||||
}
|
||||
|
||||
public:
|
||||
bulk_allocator() {}
|
||||
BulkAllocator() = default;
|
||||
// prevent accidental copying, moving or assignment of this object
|
||||
bulk_allocator(const bulk_allocator<MemoryT>&) = delete;
|
||||
bulk_allocator(bulk_allocator<MemoryT>&&) = delete;
|
||||
void operator=(const bulk_allocator<MemoryT>&) = delete;
|
||||
void operator=(bulk_allocator<MemoryT>&&) = delete;
|
||||
BulkAllocator(const BulkAllocator<MemoryT>&) = delete;
|
||||
BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
|
||||
void operator=(const BulkAllocator<MemoryT>&) = delete;
|
||||
void operator=(BulkAllocator<MemoryT>&&) = delete;
|
||||
|
||||
~bulk_allocator() {
|
||||
for (size_t i = 0; i < d_ptr.size(); i++) {
|
||||
if (!(d_ptr[i] == nullptr)) {
|
||||
safe_cuda(cudaSetDevice(_device_idx[i]));
|
||||
safe_cuda(cudaFree(d_ptr[i]));
|
||||
d_ptr[i] = nullptr;
|
||||
~BulkAllocator() {
|
||||
for (size_t i = 0; i < d_ptr_.size(); i++) {
|
||||
if (!(d_ptr_[i] == nullptr)) {
|
||||
safe_cuda(cudaSetDevice(device_idx_[i]));
|
||||
safe_cuda(cudaFree(d_ptr_[i]));
|
||||
d_ptr_[i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns sum of bytes for all allocations
|
||||
size_t size() {
|
||||
return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
|
||||
size_t Size() {
|
||||
return std::accumulate(size_.begin(), size_.end(), static_cast<size_t>(0));
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void allocate(int device_idx, bool silent, Args... args) {
|
||||
size_t size = get_size_bytes(args...);
|
||||
void Allocate(int device_idx, bool silent, Args... args) {
|
||||
size_t size = GetSizeBytes(args...);
|
||||
|
||||
char *ptr = allocate_device(device_idx, size, MemoryT);
|
||||
char *ptr = AllocateDevice(device_idx, size, MemoryT);
|
||||
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
AllocateDVec(device_idx, ptr, args...);
|
||||
|
||||
d_ptr.push_back(ptr);
|
||||
_size.push_back(size);
|
||||
_device_idx.push_back(device_idx);
|
||||
d_ptr_.push_back(ptr);
|
||||
size_.push_back(size);
|
||||
device_idx_.push_back(device_idx);
|
||||
|
||||
if (!silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << size / mb_size << "MB on [" << device_idx
|
||||
<< "] " << device_name(device_idx) << ", "
|
||||
<< available_memory(device_idx) / mb_size << "MB remaining.";
|
||||
<< "] " << DeviceName(device_idx) << ", "
|
||||
<< AvailableMemory(device_idx) / mb_size << "MB remaining.";
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -543,7 +544,7 @@ struct CubMemory {
|
||||
size_t temp_storage_bytes;
|
||||
|
||||
// Thrust
|
||||
typedef char value_type;
|
||||
using ValueT = char;
|
||||
|
||||
CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}
|
||||
|
||||
@@ -568,17 +569,18 @@ struct CubMemory {
|
||||
}
|
||||
}
|
||||
// Thrust
|
||||
char *allocate(std::ptrdiff_t num_bytes) {
|
||||
char *allocate(std::ptrdiff_t num_bytes) { // NOLINT
|
||||
LazyAllocate(num_bytes);
|
||||
return reinterpret_cast<char *>(d_temp_storage);
|
||||
}
|
||||
|
||||
// Thrust
|
||||
void deallocate(char *ptr, size_t n) {
|
||||
void deallocate(char *ptr, size_t n) { // NOLINT
|
||||
|
||||
// Do nothing
|
||||
}
|
||||
|
||||
bool IsAllocated() { return d_temp_storage != NULL; }
|
||||
bool IsAllocated() { return d_temp_storage != nullptr; }
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -586,7 +588,7 @@ struct CubMemory {
|
||||
*/
|
||||
|
||||
template <typename T>
|
||||
void print(const dvec<T> &v, size_t max_items = 10) {
|
||||
void Print(const DVec<T> &v, size_t max_items = 10) {
|
||||
std::vector<T> h = v.as_vector();
|
||||
for (size_t i = 0; i < std::min(max_items, h.size()); i++) {
|
||||
std::cout << " " << h[i];
|
||||
@@ -609,14 +611,14 @@ void print(const dvec<T> &v, size_t max_items = 10) {
|
||||
|
||||
// Load balancing search
|
||||
|
||||
template <typename coordinate_t, typename segments_t, typename offset_t>
|
||||
void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
|
||||
size_t num_tiles, int tile_size, segments_t segments,
|
||||
offset_t num_rows, offset_t num_elements) {
|
||||
dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
|
||||
offset_t diagonal = idx * tile_size;
|
||||
coordinate_t tile_coordinate;
|
||||
cub::CountingInputIterator<offset_t> nonzero_indices(0);
|
||||
template <typename CoordinateT, typename SegmentT, typename OffsetT>
|
||||
void FindMergePartitions(int device_idx, CoordinateT *d_tile_coordinates,
|
||||
size_t num_tiles, int tile_size, SegmentT segments,
|
||||
OffsetT num_rows, OffsetT num_elements) {
|
||||
dh::LaunchN(device_idx, num_tiles + 1, [=] __device__(int idx) {
|
||||
OffsetT diagonal = idx * tile_size;
|
||||
CoordinateT tile_coordinate;
|
||||
cub::CountingInputIterator<OffsetT> nonzero_indices(0);
|
||||
|
||||
// Search the merge path
|
||||
// Cast to signed integer as this function can have negatives
|
||||
@@ -630,27 +632,27 @@ void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
|
||||
}
|
||||
|
||||
template <int TILE_SIZE, int ITEMS_PER_THREAD, int BLOCK_THREADS,
|
||||
typename offset_t, typename coordinate_t, typename func_t,
|
||||
typename segments_iter>
|
||||
__global__ void LbsKernel(coordinate_t *d_coordinates,
|
||||
segments_iter segment_end_offsets, func_t f,
|
||||
offset_t num_segments) {
|
||||
typename OffsetT, typename CoordinateT, typename FunctionT,
|
||||
typename SegmentIterT>
|
||||
__global__ void LbsKernel(CoordinateT *d_coordinates,
|
||||
SegmentIterT segment_end_offsets, FunctionT f,
|
||||
OffsetT num_segments) {
|
||||
int tile = blockIdx.x;
|
||||
coordinate_t tile_start_coord = d_coordinates[tile];
|
||||
coordinate_t tile_end_coord = d_coordinates[tile + 1];
|
||||
CoordinateT tile_start_coord = d_coordinates[tile];
|
||||
CoordinateT tile_end_coord = d_coordinates[tile + 1];
|
||||
int64_t tile_num_rows = tile_end_coord.x - tile_start_coord.x;
|
||||
int64_t tile_num_elements = tile_end_coord.y - tile_start_coord.y;
|
||||
|
||||
cub::CountingInputIterator<offset_t> tile_element_indices(tile_start_coord.y);
|
||||
coordinate_t thread_start_coord;
|
||||
cub::CountingInputIterator<OffsetT> tile_element_indices(tile_start_coord.y);
|
||||
CoordinateT thread_start_coord;
|
||||
|
||||
typedef typename std::iterator_traits<segments_iter>::value_type segment_t;
|
||||
typedef typename std::iterator_traits<SegmentIterT>::value_type SegmentT;
|
||||
__shared__ struct {
|
||||
segment_t tile_segment_end_offsets[TILE_SIZE + 1];
|
||||
segment_t output_segment[TILE_SIZE];
|
||||
SegmentT tile_segment_end_offsets[TILE_SIZE + 1];
|
||||
SegmentT output_segment[TILE_SIZE];
|
||||
} temp_storage;
|
||||
|
||||
for (auto item : dh::block_stride_range(int(0), int(tile_num_rows + 1))) {
|
||||
for (auto item : dh::BlockStrideRange(int(0), int(tile_num_rows + 1))) {
|
||||
temp_storage.tile_segment_end_offsets[item] =
|
||||
segment_end_offsets[min(static_cast<size_t>(tile_start_coord.x + item),
|
||||
static_cast<size_t>(num_segments - 1))];
|
||||
@@ -665,7 +667,7 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
|
||||
tile_element_indices, // List B
|
||||
tile_num_rows, tile_num_elements, thread_start_coord);
|
||||
|
||||
coordinate_t thread_current_coord = thread_start_coord;
|
||||
CoordinateT thread_current_coord = thread_start_coord;
|
||||
#pragma unroll
|
||||
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
|
||||
if (tile_element_indices[thread_current_coord.y] <
|
||||
@@ -679,50 +681,50 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (auto item : dh::block_stride_range(int(0), int(tile_num_elements))) {
|
||||
for (auto item : dh::BlockStrideRange(int(0), int(tile_num_elements))) {
|
||||
f(tile_start_coord.y + item, temp_storage.output_segment[item]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename func_t, typename segments_iter, typename offset_t>
|
||||
template <typename FunctionT, typename SegmentIterT, typename OffsetT>
|
||||
void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
offset_t count, segments_iter segments,
|
||||
offset_t num_segments, func_t f) {
|
||||
typedef typename cub::CubVector<offset_t, 2>::Type coordinate_t;
|
||||
OffsetT count, SegmentIterT segments,
|
||||
OffsetT num_segments, FunctionT f) {
|
||||
typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
const int BLOCK_THREADS = 256;
|
||||
const int ITEMS_PER_THREAD = 1;
|
||||
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
|
||||
auto num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
|
||||
auto num_tiles = dh::DivRoundUp(count + num_segments, BLOCK_THREADS);
|
||||
CHECK(num_tiles < std::numeric_limits<unsigned int>::max());
|
||||
|
||||
temp_memory->LazyAllocate(sizeof(coordinate_t) * (num_tiles + 1));
|
||||
coordinate_t *tmp_tile_coordinates =
|
||||
reinterpret_cast<coordinate_t *>(temp_memory->d_temp_storage);
|
||||
temp_memory->LazyAllocate(sizeof(CoordinateT) * (num_tiles + 1));
|
||||
CoordinateT *tmp_tile_coordinates =
|
||||
reinterpret_cast<CoordinateT *>(temp_memory->d_temp_storage);
|
||||
|
||||
FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles,
|
||||
BLOCK_THREADS, segments, num_segments, count);
|
||||
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, OffsetT>
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
|
||||
segments + 1, f, num_segments);
|
||||
}
|
||||
|
||||
template <typename func_t, typename offset_t>
|
||||
void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
|
||||
func_t f) {
|
||||
template <typename FunctionT, typename OffsetT>
|
||||
void DenseTransformLbs(int device_idx, OffsetT count, OffsetT num_segments,
|
||||
FunctionT f) {
|
||||
CHECK(count % num_segments == 0) << "Data is not dense.";
|
||||
|
||||
launch_n(device_idx, count, [=] __device__(offset_t idx) {
|
||||
offset_t segment = idx / (count / num_segments);
|
||||
LaunchN(device_idx, count, [=] __device__(OffsetT idx) {
|
||||
OffsetT segment = idx / (count / num_segments);
|
||||
f(idx, segment);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn template <typename func_t, typename segments_iter, typename offset_t>
|
||||
* void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
* segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
|
||||
* \fn template <typename FunctionT, typename SegmentIterT, typename OffsetT>
|
||||
* void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
|
||||
* SegmentIterT segments, OffsetT num_segments, bool is_dense, FunctionT f)
|
||||
*
|
||||
* \brief Load balancing search function. Reads a CSR type matrix description
|
||||
* and allows a function to be executed on each element. Search 'modern GPU load
|
||||
@@ -731,9 +733,9 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
|
||||
* \author Rory
|
||||
* \date 7/9/2017
|
||||
*
|
||||
* \tparam func_t Type of the function t.
|
||||
* \tparam segments_iter Type of the segments iterator.
|
||||
* \tparam offset_t Type of the offset.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \tparam SegmentIterT Type of the segments iterator.
|
||||
* \tparam OffsetT Type of the offset.
|
||||
* \param device_idx Zero-based index of the device.
|
||||
* \param [in,out] temp_memory Temporary memory allocator.
|
||||
* \param count Number of elements.
|
||||
@@ -743,10 +745,10 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
|
||||
* \param f Lambda to be executed on matrix elements.
|
||||
*/
|
||||
|
||||
template <typename func_t, typename segments_iter, typename offset_t>
|
||||
void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
segments_iter segments, offset_t num_segments, bool is_dense,
|
||||
func_t f) {
|
||||
template <typename FunctionT, typename SegmentIterT, typename OffsetT>
|
||||
void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
|
||||
SegmentIterT segments, OffsetT num_segments, bool is_dense,
|
||||
FunctionT f) {
|
||||
if (is_dense) {
|
||||
DenseTransformLbs(device_idx, count, num_segments, f);
|
||||
} else {
|
||||
@@ -765,18 +767,18 @@ void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
* @param offsets the segments
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
|
||||
dh::dvec2<T2> *vals, int nVals, int nSegs,
|
||||
const dh::dvec<int> &offsets, int start = 0,
|
||||
void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
|
||||
dh::DVec2<T2> *vals, int nVals, int nSegs,
|
||||
const dh::DVec<int> &offsets, int start = 0,
|
||||
int end = sizeof(T1) * 8) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.data(),
|
||||
offsets.data() + 1, start, end));
|
||||
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.Data(),
|
||||
offsets.Data() + 1, start, end));
|
||||
tmp_mem->LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
|
||||
nSegs, offsets.data(), offsets.data() + 1, start, end));
|
||||
nSegs, offsets.Data(), offsets.Data() + 1, start, end));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -787,14 +789,14 @@ void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
|
||||
int nVals) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(
|
||||
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
|
||||
cub::DeviceReduce::Sum(NULL, tmpSize, in.Data(), out.Data(), nVals));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
||||
in.data(), out.data(), nVals));
|
||||
in.Data(), out.Data(), nVals));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -805,7 +807,7 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
T sumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
|
||||
T SumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
|
||||
// Allocate small extra memory for the return value
|
||||
@@ -827,8 +829,8 @@ T sumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
|
||||
* @param def default value to be filled
|
||||
*/
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void fillConst(int device_idx, T *out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
|
||||
void FillConst(int device_idx, T *out, int len, T def) {
|
||||
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
|
||||
[=] __device__(int i) { out[i] = def; });
|
||||
}
|
||||
|
||||
@@ -842,9 +844,9 @@ void fillConst(int device_idx, T *out, int len, T def) {
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
|
||||
void Gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
|
||||
const int *instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
@@ -862,8 +864,8 @@ void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
||||
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
|
||||
@@ -29,12 +29,12 @@ struct ParallelGroupBuilder {
|
||||
// parallel group builder of data
|
||||
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
|
||||
std::vector<ValueType> *p_data)
|
||||
: rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
|
||||
: rptr_(*p_rptr), data_(*p_data), thread_rptr_(tmp_thread_rptr_) {
|
||||
}
|
||||
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
|
||||
std::vector<ValueType> *p_data,
|
||||
std::vector< std::vector<SizeType> > *p_thread_rptr)
|
||||
: rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
|
||||
: rptr_(*p_rptr), data_(*p_data), thread_rptr_(*p_thread_rptr) {
|
||||
}
|
||||
|
||||
public:
|
||||
@@ -45,10 +45,10 @@ struct ParallelGroupBuilder {
|
||||
* \param nthread number of thread that will be used in construction
|
||||
*/
|
||||
inline void InitBudget(size_t nkeys, int nthread) {
|
||||
thread_rptr.resize(nthread);
|
||||
for (size_t i = 0; i < thread_rptr.size(); ++i) {
|
||||
thread_rptr[i].resize(nkeys);
|
||||
std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
|
||||
thread_rptr_.resize(nthread);
|
||||
for (size_t i = 0; i < thread_rptr_.size(); ++i) {
|
||||
thread_rptr_[i].resize(nkeys);
|
||||
std::fill(thread_rptr_[i].begin(), thread_rptr_[i].end(), 0);
|
||||
}
|
||||
}
|
||||
/*!
|
||||
@@ -58,34 +58,34 @@ struct ParallelGroupBuilder {
|
||||
* \param nelem number of element budget add to this row
|
||||
*/
|
||||
inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) {
|
||||
std::vector<SizeType> &trptr = thread_rptr[threadid];
|
||||
std::vector<SizeType> &trptr = thread_rptr_[threadid];
|
||||
if (trptr.size() < key + 1) {
|
||||
trptr.resize(key + 1, 0);
|
||||
}
|
||||
trptr[key] += nelem;
|
||||
}
|
||||
/*! \brief step 3: initialize the necessary storage */
|
||||
inline void InitStorage(void) {
|
||||
inline void InitStorage() {
|
||||
// set rptr to correct size
|
||||
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
|
||||
if (rptr.size() <= thread_rptr[tid].size()) {
|
||||
rptr.resize(thread_rptr[tid].size() + 1);
|
||||
for (size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
|
||||
if (rptr_.size() <= thread_rptr_[tid].size()) {
|
||||
rptr_.resize(thread_rptr_[tid].size() + 1);
|
||||
}
|
||||
}
|
||||
// initialize rptr to be beginning of each segment
|
||||
size_t start = 0;
|
||||
for (size_t i = 0; i + 1 < rptr.size(); ++i) {
|
||||
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
|
||||
std::vector<SizeType> &trptr = thread_rptr[tid];
|
||||
for (size_t i = 0; i + 1 < rptr_.size(); ++i) {
|
||||
for (size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
|
||||
std::vector<SizeType> &trptr = thread_rptr_[tid];
|
||||
if (i < trptr.size()) {
|
||||
size_t ncnt = trptr[i];
|
||||
trptr[i] = start;
|
||||
start += ncnt;
|
||||
}
|
||||
}
|
||||
rptr[i + 1] = start;
|
||||
rptr_[i + 1] = start;
|
||||
}
|
||||
data.resize(start);
|
||||
data_.resize(start);
|
||||
}
|
||||
/*!
|
||||
* \brief step 4: add data to the allocated space,
|
||||
@@ -96,19 +96,19 @@ struct ParallelGroupBuilder {
|
||||
* \param threadid the id of thread that calls this function
|
||||
*/
|
||||
inline void Push(size_t key, ValueType value, int threadid) {
|
||||
SizeType &rp = thread_rptr[threadid][key];
|
||||
data[rp++] = value;
|
||||
SizeType &rp = thread_rptr_[threadid][key];
|
||||
data_[rp++] = value;
|
||||
}
|
||||
|
||||
private:
|
||||
/*! \brief pointer to the beginning and end of each continuous key */
|
||||
std::vector<SizeType> &rptr;
|
||||
std::vector<SizeType> &rptr_;
|
||||
/*! \brief index of nonzero entries in each row */
|
||||
std::vector<ValueType> &data;
|
||||
std::vector<ValueType> &data_;
|
||||
/*! \brief thread local data structure */
|
||||
std::vector<std::vector<SizeType> > &thread_rptr;
|
||||
std::vector<std::vector<SizeType> > &thread_rptr_;
|
||||
/*! \brief local temp thread ptr, use this if not specified by the constructor */
|
||||
std::vector<std::vector<SizeType> > tmp_thread_rptr;
|
||||
std::vector<std::vector<SizeType> > tmp_thread_rptr_;
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -17,20 +17,20 @@ namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
|
||||
const MetaInfo& info = p_fmat->info();
|
||||
using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
|
||||
const MetaInfo& info = p_fmat->Info();
|
||||
|
||||
// safe factor for better accuracy
|
||||
const int kFactor = 8;
|
||||
constexpr int kFactor = 8;
|
||||
std::vector<WXQSketch> sketchs;
|
||||
|
||||
const int nthread = omp_get_max_threads();
|
||||
|
||||
unsigned nstep = static_cast<unsigned>((info.num_col + nthread - 1) / nthread);
|
||||
unsigned ncol = static_cast<unsigned>(info.num_col);
|
||||
sketchs.resize(info.num_col);
|
||||
auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
|
||||
auto ncol = static_cast<unsigned>(info.num_col_);
|
||||
sketchs.resize(info.num_col_);
|
||||
for (auto& s : sketchs) {
|
||||
s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
|
||||
s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
|
||||
}
|
||||
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
@@ -40,7 +40,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
CHECK_EQ(nthread, omp_get_num_threads());
|
||||
unsigned tid = static_cast<unsigned>(omp_get_thread_num());
|
||||
auto tid = static_cast<unsigned>(omp_get_thread_num());
|
||||
unsigned begin = std::min(nstep * tid, ncol);
|
||||
unsigned end = std::min(nstep * (tid + 1), ncol);
|
||||
for (size_t i = 0; i < batch.size; ++i) { // NOLINT(*)
|
||||
@@ -68,7 +68,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
|
||||
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
|
||||
|
||||
this->min_val.resize(info.num_col);
|
||||
this->min_val.resize(info.num_col_);
|
||||
row_ptr.push_back(0);
|
||||
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
|
||||
WXQSketch::SummaryContainer a;
|
||||
@@ -105,7 +105,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
CHECK(cut != nullptr);
|
||||
CHECK(cut != nullptr); // NOLINT
|
||||
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
|
||||
|
||||
const int nthread = omp_get_max_threads();
|
||||
@@ -126,7 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
CHECK_GT(cut->cut.size(), 0U);
|
||||
CHECK_EQ(cut->row_ptr.back(), cut->cut.size());
|
||||
|
||||
omp_ulong bsize = static_cast<omp_ulong>(batch.size);
|
||||
auto bsize = static_cast<omp_ulong>(batch.size);
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
|
||||
const int tid = omp_get_thread_num();
|
||||
@@ -217,7 +217,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
||||
std::vector<std::vector<bool>> conflict_marks;
|
||||
std::vector<size_t> group_nnz;
|
||||
std::vector<size_t> group_conflict_cnt;
|
||||
const size_t max_conflict_cnt
|
||||
const auto max_conflict_cnt
|
||||
= static_cast<size_t>(param.max_conflict_rate * nrow);
|
||||
|
||||
for (auto fid : feature_list) {
|
||||
@@ -336,14 +336,14 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
|
||||
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
const ColumnMatrix& colmat,
|
||||
const FastHistParam& param) {
|
||||
cut = gmat.cut;
|
||||
cut_ = gmat.cut;
|
||||
|
||||
const size_t nrow = gmat.row_ptr.size() - 1;
|
||||
const uint32_t nbins = gmat.cut->row_ptr.back();
|
||||
|
||||
/* step 1: form feature groups */
|
||||
auto groups = FastFeatureGrouping(gmat, colmat, param);
|
||||
const uint32_t nblock = static_cast<uint32_t>(groups.size());
|
||||
const auto nblock = static_cast<uint32_t>(groups.size());
|
||||
|
||||
/* step 2: build a new CSR matrix for each feature group */
|
||||
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
|
||||
@@ -380,24 +380,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
index_blk_ptr.push_back(0);
|
||||
row_ptr_blk_ptr.push_back(0);
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
|
||||
row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
|
||||
index_blk_ptr.push_back(index.size());
|
||||
row_ptr_blk_ptr.push_back(row_ptr.size());
|
||||
index_.insert(index_.end(), index_temp[block_id].begin(), index_temp[block_id].end());
|
||||
row_ptr_.insert(row_ptr_.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
|
||||
index_blk_ptr.push_back(index_.size());
|
||||
row_ptr_blk_ptr.push_back(row_ptr_.size());
|
||||
}
|
||||
|
||||
// save shortcut for each block
|
||||
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
|
||||
Block blk;
|
||||
blk.index_begin = &index[index_blk_ptr[block_id]];
|
||||
blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
|
||||
blk.index_end = &index[index_blk_ptr[block_id + 1]];
|
||||
blk.row_ptr_end = &row_ptr[row_ptr_blk_ptr[block_id + 1]];
|
||||
blocks.push_back(blk);
|
||||
blk.index_begin = &index_[index_blk_ptr[block_id]];
|
||||
blk.row_ptr_begin = &row_ptr_[row_ptr_blk_ptr[block_id]];
|
||||
blk.index_end = &index_[index_blk_ptr[block_id + 1]];
|
||||
blk.row_ptr_end = &row_ptr_[row_ptr_blk_ptr[block_id + 1]];
|
||||
blocks_.push_back(blk);
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
@@ -405,30 +405,30 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
data_.resize(nbins_ * nthread_, GHistEntry());
|
||||
std::fill(data_.begin(), data_.end(), GHistEntry());
|
||||
|
||||
const int K = 8; // loop unrolling factor
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
const size_t nrows = row_indices.end - row_indices.begin;
|
||||
const size_t rest = nrows % K;
|
||||
const size_t rest = nrows % kUnroll;
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
|
||||
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
|
||||
const bst_omp_uint tid = omp_get_thread_num();
|
||||
const size_t off = tid * nbins_;
|
||||
size_t rid[K];
|
||||
size_t ibegin[K];
|
||||
size_t iend[K];
|
||||
bst_gpair stat[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
size_t rid[kUnroll];
|
||||
size_t ibegin[kUnroll];
|
||||
size_t iend[kUnroll];
|
||||
GradientPair stat[kUnroll];
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
ibegin[k] = gmat.row_ptr[rid[k]];
|
||||
iend[k] = gmat.row_ptr[rid[k] + 1];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
data_[off + bin].Add(stat[k]);
|
||||
@@ -439,7 +439,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
const GradientPair stat = gpair[rid];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
data_[bin].Add(stat);
|
||||
@@ -456,37 +456,40 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
}
|
||||
|
||||
void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
const int K = 8; // loop unrolling factor
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nblock = gmatb.GetNumBlock();
|
||||
const size_t nrows = row_indices.end - row_indices.begin;
|
||||
const size_t rest = nrows % K;
|
||||
const size_t rest = nrows % kUnroll;
|
||||
|
||||
#if defined(_OPENMP)
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
#endif
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(guided)
|
||||
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
|
||||
auto gmat = gmatb[bid];
|
||||
|
||||
for (size_t i = 0; i < nrows - rest; i += K) {
|
||||
size_t rid[K];
|
||||
size_t ibegin[K];
|
||||
size_t iend[K];
|
||||
bst_gpair stat[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
|
||||
size_t rid[kUnroll];
|
||||
size_t ibegin[kUnroll];
|
||||
size_t iend[kUnroll];
|
||||
GradientPair stat[kUnroll];
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
rid[k] = row_indices.begin[i + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
ibegin[k] = gmat.row_ptr[rid[k]];
|
||||
iend[k] = gmat.row_ptr[rid[k] + 1];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
stat[k] = gpair[rid[k]];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
hist.begin[bin].Add(stat[k]);
|
||||
@@ -497,7 +500,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
const bst_gpair stat = gpair[rid];
|
||||
const GradientPair stat = gpair[rid];
|
||||
for (size_t j = ibegin; j < iend; ++j) {
|
||||
const uint32_t bin = gmat.index[j];
|
||||
hist.begin[bin].Add(stat);
|
||||
@@ -507,21 +510,26 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
|
||||
void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
const uint32_t nbins = static_cast<bst_omp_uint>(nbins_);
|
||||
const int K = 8; // loop unrolling factor
|
||||
const uint32_t rest = nbins % K;
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const uint32_t rest = nbins % kUnroll;
|
||||
|
||||
#if defined(_OPENMP)
|
||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
#endif
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint bin_id = 0; bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += K) {
|
||||
GHistEntry pb[K];
|
||||
GHistEntry sb[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (bst_omp_uint bin_id = 0;
|
||||
bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += kUnroll) {
|
||||
GHistEntry pb[kUnroll];
|
||||
GHistEntry sb[kUnroll];
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
pb[k] = parent.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
sb[k] = sibling.begin[bin_id + k];
|
||||
}
|
||||
for (int k = 0; k < K; ++k) {
|
||||
for (int k = 0; k < kUnroll; ++k) {
|
||||
self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,26 +13,26 @@
|
||||
#include "row_set.h"
|
||||
#include "../tree/fast_hist_param.h"
|
||||
|
||||
using xgboost::tree::FastHistParam;
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
using tree::FastHistParam;
|
||||
|
||||
/*! \brief sums of gradient statistics corresponding to a histogram bin */
|
||||
struct GHistEntry {
|
||||
/*! \brief sum of first-order gradient statistics */
|
||||
double sum_grad;
|
||||
double sum_grad{0};
|
||||
/*! \brief sum of second-order gradient statistics */
|
||||
double sum_hess;
|
||||
double sum_hess{0};
|
||||
|
||||
GHistEntry() : sum_grad(0), sum_hess(0) {}
|
||||
GHistEntry() = default;
|
||||
|
||||
inline void Clear() {
|
||||
sum_grad = sum_hess = 0;
|
||||
}
|
||||
|
||||
/*! \brief add a bst_gpair to the sum */
|
||||
inline void Add(const bst_gpair& e) {
|
||||
/*! \brief add a GradientPair to the sum */
|
||||
inline void Add(const GradientPair& e) {
|
||||
sum_grad += e.GetGrad();
|
||||
sum_hess += e.GetHess();
|
||||
}
|
||||
@@ -58,7 +58,7 @@ struct HistCutUnit {
|
||||
/*! \brief number of cutting point, containing the maximum point */
|
||||
uint32_t size;
|
||||
// default constructor
|
||||
HistCutUnit() {}
|
||||
HistCutUnit() = default;
|
||||
// constructor
|
||||
HistCutUnit(const bst_float* cut, uint32_t size)
|
||||
: cut(cut), size(size) {}
|
||||
@@ -74,8 +74,8 @@ struct HistCutMatrix {
|
||||
std::vector<bst_float> cut;
|
||||
/*! \brief Get histogram bound for fid */
|
||||
inline HistCutUnit operator[](bst_uint fid) const {
|
||||
return HistCutUnit(dmlc::BeginPtr(cut) + row_ptr[fid],
|
||||
row_ptr[fid + 1] - row_ptr[fid]);
|
||||
return {dmlc::BeginPtr(cut) + row_ptr[fid],
|
||||
row_ptr[fid + 1] - row_ptr[fid]};
|
||||
}
|
||||
// create histogram cut matrix given statistics from data
|
||||
// using approximate quantile sketch approach
|
||||
@@ -92,7 +92,7 @@ struct GHistIndexRow {
|
||||
const uint32_t* index;
|
||||
/*! \brief The size of the histogram */
|
||||
size_t size;
|
||||
GHistIndexRow() {}
|
||||
GHistIndexRow() = default;
|
||||
GHistIndexRow(const uint32_t* index, size_t size)
|
||||
: index(index), size(size) {}
|
||||
};
|
||||
@@ -115,7 +115,7 @@ struct GHistIndexMatrix {
|
||||
void Init(DMatrix* p_fmat);
|
||||
// get i-th row
|
||||
inline GHistIndexRow operator[](size_t i) const {
|
||||
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
|
||||
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
||||
}
|
||||
inline void GetFeatureCounts(size_t* counts) const {
|
||||
auto nfeature = cut->row_ptr.size() - 1;
|
||||
@@ -141,7 +141,7 @@ struct GHistIndexBlock {
|
||||
|
||||
// get i-th row
|
||||
inline GHistIndexRow operator[](size_t i) const {
|
||||
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
|
||||
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -154,24 +154,24 @@ class GHistIndexBlockMatrix {
|
||||
const FastHistParam& param);
|
||||
|
||||
inline GHistIndexBlock operator[](size_t i) const {
|
||||
return GHistIndexBlock(blocks[i].row_ptr_begin, blocks[i].index_begin);
|
||||
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
|
||||
}
|
||||
|
||||
inline size_t GetNumBlock() const {
|
||||
return blocks.size();
|
||||
return blocks_.size();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<size_t> row_ptr;
|
||||
std::vector<uint32_t> index;
|
||||
const HistCutMatrix* cut;
|
||||
std::vector<size_t> row_ptr_;
|
||||
std::vector<uint32_t> index_;
|
||||
const HistCutMatrix* cut_;
|
||||
struct Block {
|
||||
const size_t* row_ptr_begin;
|
||||
const size_t* row_ptr_end;
|
||||
const uint32_t* index_begin;
|
||||
const uint32_t* index_end;
|
||||
};
|
||||
std::vector<Block> blocks;
|
||||
std::vector<Block> blocks_;
|
||||
};
|
||||
|
||||
/*!
|
||||
@@ -186,7 +186,7 @@ struct GHistRow {
|
||||
/*! \brief number of entries */
|
||||
uint32_t size;
|
||||
|
||||
GHistRow() {}
|
||||
GHistRow() = default;
|
||||
GHistRow(GHistEntry* begin, uint32_t size)
|
||||
: begin(begin), size(size) {}
|
||||
};
|
||||
@@ -198,15 +198,15 @@ class HistCollection {
|
||||
public:
|
||||
// access histogram for i-th node
|
||||
inline GHistRow operator[](bst_uint nid) const {
|
||||
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
CHECK_NE(row_ptr_[nid], kMax);
|
||||
return GHistRow(const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_);
|
||||
return {const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_};
|
||||
}
|
||||
|
||||
// have we computed a histogram for i-th node?
|
||||
inline bool RowExists(bst_uint nid) const {
|
||||
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
return (nid < row_ptr_.size() && row_ptr_[nid] != kMax);
|
||||
const uint32_t k_max = std::numeric_limits<uint32_t>::max();
|
||||
return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
|
||||
}
|
||||
|
||||
// initialize histogram collection
|
||||
@@ -218,7 +218,7 @@ class HistCollection {
|
||||
|
||||
// create an empty histogram for i-th node
|
||||
inline void AddHistRow(bst_uint nid) {
|
||||
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
|
||||
if (nid >= row_ptr_.size()) {
|
||||
row_ptr_.resize(nid + 1, kMax);
|
||||
}
|
||||
@@ -250,13 +250,13 @@ class GHistBuilder {
|
||||
}
|
||||
|
||||
// construct a histogram via histogram aggregation
|
||||
void BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
void BuildHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist);
|
||||
// same, with feature grouping
|
||||
void BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
void BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
// dummy implementation of HostDeviceVector in case CUDA is not used
|
||||
|
||||
#include <xgboost/base.h>
|
||||
|
||||
#include <utility>
|
||||
#include "./host_device_vector.h"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -13,8 +15,8 @@ namespace xgboost {
|
||||
template <typename T>
|
||||
struct HostDeviceVectorImpl {
|
||||
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
|
||||
explicit HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
|
||||
explicit HostDeviceVectorImpl(const std::vector<T>& init) : data_h_(init) {}
|
||||
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
|
||||
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
|
||||
std::vector<T> data_h_;
|
||||
};
|
||||
|
||||
@@ -43,25 +45,25 @@ HostDeviceVector<T>::~HostDeviceVector() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::size() const { return impl_->data_h_.size(); }
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::device() const { return -1; }
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::ptr_d(int device) { return nullptr; }
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
|
||||
|
||||
template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h_; }
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
||||
impl_->data_h_.resize(new_size, v);
|
||||
}
|
||||
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<bst_gpair>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -35,27 +35,27 @@ struct HostDeviceVectorImpl {
|
||||
void operator=(const HostDeviceVectorImpl<T>&) = delete;
|
||||
void operator=(HostDeviceVectorImpl<T>&&) = delete;
|
||||
|
||||
size_t size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
|
||||
size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
|
||||
|
||||
int device() const { return device_; }
|
||||
int DeviceIdx() const { return device_; }
|
||||
|
||||
T* ptr_d(int device) {
|
||||
lazy_sync_device(device);
|
||||
T* DevicePointer(int device) {
|
||||
LazySyncDevice(device);
|
||||
return data_d_.data().get();
|
||||
}
|
||||
thrust::device_ptr<T> tbegin(int device) {
|
||||
return thrust::device_ptr<T>(ptr_d(device));
|
||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||
return thrust::device_ptr<T>(DevicePointer(device));
|
||||
}
|
||||
thrust::device_ptr<T> tend(int device) {
|
||||
thrust::device_ptr<T> tend(int device) { // NOLINT
|
||||
auto begin = tbegin(device);
|
||||
return begin + size();
|
||||
return begin + Size();
|
||||
}
|
||||
std::vector<T>& data_h() {
|
||||
lazy_sync_host();
|
||||
std::vector<T>& HostVector() {
|
||||
LazySyncHost();
|
||||
return data_h_;
|
||||
}
|
||||
void resize(size_t new_size, T v, int new_device) {
|
||||
if (new_size == this->size() && new_device == device_)
|
||||
void Resize(size_t new_size, T v, int new_device) {
|
||||
if (new_size == this->Size() && new_device == device_)
|
||||
return;
|
||||
if (new_device != -1)
|
||||
device_ = new_device;
|
||||
@@ -70,26 +70,26 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
void lazy_sync_host() {
|
||||
void LazySyncHost() {
|
||||
if (!on_d_)
|
||||
return;
|
||||
if (data_h_.size() != this->size())
|
||||
data_h_.resize(this->size());
|
||||
if (data_h_.size() != this->Size())
|
||||
data_h_.resize(this->Size());
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin());
|
||||
on_d_ = false;
|
||||
}
|
||||
|
||||
void lazy_sync_device(int device) {
|
||||
void LazySyncDevice(int device) {
|
||||
if (on_d_)
|
||||
return;
|
||||
if (device != device_) {
|
||||
CHECK_EQ(device_, -1);
|
||||
device_ = device;
|
||||
}
|
||||
if (data_d_.size() != this->size()) {
|
||||
if (data_d_.size() != this->Size()) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_d_.resize(this->size());
|
||||
data_d_.resize(this->Size());
|
||||
}
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin());
|
||||
@@ -128,34 +128,34 @@ HostDeviceVector<T>::~HostDeviceVector() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t HostDeviceVector<T>::size() const { return impl_->size(); }
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::device() const { return impl_->device(); }
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::ptr_d(int device) { return impl_->ptr_d(device); }
|
||||
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
|
||||
return impl_->tbegin(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {
|
||||
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
|
||||
return impl_->tend(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h(); }
|
||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
|
||||
impl_->resize(new_size, v, new_device);
|
||||
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
|
||||
impl_->Resize(new_size, v, new_device);
|
||||
}
|
||||
|
||||
// explicit instantiations are required, as HostDeviceVector isn't header-only
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<bst_gpair>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -70,10 +70,10 @@ class HostDeviceVector {
|
||||
HostDeviceVector(HostDeviceVector<T>&&) = delete;
|
||||
void operator=(const HostDeviceVector<T>&) = delete;
|
||||
void operator=(HostDeviceVector<T>&&) = delete;
|
||||
size_t size() const;
|
||||
int device() const;
|
||||
T* ptr_d(int device);
|
||||
T* ptr_h() { return data_h().data(); }
|
||||
size_t Size() const;
|
||||
int DeviceIdx() const;
|
||||
T* DevicePointer(int device);
|
||||
T* HostPointer() { return HostVector().data(); }
|
||||
|
||||
// only define functions returning device_ptr
|
||||
// if HostDeviceVector.h is included from a .cu file
|
||||
@@ -82,10 +82,10 @@ class HostDeviceVector {
|
||||
thrust::device_ptr<T> tend(int device);
|
||||
#endif
|
||||
|
||||
std::vector<T>& data_h();
|
||||
std::vector<T>& HostVector();
|
||||
|
||||
// passing in new_device == -1 keeps the device as is
|
||||
void resize(size_t new_size, T v = T(), int new_device = -1);
|
||||
void Resize(size_t new_size, T v = T(), int new_device = -1);
|
||||
|
||||
private:
|
||||
HostDeviceVectorImpl<T>* impl_;
|
||||
|
||||
@@ -15,8 +15,8 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
|
||||
typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
|
||||
using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
|
||||
using MemoryBufferStream = rabit::utils::MemoryBufferStream;
|
||||
|
||||
/*!
|
||||
* \brief Input stream that support additional PeekRead
|
||||
|
||||
@@ -39,12 +39,12 @@ inline void Softmax(std::vector<float>* p_rec) {
|
||||
wmax = std::max(rec[i], wmax);
|
||||
}
|
||||
double wsum = 0.0f;
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] = std::exp(rec[i] - wmax);
|
||||
wsum += rec[i];
|
||||
for (float & elem : rec) {
|
||||
elem = std::exp(elem - wmax);
|
||||
wsum += elem;
|
||||
}
|
||||
for (size_t i = 0; i < rec.size(); ++i) {
|
||||
rec[i] /= static_cast<float>(wsum);
|
||||
for (float & elem : rec) {
|
||||
elem /= static_cast<float>(wsum);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ struct WQSummary {
|
||||
/*! \brief the value of data */
|
||||
DType value;
|
||||
// constructor
|
||||
Entry() {}
|
||||
Entry() = default;
|
||||
// constructor
|
||||
Entry(RType rmin, RType rmax, RType wmin, DType value)
|
||||
: rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
|
||||
@@ -48,11 +48,11 @@ struct WQSummary {
|
||||
CHECK(rmax- rmin - wmin > -eps) << "relation constraint: min/max";
|
||||
}
|
||||
/*! \return rmin estimation for v strictly bigger than value */
|
||||
inline RType rmin_next() const {
|
||||
inline RType RMinNext() const {
|
||||
return rmin + wmin;
|
||||
}
|
||||
/*! \return rmax estimation for v strictly smaller than value */
|
||||
inline RType rmax_prev() const {
|
||||
inline RType RMaxPrev() const {
|
||||
return rmax - wmin;
|
||||
}
|
||||
};
|
||||
@@ -65,7 +65,7 @@ struct WQSummary {
|
||||
// weight of instance
|
||||
RType weight;
|
||||
// default constructor
|
||||
QEntry() {}
|
||||
QEntry() = default;
|
||||
// constructor
|
||||
QEntry(DType value, RType weight)
|
||||
: value(value), weight(weight) {}
|
||||
@@ -116,7 +116,7 @@ struct WQSummary {
|
||||
inline RType MaxError() const {
|
||||
RType res = data[0].rmax - data[0].rmin - data[0].wmin;
|
||||
for (size_t i = 1; i < size; ++i) {
|
||||
res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
|
||||
res = std::max(data[i].RMaxPrev() - data[i - 1].RMinNext(), res);
|
||||
res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
|
||||
}
|
||||
return res;
|
||||
@@ -140,8 +140,8 @@ struct WQSummary {
|
||||
if (istart == 0) {
|
||||
return Entry(0.0f, 0.0f, 0.0f, qvalue);
|
||||
} else {
|
||||
return Entry(data[istart - 1].rmin_next(),
|
||||
data[istart].rmax_prev(),
|
||||
return Entry(data[istart - 1].RMinNext(),
|
||||
data[istart].RMaxPrev(),
|
||||
0.0f, qvalue);
|
||||
}
|
||||
}
|
||||
@@ -197,7 +197,7 @@ struct WQSummary {
|
||||
while (i < src.size - 1
|
||||
&& dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
|
||||
CHECK(i != src.size - 1);
|
||||
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
|
||||
if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
|
||||
if (i != lastidx) {
|
||||
data[size++] = src.data[i]; lastidx = i;
|
||||
}
|
||||
@@ -236,20 +236,20 @@ struct WQSummary {
|
||||
*dst = Entry(a->rmin + b->rmin,
|
||||
a->rmax + b->rmax,
|
||||
a->wmin + b->wmin, a->value);
|
||||
aprev_rmin = a->rmin_next();
|
||||
bprev_rmin = b->rmin_next();
|
||||
aprev_rmin = a->RMinNext();
|
||||
bprev_rmin = b->RMinNext();
|
||||
++dst; ++a; ++b;
|
||||
} else if (a->value < b->value) {
|
||||
*dst = Entry(a->rmin + bprev_rmin,
|
||||
a->rmax + b->rmax_prev(),
|
||||
a->rmax + b->RMaxPrev(),
|
||||
a->wmin, a->value);
|
||||
aprev_rmin = a->rmin_next();
|
||||
aprev_rmin = a->RMinNext();
|
||||
++dst; ++a;
|
||||
} else {
|
||||
*dst = Entry(b->rmin + aprev_rmin,
|
||||
b->rmax + a->rmax_prev(),
|
||||
b->rmax + a->RMaxPrev(),
|
||||
b->wmin, b->value);
|
||||
bprev_rmin = b->rmin_next();
|
||||
bprev_rmin = b->RMinNext();
|
||||
++dst; ++b;
|
||||
}
|
||||
}
|
||||
@@ -307,7 +307,7 @@ struct WQSummary {
|
||||
data[i].rmax = prev_rmax;
|
||||
*err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax);
|
||||
}
|
||||
RType rmin_next = data[i].rmin_next();
|
||||
RType rmin_next = data[i].RMinNext();
|
||||
if (data[i].rmax < rmin_next) {
|
||||
data[i].rmax = rmin_next;
|
||||
*err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next);
|
||||
@@ -334,13 +334,13 @@ struct WQSummary {
|
||||
template<typename DType, typename RType>
|
||||
struct WXQSummary : public WQSummary<DType, RType> {
|
||||
// redefine entry type
|
||||
typedef typename WQSummary<DType, RType>::Entry Entry;
|
||||
using Entry = typename WQSummary<DType, RType>::Entry;
|
||||
// constructor
|
||||
WXQSummary(Entry *data, size_t size)
|
||||
: WQSummary<DType, RType>(data, size) {}
|
||||
// check if the block is large chunk
|
||||
inline static bool CheckLarge(const Entry &e, RType chunk) {
|
||||
return e.rmin_next() > e.rmax_prev() + chunk;
|
||||
return e.RMinNext() > e.RMaxPrev() + chunk;
|
||||
}
|
||||
// set prune
|
||||
inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
|
||||
@@ -377,13 +377,13 @@ struct WXQSummary : public WQSummary<DType, RType> {
|
||||
if (CheckLarge(src.data[i], chunk)) {
|
||||
if (bid != i - 1) {
|
||||
// accumulate the range of the rest points
|
||||
mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
|
||||
mrange += src.data[i].RMaxPrev() - src.data[bid].RMinNext();
|
||||
}
|
||||
bid = i; ++nbig;
|
||||
}
|
||||
}
|
||||
if (bid != src.size - 2) {
|
||||
mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
|
||||
mrange += src.data[src.size-1].RMaxPrev() - src.data[bid].RMinNext();
|
||||
}
|
||||
}
|
||||
// assert: there cannot be more than n big data points
|
||||
@@ -405,14 +405,14 @@ struct WXQSummary : public WQSummary<DType, RType> {
|
||||
if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
|
||||
if (bid != end - 1) {
|
||||
size_t i = bid;
|
||||
RType maxdx2 = src.data[end].rmax_prev() * 2;
|
||||
RType maxdx2 = src.data[end].RMaxPrev() * 2;
|
||||
for (; k < n; ++k) {
|
||||
RType dx2 = 2 * ((k * mrange) / n + begin);
|
||||
if (dx2 >= maxdx2) break;
|
||||
while (i < end &&
|
||||
dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
|
||||
if (i == end) break;
|
||||
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
|
||||
if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
|
||||
if (i != lastidx) {
|
||||
this->data[this->size++] = src.data[i]; lastidx = i;
|
||||
}
|
||||
@@ -429,7 +429,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
|
||||
}
|
||||
bid = end;
|
||||
// shift base by the gap
|
||||
begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
|
||||
begin += src.data[bid].RMinNext() - src.data[bid].RMaxPrev();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -448,7 +448,7 @@ struct GKSummary {
|
||||
/*! \brief the value of data */
|
||||
DType value;
|
||||
// constructor
|
||||
Entry() {}
|
||||
Entry() = default;
|
||||
// constructor
|
||||
Entry(RType rmin, RType rmax, DType value)
|
||||
: rmin(rmin), rmax(rmax), value(value) {}
|
||||
@@ -591,17 +591,17 @@ template<typename DType, typename RType, class TSummary>
|
||||
class QuantileSketchTemplate {
|
||||
public:
|
||||
/*! \brief type of summary type */
|
||||
typedef TSummary Summary;
|
||||
using Summary = TSummary;
|
||||
/*! \brief the entry type */
|
||||
typedef typename Summary::Entry Entry;
|
||||
using Entry = typename Summary::Entry;
|
||||
/*! \brief same as summary, but use STL to backup the space */
|
||||
struct SummaryContainer : public Summary {
|
||||
std::vector<Entry> space;
|
||||
SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) {
|
||||
SummaryContainer(const SummaryContainer &src) : Summary(nullptr, src.size) {
|
||||
this->space = src.space;
|
||||
this->data = dmlc::BeginPtr(this->space);
|
||||
}
|
||||
SummaryContainer() : Summary(NULL, 0) {
|
||||
SummaryContainer() : Summary(nullptr, 0) {
|
||||
}
|
||||
/*! \brief reserve space for summary */
|
||||
inline void Reserve(size_t size) {
|
||||
@@ -775,7 +775,7 @@ class QuantileSketchTemplate {
|
||||
inline void InitLevel(size_t nlevel) {
|
||||
if (level.size() >= nlevel) return;
|
||||
data.resize(limit_size * nlevel);
|
||||
level.resize(nlevel, Summary(NULL, 0));
|
||||
level.resize(nlevel, Summary(nullptr, 0));
|
||||
for (size_t l = 0; l < level.size(); ++l) {
|
||||
level[l].data = dmlc::BeginPtr(data) + l * limit_size;
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ namespace common {
|
||||
/*!
|
||||
* \brief Define mt19937 as default type Random Engine.
|
||||
*/
|
||||
typedef std::mt19937 RandomEngine;
|
||||
using RandomEngine = std::mt19937;
|
||||
|
||||
#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
|
||||
/*!
|
||||
@@ -56,7 +56,7 @@ typedef CustomGlobalRandomEngine GlobalRandomEngine;
|
||||
/*!
|
||||
* \brief global random engine
|
||||
*/
|
||||
typedef RandomEngine GlobalRandomEngine;
|
||||
using GlobalRandomEngine = RandomEngine;
|
||||
#endif
|
||||
|
||||
/*!
|
||||
|
||||
@@ -21,18 +21,18 @@ class RowSetCollection {
|
||||
* rows (instances) associated with a particular node in a decision
|
||||
* tree. */
|
||||
struct Elem {
|
||||
const size_t* begin;
|
||||
const size_t* end;
|
||||
int node_id;
|
||||
const size_t* begin{nullptr};
|
||||
const size_t* end{nullptr};
|
||||
int node_id{-1};
|
||||
// id of node associated with this instance set; -1 means uninitialized
|
||||
Elem(void)
|
||||
: begin(nullptr), end(nullptr), node_id(-1) {}
|
||||
Elem()
|
||||
= default;
|
||||
Elem(const size_t* begin,
|
||||
const size_t* end,
|
||||
int node_id)
|
||||
: begin(begin), end(end), node_id(node_id) {}
|
||||
|
||||
inline size_t size() const {
|
||||
inline size_t Size() const {
|
||||
return end - begin;
|
||||
}
|
||||
};
|
||||
@@ -42,11 +42,11 @@ class RowSetCollection {
|
||||
std::vector<size_t> right;
|
||||
};
|
||||
|
||||
inline std::vector<Elem>::const_iterator begin() const {
|
||||
inline std::vector<Elem>::const_iterator begin() const { // NOLINT
|
||||
return elem_of_each_node_.begin();
|
||||
}
|
||||
|
||||
inline std::vector<Elem>::const_iterator end() const {
|
||||
inline std::vector<Elem>::const_iterator end() const { // NOLINT
|
||||
return elem_of_each_node_.end();
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ class RowSetCollection {
|
||||
unsigned left_node_id,
|
||||
unsigned right_node_id) {
|
||||
const Elem e = elem_of_each_node_[node_id];
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
|
||||
const auto nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
|
||||
CHECK(e.begin != nullptr);
|
||||
size_t* all_begin = dmlc::BeginPtr(row_indices_);
|
||||
size_t* begin = all_begin + (e.begin - all_begin);
|
||||
|
||||
@@ -12,10 +12,10 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
struct Timer {
|
||||
typedef std::chrono::high_resolution_clock ClockT;
|
||||
typedef std::chrono::high_resolution_clock::time_point TimePointT;
|
||||
typedef std::chrono::high_resolution_clock::duration DurationT;
|
||||
typedef std::chrono::duration<double> SecondsT;
|
||||
using ClockT = std::chrono::high_resolution_clock;
|
||||
using TimePointT = std::chrono::high_resolution_clock::time_point;
|
||||
using DurationT = std::chrono::high_resolution_clock::duration;
|
||||
using SecondsT = std::chrono::duration<double>;
|
||||
|
||||
TimePointT start;
|
||||
DurationT elapsed;
|
||||
@@ -70,7 +70,7 @@ struct Monitor {
|
||||
if (debug_verbose) {
|
||||
#ifdef __CUDACC__
|
||||
#include "device_helpers.cuh"
|
||||
dh::synchronize_n_devices(dList.size(), dList);
|
||||
dh::SynchronizeNDevices(dList.size(), dList);
|
||||
#endif
|
||||
}
|
||||
timer_map[name].Start();
|
||||
@@ -80,7 +80,7 @@ struct Monitor {
|
||||
if (debug_verbose) {
|
||||
#ifdef __CUDACC__
|
||||
#include "device_helpers.cuh"
|
||||
dh::synchronize_n_devices(dList.size(), dList);
|
||||
dh::SynchronizeNDevices(dList.size(), dList);
|
||||
#endif
|
||||
}
|
||||
timer_map[name].Stop();
|
||||
|
||||
Reference in New Issue
Block a user