Clang-tidy static analysis (#3222)

* Clang-tidy static analysis

* Modernise checks

* Google coding standard checks

* Identifier renaming according to Google style
This commit is contained in:
Rory Mitchell
2018-04-19 18:57:13 +12:00
committed by GitHub
parent 3242b0a378
commit ccf80703ef
97 changed files with 3407 additions and 3354 deletions

View File

@@ -68,10 +68,10 @@ inline Float8 round(const Float8& x) {
// Overload std::max/min
namespace std {
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) {
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) { // NOLINT
return avx::Float8(_mm256_max_ps(a.x, b.x));
}
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) {
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) { // NOLINT
return avx::Float8(_mm256_min_ps(a.x, b.x));
}
} // namespace std
@@ -172,7 +172,7 @@ inline Float8 Sigmoid(Float8 x) {
}
// Store 8 gradient pairs given vectors containing gradient and Hessian
inline void StoreGpair(xgboost::bst_gpair* dst, const Float8& grad,
inline void StoreGpair(xgboost::GradientPair* dst, const Float8& grad,
const Float8& hess) {
float* ptr = reinterpret_cast<float*>(dst);
__m256 gpair_low = _mm256_unpacklo_ps(grad.x, hess.x);
@@ -190,11 +190,11 @@ namespace avx {
* \brief Fallback implementation not using AVX.
*/
struct Float8 {
struct Float8 { // NOLINT
float x[8];
explicit Float8(const float& val) {
for (int i = 0; i < 8; i++) {
x[i] = val;
for (float & i : x) {
i = val;
}
}
explicit Float8(const float* vec) {
@@ -202,7 +202,7 @@ struct Float8 {
x[i] = vec[i];
}
}
Float8() {}
Float8() = default;
Float8& operator+=(const Float8& rhs) {
for (int i = 0; i < 8; i++) {
x[i] += rhs.x[i];
@@ -228,7 +228,7 @@ struct Float8 {
return *this;
}
void Print() {
float* f = reinterpret_cast<float*>(&x);
auto* f = reinterpret_cast<float*>(&x);
printf("%f %f %f %f %f %f %f %f\n", f[0], f[1], f[2], f[3], f[4], f[5],
f[6], f[7]);
}
@@ -252,10 +252,10 @@ inline Float8 operator/(Float8 lhs, const Float8& rhs) {
}
// Store 8 gradient pairs given vectors containing gradient and Hessian
inline void StoreGpair(xgboost::bst_gpair* dst, const Float8& grad,
inline void StoreGpair(xgboost::GradientPair* dst, const Float8& grad,
const Float8& hess) {
for (int i = 0; i < 8; i++) {
dst[i] = xgboost::bst_gpair(grad.x[i], hess.x[i]);
dst[i] = xgboost::GradientPair(grad.x[i], hess.x[i]);
}
}
@@ -269,14 +269,14 @@ inline Float8 Sigmoid(Float8 x) {
} // namespace avx
namespace std {
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) {
inline avx::Float8 max(const avx::Float8& a, const avx::Float8& b) { // NOLINT
avx::Float8 max;
for (int i = 0; i < 8; i++) {
max.x[i] = std::max(a.x[i], b.x[i]);
}
return max;
}
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) {
inline avx::Float8 min(const avx::Float8& a, const avx::Float8& b) { // NOLINT
avx::Float8 min;
for (int i = 0; i < 8; i++) {
min.x[i] = std::min(a.x[i], b.x[i]);

View File

@@ -42,7 +42,7 @@ struct BitMap {
inline void InitFromBool(const std::vector<int>& vec) {
this->Resize(vec.size());
// parallel over the full cases
bst_omp_uint nsize = static_cast<bst_omp_uint>(vec.size() / 32);
auto nsize = static_cast<bst_omp_uint>(vec.size() / 32);
#pragma omp parallel for schedule(static)
for (bst_omp_uint i = 0; i < nsize; ++i) {
uint32_t res = 0;

View File

@@ -8,21 +8,27 @@
#ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
#define XGBOOST_COMMON_COLUMN_MATRIX_H_
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
switch (dtype) { \
case xgboost::common::uint32 : { \
typedef uint32_t DType; \
OP; break; \
} \
case xgboost::common::uint16 : { \
typedef uint16_t DType; \
OP; break; \
} \
case xgboost::common::uint8 : { \
typedef uint8_t DType; \
OP; break; \
default: LOG(FATAL) << "don't recognize type flag" << dtype; \
} \
#define XGBOOST_TYPE_SWITCH(dtype, OP) \
\
switch(dtype) { \
case xgboost::common::uint32: { \
using DType = uint32_t; \
OP; \
break; \
} \
case xgboost::common::uint16: { \
using DType = uint16_t; \
OP; \
break; \
} \
case xgboost::common::uint8: { \
using DType = uint8_t; \
OP; \
break; \
default: \
LOG(FATAL) << "don't recognize type flag" << dtype; \
} \
\
}
#include <type_traits>
@@ -31,11 +37,12 @@ switch (dtype) { \
#include "hist_util.h"
#include "../tree/fast_hist_param.h"
using xgboost::tree::FastHistParam;
namespace xgboost {
namespace common {
using tree::FastHistParam;
/*! \brief indicator of data type used for storing bin id's in a column. */
enum DataType {
uint8 = 1,
@@ -78,7 +85,7 @@ class ColumnMatrix {
slot of internal buffer. */
packing_factor_ = sizeof(uint32_t) / static_cast<size_t>(this->dtype);
const bst_uint nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
const auto nfeature = static_cast<bst_uint>(gmat.cut->row_ptr.size() - 1);
const size_t nrow = gmat.row_ptr.size() - 1;
// identify type of each column

View File

@@ -14,7 +14,7 @@ struct RandomThreadLocalEntry {
GlobalRandomEngine engine;
};
typedef dmlc::ThreadLocalStore<RandomThreadLocalEntry> RandomThreadLocalStore;
using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
GlobalRandomEngine& GlobalRandom() {
return RandomThreadLocalStore::Get()->engine;

View File

@@ -11,20 +11,20 @@
namespace xgboost {
namespace common {
typedef unsigned char compressed_byte_t;
using CompressedByteT = unsigned char;
namespace detail {
inline void SetBit(compressed_byte_t *byte, int bit_idx) {
inline void SetBit(CompressedByteT *byte, int bit_idx) {
*byte |= 1 << bit_idx;
}
template <typename T>
inline T CheckBit(const T &byte, int bit_idx) {
return byte & (1 << bit_idx);
}
inline void ClearBit(compressed_byte_t *byte, int bit_idx) {
inline void ClearBit(CompressedByteT *byte, int bit_idx) {
*byte &= ~(1 << bit_idx);
}
static const int padding = 4; // Assign padding so we can read slightly off
static const int kPadding = 4; // Assign padding so we can read slightly off
// the beginning of the array
// The number of bits required to represent a given unsigned range
@@ -76,16 +76,16 @@ class CompressedBufferWriter {
size_t compressed_size = static_cast<size_t>(std::ceil(
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
bits_per_byte));
return compressed_size + detail::padding;
return compressed_size + detail::kPadding;
}
template <typename T>
void WriteSymbol(compressed_byte_t *buffer, T symbol, size_t offset) {
void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) {
const int bits_per_byte = 8;
for (size_t i = 0; i < symbol_bits_; i++) {
size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte;
byte_idx += detail::padding;
byte_idx += detail::kPadding;
size_t bit_idx =
((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte;
@@ -96,20 +96,20 @@ class CompressedBufferWriter {
}
}
}
template <typename iter_t>
void Write(compressed_byte_t *buffer, iter_t input_begin, iter_t input_end) {
template <typename IterT>
void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
uint64_t tmp = 0;
size_t stored_bits = 0;
const size_t max_stored_bits = 64 - symbol_bits_;
size_t buffer_position = detail::padding;
size_t buffer_position = detail::kPadding;
const size_t num_symbols = input_end - input_begin;
for (size_t i = 0; i < num_symbols; i++) {
typename std::iterator_traits<iter_t>::value_type symbol = input_begin[i];
typename std::iterator_traits<IterT>::value_type symbol = input_begin[i];
if (stored_bits > max_stored_bits) {
// Eject only full bytes
size_t tmp_bytes = stored_bits / 8;
for (size_t j = 0; j < tmp_bytes; j++) {
buffer[buffer_position] = static_cast<compressed_byte_t>(
buffer[buffer_position] = static_cast<CompressedByteT>(
tmp >> (stored_bits - (j + 1) * 8));
buffer_position++;
}
@@ -129,10 +129,10 @@ class CompressedBufferWriter {
int shift_bits = static_cast<int>(stored_bits) - (j + 1) * 8;
if (shift_bits >= 0) {
buffer[buffer_position] =
static_cast<compressed_byte_t>(tmp >> shift_bits);
static_cast<CompressedByteT>(tmp >> shift_bits);
} else {
buffer[buffer_position] =
static_cast<compressed_byte_t>(tmp << std::abs(shift_bits));
static_cast<CompressedByteT>(tmp << std::abs(shift_bits));
}
buffer_position++;
}
@@ -153,23 +153,21 @@ template <typename T>
class CompressedIterator {
public:
typedef CompressedIterator<T> self_type; ///< My own type
typedef ptrdiff_t
difference_type; ///< Type to express the result of subtracting
/// one iterator from another
typedef T value_type; ///< The type of the element the iterator can point to
typedef value_type *pointer; ///< The type of a pointer to an element the
/// iterator can point to
typedef value_type reference; ///< The type of a reference to an element the
/// iterator can point to
// Type definitions for thrust
typedef CompressedIterator<T> self_type; // NOLINT
typedef ptrdiff_t difference_type; // NOLINT
typedef T value_type; // NOLINT
typedef value_type *pointer; // NOLINT
typedef value_type reference; // NOLINT
private:
compressed_byte_t *buffer_;
CompressedByteT *buffer_;
size_t symbol_bits_;
size_t offset_;
public:
CompressedIterator() : buffer_(nullptr), symbol_bits_(0), offset_(0) {}
CompressedIterator(compressed_byte_t *buffer, int num_symbols)
CompressedIterator(CompressedByteT *buffer, int num_symbols)
: buffer_(buffer), offset_(0) {
symbol_bits_ = detail::SymbolBits(num_symbols);
}
@@ -178,7 +176,7 @@ class CompressedIterator {
const int bits_per_byte = 8;
size_t start_bit_idx = ((offset_ + 1) * symbol_bits_ - 1);
size_t start_byte_idx = start_bit_idx / bits_per_byte;
start_byte_idx += detail::padding;
start_byte_idx += detail::kPadding;
// Read 5 bytes - the maximum we will need
uint64_t tmp = static_cast<uint64_t>(buffer_[start_byte_idx - 4]) << 32 |

View File

@@ -24,33 +24,33 @@ class ConfigReaderBase {
* \brief get current name, called after Next returns true
* \return current parameter name
*/
inline const char *name(void) const {
return s_name.c_str();
inline const char *Name() const {
return s_name_.c_str();
}
/*!
* \brief get current value, called after Next returns true
* \return current parameter value
*/
inline const char *val(void) const {
return s_val.c_str();
inline const char *Val() const {
return s_val_.c_str();
}
/*!
* \brief move iterator to next position
* \return true if there is value in next position
*/
inline bool Next(void) {
inline bool Next() {
while (!this->IsEnd()) {
GetNextToken(&s_name);
if (s_name == "=") return false;
if (GetNextToken(&s_buf) || s_buf != "=") return false;
if (GetNextToken(&s_val) || s_val == "=") return false;
GetNextToken(&s_name_);
if (s_name_ == "=") return false;
if (GetNextToken(&s_buf_) || s_buf_ != "=") return false;
if (GetNextToken(&s_val_) || s_val_ == "=") return false;
return true;
}
return false;
}
// called before usage
inline void Init(void) {
ch_buf = this->GetChar();
inline void Init() {
ch_buf_ = this->GetChar();
}
protected:
@@ -58,38 +58,38 @@ class ConfigReaderBase {
* \brief to be implemented by subclass,
* get next token, return EOF if end of file
*/
virtual char GetChar(void) = 0;
virtual char GetChar() = 0;
/*! \brief to be implemented by child, check if end of stream */
virtual bool IsEnd(void) = 0;
virtual bool IsEnd() = 0;
private:
char ch_buf;
std::string s_name, s_val, s_buf;
char ch_buf_;
std::string s_name_, s_val_, s_buf_;
inline void SkipLine(void) {
inline void SkipLine() {
do {
ch_buf = this->GetChar();
} while (ch_buf != EOF && ch_buf != '\n' && ch_buf != '\r');
ch_buf_ = this->GetChar();
} while (ch_buf_ != EOF && ch_buf_ != '\n' && ch_buf_ != '\r');
}
inline void ParseStr(std::string *tok) {
while ((ch_buf = this->GetChar()) != EOF) {
switch (ch_buf) {
while ((ch_buf_ = this->GetChar()) != EOF) {
switch (ch_buf_) {
case '\\': *tok += this->GetChar(); break;
case '\"': return;
case '\r':
case '\n': LOG(FATAL)<< "ConfigReader: unterminated string";
default: *tok += ch_buf;
default: *tok += ch_buf_;
}
}
LOG(FATAL) << "ConfigReader: unterminated string";
}
inline void ParseStrML(std::string *tok) {
while ((ch_buf = this->GetChar()) != EOF) {
switch (ch_buf) {
while ((ch_buf_ = this->GetChar()) != EOF) {
switch (ch_buf_) {
case '\\': *tok += this->GetChar(); break;
case '\'': return;
default: *tok += ch_buf;
default: *tok += ch_buf_;
}
}
LOG(FATAL) << "unterminated string";
@@ -98,24 +98,24 @@ class ConfigReaderBase {
inline bool GetNextToken(std::string *tok) {
tok->clear();
bool new_line = false;
while (ch_buf != EOF) {
switch (ch_buf) {
while (ch_buf_ != EOF) {
switch (ch_buf_) {
case '#' : SkipLine(); new_line = true; break;
case '\"':
if (tok->length() == 0) {
ParseStr(tok); ch_buf = this->GetChar(); return new_line;
ParseStr(tok); ch_buf_ = this->GetChar(); return new_line;
} else {
LOG(FATAL) << "ConfigReader: token followed directly by string";
}
case '\'':
if (tok->length() == 0) {
ParseStrML(tok); ch_buf = this->GetChar(); return new_line;
ParseStrML(tok); ch_buf_ = this->GetChar(); return new_line;
} else {
LOG(FATAL) << "ConfigReader: token followed directly by string";
}
case '=':
if (tok->length() == 0) {
ch_buf = this->GetChar();
ch_buf_ = this->GetChar();
*tok = '=';
}
return new_line;
@@ -124,12 +124,12 @@ class ConfigReaderBase {
if (tok->length() == 0) new_line = true;
case '\t':
case ' ' :
ch_buf = this->GetChar();
ch_buf_ = this->GetChar();
if (tok->length() != 0) return new_line;
break;
default:
*tok += ch_buf;
ch_buf = this->GetChar();
*tok += ch_buf_;
ch_buf_ = this->GetChar();
break;
}
}
@@ -149,19 +149,19 @@ class ConfigStreamReader: public ConfigReaderBase {
* \brief constructor
* \param fin istream input stream
*/
explicit ConfigStreamReader(std::istream &fin) : fin(fin) {}
explicit ConfigStreamReader(std::istream &fin) : fin_(fin) {}
protected:
virtual char GetChar(void) {
return fin.get();
char GetChar() override {
return fin_.get();
}
/*! \brief to be implemented by child, check if end of stream */
virtual bool IsEnd(void) {
return fin.eof();
bool IsEnd() override {
return fin_.eof();
}
private:
std::istream &fin;
std::istream &fin_;
};
/*!
@@ -173,20 +173,20 @@ class ConfigIterator: public ConfigStreamReader {
* \brief constructor
* \param fname name of configure file
*/
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi) {
fi.open(fname);
if (fi.fail()) {
explicit ConfigIterator(const char *fname) : ConfigStreamReader(fi_) {
fi_.open(fname);
if (fi_.fail()) {
LOG(FATAL) << "cannot open file " << fname;
}
ConfigReaderBase::Init();
}
/*! \brief destructor */
~ConfigIterator(void) {
fi.close();
~ConfigIterator() {
fi_.close();
}
private:
std::ifstream fi;
std::ifstream fi_;
};
} // namespace common
} // namespace xgboost

View File

@@ -25,16 +25,16 @@
namespace dh {
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
#define HOST_DEV_INLINE XGBOOST_DEVICE __forceinline__
#define DEV_INLINE __device__ __forceinline__
/*
* Error handling functions
*/
#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__)
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
int line) {
if (code != cudaSuccess) {
std::stringstream ss;
@@ -48,9 +48,9 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
}
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file,
int line) {
if (code != ncclSuccess) {
std::stringstream ss;
@@ -64,16 +64,16 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
#endif
template <typename T>
T *raw(thrust::device_vector<T> &v) { // NOLINT
T *Raw(thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}
template <typename T>
const T *raw(const thrust::device_vector<T> &v) { // NOLINT
const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}
inline int n_visible_devices() {
inline int NVisibleDevices() {
int n_visgpus = 0;
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
@@ -81,40 +81,40 @@ inline int n_visible_devices() {
return n_visgpus;
}
inline int n_devices_all(int n_gpus) {
int n_devices_visible = dh::n_visible_devices();
inline int NDevicesAll(int n_gpus) {
int n_devices_visible = dh::NVisibleDevices();
int n_devices = n_gpus < 0 ? n_devices_visible : n_gpus;
return (n_devices);
}
inline int n_devices(int n_gpus, int num_rows) {
int n_devices = dh::n_devices_all(n_gpus);
inline int NDevices(int n_gpus, int num_rows) {
int n_devices = dh::NDevicesAll(n_gpus);
// fix-up device number to be limited by number of rows
n_devices = n_devices > num_rows ? num_rows : n_devices;
return (n_devices);
}
// if n_devices=-1, then use all visible devices
inline void synchronize_n_devices(int n_devices, std::vector<int> dList) {
inline void SynchronizeNDevices(int n_devices, std::vector<int> dList) {
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}
inline void synchronize_all() {
for (int device_idx = 0; device_idx < n_visible_devices(); device_idx++) {
inline void SynchronizeAll() {
for (int device_idx = 0; device_idx < NVisibleDevices(); device_idx++) {
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}
inline std::string device_name(int device_idx) {
inline std::string DeviceName(int device_idx) {
cudaDeviceProp prop;
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
return std::string(prop.name);
}
inline size_t available_memory(int device_idx) {
inline size_t AvailableMemory(int device_idx) {
size_t device_free = 0;
size_t device_total = 0;
safe_cuda(cudaSetDevice(device_idx));
@@ -130,20 +130,20 @@ inline size_t available_memory(int device_idx) {
* \param device_idx Zero-based index of the device.
*/
inline size_t max_shared_memory(int device_idx) {
inline size_t MaxSharedMemory(int device_idx) {
cudaDeviceProp prop;
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
return prop.sharedMemPerBlock;
}
// ensure gpu_id is correct, so not dependent upon user knowing details
inline int get_device_idx(int gpu_id) {
inline int GetDeviceIdx(int gpu_id) {
// protect against overrun for gpu_id
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
return (std::abs(gpu_id) + 0) % dh::NVisibleDevices();
}
inline void check_compute_capability() {
int n_devices = n_visible_devices();
inline void CheckComputeCapability() {
int n_devices = NVisibleDevices();
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
cudaDeviceProp prop;
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
@@ -159,72 +159,72 @@ inline void check_compute_capability() {
* Range iterator
*/
class range {
class Range {
public:
class iterator {
friend class range;
class Iterator {
friend class Range;
public:
__host__ __device__ int64_t operator*() const { return i_; }
__host__ __device__ const iterator &operator++() {
XGBOOST_DEVICE int64_t operator*() const { return i_; }
XGBOOST_DEVICE const Iterator &operator++() {
i_ += step_;
return *this;
}
__host__ __device__ iterator operator++(int) {
iterator copy(*this);
XGBOOST_DEVICE Iterator operator++(int) {
Iterator copy(*this);
i_ += step_;
return copy;
}
__host__ __device__ bool operator==(const iterator &other) const {
XGBOOST_DEVICE bool operator==(const Iterator &other) const {
return i_ >= other.i_;
}
__host__ __device__ bool operator!=(const iterator &other) const {
XGBOOST_DEVICE bool operator!=(const Iterator &other) const {
return i_ < other.i_;
}
__host__ __device__ void step(int s) { step_ = s; }
XGBOOST_DEVICE void Step(int s) { step_ = s; }
protected:
__host__ __device__ explicit iterator(int64_t start) : i_(start) {}
XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
public:
uint64_t i_;
int step_ = 1;
};
__host__ __device__ iterator begin() const { return begin_; }
__host__ __device__ iterator end() const { return end_; }
__host__ __device__ range(int64_t begin, int64_t end)
XGBOOST_DEVICE Iterator begin() const { return begin_; } // NOLINT
XGBOOST_DEVICE Iterator end() const { return end_; } // NOLINT
XGBOOST_DEVICE Range(int64_t begin, int64_t end)
: begin_(begin), end_(end) {}
__host__ __device__ void step(int s) { begin_.step(s); }
XGBOOST_DEVICE void Step(int s) { begin_.Step(s); }
private:
iterator begin_;
iterator end_;
Iterator begin_;
Iterator end_;
};
template <typename T>
__device__ range grid_stride_range(T begin, T end) {
__device__ Range GridStrideRange(T begin, T end) {
begin += blockDim.x * blockIdx.x + threadIdx.x;
range r(begin, end);
r.step(gridDim.x * blockDim.x);
Range r(begin, end);
r.Step(gridDim.x * blockDim.x);
return r;
}
template <typename T>
__device__ range block_stride_range(T begin, T end) {
__device__ Range BlockStrideRange(T begin, T end) {
begin += threadIdx.x;
range r(begin, end);
r.step(blockDim.x);
Range r(begin, end);
r.Step(blockDim.x);
return r;
}
// Threadblock iterates over range, filling with value. Requires all threads in
// block to be active.
template <typename IterT, typename ValueT>
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
__device__ void BlockFill(IterT begin, size_t n, ValueT value) {
for (auto i : BlockStrideRange(static_cast<size_t>(0), n)) {
begin[i] = value;
}
}
@@ -234,34 +234,34 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
*/
template <typename T1, typename T2>
T1 div_round_up(const T1 a, const T2 b) {
T1 DivRoundUp(const T1 a, const T2 b) {
return static_cast<T1>(ceil(static_cast<double>(a) / b));
}
template <typename L>
__global__ void launch_n_kernel(size_t begin, size_t end, L lambda) {
for (auto i : grid_stride_range(begin, end)) {
__global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
for (auto i : GridStrideRange(begin, end)) {
lambda(i);
}
}
template <typename L>
__global__ void launch_n_kernel(int device_idx, size_t begin, size_t end,
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
L lambda) {
for (auto i : grid_stride_range(begin, end)) {
for (auto i : GridStrideRange(begin, end)) {
lambda(i, device_idx);
}
}
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void launch_n(int device_idx, size_t n, L lambda) {
inline void LaunchN(int device_idx, size_t n, L lambda) {
if (n == 0) {
return;
}
safe_cuda(cudaSetDevice(device_idx));
const int GRID_SIZE =
static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
static_cast<int>(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
lambda);
}
@@ -269,91 +269,91 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
* Memory
*/
enum memory_type { DEVICE, DEVICE_MANAGED };
enum MemoryType { kDevice, kDeviceManaged };
template <memory_type MemoryT>
class bulk_allocator;
template <MemoryType MemoryT>
class BulkAllocator;
template <typename T>
class dvec2;
class DVec2;
template <typename T>
class dvec {
friend class dvec2<T>;
class DVec {
friend class DVec2<T>;
private:
T *_ptr;
size_t _size;
int _device_idx;
T *ptr_;
size_t size_;
int device_idx_;
public:
void external_allocate(int device_idx, void *ptr, size_t size) {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec but already allocated");
void ExternalAllocate(int device_idx, void *ptr, size_t size) {
if (!Empty()) {
throw std::runtime_error("Tried to allocate DVec but already allocated");
}
_ptr = static_cast<T *>(ptr);
_size = size;
_device_idx = device_idx;
safe_cuda(cudaSetDevice(_device_idx));
ptr_ = static_cast<T *>(ptr);
size_ = size;
device_idx_ = device_idx;
safe_cuda(cudaSetDevice(device_idx_));
}
dvec() : _ptr(NULL), _size(0), _device_idx(-1) {}
size_t size() const { return _size; }
int device_idx() const { return _device_idx; }
bool empty() const { return _ptr == NULL || _size == 0; }
DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
size_t Size() const { return size_; }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return ptr_ == NULL || size_ == 0; }
T *data() { return _ptr; }
T *Data() { return ptr_; }
const T *data() const { return _ptr; }
const T *Data() const { return ptr_; }
std::vector<T> as_vector() const {
std::vector<T> h_vector(size());
safe_cuda(cudaSetDevice(_device_idx));
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
std::vector<T> AsVector() const {
std::vector<T> h_vector(Size());
safe_cuda(cudaSetDevice(device_idx_));
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
}
void fill(T value) {
auto d_ptr = _ptr;
launch_n(_device_idx, size(),
void Fill(T value) {
auto d_ptr = ptr_;
LaunchN(device_idx_, Size(),
[=] __device__(size_t idx) { d_ptr[idx] = value; });
}
void print() {
auto h_vector = this->as_vector();
void Print() {
auto h_vector = this->AsVector();
for (auto e : h_vector) {
std::cout << e << " ";
}
std::cout << "\n";
}
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(_ptr); }
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(ptr_); }
thrust::device_ptr<T> tend() {
return thrust::device_pointer_cast(_ptr + size());
return thrust::device_pointer_cast(ptr_ + Size());
}
template <typename T2>
dvec &operator=(const std::vector<T2> &other) {
DVec &operator=(const std::vector<T2> &other) {
this->copy(other.begin(), other.end());
return *this;
}
dvec &operator=(dvec<T> &other) {
if (other.size() != size()) {
DVec &operator=(DVec<T> &other) {
if (other.Size() != Size()) {
throw std::runtime_error(
"Cannot copy assign dvec to dvec, sizes are different");
"Cannot copy assign DVec to DVec, sizes are different");
}
safe_cuda(cudaSetDevice(this->device_idx()));
if (other.device_idx() == this->device_idx()) {
dh::safe_cuda(cudaMemcpy(this->data(), other.data(),
other.size() * sizeof(T),
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpy(this->Data(), other.Data(),
other.Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
} else {
std::cout << "deviceother: " << other.device_idx()
<< " devicethis: " << this->device_idx() << std::endl;
std::cout << "size deviceother: " << other.size()
<< " devicethis: " << this->device_idx() << std::endl;
std::cout << "deviceother: " << other.DeviceIdx()
<< " devicethis: " << this->DeviceIdx() << std::endl;
std::cout << "size deviceother: " << other.Size()
<< " devicethis: " << this->DeviceIdx() << std::endl;
throw std::runtime_error("Cannot copy to/from different devices");
}
@@ -362,177 +362,178 @@ class dvec {
template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->device_idx()));
if (end - begin != size()) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
"Cannot copy assign vector to DVec, sizes are different");
}
thrust::copy(begin, end, this->tbegin());
}
void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
safe_cuda(cudaSetDevice(this->device_idx()));
if (end - begin != size()) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
"Cannot copy assign vector to DVec, sizes are different");
}
safe_cuda(cudaMemcpy(this->data(), begin.get(),
size() * sizeof(T), cudaMemcpyDefault));
safe_cuda(cudaMemcpy(this->Data(), begin.get(),
Size() * sizeof(T), cudaMemcpyDefault));
}
};
/**
* @class dvec2 device_helpers.cuh
* @brief wrapper for storing 2 dvec's which are needed for cub::DoubleBuffer
* @class DVec2 device_helpers.cuh
* @brief wrapper for storing 2 DVec's which are needed for cub::DoubleBuffer
*/
template <typename T>
class dvec2 {
class DVec2 {
private:
dvec<T> _d1, _d2;
cub::DoubleBuffer<T> _buff;
int _device_idx;
DVec<T> d1_, d2_;
cub::DoubleBuffer<T> buff_;
int device_idx_;
public:
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
void ExternalAllocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
if (!Empty()) {
throw std::runtime_error("Tried to allocate DVec2 but already allocated");
}
_device_idx = device_idx;
_d1.external_allocate(_device_idx, ptr1, size);
_d2.external_allocate(_device_idx, ptr2, size);
_buff.d_buffers[0] = static_cast<T *>(ptr1);
_buff.d_buffers[1] = static_cast<T *>(ptr2);
_buff.selector = 0;
device_idx_ = device_idx;
d1_.ExternalAllocate(device_idx_, ptr1, size);
d2_.ExternalAllocate(device_idx_, ptr2, size);
buff_.d_buffers[0] = static_cast<T *>(ptr1);
buff_.d_buffers[1] = static_cast<T *>(ptr2);
buff_.selector = 0;
}
dvec2() : _d1(), _d2(), _buff(), _device_idx(-1) {}
DVec2() : d1_(), d2_(), buff_(), device_idx_(-1) {}
size_t size() const { return _d1.size(); }
int device_idx() const { return _device_idx; }
bool empty() const { return _d1.empty() || _d2.empty(); }
size_t Size() const { return d1_.Size(); }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return d1_.Empty() || d2_.Empty(); }
cub::DoubleBuffer<T> &buff() { return _buff; }
cub::DoubleBuffer<T> &buff() { return buff_; }
dvec<T> &d1() { return _d1; }
dvec<T> &d2() { return _d2; }
DVec<T> &D1() { return d1_; }
T *current() { return _buff.Current(); }
DVec<T> &D2() { return d2_; }
dvec<T> &current_dvec() { return _buff.selector == 0 ? d1() : d2(); }
T *Current() { return buff_.Current(); }
T *other() { return _buff.Alternate(); }
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
T *other() { return buff_.Alternate(); }
};
template <memory_type MemoryT>
class bulk_allocator {
std::vector<char *> d_ptr;
std::vector<size_t> _size;
std::vector<int> _device_idx;
template <MemoryType MemoryT>
class BulkAllocator {
std::vector<char *> d_ptr_;
std::vector<size_t> size_;
std::vector<int> device_idx_;
const int align = 256;
static const int kAlign = 256;
size_t align_round_up(size_t n) const {
n = (n + align - 1) / align;
return n * align;
size_t AlignRoundUp(size_t n) const {
n = (n + kAlign - 1) / kAlign;
return n * kAlign;
}
template <typename T>
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size) {
return align_round_up(first_size * sizeof(T));
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size) {
return AlignRoundUp(first_size * sizeof(T));
}
template <typename T, typename... Args>
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size, Args... args) {
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
}
template <typename T>
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
size_t first_size) {
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
first_vec->ExternalAllocate(device_idx, static_cast<void *>(ptr),
first_size);
}
template <typename T, typename... Args>
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec,
size_t first_size, Args... args) {
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
ptr += align_round_up(first_size * sizeof(T));
allocate_dvec(device_idx, ptr, args...);
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
ptr += AlignRoundUp(first_size * sizeof(T));
AllocateDVec(device_idx, ptr, args...);
}
char *allocate_device(int device_idx, size_t bytes, memory_type t) {
char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) {
char *ptr;
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes));
return ptr;
}
template <typename T>
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size) {
return 2 * align_round_up(first_size * sizeof(T));
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size) {
return 2 * AlignRoundUp(first_size * sizeof(T));
}
template <typename T, typename... Args>
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
}
template <typename T>
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
size_t first_size) {
first_vec->external_allocate(
first_vec->ExternalAllocate(
device_idx, static_cast<void *>(ptr),
static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
static_cast<void *>(ptr + AlignRoundUp(first_size * sizeof(T))),
first_size);
}
template <typename T, typename... Args>
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec,
size_t first_size, Args... args) {
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
ptr += (align_round_up(first_size * sizeof(T)) * 2);
allocate_dvec(device_idx, ptr, args...);
AllocateDVec<T>(device_idx, ptr, first_vec, first_size);
ptr += (AlignRoundUp(first_size * sizeof(T)) * 2);
AllocateDVec(device_idx, ptr, args...);
}
public:
bulk_allocator() {}
BulkAllocator() = default;
// prevent accidental copying, moving or assignment of this object
bulk_allocator(const bulk_allocator<MemoryT>&) = delete;
bulk_allocator(bulk_allocator<MemoryT>&&) = delete;
void operator=(const bulk_allocator<MemoryT>&) = delete;
void operator=(bulk_allocator<MemoryT>&&) = delete;
BulkAllocator(const BulkAllocator<MemoryT>&) = delete;
BulkAllocator(BulkAllocator<MemoryT>&&) = delete;
void operator=(const BulkAllocator<MemoryT>&) = delete;
void operator=(BulkAllocator<MemoryT>&&) = delete;
~bulk_allocator() {
for (size_t i = 0; i < d_ptr.size(); i++) {
if (!(d_ptr[i] == nullptr)) {
safe_cuda(cudaSetDevice(_device_idx[i]));
safe_cuda(cudaFree(d_ptr[i]));
d_ptr[i] = nullptr;
~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) {
if (!(d_ptr_[i] == nullptr)) {
safe_cuda(cudaSetDevice(device_idx_[i]));
safe_cuda(cudaFree(d_ptr_[i]));
d_ptr_[i] = nullptr;
}
}
}
// returns sum of bytes for all allocations
size_t size() {
return std::accumulate(_size.begin(), _size.end(), static_cast<size_t>(0));
size_t Size() {
return std::accumulate(size_.begin(), size_.end(), static_cast<size_t>(0));
}
template <typename... Args>
void allocate(int device_idx, bool silent, Args... args) {
size_t size = get_size_bytes(args...);
void Allocate(int device_idx, bool silent, Args... args) {
size_t size = GetSizeBytes(args...);
char *ptr = allocate_device(device_idx, size, MemoryT);
char *ptr = AllocateDevice(device_idx, size, MemoryT);
allocate_dvec(device_idx, ptr, args...);
AllocateDVec(device_idx, ptr, args...);
d_ptr.push_back(ptr);
_size.push_back(size);
_device_idx.push_back(device_idx);
d_ptr_.push_back(ptr);
size_.push_back(size);
device_idx_.push_back(device_idx);
if (!silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << size / mb_size << "MB on [" << device_idx
<< "] " << device_name(device_idx) << ", "
<< available_memory(device_idx) / mb_size << "MB remaining.";
<< "] " << DeviceName(device_idx) << ", "
<< AvailableMemory(device_idx) / mb_size << "MB remaining.";
}
}
};
@@ -543,7 +544,7 @@ struct CubMemory {
size_t temp_storage_bytes;
// Thrust
typedef char value_type;
using ValueT = char;
CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}
@@ -568,17 +569,18 @@ struct CubMemory {
}
}
// Thrust
char *allocate(std::ptrdiff_t num_bytes) {
char *allocate(std::ptrdiff_t num_bytes) { // NOLINT
LazyAllocate(num_bytes);
return reinterpret_cast<char *>(d_temp_storage);
}
// Thrust
void deallocate(char *ptr, size_t n) {
void deallocate(char *ptr, size_t n) { // NOLINT
// Do nothing
}
bool IsAllocated() { return d_temp_storage != NULL; }
bool IsAllocated() { return d_temp_storage != nullptr; }
};
/*
@@ -586,7 +588,7 @@ struct CubMemory {
*/
template <typename T>
void print(const dvec<T> &v, size_t max_items = 10) {
void Print(const DVec<T> &v, size_t max_items = 10) {
std::vector<T> h = v.as_vector();
for (size_t i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
@@ -609,14 +611,14 @@ void print(const dvec<T> &v, size_t max_items = 10) {
// Load balancing search
template <typename coordinate_t, typename segments_t, typename offset_t>
void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
size_t num_tiles, int tile_size, segments_t segments,
offset_t num_rows, offset_t num_elements) {
dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
offset_t diagonal = idx * tile_size;
coordinate_t tile_coordinate;
cub::CountingInputIterator<offset_t> nonzero_indices(0);
template <typename CoordinateT, typename SegmentT, typename OffsetT>
void FindMergePartitions(int device_idx, CoordinateT *d_tile_coordinates,
size_t num_tiles, int tile_size, SegmentT segments,
OffsetT num_rows, OffsetT num_elements) {
dh::LaunchN(device_idx, num_tiles + 1, [=] __device__(int idx) {
OffsetT diagonal = idx * tile_size;
CoordinateT tile_coordinate;
cub::CountingInputIterator<OffsetT> nonzero_indices(0);
// Search the merge path
// Cast to signed integer as this function can have negatives
@@ -630,27 +632,27 @@ void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
}
template <int TILE_SIZE, int ITEMS_PER_THREAD, int BLOCK_THREADS,
typename offset_t, typename coordinate_t, typename func_t,
typename segments_iter>
__global__ void LbsKernel(coordinate_t *d_coordinates,
segments_iter segment_end_offsets, func_t f,
offset_t num_segments) {
typename OffsetT, typename CoordinateT, typename FunctionT,
typename SegmentIterT>
__global__ void LbsKernel(CoordinateT *d_coordinates,
SegmentIterT segment_end_offsets, FunctionT f,
OffsetT num_segments) {
int tile = blockIdx.x;
coordinate_t tile_start_coord = d_coordinates[tile];
coordinate_t tile_end_coord = d_coordinates[tile + 1];
CoordinateT tile_start_coord = d_coordinates[tile];
CoordinateT tile_end_coord = d_coordinates[tile + 1];
int64_t tile_num_rows = tile_end_coord.x - tile_start_coord.x;
int64_t tile_num_elements = tile_end_coord.y - tile_start_coord.y;
cub::CountingInputIterator<offset_t> tile_element_indices(tile_start_coord.y);
coordinate_t thread_start_coord;
cub::CountingInputIterator<OffsetT> tile_element_indices(tile_start_coord.y);
CoordinateT thread_start_coord;
typedef typename std::iterator_traits<segments_iter>::value_type segment_t;
typedef typename std::iterator_traits<SegmentIterT>::value_type SegmentT;
__shared__ struct {
segment_t tile_segment_end_offsets[TILE_SIZE + 1];
segment_t output_segment[TILE_SIZE];
SegmentT tile_segment_end_offsets[TILE_SIZE + 1];
SegmentT output_segment[TILE_SIZE];
} temp_storage;
for (auto item : dh::block_stride_range(int(0), int(tile_num_rows + 1))) {
for (auto item : dh::BlockStrideRange(int(0), int(tile_num_rows + 1))) {
temp_storage.tile_segment_end_offsets[item] =
segment_end_offsets[min(static_cast<size_t>(tile_start_coord.x + item),
static_cast<size_t>(num_segments - 1))];
@@ -665,7 +667,7 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
tile_element_indices, // List B
tile_num_rows, tile_num_elements, thread_start_coord);
coordinate_t thread_current_coord = thread_start_coord;
CoordinateT thread_current_coord = thread_start_coord;
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) {
if (tile_element_indices[thread_current_coord.y] <
@@ -679,50 +681,50 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
}
__syncthreads();
for (auto item : dh::block_stride_range(int(0), int(tile_num_elements))) {
for (auto item : dh::BlockStrideRange(int(0), int(tile_num_elements))) {
f(tile_start_coord.y + item, temp_storage.output_segment[item]);
}
}
template <typename func_t, typename segments_iter, typename offset_t>
template <typename FunctionT, typename SegmentIterT, typename OffsetT>
void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
offset_t count, segments_iter segments,
offset_t num_segments, func_t f) {
typedef typename cub::CubVector<offset_t, 2>::Type coordinate_t;
OffsetT count, SegmentIterT segments,
OffsetT num_segments, FunctionT f) {
typedef typename cub::CubVector<OffsetT, 2>::Type CoordinateT;
dh::safe_cuda(cudaSetDevice(device_idx));
const int BLOCK_THREADS = 256;
const int ITEMS_PER_THREAD = 1;
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
auto num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
auto num_tiles = dh::DivRoundUp(count + num_segments, BLOCK_THREADS);
CHECK(num_tiles < std::numeric_limits<unsigned int>::max());
temp_memory->LazyAllocate(sizeof(coordinate_t) * (num_tiles + 1));
coordinate_t *tmp_tile_coordinates =
reinterpret_cast<coordinate_t *>(temp_memory->d_temp_storage);
temp_memory->LazyAllocate(sizeof(CoordinateT) * (num_tiles + 1));
CoordinateT *tmp_tile_coordinates =
reinterpret_cast<CoordinateT *>(temp_memory->d_temp_storage);
FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles,
BLOCK_THREADS, segments, num_segments, count);
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, OffsetT>
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
segments + 1, f, num_segments);
}
template <typename func_t, typename offset_t>
void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
func_t f) {
template <typename FunctionT, typename OffsetT>
void DenseTransformLbs(int device_idx, OffsetT count, OffsetT num_segments,
FunctionT f) {
CHECK(count % num_segments == 0) << "Data is not dense.";
launch_n(device_idx, count, [=] __device__(offset_t idx) {
offset_t segment = idx / (count / num_segments);
LaunchN(device_idx, count, [=] __device__(OffsetT idx) {
OffsetT segment = idx / (count / num_segments);
f(idx, segment);
});
}
/**
* \fn template <typename func_t, typename segments_iter, typename offset_t>
* void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
* segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
* \fn template <typename FunctionT, typename SegmentIterT, typename OffsetT>
* void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
* SegmentIterT segments, OffsetT num_segments, bool is_dense, FunctionT f)
*
* \brief Load balancing search function. Reads a CSR type matrix description
* and allows a function to be executed on each element. Search 'modern GPU load
@@ -731,9 +733,9 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
* \author Rory
* \date 7/9/2017
*
* \tparam func_t Type of the function t.
* \tparam segments_iter Type of the segments iterator.
* \tparam offset_t Type of the offset.
* \tparam FunctionT Type of the function t.
* \tparam SegmentIterT Type of the segments iterator.
* \tparam OffsetT Type of the offset.
* \param device_idx Zero-based index of the device.
* \param [in,out] temp_memory Temporary memory allocator.
* \param count Number of elements.
@@ -743,10 +745,10 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
* \param f Lambda to be executed on matrix elements.
*/
template <typename func_t, typename segments_iter, typename offset_t>
void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
segments_iter segments, offset_t num_segments, bool is_dense,
func_t f) {
template <typename FunctionT, typename SegmentIterT, typename OffsetT>
void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
SegmentIterT segments, OffsetT num_segments, bool is_dense,
FunctionT f) {
if (is_dense) {
DenseTransformLbs(device_idx, count, num_segments, f);
} else {
@@ -765,18 +767,18 @@ void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
* @param offsets the segments
*/
template <typename T1, typename T2>
void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
dh::dvec2<T2> *vals, int nVals, int nSegs,
const dh::dvec<int> &offsets, int start = 0,
void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
dh::DVec2<T2> *vals, int nVals, int nSegs,
const dh::DVec<int> &offsets, int start = 0,
int end = sizeof(T1) * 8) {
size_t tmpSize;
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.data(),
offsets.data() + 1, start, end));
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.Data(),
offsets.Data() + 1, start, end));
tmp_mem->LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
nSegs, offsets.data(), offsets.data() + 1, start, end));
nSegs, offsets.Data(), offsets.Data() + 1, start, end));
}
/**
@@ -787,14 +789,14 @@ void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
* @param nVals number of elements in the input array
*/
template <typename T>
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
int nVals) {
size_t tmpSize;
dh::safe_cuda(
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
cub::DeviceReduce::Sum(NULL, tmpSize, in.Data(), out.Data(), nVals));
tmp_mem.LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
in.data(), out.data(), nVals));
in.Data(), out.Data(), nVals));
}
/**
@@ -805,7 +807,7 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
* @param nVals number of elements in the input array
*/
template <typename T>
T sumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
T SumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
size_t tmpSize;
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
// Allocate small extra memory for the return value
@@ -827,8 +829,8 @@ T sumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
* @param def default value to be filled
*/
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
void fillConst(int device_idx, T *out, int len, T def) {
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
void FillConst(int device_idx, T *out, int len, T def) {
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
[=] __device__(int i) { out[i] = def; });
}
@@ -842,9 +844,9 @@ void fillConst(int device_idx, T *out, int len, T def) {
* @param nVals length of the buffers
*/
template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
void Gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
const int *instId, int nVals) {
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, nVals,
[=] __device__(int i) {
int iid = instId[i];
T1 v1 = in1[iid];
@@ -862,8 +864,8 @@ void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
* @param nVals length of the buffers
*/
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, nVals,
[=] __device__(int i) {
int iid = instId[i];
out[i] = in[iid];

View File

@@ -29,12 +29,12 @@ struct ParallelGroupBuilder {
// parallel group builder of data
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data)
: rptr(*p_rptr), data(*p_data), thread_rptr(tmp_thread_rptr) {
: rptr_(*p_rptr), data_(*p_data), thread_rptr_(tmp_thread_rptr_) {
}
ParallelGroupBuilder(std::vector<SizeType> *p_rptr,
std::vector<ValueType> *p_data,
std::vector< std::vector<SizeType> > *p_thread_rptr)
: rptr(*p_rptr), data(*p_data), thread_rptr(*p_thread_rptr) {
: rptr_(*p_rptr), data_(*p_data), thread_rptr_(*p_thread_rptr) {
}
public:
@@ -45,10 +45,10 @@ struct ParallelGroupBuilder {
* \param nthread number of thread that will be used in construction
*/
inline void InitBudget(size_t nkeys, int nthread) {
thread_rptr.resize(nthread);
for (size_t i = 0; i < thread_rptr.size(); ++i) {
thread_rptr[i].resize(nkeys);
std::fill(thread_rptr[i].begin(), thread_rptr[i].end(), 0);
thread_rptr_.resize(nthread);
for (size_t i = 0; i < thread_rptr_.size(); ++i) {
thread_rptr_[i].resize(nkeys);
std::fill(thread_rptr_[i].begin(), thread_rptr_[i].end(), 0);
}
}
/*!
@@ -58,34 +58,34 @@ struct ParallelGroupBuilder {
* \param nelem number of element budget add to this row
*/
inline void AddBudget(size_t key, int threadid, SizeType nelem = 1) {
std::vector<SizeType> &trptr = thread_rptr[threadid];
std::vector<SizeType> &trptr = thread_rptr_[threadid];
if (trptr.size() < key + 1) {
trptr.resize(key + 1, 0);
}
trptr[key] += nelem;
}
/*! \brief step 3: initialize the necessary storage */
inline void InitStorage(void) {
inline void InitStorage() {
// set rptr to correct size
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
if (rptr.size() <= thread_rptr[tid].size()) {
rptr.resize(thread_rptr[tid].size() + 1);
for (size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
if (rptr_.size() <= thread_rptr_[tid].size()) {
rptr_.resize(thread_rptr_[tid].size() + 1);
}
}
// initialize rptr to be beginning of each segment
size_t start = 0;
for (size_t i = 0; i + 1 < rptr.size(); ++i) {
for (size_t tid = 0; tid < thread_rptr.size(); ++tid) {
std::vector<SizeType> &trptr = thread_rptr[tid];
for (size_t i = 0; i + 1 < rptr_.size(); ++i) {
for (size_t tid = 0; tid < thread_rptr_.size(); ++tid) {
std::vector<SizeType> &trptr = thread_rptr_[tid];
if (i < trptr.size()) {
size_t ncnt = trptr[i];
trptr[i] = start;
start += ncnt;
}
}
rptr[i + 1] = start;
rptr_[i + 1] = start;
}
data.resize(start);
data_.resize(start);
}
/*!
* \brief step 4: add data to the allocated space,
@@ -96,19 +96,19 @@ struct ParallelGroupBuilder {
* \param threadid the id of thread that calls this function
*/
inline void Push(size_t key, ValueType value, int threadid) {
SizeType &rp = thread_rptr[threadid][key];
data[rp++] = value;
SizeType &rp = thread_rptr_[threadid][key];
data_[rp++] = value;
}
private:
/*! \brief pointer to the beginning and end of each continuous key */
std::vector<SizeType> &rptr;
std::vector<SizeType> &rptr_;
/*! \brief index of nonzero entries in each row */
std::vector<ValueType> &data;
std::vector<ValueType> &data_;
/*! \brief thread local data structure */
std::vector<std::vector<SizeType> > &thread_rptr;
std::vector<std::vector<SizeType> > &thread_rptr_;
/*! \brief local temp thread ptr, use this if not specified by the constructor */
std::vector<std::vector<SizeType> > tmp_thread_rptr;
std::vector<std::vector<SizeType> > tmp_thread_rptr_;
};
} // namespace common
} // namespace xgboost

View File

@@ -17,20 +17,20 @@ namespace xgboost {
namespace common {
void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
typedef common::WXQuantileSketch<bst_float, bst_float> WXQSketch;
const MetaInfo& info = p_fmat->info();
using WXQSketch = common::WXQuantileSketch<bst_float, bst_float>;
const MetaInfo& info = p_fmat->Info();
// safe factor for better accuracy
const int kFactor = 8;
constexpr int kFactor = 8;
std::vector<WXQSketch> sketchs;
const int nthread = omp_get_max_threads();
unsigned nstep = static_cast<unsigned>((info.num_col + nthread - 1) / nthread);
unsigned ncol = static_cast<unsigned>(info.num_col);
sketchs.resize(info.num_col);
auto nstep = static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread);
auto ncol = static_cast<unsigned>(info.num_col_);
sketchs.resize(info.num_col_);
for (auto& s : sketchs) {
s.Init(info.num_row, 1.0 / (max_num_bins * kFactor));
s.Init(info.num_row_, 1.0 / (max_num_bins * kFactor));
}
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
@@ -40,7 +40,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
#pragma omp parallel num_threads(nthread)
{
CHECK_EQ(nthread, omp_get_num_threads());
unsigned tid = static_cast<unsigned>(omp_get_thread_num());
auto tid = static_cast<unsigned>(omp_get_thread_num());
unsigned begin = std::min(nstep * tid, ncol);
unsigned end = std::min(nstep * (tid + 1), ncol);
for (size_t i = 0; i < batch.size; ++i) { // NOLINT(*)
@@ -68,7 +68,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
size_t nbytes = WXQSketch::SummaryContainer::CalcMemCost(max_num_bins * kFactor);
sreducer.Allreduce(dmlc::BeginPtr(summary_array), nbytes, summary_array.size());
this->min_val.resize(info.num_col);
this->min_val.resize(info.num_col_);
row_ptr.push_back(0);
for (size_t fid = 0; fid < summary_array.size(); ++fid) {
WXQSketch::SummaryContainer a;
@@ -105,7 +105,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
}
void GHistIndexMatrix::Init(DMatrix* p_fmat) {
CHECK(cut != nullptr);
CHECK(cut != nullptr); // NOLINT
dmlc::DataIter<RowBatch>* iter = p_fmat->RowIterator();
const int nthread = omp_get_max_threads();
@@ -126,7 +126,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
CHECK_GT(cut->cut.size(), 0U);
CHECK_EQ(cut->row_ptr.back(), cut->cut.size());
omp_ulong bsize = static_cast<omp_ulong>(batch.size);
auto bsize = static_cast<omp_ulong>(batch.size);
#pragma omp parallel for num_threads(nthread) schedule(static)
for (omp_ulong i = 0; i < bsize; ++i) { // NOLINT(*)
const int tid = omp_get_thread_num();
@@ -217,7 +217,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
std::vector<std::vector<bool>> conflict_marks;
std::vector<size_t> group_nnz;
std::vector<size_t> group_conflict_cnt;
const size_t max_conflict_cnt
const auto max_conflict_cnt
= static_cast<size_t>(param.max_conflict_rate * nrow);
for (auto fid : feature_list) {
@@ -336,14 +336,14 @@ FastFeatureGrouping(const GHistIndexMatrix& gmat,
void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
const ColumnMatrix& colmat,
const FastHistParam& param) {
cut = gmat.cut;
cut_ = gmat.cut;
const size_t nrow = gmat.row_ptr.size() - 1;
const uint32_t nbins = gmat.cut->row_ptr.back();
/* step 1: form feature groups */
auto groups = FastFeatureGrouping(gmat, colmat, param);
const uint32_t nblock = static_cast<uint32_t>(groups.size());
const auto nblock = static_cast<uint32_t>(groups.size());
/* step 2: build a new CSR matrix for each feature group */
std::vector<uint32_t> bin2block(nbins); // lookup table [bin id] => [block id]
@@ -380,24 +380,24 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
index_blk_ptr.push_back(0);
row_ptr_blk_ptr.push_back(0);
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
index.insert(index.end(), index_temp[block_id].begin(), index_temp[block_id].end());
row_ptr.insert(row_ptr.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
index_blk_ptr.push_back(index.size());
row_ptr_blk_ptr.push_back(row_ptr.size());
index_.insert(index_.end(), index_temp[block_id].begin(), index_temp[block_id].end());
row_ptr_.insert(row_ptr_.end(), row_ptr_temp[block_id].begin(), row_ptr_temp[block_id].end());
index_blk_ptr.push_back(index_.size());
row_ptr_blk_ptr.push_back(row_ptr_.size());
}
// save shortcut for each block
for (uint32_t block_id = 0; block_id < nblock; ++block_id) {
Block blk;
blk.index_begin = &index[index_blk_ptr[block_id]];
blk.row_ptr_begin = &row_ptr[row_ptr_blk_ptr[block_id]];
blk.index_end = &index[index_blk_ptr[block_id + 1]];
blk.row_ptr_end = &row_ptr[row_ptr_blk_ptr[block_id + 1]];
blocks.push_back(blk);
blk.index_begin = &index_[index_blk_ptr[block_id]];
blk.row_ptr_begin = &row_ptr_[row_ptr_blk_ptr[block_id]];
blk.index_end = &index_[index_blk_ptr[block_id + 1]];
blk.row_ptr_end = &row_ptr_[row_ptr_blk_ptr[block_id + 1]];
blocks_.push_back(blk);
}
}
void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const std::vector<bst_uint>& feat_set,
@@ -405,30 +405,30 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
data_.resize(nbins_ * nthread_, GHistEntry());
std::fill(data_.begin(), data_.end(), GHistEntry());
const int K = 8; // loop unrolling factor
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
constexpr int kUnroll = 8; // loop unrolling factor
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
const size_t nrows = row_indices.end - row_indices.begin;
const size_t rest = nrows % K;
const size_t rest = nrows % kUnroll;
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (bst_omp_uint i = 0; i < nrows - rest; i += K) {
for (bst_omp_uint i = 0; i < nrows - rest; i += kUnroll) {
const bst_omp_uint tid = omp_get_thread_num();
const size_t off = tid * nbins_;
size_t rid[K];
size_t ibegin[K];
size_t iend[K];
bst_gpair stat[K];
for (int k = 0; k < K; ++k) {
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
data_[off + bin].Add(stat[k]);
@@ -439,7 +439,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const bst_gpair stat = gpair[rid];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
data_[bin].Add(stat);
@@ -456,37 +456,40 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
}
}
void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
const std::vector<bst_uint>& feat_set,
GHistRow hist) {
const int K = 8; // loop unrolling factor
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
constexpr int kUnroll = 8; // loop unrolling factor
const size_t nblock = gmatb.GetNumBlock();
const size_t nrows = row_indices.end - row_indices.begin;
const size_t rest = nrows % K;
const size_t rest = nrows % kUnroll;
#if defined(_OPENMP)
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
#endif
#pragma omp parallel for num_threads(nthread) schedule(guided)
for (bst_omp_uint bid = 0; bid < nblock; ++bid) {
auto gmat = gmatb[bid];
for (size_t i = 0; i < nrows - rest; i += K) {
size_t rid[K];
size_t ibegin[K];
size_t iend[K];
bst_gpair stat[K];
for (int k = 0; k < K; ++k) {
for (size_t i = 0; i < nrows - rest; i += kUnroll) {
size_t rid[kUnroll];
size_t ibegin[kUnroll];
size_t iend[kUnroll];
GradientPair stat[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
rid[k] = row_indices.begin[i + k];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
ibegin[k] = gmat.row_ptr[rid[k]];
iend[k] = gmat.row_ptr[rid[k] + 1];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
stat[k] = gpair[rid[k]];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
for (size_t j = ibegin[k]; j < iend[k]; ++j) {
const uint32_t bin = gmat.index[j];
hist.begin[bin].Add(stat[k]);
@@ -497,7 +500,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
const size_t rid = row_indices.begin[i];
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const bst_gpair stat = gpair[rid];
const GradientPair stat = gpair[rid];
for (size_t j = ibegin; j < iend; ++j) {
const uint32_t bin = gmat.index[j];
hist.begin[bin].Add(stat);
@@ -507,21 +510,26 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
}
void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent) {
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
const uint32_t nbins = static_cast<bst_omp_uint>(nbins_);
const int K = 8; // loop unrolling factor
const uint32_t rest = nbins % K;
constexpr int kUnroll = 8; // loop unrolling factor
const uint32_t rest = nbins % kUnroll;
#if defined(_OPENMP)
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
#endif
#pragma omp parallel for num_threads(nthread) schedule(static)
for (bst_omp_uint bin_id = 0; bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += K) {
GHistEntry pb[K];
GHistEntry sb[K];
for (int k = 0; k < K; ++k) {
for (bst_omp_uint bin_id = 0;
bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += kUnroll) {
GHistEntry pb[kUnroll];
GHistEntry sb[kUnroll];
for (int k = 0; k < kUnroll; ++k) {
pb[k] = parent.begin[bin_id + k];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
sb[k] = sibling.begin[bin_id + k];
}
for (int k = 0; k < K; ++k) {
for (int k = 0; k < kUnroll; ++k) {
self.begin[bin_id + k].SetSubtract(pb[k], sb[k]);
}
}

View File

@@ -13,26 +13,26 @@
#include "row_set.h"
#include "../tree/fast_hist_param.h"
using xgboost::tree::FastHistParam;
namespace xgboost {
namespace common {
using tree::FastHistParam;
/*! \brief sums of gradient statistics corresponding to a histogram bin */
struct GHistEntry {
/*! \brief sum of first-order gradient statistics */
double sum_grad;
double sum_grad{0};
/*! \brief sum of second-order gradient statistics */
double sum_hess;
double sum_hess{0};
GHistEntry() : sum_grad(0), sum_hess(0) {}
GHistEntry() = default;
inline void Clear() {
sum_grad = sum_hess = 0;
}
/*! \brief add a bst_gpair to the sum */
inline void Add(const bst_gpair& e) {
/*! \brief add a GradientPair to the sum */
inline void Add(const GradientPair& e) {
sum_grad += e.GetGrad();
sum_hess += e.GetHess();
}
@@ -58,7 +58,7 @@ struct HistCutUnit {
/*! \brief number of cutting point, containing the maximum point */
uint32_t size;
// default constructor
HistCutUnit() {}
HistCutUnit() = default;
// constructor
HistCutUnit(const bst_float* cut, uint32_t size)
: cut(cut), size(size) {}
@@ -74,8 +74,8 @@ struct HistCutMatrix {
std::vector<bst_float> cut;
/*! \brief Get histogram bound for fid */
inline HistCutUnit operator[](bst_uint fid) const {
return HistCutUnit(dmlc::BeginPtr(cut) + row_ptr[fid],
row_ptr[fid + 1] - row_ptr[fid]);
return {dmlc::BeginPtr(cut) + row_ptr[fid],
row_ptr[fid + 1] - row_ptr[fid]};
}
// create histogram cut matrix given statistics from data
// using approximate quantile sketch approach
@@ -92,7 +92,7 @@ struct GHistIndexRow {
const uint32_t* index;
/*! \brief The size of the histogram */
size_t size;
GHistIndexRow() {}
GHistIndexRow() = default;
GHistIndexRow(const uint32_t* index, size_t size)
: index(index), size(size) {}
};
@@ -115,7 +115,7 @@ struct GHistIndexMatrix {
void Init(DMatrix* p_fmat);
// get i-th row
inline GHistIndexRow operator[](size_t i) const {
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
}
inline void GetFeatureCounts(size_t* counts) const {
auto nfeature = cut->row_ptr.size() - 1;
@@ -141,7 +141,7 @@ struct GHistIndexBlock {
// get i-th row
inline GHistIndexRow operator[](size_t i) const {
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
return {&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]};
}
};
@@ -154,24 +154,24 @@ class GHistIndexBlockMatrix {
const FastHistParam& param);
inline GHistIndexBlock operator[](size_t i) const {
return GHistIndexBlock(blocks[i].row_ptr_begin, blocks[i].index_begin);
return {blocks_[i].row_ptr_begin, blocks_[i].index_begin};
}
inline size_t GetNumBlock() const {
return blocks.size();
return blocks_.size();
}
private:
std::vector<size_t> row_ptr;
std::vector<uint32_t> index;
const HistCutMatrix* cut;
std::vector<size_t> row_ptr_;
std::vector<uint32_t> index_;
const HistCutMatrix* cut_;
struct Block {
const size_t* row_ptr_begin;
const size_t* row_ptr_end;
const uint32_t* index_begin;
const uint32_t* index_end;
};
std::vector<Block> blocks;
std::vector<Block> blocks_;
};
/*!
@@ -186,7 +186,7 @@ struct GHistRow {
/*! \brief number of entries */
uint32_t size;
GHistRow() {}
GHistRow() = default;
GHistRow(GHistEntry* begin, uint32_t size)
: begin(begin), size(size) {}
};
@@ -198,15 +198,15 @@ class HistCollection {
public:
// access histogram for i-th node
inline GHistRow operator[](bst_uint nid) const {
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
CHECK_NE(row_ptr_[nid], kMax);
return GHistRow(const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_);
return {const_cast<GHistEntry*>(dmlc::BeginPtr(data_) + row_ptr_[nid]), nbins_};
}
// have we computed a histogram for i-th node?
inline bool RowExists(bst_uint nid) const {
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
return (nid < row_ptr_.size() && row_ptr_[nid] != kMax);
const uint32_t k_max = std::numeric_limits<uint32_t>::max();
return (nid < row_ptr_.size() && row_ptr_[nid] != k_max);
}
// initialize histogram collection
@@ -218,7 +218,7 @@ class HistCollection {
// create an empty histogram for i-th node
inline void AddHistRow(bst_uint nid) {
const uint32_t kMax = std::numeric_limits<uint32_t>::max();
constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
if (nid >= row_ptr_.size()) {
row_ptr_.resize(nid + 1, kMax);
}
@@ -250,13 +250,13 @@ class GHistBuilder {
}
// construct a histogram via histogram aggregation
void BuildHist(const std::vector<bst_gpair>& gpair,
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const std::vector<bst_uint>& feat_set,
GHistRow hist);
// same, with feature grouping
void BuildBlockHist(const std::vector<bst_gpair>& gpair,
void BuildBlockHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexBlockMatrix& gmatb,
const std::vector<bst_uint>& feat_set,

View File

@@ -6,6 +6,8 @@
// dummy implementation of HostDeviceVector in case CUDA is not used
#include <xgboost/base.h>
#include <utility>
#include "./host_device_vector.h"
namespace xgboost {
@@ -13,8 +15,8 @@ namespace xgboost {
template <typename T>
struct HostDeviceVectorImpl {
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
explicit HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
explicit HostDeviceVectorImpl(const std::vector<T>& init) : data_h_(init) {}
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
std::vector<T> data_h_;
};
@@ -43,25 +45,25 @@ HostDeviceVector<T>::~HostDeviceVector() {
}
template <typename T>
size_t HostDeviceVector<T>::size() const { return impl_->data_h_.size(); }
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
template <typename T>
int HostDeviceVector<T>::device() const { return -1; }
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
template <typename T>
T* HostDeviceVector<T>::ptr_d(int device) { return nullptr; }
T* HostDeviceVector<T>::DevicePointer(int device) { return nullptr; }
template <typename T>
std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h_; }
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
template <typename T>
void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
impl_->data_h_.resize(new_size, v);
}
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<bst_gpair>;
template class HostDeviceVector<GradientPair>;
} // namespace xgboost

View File

@@ -35,27 +35,27 @@ struct HostDeviceVectorImpl {
void operator=(const HostDeviceVectorImpl<T>&) = delete;
void operator=(HostDeviceVectorImpl<T>&&) = delete;
size_t size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
size_t Size() const { return on_d_ ? data_d_.size() : data_h_.size(); }
int device() const { return device_; }
int DeviceIdx() const { return device_; }
T* ptr_d(int device) {
lazy_sync_device(device);
T* DevicePointer(int device) {
LazySyncDevice(device);
return data_d_.data().get();
}
thrust::device_ptr<T> tbegin(int device) {
return thrust::device_ptr<T>(ptr_d(device));
thrust::device_ptr<T> tbegin(int device) { // NOLINT
return thrust::device_ptr<T>(DevicePointer(device));
}
thrust::device_ptr<T> tend(int device) {
thrust::device_ptr<T> tend(int device) { // NOLINT
auto begin = tbegin(device);
return begin + size();
return begin + Size();
}
std::vector<T>& data_h() {
lazy_sync_host();
std::vector<T>& HostVector() {
LazySyncHost();
return data_h_;
}
void resize(size_t new_size, T v, int new_device) {
if (new_size == this->size() && new_device == device_)
void Resize(size_t new_size, T v, int new_device) {
if (new_size == this->Size() && new_device == device_)
return;
if (new_device != -1)
device_ = new_device;
@@ -70,26 +70,26 @@ struct HostDeviceVectorImpl {
}
}
void lazy_sync_host() {
void LazySyncHost() {
if (!on_d_)
return;
if (data_h_.size() != this->size())
data_h_.resize(this->size());
if (data_h_.size() != this->Size())
data_h_.resize(this->Size());
dh::safe_cuda(cudaSetDevice(device_));
thrust::copy(data_d_.begin(), data_d_.end(), data_h_.begin());
on_d_ = false;
}
void lazy_sync_device(int device) {
void LazySyncDevice(int device) {
if (on_d_)
return;
if (device != device_) {
CHECK_EQ(device_, -1);
device_ = device;
}
if (data_d_.size() != this->size()) {
if (data_d_.size() != this->Size()) {
dh::safe_cuda(cudaSetDevice(device_));
data_d_.resize(this->size());
data_d_.resize(this->Size());
}
dh::safe_cuda(cudaSetDevice(device_));
thrust::copy(data_h_.begin(), data_h_.end(), data_d_.begin());
@@ -128,34 +128,34 @@ HostDeviceVector<T>::~HostDeviceVector() {
}
template <typename T>
size_t HostDeviceVector<T>::size() const { return impl_->size(); }
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T>
int HostDeviceVector<T>::device() const { return impl_->device(); }
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
template <typename T>
T* HostDeviceVector<T>::ptr_d(int device) { return impl_->ptr_d(device); }
T* HostDeviceVector<T>::DevicePointer(int device) { return impl_->DevicePointer(device); }
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) {
thrust::device_ptr<T> HostDeviceVector<T>::tbegin(int device) { // NOLINT
return impl_->tbegin(device);
}
template <typename T>
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) {
thrust::device_ptr<T> HostDeviceVector<T>::tend(int device) { // NOLINT
return impl_->tend(device);
}
template <typename T>
std::vector<T>& HostDeviceVector<T>::data_h() { return impl_->data_h(); }
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->HostVector(); }
template <typename T>
void HostDeviceVector<T>::resize(size_t new_size, T v, int new_device) {
impl_->resize(new_size, v, new_device);
void HostDeviceVector<T>::Resize(size_t new_size, T v, int new_device) {
impl_->Resize(new_size, v, new_device);
}
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;
template class HostDeviceVector<bst_gpair>;
template class HostDeviceVector<GradientPair>;
} // namespace xgboost

View File

@@ -70,10 +70,10 @@ class HostDeviceVector {
HostDeviceVector(HostDeviceVector<T>&&) = delete;
void operator=(const HostDeviceVector<T>&) = delete;
void operator=(HostDeviceVector<T>&&) = delete;
size_t size() const;
int device() const;
T* ptr_d(int device);
T* ptr_h() { return data_h().data(); }
size_t Size() const;
int DeviceIdx() const;
T* DevicePointer(int device);
T* HostPointer() { return HostVector().data(); }
// only define functions returning device_ptr
// if HostDeviceVector.h is included from a .cu file
@@ -82,10 +82,10 @@ class HostDeviceVector {
thrust::device_ptr<T> tend(int device);
#endif
std::vector<T>& data_h();
std::vector<T>& HostVector();
// passing in new_device == -1 keeps the device as is
void resize(size_t new_size, T v = T(), int new_device = -1);
void Resize(size_t new_size, T v = T(), int new_device = -1);
private:
HostDeviceVectorImpl<T>* impl_;

View File

@@ -15,8 +15,8 @@
namespace xgboost {
namespace common {
typedef rabit::utils::MemoryFixSizeBuffer MemoryFixSizeBuffer;
typedef rabit::utils::MemoryBufferStream MemoryBufferStream;
using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
using MemoryBufferStream = rabit::utils::MemoryBufferStream;
/*!
* \brief Input stream that support additional PeekRead

View File

@@ -39,12 +39,12 @@ inline void Softmax(std::vector<float>* p_rec) {
wmax = std::max(rec[i], wmax);
}
double wsum = 0.0f;
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] = std::exp(rec[i] - wmax);
wsum += rec[i];
for (float & elem : rec) {
elem = std::exp(elem - wmax);
wsum += elem;
}
for (size_t i = 0; i < rec.size(); ++i) {
rec[i] /= static_cast<float>(wsum);
for (float & elem : rec) {
elem /= static_cast<float>(wsum);
}
}

View File

@@ -35,7 +35,7 @@ struct WQSummary {
/*! \brief the value of data */
DType value;
// constructor
Entry() {}
Entry() = default;
// constructor
Entry(RType rmin, RType rmax, RType wmin, DType value)
: rmin(rmin), rmax(rmax), wmin(wmin), value(value) {}
@@ -48,11 +48,11 @@ struct WQSummary {
CHECK(rmax- rmin - wmin > -eps) << "relation constraint: min/max";
}
/*! \return rmin estimation for v strictly bigger than value */
inline RType rmin_next() const {
inline RType RMinNext() const {
return rmin + wmin;
}
/*! \return rmax estimation for v strictly smaller than value */
inline RType rmax_prev() const {
inline RType RMaxPrev() const {
return rmax - wmin;
}
};
@@ -65,7 +65,7 @@ struct WQSummary {
// weight of instance
RType weight;
// default constructor
QEntry() {}
QEntry() = default;
// constructor
QEntry(DType value, RType weight)
: value(value), weight(weight) {}
@@ -116,7 +116,7 @@ struct WQSummary {
inline RType MaxError() const {
RType res = data[0].rmax - data[0].rmin - data[0].wmin;
for (size_t i = 1; i < size; ++i) {
res = std::max(data[i].rmax_prev() - data[i - 1].rmin_next(), res);
res = std::max(data[i].RMaxPrev() - data[i - 1].RMinNext(), res);
res = std::max(data[i].rmax - data[i].rmin - data[i].wmin, res);
}
return res;
@@ -140,8 +140,8 @@ struct WQSummary {
if (istart == 0) {
return Entry(0.0f, 0.0f, 0.0f, qvalue);
} else {
return Entry(data[istart - 1].rmin_next(),
data[istart].rmax_prev(),
return Entry(data[istart - 1].RMinNext(),
data[istart].RMaxPrev(),
0.0f, qvalue);
}
}
@@ -197,7 +197,7 @@ struct WQSummary {
while (i < src.size - 1
&& dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
CHECK(i != src.size - 1);
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
if (i != lastidx) {
data[size++] = src.data[i]; lastidx = i;
}
@@ -236,20 +236,20 @@ struct WQSummary {
*dst = Entry(a->rmin + b->rmin,
a->rmax + b->rmax,
a->wmin + b->wmin, a->value);
aprev_rmin = a->rmin_next();
bprev_rmin = b->rmin_next();
aprev_rmin = a->RMinNext();
bprev_rmin = b->RMinNext();
++dst; ++a; ++b;
} else if (a->value < b->value) {
*dst = Entry(a->rmin + bprev_rmin,
a->rmax + b->rmax_prev(),
a->rmax + b->RMaxPrev(),
a->wmin, a->value);
aprev_rmin = a->rmin_next();
aprev_rmin = a->RMinNext();
++dst; ++a;
} else {
*dst = Entry(b->rmin + aprev_rmin,
b->rmax + a->rmax_prev(),
b->rmax + a->RMaxPrev(),
b->wmin, b->value);
bprev_rmin = b->rmin_next();
bprev_rmin = b->RMinNext();
++dst; ++b;
}
}
@@ -307,7 +307,7 @@ struct WQSummary {
data[i].rmax = prev_rmax;
*err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax);
}
RType rmin_next = data[i].rmin_next();
RType rmin_next = data[i].RMinNext();
if (data[i].rmax < rmin_next) {
data[i].rmax = rmin_next;
*err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next);
@@ -334,13 +334,13 @@ struct WQSummary {
template<typename DType, typename RType>
struct WXQSummary : public WQSummary<DType, RType> {
// redefine entry type
typedef typename WQSummary<DType, RType>::Entry Entry;
using Entry = typename WQSummary<DType, RType>::Entry;
// constructor
WXQSummary(Entry *data, size_t size)
: WQSummary<DType, RType>(data, size) {}
// check if the block is large chunk
inline static bool CheckLarge(const Entry &e, RType chunk) {
return e.rmin_next() > e.rmax_prev() + chunk;
return e.RMinNext() > e.RMaxPrev() + chunk;
}
// set prune
inline void SetPrune(const WQSummary<DType, RType> &src, size_t maxsize) {
@@ -377,13 +377,13 @@ struct WXQSummary : public WQSummary<DType, RType> {
if (CheckLarge(src.data[i], chunk)) {
if (bid != i - 1) {
// accumulate the range of the rest points
mrange += src.data[i].rmax_prev() - src.data[bid].rmin_next();
mrange += src.data[i].RMaxPrev() - src.data[bid].RMinNext();
}
bid = i; ++nbig;
}
}
if (bid != src.size - 2) {
mrange += src.data[src.size-1].rmax_prev() - src.data[bid].rmin_next();
mrange += src.data[src.size-1].RMaxPrev() - src.data[bid].RMinNext();
}
}
// assert: there cannot be more than n big data points
@@ -405,14 +405,14 @@ struct WXQSummary : public WQSummary<DType, RType> {
if (end == src.size - 1 || CheckLarge(src.data[end], chunk)) {
if (bid != end - 1) {
size_t i = bid;
RType maxdx2 = src.data[end].rmax_prev() * 2;
RType maxdx2 = src.data[end].RMaxPrev() * 2;
for (; k < n; ++k) {
RType dx2 = 2 * ((k * mrange) / n + begin);
if (dx2 >= maxdx2) break;
while (i < end &&
dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
if (i == end) break;
if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
if (dx2 < src.data[i].RMinNext() + src.data[i + 1].RMaxPrev()) {
if (i != lastidx) {
this->data[this->size++] = src.data[i]; lastidx = i;
}
@@ -429,7 +429,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
}
bid = end;
// shift base by the gap
begin += src.data[bid].rmin_next() - src.data[bid].rmax_prev();
begin += src.data[bid].RMinNext() - src.data[bid].RMaxPrev();
}
}
}
@@ -448,7 +448,7 @@ struct GKSummary {
/*! \brief the value of data */
DType value;
// constructor
Entry() {}
Entry() = default;
// constructor
Entry(RType rmin, RType rmax, DType value)
: rmin(rmin), rmax(rmax), value(value) {}
@@ -591,17 +591,17 @@ template<typename DType, typename RType, class TSummary>
class QuantileSketchTemplate {
public:
/*! \brief type of summary type */
typedef TSummary Summary;
using Summary = TSummary;
/*! \brief the entry type */
typedef typename Summary::Entry Entry;
using Entry = typename Summary::Entry;
/*! \brief same as summary, but use STL to backup the space */
struct SummaryContainer : public Summary {
std::vector<Entry> space;
SummaryContainer(const SummaryContainer &src) : Summary(NULL, src.size) {
SummaryContainer(const SummaryContainer &src) : Summary(nullptr, src.size) {
this->space = src.space;
this->data = dmlc::BeginPtr(this->space);
}
SummaryContainer() : Summary(NULL, 0) {
SummaryContainer() : Summary(nullptr, 0) {
}
/*! \brief reserve space for summary */
inline void Reserve(size_t size) {
@@ -775,7 +775,7 @@ class QuantileSketchTemplate {
inline void InitLevel(size_t nlevel) {
if (level.size() >= nlevel) return;
data.resize(limit_size * nlevel);
level.resize(nlevel, Summary(NULL, 0));
level.resize(nlevel, Summary(nullptr, 0));
for (size_t l = 0; l < level.size(); ++l) {
level[l].data = dmlc::BeginPtr(data) + l * limit_size;
}

View File

@@ -15,7 +15,7 @@ namespace common {
/*!
* \brief Define mt19937 as default type Random Engine.
*/
typedef std::mt19937 RandomEngine;
using RandomEngine = std::mt19937;
#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
/*!
@@ -56,7 +56,7 @@ typedef CustomGlobalRandomEngine GlobalRandomEngine;
/*!
* \brief global random engine
*/
typedef RandomEngine GlobalRandomEngine;
using GlobalRandomEngine = RandomEngine;
#endif
/*!

View File

@@ -21,18 +21,18 @@ class RowSetCollection {
* rows (instances) associated with a particular node in a decision
* tree. */
struct Elem {
const size_t* begin;
const size_t* end;
int node_id;
const size_t* begin{nullptr};
const size_t* end{nullptr};
int node_id{-1};
// id of node associated with this instance set; -1 means uninitialized
Elem(void)
: begin(nullptr), end(nullptr), node_id(-1) {}
Elem()
= default;
Elem(const size_t* begin,
const size_t* end,
int node_id)
: begin(begin), end(end), node_id(node_id) {}
inline size_t size() const {
inline size_t Size() const {
return end - begin;
}
};
@@ -42,11 +42,11 @@ class RowSetCollection {
std::vector<size_t> right;
};
inline std::vector<Elem>::const_iterator begin() const {
inline std::vector<Elem>::const_iterator begin() const { // NOLINT
return elem_of_each_node_.begin();
}
inline std::vector<Elem>::const_iterator end() const {
inline std::vector<Elem>::const_iterator end() const { // NOLINT
return elem_of_each_node_.end();
}
@@ -88,7 +88,7 @@ class RowSetCollection {
unsigned left_node_id,
unsigned right_node_id) {
const Elem e = elem_of_each_node_[node_id];
const bst_omp_uint nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
const auto nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
CHECK(e.begin != nullptr);
size_t* all_begin = dmlc::BeginPtr(row_indices_);
size_t* begin = all_begin + (e.begin - all_begin);

View File

@@ -12,10 +12,10 @@
namespace xgboost {
namespace common {
struct Timer {
typedef std::chrono::high_resolution_clock ClockT;
typedef std::chrono::high_resolution_clock::time_point TimePointT;
typedef std::chrono::high_resolution_clock::duration DurationT;
typedef std::chrono::duration<double> SecondsT;
using ClockT = std::chrono::high_resolution_clock;
using TimePointT = std::chrono::high_resolution_clock::time_point;
using DurationT = std::chrono::high_resolution_clock::duration;
using SecondsT = std::chrono::duration<double>;
TimePointT start;
DurationT elapsed;
@@ -70,7 +70,7 @@ struct Monitor {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::synchronize_n_devices(dList.size(), dList);
dh::SynchronizeNDevices(dList.size(), dList);
#endif
}
timer_map[name].Start();
@@ -80,7 +80,7 @@ struct Monitor {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::synchronize_n_devices(dList.size(), dList);
dh::SynchronizeNDevices(dList.size(), dList);
#endif
}
timer_map[name].Stop();