Various bug fixes (#2825)
* Fatal error if GPU algorithm selected without GPU support compiled * Resolve type conversion warnings * Fix gpu unit test failure * Fix compressed iterator edge case * Fix python unit test failures due to flake8 update on pip
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
#include <xgboost/base.h>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include "dmlc/logging.h"
|
||||
#include <algorithm>
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -28,8 +28,9 @@ static const int padding = 4; // Assign padding so we can read slightly off
|
||||
// the beginning of the array
|
||||
|
||||
// The number of bits required to represent a given unsigned range
|
||||
static int SymbolBits(int num_symbols) {
|
||||
return std::ceil(std::log2(num_symbols));
|
||||
static size_t SymbolBits(size_t num_symbols) {
|
||||
auto bits = std::ceil(std::log2(num_symbols));
|
||||
return std::max(static_cast<size_t>(bits), size_t(1));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
@@ -72,9 +73,9 @@ class CompressedBufferWriter {
|
||||
|
||||
static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
|
||||
const int bits_per_byte = 8;
|
||||
size_t compressed_size = std::ceil(
|
||||
size_t compressed_size = static_cast<size_t>(std::ceil(
|
||||
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
|
||||
bits_per_byte);
|
||||
bits_per_byte));
|
||||
return compressed_size + detail::padding;
|
||||
}
|
||||
|
||||
@@ -98,8 +99,8 @@ class CompressedBufferWriter {
|
||||
template <typename iter_t>
|
||||
void Write(compressed_byte_t *buffer, iter_t input_begin, iter_t input_end) {
|
||||
uint64_t tmp = 0;
|
||||
int stored_bits = 0;
|
||||
const int max_stored_bits = 64 - symbol_bits_;
|
||||
size_t stored_bits = 0;
|
||||
const size_t max_stored_bits = 64 - symbol_bits_;
|
||||
size_t buffer_position = detail::padding;
|
||||
const size_t num_symbols = input_end - input_begin;
|
||||
for (size_t i = 0; i < num_symbols; i++) {
|
||||
@@ -108,7 +109,8 @@ class CompressedBufferWriter {
|
||||
// Eject only full bytes
|
||||
size_t tmp_bytes = stored_bits / 8;
|
||||
for (size_t j = 0; j < tmp_bytes; j++) {
|
||||
buffer[buffer_position] = tmp >> (stored_bits - (j + 1) * 8);
|
||||
buffer[buffer_position] = static_cast<compressed_byte_t>(
|
||||
tmp >> (stored_bits - (j + 1) * 8));
|
||||
buffer_position++;
|
||||
}
|
||||
stored_bits -= tmp_bytes * 8;
|
||||
@@ -121,13 +123,16 @@ class CompressedBufferWriter {
|
||||
}
|
||||
|
||||
// Eject all bytes
|
||||
size_t tmp_bytes = std::ceil(static_cast<float>(stored_bits) / 8);
|
||||
for (size_t j = 0; j < tmp_bytes; j++) {
|
||||
int shift_bits = stored_bits - (j + 1) * 8;
|
||||
int tmp_bytes =
|
||||
static_cast<int>(std::ceil(static_cast<float>(stored_bits) / 8));
|
||||
for (int j = 0; j < tmp_bytes; j++) {
|
||||
int shift_bits = static_cast<int>(stored_bits) - (j + 1) * 8;
|
||||
if (shift_bits >= 0) {
|
||||
buffer[buffer_position] = tmp >> shift_bits;
|
||||
buffer[buffer_position] =
|
||||
static_cast<compressed_byte_t>(tmp >> shift_bits);
|
||||
} else {
|
||||
buffer[buffer_position] = tmp << std::abs(shift_bits);
|
||||
buffer[buffer_position] =
|
||||
static_cast<compressed_byte_t>(tmp << std::abs(shift_bits));
|
||||
}
|
||||
buffer_position++;
|
||||
}
|
||||
|
||||
@@ -125,7 +125,7 @@ inline size_t available_memory(int device_idx) {
|
||||
* \param device_idx Zero-based index of the device.
|
||||
*/
|
||||
|
||||
inline int max_shared_memory(int device_idx) {
|
||||
inline size_t max_shared_memory(int device_idx) {
|
||||
cudaDeviceProp prop;
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
|
||||
return prop.sharedMemPerBlock;
|
||||
@@ -241,8 +241,7 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
|
||||
}
|
||||
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
// TODO: Template on n so GRID_SIZE always fits into int.
|
||||
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
|
||||
const int GRID_SIZE = static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
|
||||
lambda);
|
||||
}
|
||||
@@ -428,74 +427,66 @@ class bulk_allocator {
|
||||
|
||||
const int align = 256;
|
||||
|
||||
template <typename SizeT>
|
||||
size_t align_round_up(SizeT n) {
|
||||
size_t align_round_up(size_t n) const {
|
||||
n = (n + align - 1) / align;
|
||||
return n * align;
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size) {
|
||||
return align_round_up<SizeT>(first_size * sizeof(T));
|
||||
template <typename T>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size) {
|
||||
return align_round_up(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||
get_size_bytes(args...);
|
||||
template <typename T, typename... Args>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, size_t first_size, Args... args) {
|
||||
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
template <typename T>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
SizeT first_size) {
|
||||
size_t first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
template <typename T, typename... Args>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
SizeT first_size, Args... args) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
size_t first_size, Args... args) {
|
||||
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += align_round_up(first_size * sizeof(T));
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
}
|
||||
|
||||
// template <memory_type MemoryT>
|
||||
char *allocate_device(int device_idx, size_t bytes, memory_type t) {
|
||||
char *ptr;
|
||||
if (t == memory_type::DEVICE) {
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaMalloc(&ptr, bytes));
|
||||
} else {
|
||||
safe_cuda(cudaMallocManaged(&ptr, bytes));
|
||||
}
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
safe_cuda(cudaMalloc(&ptr, bytes));
|
||||
return ptr;
|
||||
}
|
||||
template <typename T, typename SizeT>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
|
||||
template <typename T>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size) {
|
||||
return 2 * align_round_up(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||
template <typename T, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
|
||||
return get_size_bytes<T>(first_vec, first_size) +
|
||||
get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
template <typename T>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
SizeT first_size) {
|
||||
size_t first_size) {
|
||||
first_vec->external_allocate(
|
||||
device_idx, static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
template <typename T, typename... Args>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
SizeT first_size, Args... args) {
|
||||
allocate_dvec<T, SizeT>(device_idx, ptr, first_vec, first_size);
|
||||
size_t first_size, Args... args) {
|
||||
allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
}
|
||||
@@ -544,14 +535,13 @@ struct CubMemory {
|
||||
// Thrust
|
||||
typedef char value_type;
|
||||
|
||||
CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
|
||||
CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}
|
||||
|
||||
~CubMemory() { Free(); }
|
||||
|
||||
template <typename T>
|
||||
T* Pointer()
|
||||
{
|
||||
return static_cast<T*>(d_temp_storage);
|
||||
T *Pointer() {
|
||||
return static_cast<T *>(d_temp_storage);
|
||||
}
|
||||
|
||||
void Free() {
|
||||
@@ -611,7 +601,7 @@ void print(const dvec<T> &v, size_t max_items = 10) {
|
||||
|
||||
template <typename coordinate_t, typename segments_t, typename offset_t>
|
||||
void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
|
||||
int num_tiles, int tile_size, segments_t segments,
|
||||
size_t num_tiles, int tile_size, segments_t segments,
|
||||
offset_t num_rows, offset_t num_elements) {
|
||||
dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
|
||||
offset_t diagonal = idx * tile_size;
|
||||
@@ -692,7 +682,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
const int BLOCK_THREADS = 256;
|
||||
const int ITEMS_PER_THREAD = 1;
|
||||
const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
|
||||
int num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
|
||||
auto num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
|
||||
CHECK(num_tiles < std::numeric_limits<unsigned int>::max());
|
||||
|
||||
temp_memory->LazyAllocate(sizeof(coordinate_t) * (num_tiles + 1));
|
||||
coordinate_t *tmp_tile_coordinates =
|
||||
@@ -702,7 +693,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
BLOCK_THREADS, segments, num_segments, count);
|
||||
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
|
||||
<<<num_tiles, BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
|
||||
num_segments);
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
|
||||
const int nthread = omp_get_max_threads();
|
||||
|
||||
unsigned nstep = (info.num_col + nthread - 1) / nthread;
|
||||
unsigned nstep = static_cast<unsigned>((info.num_col + nthread - 1) / nthread);
|
||||
unsigned ncol = static_cast<unsigned>(info.num_col);
|
||||
sketchs.resize(info.num_col);
|
||||
for (auto& s : sketchs) {
|
||||
@@ -79,7 +79,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
if (a.size > 1 && a.size <= 16) {
|
||||
/* specialized code categorial / ordinal data -- use midpoints */
|
||||
for (size_t i = 1; i < a.size; ++i) {
|
||||
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
|
||||
bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0f;
|
||||
if (i == 1 || cpt > cut.back()) {
|
||||
cut.push_back(cpt);
|
||||
}
|
||||
@@ -99,7 +99,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
bst_float last = cpt + fabs(cpt);
|
||||
cut.push_back(last);
|
||||
}
|
||||
row_ptr.push_back(cut.size());
|
||||
row_ptr.push_back(static_cast<bst_uint>(cut.size()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,7 +148,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
|
||||
}
|
||||
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint idx = 0; idx < nbins; ++idx) {
|
||||
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
|
||||
for (int tid = 0; tid < nthread; ++tid) {
|
||||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
|
||||
}
|
||||
@@ -226,7 +226,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
|
||||
bool need_new_group = true;
|
||||
|
||||
// randomly choose some of existing groups as candidates
|
||||
std::vector<unsigned> search_groups;
|
||||
std::vector<size_t> search_groups;
|
||||
for (size_t gid = 0; gid < groups.size(); ++gid) {
|
||||
if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
|
||||
search_groups.push_back(gid);
|
||||
@@ -434,7 +434,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
@@ -448,7 +448,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
|
||||
/* reduction */
|
||||
const uint32_t nbins = nbins_;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
|
||||
for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) {
|
||||
for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
|
||||
hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]);
|
||||
}
|
||||
@@ -462,7 +462,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
GHistRow hist) {
|
||||
const int K = 8; // loop unrolling factor
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
|
||||
const uint32_t nblock = gmatb.GetNumBlock();
|
||||
const size_t nblock = gmatb.GetNumBlock();
|
||||
const size_t nrows = row_indices.end - row_indices.begin;
|
||||
const size_t rest = nrows % K;
|
||||
|
||||
@@ -492,7 +492,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
|
||||
}
|
||||
}
|
||||
}
|
||||
for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
|
||||
for (size_t i = nrows - rest; i < nrows; ++i) {
|
||||
const size_t rid = row_indices.begin[i];
|
||||
const size_t ibegin = gmat.row_ptr[rid];
|
||||
const size_t iend = gmat.row_ptr[rid + 1];
|
||||
@@ -511,7 +511,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa
|
||||
const int K = 8; // loop unrolling factor
|
||||
const uint32_t rest = nbins % K;
|
||||
#pragma omp parallel for num_threads(nthread) schedule(static)
|
||||
for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
|
||||
for (bst_omp_uint bin_id = 0; bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += K) {
|
||||
GHistEntry pb[K];
|
||||
GHistEntry sb[K];
|
||||
for (int k = 0; k < K; ++k) {
|
||||
|
||||
@@ -118,11 +118,11 @@ struct GHistIndexMatrix {
|
||||
return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
|
||||
}
|
||||
inline void GetFeatureCounts(size_t* counts) const {
|
||||
const unsigned nfeature = cut->row_ptr.size() - 1;
|
||||
auto nfeature = cut->row_ptr.size() - 1;
|
||||
for (unsigned fid = 0; fid < nfeature; ++fid) {
|
||||
const unsigned ibegin = cut->row_ptr[fid];
|
||||
const unsigned iend = cut->row_ptr[fid + 1];
|
||||
for (unsigned i = ibegin; i < iend; ++i) {
|
||||
auto ibegin = cut->row_ptr[fid];
|
||||
auto iend = cut->row_ptr[fid + 1];
|
||||
for (auto i = ibegin; i < iend; ++i) {
|
||||
counts[fid] += hit_count[i];
|
||||
}
|
||||
}
|
||||
@@ -235,7 +235,7 @@ class HistCollection {
|
||||
std::vector<GHistEntry> data_;
|
||||
|
||||
/*! \brief row_ptr_[nid] locates bin for historgram of node nid */
|
||||
std::vector<uint32_t> row_ptr_;
|
||||
std::vector<size_t> row_ptr_;
|
||||
};
|
||||
|
||||
/*!
|
||||
|
||||
@@ -680,12 +680,12 @@ class QuantileSketchTemplate {
|
||||
nlevel = 1;
|
||||
while (true) {
|
||||
limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
|
||||
size_t n = (1UL << nlevel);
|
||||
size_t n = (1ULL << nlevel);
|
||||
if (n * limit_size >= maxn) break;
|
||||
++nlevel;
|
||||
}
|
||||
// check invariant
|
||||
size_t n = (1UL << nlevel);
|
||||
size_t n = (1ULL << nlevel);
|
||||
CHECK(n * limit_size >= maxn) << "invalid init parameter";
|
||||
CHECK(nlevel <= limit_size * eps) << "invalid init parameter";
|
||||
// lazy reserve the space, if there is only one value, no need to allocate space
|
||||
|
||||
@@ -88,7 +88,7 @@ class RowSetCollection {
|
||||
unsigned left_node_id,
|
||||
unsigned right_node_id) {
|
||||
const Elem e = elem_of_each_node_[node_id];
|
||||
const unsigned nthread = row_split_tloc.size();
|
||||
const bst_omp_uint nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
|
||||
CHECK(e.begin != nullptr);
|
||||
size_t* all_begin = dmlc::BeginPtr(row_indices_);
|
||||
size_t* begin = all_begin + (e.begin - all_begin);
|
||||
|
||||
Reference in New Issue
Block a user