diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 259c7a891..117c2f7a0 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -171,19 +171,19 @@ class bst_gpair_internal { template<> inline XGBOOST_DEVICE float bst_gpair_internal::GetGrad() const { - return grad_ * 1e-5; + return grad_ * 1e-5f; } template<> inline XGBOOST_DEVICE float bst_gpair_internal::GetHess() const { - return hess_ * 1e-5; + return hess_ * 1e-5f; } template<> inline XGBOOST_DEVICE void bst_gpair_internal::SetGrad(float g) { - grad_ = std::round(g * 1e5); + grad_ = static_cast(std::round(g * 1e5)); } template<> inline XGBOOST_DEVICE void bst_gpair_internal::SetHess(float h) { - hess_ = std::round(h * 1e5); + hess_ = static_cast(std::round(h * 1e5)); } } // namespace detail diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 42596f344..0859e64ab 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -328,7 +328,7 @@ class DMatrix { // implementation of inline functions inline bst_uint RowSet::operator[](size_t i) const { - return rows_.size() == 0 ? i : rows_[i]; + return rows_.size() == 0 ? static_cast(i) : rows_[i]; } inline size_t RowSet::size() const { diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index fb8b16ec1..e95b83e77 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -651,7 +651,7 @@ inline void ExtendPath(PathElement *unique_path, unsigned unique_depth, unique_path[unique_depth].feature_index = feature_index; unique_path[unique_depth].zero_fraction = zero_fraction; unique_path[unique_depth].one_fraction = one_fraction; - unique_path[unique_depth].pweight = (unique_depth == 0 ? 1 : 0); + unique_path[unique_depth].pweight = static_cast(unique_depth == 0 ? 1 : 0); for (int i = unique_depth-1; i >= 0; i--) { unique_path[i+1].pweight += one_fraction*unique_path[i].pweight*(i+1) / static_cast(unique_depth+1); @@ -679,7 +679,7 @@ inline void UnwindPath(PathElement *unique_path, unsigned unique_depth, unsigned } } - for (int i = path_index; i < unique_depth; ++i) { + for (auto i = path_index; i < unique_depth; ++i) { unique_path[i].feature_index = unique_path[i+1].feature_index; unique_path[i].zero_fraction = unique_path[i+1].zero_fraction; unique_path[i].one_fraction = unique_path[i+1].one_fraction; @@ -725,7 +725,7 @@ inline void RegTree::TreeShap(const RegTree::FVec& feat, bst_float *phi, // leaf node if (node.is_leaf()) { - for (int i = 1; i <= unique_depth; ++i) { + for (unsigned i = 1; i <= unique_depth; ++i) { const bst_float w = UnwoundPathSum(unique_path, unique_depth, i); const PathElement &el = unique_path[i]; phi[el.feature_index] += w*(el.one_fraction-el.zero_fraction)*node.leaf_value(); @@ -775,7 +775,7 @@ inline void RegTree::CalculateContributions(const RegTree::FVec& feat, unsigned // find the expected value of the tree's predictions bst_float base_value = 0.0; bst_float total_cover = 0; - for (unsigned i = 0; i < (*this).param.num_nodes; ++i) { + for (int i = 0; i < (*this).param.num_nodes; ++i) { const auto node = (*this)[i]; if (node.is_leaf()) { const auto cover = this->stat(i).sum_hess; diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h index c40d6c897..42f75b81f 100644 --- a/src/common/compressed_iterator.h +++ b/src/common/compressed_iterator.h @@ -6,7 +6,7 @@ #include #include #include -#include "dmlc/logging.h" +#include namespace xgboost { namespace common { @@ -28,8 +28,9 @@ static const int padding = 4; // Assign padding so we can read slightly off // the beginning of the array // The number of bits required to represent a given unsigned range -static int SymbolBits(int num_symbols) { - return std::ceil(std::log2(num_symbols)); +static size_t SymbolBits(size_t num_symbols) { + auto bits = std::ceil(std::log2(num_symbols)); + return std::max(static_cast(bits), size_t(1)); } } // namespace detail @@ -72,9 +73,9 @@ class CompressedBufferWriter { static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) { const int bits_per_byte = 8; - size_t compressed_size = std::ceil( + size_t compressed_size = static_cast(std::ceil( static_cast(detail::SymbolBits(num_symbols) * num_elements) / - bits_per_byte); + bits_per_byte)); return compressed_size + detail::padding; } @@ -98,8 +99,8 @@ class CompressedBufferWriter { template void Write(compressed_byte_t *buffer, iter_t input_begin, iter_t input_end) { uint64_t tmp = 0; - int stored_bits = 0; - const int max_stored_bits = 64 - symbol_bits_; + size_t stored_bits = 0; + const size_t max_stored_bits = 64 - symbol_bits_; size_t buffer_position = detail::padding; const size_t num_symbols = input_end - input_begin; for (size_t i = 0; i < num_symbols; i++) { @@ -108,7 +109,8 @@ class CompressedBufferWriter { // Eject only full bytes size_t tmp_bytes = stored_bits / 8; for (size_t j = 0; j < tmp_bytes; j++) { - buffer[buffer_position] = tmp >> (stored_bits - (j + 1) * 8); + buffer[buffer_position] = static_cast( + tmp >> (stored_bits - (j + 1) * 8)); buffer_position++; } stored_bits -= tmp_bytes * 8; @@ -121,13 +123,16 @@ class CompressedBufferWriter { } // Eject all bytes - size_t tmp_bytes = std::ceil(static_cast(stored_bits) / 8); - for (size_t j = 0; j < tmp_bytes; j++) { - int shift_bits = stored_bits - (j + 1) * 8; + int tmp_bytes = + static_cast(std::ceil(static_cast(stored_bits) / 8)); + for (int j = 0; j < tmp_bytes; j++) { + int shift_bits = static_cast(stored_bits) - (j + 1) * 8; if (shift_bits >= 0) { - buffer[buffer_position] = tmp >> shift_bits; + buffer[buffer_position] = + static_cast(tmp >> shift_bits); } else { - buffer[buffer_position] = tmp << std::abs(shift_bits); + buffer[buffer_position] = + static_cast(tmp << std::abs(shift_bits)); } buffer_position++; } diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 40cc62200..dce0e3be2 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -125,7 +125,7 @@ inline size_t available_memory(int device_idx) { * \param device_idx Zero-based index of the device. */ -inline int max_shared_memory(int device_idx) { +inline size_t max_shared_memory(int device_idx) { cudaDeviceProp prop; dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx)); return prop.sharedMemPerBlock; @@ -241,8 +241,7 @@ inline void launch_n(int device_idx, size_t n, L lambda) { } safe_cuda(cudaSetDevice(device_idx)); - // TODO: Template on n so GRID_SIZE always fits into int. - const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS); + const int GRID_SIZE = static_cast(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS)); launch_n_kernel<<>>(static_cast(0), n, lambda); } @@ -428,74 +427,66 @@ class bulk_allocator { const int align = 256; - template - size_t align_round_up(SizeT n) { + size_t align_round_up(size_t n) const { n = (n + align - 1) / align; return n * align; } - template - size_t get_size_bytes(dvec *first_vec, SizeT first_size) { - return align_round_up(first_size * sizeof(T)); + template + size_t get_size_bytes(dvec *first_vec, size_t first_size) { + return align_round_up(first_size * sizeof(T)); } - template - size_t get_size_bytes(dvec *first_vec, SizeT first_size, Args... args) { - return get_size_bytes(first_vec, first_size) + - get_size_bytes(args...); + template + size_t get_size_bytes(dvec *first_vec, size_t first_size, Args... args) { + return get_size_bytes(first_vec, first_size) + get_size_bytes(args...); } - template + template void allocate_dvec(int device_idx, char *ptr, dvec *first_vec, - SizeT first_size) { + size_t first_size) { first_vec->external_allocate(device_idx, static_cast(ptr), first_size); } - template + template void allocate_dvec(int device_idx, char *ptr, dvec *first_vec, - SizeT first_size, Args... args) { - first_vec->external_allocate(device_idx, static_cast(ptr), - first_size); + size_t first_size, Args... args) { + allocate_dvec(device_idx, ptr, first_vec, first_size); ptr += align_round_up(first_size * sizeof(T)); allocate_dvec(device_idx, ptr, args...); } - // template char *allocate_device(int device_idx, size_t bytes, memory_type t) { char *ptr; - if (t == memory_type::DEVICE) { - safe_cuda(cudaSetDevice(device_idx)); - safe_cuda(cudaMalloc(&ptr, bytes)); - } else { - safe_cuda(cudaMallocManaged(&ptr, bytes)); - } + safe_cuda(cudaSetDevice(device_idx)); + safe_cuda(cudaMalloc(&ptr, bytes)); return ptr; } - template - size_t get_size_bytes(dvec2 *first_vec, SizeT first_size) { + template + size_t get_size_bytes(dvec2 *first_vec, size_t first_size) { return 2 * align_round_up(first_size * sizeof(T)); } - template - size_t get_size_bytes(dvec2 *first_vec, SizeT first_size, Args... args) { - return get_size_bytes(first_vec, first_size) + + template + size_t get_size_bytes(dvec2 *first_vec, size_t first_size, Args... args) { + return get_size_bytes(first_vec, first_size) + get_size_bytes(args...); } - template + template void allocate_dvec(int device_idx, char *ptr, dvec2 *first_vec, - SizeT first_size) { + size_t first_size) { first_vec->external_allocate( device_idx, static_cast(ptr), static_cast(ptr + align_round_up(first_size * sizeof(T))), first_size); } - template + template void allocate_dvec(int device_idx, char *ptr, dvec2 *first_vec, - SizeT first_size, Args... args) { - allocate_dvec(device_idx, ptr, first_vec, first_size); + size_t first_size, Args... args) { + allocate_dvec(device_idx, ptr, first_vec, first_size); ptr += (align_round_up(first_size * sizeof(T)) * 2); allocate_dvec(device_idx, ptr, args...); } @@ -544,14 +535,13 @@ struct CubMemory { // Thrust typedef char value_type; - CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {} + CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {} ~CubMemory() { Free(); } template - T* Pointer() - { - return static_cast(d_temp_storage); + T *Pointer() { + return static_cast(d_temp_storage); } void Free() { @@ -611,7 +601,7 @@ void print(const dvec &v, size_t max_items = 10) { template void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates, - int num_tiles, int tile_size, segments_t segments, + size_t num_tiles, int tile_size, segments_t segments, offset_t num_rows, offset_t num_elements) { dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) { offset_t diagonal = idx * tile_size; @@ -692,7 +682,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, const int BLOCK_THREADS = 256; const int ITEMS_PER_THREAD = 1; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; - int num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS); + auto num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS); + CHECK(num_tiles < std::numeric_limits::max()); temp_memory->LazyAllocate(sizeof(coordinate_t) * (num_tiles + 1)); coordinate_t *tmp_tile_coordinates = @@ -702,7 +693,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, BLOCK_THREADS, segments, num_segments, count); LbsKernel - <<>>(tmp_tile_coordinates, segments + 1, f, + <<>>(tmp_tile_coordinates, segments + 1, f, num_segments); } diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index ee64d9778..881f89129 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -26,7 +26,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { const int nthread = omp_get_max_threads(); - unsigned nstep = (info.num_col + nthread - 1) / nthread; + unsigned nstep = static_cast((info.num_col + nthread - 1) / nthread); unsigned ncol = static_cast(info.num_col); sketchs.resize(info.num_col); for (auto& s : sketchs) { @@ -79,7 +79,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { if (a.size > 1 && a.size <= 16) { /* specialized code categorial / ordinal data -- use midpoints */ for (size_t i = 1; i < a.size; ++i) { - bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0; + bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0f; if (i == 1 || cpt > cut.back()) { cut.push_back(cpt); } @@ -99,7 +99,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { bst_float last = cpt + fabs(cpt); cut.push_back(last); } - row_ptr.push_back(cut.size()); + row_ptr.push_back(static_cast(cut.size())); } } @@ -148,7 +148,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) { } #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint idx = 0; idx < nbins; ++idx) { + for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) { for (int tid = 0; tid < nthread; ++tid) { hit_count[idx] += hit_count_tloc_[tid * nbins + idx]; } @@ -226,7 +226,7 @@ FindGroups_(const std::vector& feature_list, bool need_new_group = true; // randomly choose some of existing groups as candidates - std::vector search_groups; + std::vector search_groups; for (size_t gid = 0; gid < groups.size(); ++gid) { if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) { search_groups.push_back(gid); @@ -434,7 +434,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, } } } - for (bst_omp_uint i = nrows - rest; i < nrows; ++i) { + for (size_t i = nrows - rest; i < nrows; ++i) { const size_t rid = row_indices.begin[i]; const size_t ibegin = gmat.row_ptr[rid]; const size_t iend = gmat.row_ptr[rid + 1]; @@ -448,7 +448,7 @@ void GHistBuilder::BuildHist(const std::vector& gpair, /* reduction */ const uint32_t nbins = nbins_; #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) { + for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) { for (bst_omp_uint tid = 0; tid < nthread; ++tid) { hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]); } @@ -462,7 +462,7 @@ void GHistBuilder::BuildBlockHist(const std::vector& gpair, GHistRow hist) { const int K = 8; // loop unrolling factor const bst_omp_uint nthread = static_cast(this->nthread_); - const uint32_t nblock = gmatb.GetNumBlock(); + const size_t nblock = gmatb.GetNumBlock(); const size_t nrows = row_indices.end - row_indices.begin; const size_t rest = nrows % K; @@ -492,7 +492,7 @@ void GHistBuilder::BuildBlockHist(const std::vector& gpair, } } } - for (bst_omp_uint i = nrows - rest; i < nrows; ++i) { + for (size_t i = nrows - rest; i < nrows; ++i) { const size_t rid = row_indices.begin[i]; const size_t ibegin = gmat.row_ptr[rid]; const size_t iend = gmat.row_ptr[rid + 1]; @@ -511,7 +511,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa const int K = 8; // loop unrolling factor const uint32_t rest = nbins % K; #pragma omp parallel for num_threads(nthread) schedule(static) - for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) { + for (bst_omp_uint bin_id = 0; bin_id < static_cast(nbins - rest); bin_id += K) { GHistEntry pb[K]; GHistEntry sb[K]; for (int k = 0; k < K; ++k) { diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 65f367aa8..8b04410d0 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -118,11 +118,11 @@ struct GHistIndexMatrix { return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]); } inline void GetFeatureCounts(size_t* counts) const { - const unsigned nfeature = cut->row_ptr.size() - 1; + auto nfeature = cut->row_ptr.size() - 1; for (unsigned fid = 0; fid < nfeature; ++fid) { - const unsigned ibegin = cut->row_ptr[fid]; - const unsigned iend = cut->row_ptr[fid + 1]; - for (unsigned i = ibegin; i < iend; ++i) { + auto ibegin = cut->row_ptr[fid]; + auto iend = cut->row_ptr[fid + 1]; + for (auto i = ibegin; i < iend; ++i) { counts[fid] += hit_count[i]; } } @@ -235,7 +235,7 @@ class HistCollection { std::vector data_; /*! \brief row_ptr_[nid] locates bin for historgram of node nid */ - std::vector row_ptr_; + std::vector row_ptr_; }; /*! diff --git a/src/common/quantile.h b/src/common/quantile.h index 409279bd9..98f7f732b 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -680,12 +680,12 @@ class QuantileSketchTemplate { nlevel = 1; while (true) { limit_size = static_cast(ceil(nlevel / eps)) + 1; - size_t n = (1UL << nlevel); + size_t n = (1ULL << nlevel); if (n * limit_size >= maxn) break; ++nlevel; } // check invariant - size_t n = (1UL << nlevel); + size_t n = (1ULL << nlevel); CHECK(n * limit_size >= maxn) << "invalid init parameter"; CHECK(nlevel <= limit_size * eps) << "invalid init parameter"; // lazy reserve the space, if there is only one value, no need to allocate space diff --git a/src/common/row_set.h b/src/common/row_set.h index 56eead595..921f4fbe8 100644 --- a/src/common/row_set.h +++ b/src/common/row_set.h @@ -88,7 +88,7 @@ class RowSetCollection { unsigned left_node_id, unsigned right_node_id) { const Elem e = elem_of_each_node_[node_id]; - const unsigned nthread = row_split_tloc.size(); + const bst_omp_uint nthread = static_cast(row_split_tloc.size()); CHECK(e.begin != nullptr); size_t* all_begin = dmlc::BeginPtr(row_indices_); size_t* begin = all_begin + (e.begin - all_begin); diff --git a/src/data/sparse_page_source.cc b/src/data/sparse_page_source.cc index 5d87d0502..751c02824 100644 --- a/src/data/sparse_page_source.cc +++ b/src/data/sparse_page_source.cc @@ -120,7 +120,7 @@ void SparsePageSource::Create(dmlc::Parser* src, double tstart = dmlc::GetTime(); // print every 4 sec. const double kStep = 4.0; - size_t tick_expected = kStep; + size_t tick_expected = static_cast(kStep); while (src->Next()) { const dmlc::RowBlock& batch = src->Value(); @@ -149,7 +149,7 @@ void SparsePageSource::Create(dmlc::Parser* src, LOG(CONSOLE) << "Writing row.page to " << cache_info << " in " << ((bytes_write >> 20UL) / tdiff) << " MB/s, " << (bytes_write >> 20UL) << " written"; - tick_expected += kStep; + tick_expected += static_cast(kStep); } } } diff --git a/src/learner.cc b/src/learner.cc index 7a9a6cc2d..6979225e1 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -146,6 +146,12 @@ class LearnerImpl : public Learner { name_gbm_ = "gbtree"; } + static void AssertGPUSupport() { +#ifndef XGBOOST_USE_CUDA + LOG(FATAL) << "XGBoost version not compiled with GPU support."; +#endif + } + void ConfigureUpdaters() { if (tparam.tree_method == 0 || tparam.tree_method == 1 || tparam.tree_method == 2) { @@ -166,6 +172,7 @@ class LearnerImpl : public Learner { << "grow_fast_histmaker."; cfg_["updater"] = "grow_fast_histmaker"; } else if (tparam.tree_method == 4) { + this->AssertGPUSupport(); if (cfg_.count("updater") == 0) { cfg_["updater"] = "grow_gpu,prune"; } @@ -173,6 +180,7 @@ class LearnerImpl : public Learner { cfg_["predictor"] = "gpu_predictor"; } } else if (tparam.tree_method == 5) { + this->AssertGPUSupport(); if (cfg_.count("updater") == 0) { cfg_["updater"] = "grow_gpu_hist"; } @@ -180,6 +188,7 @@ class LearnerImpl : public Learner { cfg_["predictor"] = "gpu_predictor"; } } else if (tparam.tree_method == 6) { + this->AssertGPUSupport(); if (cfg_.count("updater") == 0) { cfg_["updater"] = "grow_gpu_hist_experimental,prune"; } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index be63bd385..a6f93ec71 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -216,11 +216,11 @@ __device__ float GetLeafWeight(bst_uint ridx, const DevicePredictionNode* tree, template __global__ void PredictKernel(const DevicePredictionNode* d_nodes, - float* d_out_predictions, int* d_tree_segments, + float* d_out_predictions, size_t* d_tree_segments, int* d_tree_group, size_t* d_row_ptr, - SparseBatch::Entry* d_data, int tree_begin, - int tree_end, int num_features, bst_uint num_rows, - bool use_shared, int num_group) { + SparseBatch::Entry* d_data, size_t tree_begin, + size_t tree_end, size_t num_features, + size_t num_rows, bool use_shared, int num_group) { extern __shared__ float smem[]; bst_uint global_idx = blockDim.x * blockIdx.x + threadIdx.x; ElementLoader loader(use_shared, d_row_ptr, d_data, num_features, smem, @@ -249,8 +249,8 @@ __global__ void PredictKernel(const DevicePredictionNode* d_nodes, class GPUPredictor : public xgboost::Predictor { private: void DevicePredictInternal(DMatrix* dmat, std::vector* out_preds, - const gbm::GBTreeModel& model, int tree_begin, - int tree_end) { + const gbm::GBTreeModel& model, size_t tree_begin, + size_t tree_end) { if (tree_end - tree_begin == 0) { return; } @@ -267,17 +267,17 @@ class GPUPredictor : public xgboost::Predictor { dh::safe_cuda(cudaSetDevice(param.gpu_id)); CHECK_EQ(model.param.size_leaf_vector, 0); // Copy decision trees to device - thrust::host_vector h_tree_segments; + thrust::host_vector h_tree_segments; h_tree_segments.reserve((tree_end - tree_end) + 1); - int sum = 0; + size_t sum = 0; h_tree_segments.push_back(sum); - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { sum += model.trees[tree_idx]->GetNodes().size(); h_tree_segments.push_back(sum); } thrust::host_vector h_nodes(h_tree_segments.back()); - for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { + for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { auto& src_nodes = model.trees[tree_idx]->GetNodes(); std::copy(src_nodes.begin(), src_nodes.end(), h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]); @@ -299,11 +299,11 @@ class GPUPredictor : public xgboost::Predictor { } const int BLOCK_THREADS = 128; - const int GRID_SIZE = - dh::div_round_up(device_matrix->row_ptr.size() - 1, BLOCK_THREADS); + const int GRID_SIZE = static_cast( + dh::div_round_up(device_matrix->row_ptr.size() - 1, BLOCK_THREADS)); - int shared_memory_bytes = - sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS; + int shared_memory_bytes = static_cast( + sizeof(float) * device_matrix->p_mat->info().num_col * BLOCK_THREADS); bool use_shared = true; if (shared_memory_bytes > dh::max_shared_memory(param.gpu_id)) { shared_memory_bytes = 0; @@ -347,8 +347,7 @@ class GPUPredictor : public xgboost::Predictor { const gbm::GBTreeModel& model, std::vector>* updaters, int num_new_trees) override { - // dh::Timer t; - int old_ntree = model.trees.size() - num_new_trees; + auto old_ntree = model.trees.size() - num_new_trees; // update cache entry for (auto& kv : cache_) { PredictionCacheEntry& e = kv.second; @@ -356,7 +355,7 @@ class GPUPredictor : public xgboost::Predictor { if (e.predictions.size() == 0) { cpu_predictor->PredictBatch(dmat, &(e.predictions), model, 0, - model.trees.size()); + static_cast(model.trees.size())); } else if (model.param.num_output_group == 1 && updaters->size() > 0 && num_new_trees == 1 && updaters->back()->UpdatePredictionCache(e.data.get(), @@ -383,11 +382,10 @@ class GPUPredictor : public xgboost::Predictor { void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - const gbm::GBTreeModel& model, - unsigned ntree_limit, + const gbm::GBTreeModel& model, unsigned ntree_limit, bool approximate) override { - cpu_predictor->PredictContribution(p_fmat, out_contribs, model, - ntree_limit, approximate); + cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, + approximate); } void Init(const std::vector>& cfg, @@ -403,7 +401,7 @@ class GPUPredictor : public xgboost::Predictor { std::unordered_map> device_matrix_cache_; thrust::device_vector nodes; - thrust::device_vector tree_segments; + thrust::device_vector tree_segments; thrust::device_vector tree_group; }; XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") diff --git a/src/tree/param.h b/src/tree/param.h index 20f0feee2..d5816bf1a 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -241,7 +241,7 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, template XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) { if (sum_hess < p.min_child_weight) - return 0.0; + return T(0.0); if (p.max_delta_step == 0.0f) { if (p.reg_alpha == 0.0f) { return Sqr(sum_grad) / (sum_hess + p.reg_lambda); @@ -251,11 +251,11 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess } } else { T w = CalcWeight(p, sum_grad, sum_hess); - T ret = sum_grad * w + 0.5 * (sum_hess + p.reg_lambda) * Sqr(w); + T ret = sum_grad * w + T(0.5) * (sum_hess + p.reg_lambda) * Sqr(w); if (p.reg_alpha == 0.0f) { - return -2.0 * ret; + return T(-2.0) * ret; } else { - return -2.0 * (ret + p.reg_alpha * std::abs(w)); + return T(-2.0) * (ret + p.reg_alpha * std::abs(w)); } } } diff --git a/src/tree/updater_gpu.cu b/src/tree/updater_gpu.cu index 4ef24450a..50a466539 100644 --- a/src/tree/updater_gpu.cu +++ b/src/tree/updater_gpu.cu @@ -630,7 +630,8 @@ class GPUMaker : public TreeUpdater { throw std::runtime_error("exact::GPUBuilder - must have 1 column block"); } std::vector fval; - std::vector fId, offset; + std::vector fId; + std::vector offset; convertToCsc(dmat, &fval, &fId, &offset); allocateAllData(static_cast(offset.size())); transferAndSortData(fval, fId, offset); @@ -638,10 +639,12 @@ class GPUMaker : public TreeUpdater { } void convertToCsc(DMatrix* dmat, std::vector* fval, - std::vector* fId, std::vector* offset) { + std::vector* fId, std::vector* offset) { MetaInfo info = dmat->info(); - nRows = info.num_row; - nCols = info.num_col; + CHECK(info.num_col < std::numeric_limits::max()); + CHECK(info.num_row < std::numeric_limits::max()); + nRows = static_cast(info.num_row); + nCols = static_cast(info.num_col); offset->reserve(nCols + 1); offset->push_back(0); fval->reserve(nCols * nRows); @@ -667,12 +670,13 @@ class GPUMaker : public TreeUpdater { offset->push_back(fval->size()); } } - nVals = fval->size(); + CHECK(fval->size() < std::numeric_limits::max()); + nVals = static_cast(fval->size()); } void transferAndSortData(const std::vector& fval, const std::vector& fId, - const std::vector& offset) { + const std::vector& offset) { vals.current_dvec() = fval; instIds.current_dvec() = fId; colOffsets = offset; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 22857c9f3..bd7ca94e1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -104,7 +104,7 @@ struct DeviceHist { template __global__ void find_split_kernel( const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth, - int n_features, int n_bins, DeviceNodeStats* d_nodes, + uint64_t n_features, int n_bins, DeviceNodeStats* d_nodes, int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map, GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) { @@ -293,7 +293,8 @@ class GPUHistMaker : public TreeUpdater { dh::Timer time1; // set member num_rows and n_devices for rest of GPUHistBuilder members info = &fmat.info(); - num_rows = info->num_row; + CHECK(info->num_row < std::numeric_limits::max()); + num_rows = static_cast(info->num_row); n_devices = dh::n_devices(param.n_gpus, num_rows); if (!initialised) { @@ -396,15 +397,15 @@ class GPUHistMaker : public TreeUpdater { fflush(stdout); } - int n_bins = hmat_.row_ptr.back(); - int n_features = hmat_.row_ptr.size() - 1; + int n_bins = static_cast(hmat_.row_ptr.back()); + int n_features = static_cast(hmat_.row_ptr.size() - 1); // deliniate data onto multiple gpus device_row_segments.push_back(0); device_element_segments.push_back(0); bst_uint offset = 0; - bst_uint shard_size = - std::ceil(static_cast(num_rows) / n_devices); + bst_uint shard_size = static_cast( + std::ceil(static_cast(num_rows) / n_devices)); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; offset += shard_size; @@ -425,7 +426,7 @@ class GPUHistMaker : public TreeUpdater { // Construct feature map std::vector h_gidx_feature_map(n_bins); for (int fidx = 0; fidx < n_features; fidx++) { - for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) { + for (auto i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) { h_gidx_feature_map[i] = fidx; } } @@ -456,7 +457,7 @@ class GPUHistMaker : public TreeUpdater { gidx_feature_map.resize(n_devices); gidx_fvalue_map.resize(n_devices); - int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices))); + int find_split_n_devices = static_cast(std::pow(2, std::floor(std::log2(n_devices)))); find_split_n_devices = std::min(n_nodes_level(param.max_depth), find_split_n_devices); int max_num_nodes_device = @@ -707,7 +708,7 @@ class GPUHistMaker : public TreeUpdater { int nodes_offset_device = 0; find_split_kernel<<>>( hist_vec[d_idx].GetLevelPtr(depth), feature_segments[d_idx].data(), - depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), + depth, info->num_col, hmat_.row_ptr.back(), nodes[d_idx].data(), nodes_offset_device, fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest[d_idx].data(), colsample, @@ -769,7 +770,7 @@ class GPUHistMaker : public TreeUpdater { DeviceNodeStats* d_nodes = nodes[d_idx].data(); auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data(); auto d_gidx = device_matrix[d_idx].gidx; - int n_columns = info->num_col; + auto n_columns = info->num_col; size_t begin = device_row_segments[d_idx]; size_t end = device_row_segments[d_idx + 1]; diff --git a/src/tree/updater_gpu_hist_experimental.cu b/src/tree/updater_gpu_hist_experimental.cu index 6193b16f4..6b80b6100 100644 --- a/src/tree/updater_gpu_hist_experimental.cu +++ b/src/tree/updater_gpu_hist_experimental.cu @@ -113,13 +113,11 @@ __device__ void EvaluateFeature(int fidx, const bst_gpair_integer* hist, } template -__global__ void evaluate_split_kernel(const bst_gpair_integer* d_hist, int nidx, - int n_features, DeviceNodeStats nodes, - const int* d_feature_segments, - const float* d_fidx_min_map, - const float* d_gidx_fvalue_map, - GPUTrainingParam gpu_param, - DeviceSplitCandidate* d_split) { +__global__ void evaluate_split_kernel( + const bst_gpair_integer* d_hist, int nidx, uint64_t n_features, + DeviceNodeStats nodes, const int* d_feature_segments, + const float* d_fidx_min_map, const float* d_gidx_fvalue_map, + GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split) { typedef cub::KeyValuePair ArgMaxT; typedef cub::BlockScan @@ -190,24 +188,6 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data, return -1; } -template -__global__ void RadixSortSmall(bst_uint* d_ridx, int* d_position, bst_uint n) { - typedef cub::BlockRadixSort BlockRadixSort; - __shared__ typename BlockRadixSort::TempStorage temp_storage; - - bool thread_active = threadIdx.x < n; - int thread_key[1]; - bst_uint thread_value[1]; - thread_key[0] = thread_active ? d_position[threadIdx.x] : INT_MAX; - thread_value[0] = thread_active ? d_ridx[threadIdx.x] : UINT_MAX; - BlockRadixSort(temp_storage).Sort(thread_key, thread_value); - - if (thread_active) { - d_position[threadIdx.x] = thread_key[0]; - d_ridx[threadIdx.x] = thread_value[0]; - } -} - struct DeviceHistogram { dh::bulk_allocator ba; dh::dvec data; @@ -269,7 +249,7 @@ struct DeviceShard { null_gidx_value(n_bins) { // Convert to ELLPACK matrix representation int max_elements_row = 0; - for (int i = row_begin; i < row_end; i++) { + for (auto i = row_begin; i < row_end; i++) { max_elements_row = (std::max)(max_elements_row, static_cast(gmat.row_ptr[i + 1] - gmat.row_ptr[i])); @@ -277,9 +257,9 @@ struct DeviceShard { row_stride = max_elements_row; std::vector ellpack_matrix(row_stride * n_rows, null_gidx_value); - for (int i = row_begin; i < row_end; i++) { + for (auto i = row_begin; i < row_end; i++) { int row_count = 0; - for (int j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) { + for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) { ellpack_matrix[i * row_stride + row_count] = gmat.index[j]; row_count++; } @@ -394,13 +374,8 @@ struct DeviceShard { int right_nidx) { auto n = segment.second - segment.first; int min_bits = 0; - int max_bits = std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1)); - // const int SINGLE_TILE_SIZE = 1024; - // if (n < SINGLE_TILE_SIZE) { - // RadixSortSmall - // <<<1, SINGLE_TILE_SIZE>>>(ridx.current() + segment.first, - // position.current() + segment.first, n); - //} else { + int max_bits = static_cast( + std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1))); size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( @@ -509,7 +484,7 @@ class GPUHistMakerExperimental : public TreeUpdater { nidx_set.size()); auto d_split = shard.temp_memory.Pointer(); - auto& streams = shard.GetStreams(nidx_set.size()); + auto& streams = shard.GetStreams(static_cast(nidx_set.size())); // Use streams to process nodes concurrently for (auto i = 0; i < nidx_set.size(); i++) { @@ -518,7 +493,7 @@ class GPUHistMakerExperimental : public TreeUpdater { const int BLOCK_THREADS = 256; evaluate_split_kernel - <<>>( + <<>>( shard.hist.node_map[nidx], nidx, info->num_col, node, shard.feature_segments.data(), shard.min_fvalue.data(), shard.gidx_fvalue_map.data(), GPUTrainingParam(param), @@ -573,10 +548,11 @@ class GPUHistMakerExperimental : public TreeUpdater { __host__ __device__ int operator()(int x) const { return x == val; } }; - __device__ void CountLeft(bst_uint* d_count, int val, int left_nidx) { + __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) { unsigned ballot = __ballot(val == left_nidx); if (threadIdx.x % 32 == 0) { - atomicAdd(d_count, __popc(ballot)); + atomicAdd(reinterpret_cast(d_count), // NOLINT + static_cast(__popc(ballot))); // NOLINT } } @@ -601,9 +577,9 @@ class GPUHistMakerExperimental : public TreeUpdater { for (auto& shard : shards) { monitor.Start("update position kernel"); - shard.temp_memory.LazyAllocate(sizeof(bst_uint)); - auto d_left_count = shard.temp_memory.Pointer(); - dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(bst_uint))); + shard.temp_memory.LazyAllocate(sizeof(int64_t)); + auto d_left_count = shard.temp_memory.Pointer(); + dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t))); dh::safe_cuda(cudaSetDevice(shard.device_idx)); auto segment = shard.ridx_segments[nidx]; CHECK_GT(segment.second - segment.first, 0); @@ -639,8 +615,8 @@ class GPUHistMakerExperimental : public TreeUpdater { d_position[idx] = position; }); - bst_uint left_count; - dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(bst_uint), + int64_t left_count; + dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(int64_t), cudaMemcpyDeviceToHost)); monitor.Stop("update position kernel"); @@ -722,7 +698,7 @@ class GPUHistMakerExperimental : public TreeUpdater { this->InitRoot(gpair, p_tree); monitor.Stop("InitRoot"); - unsigned timestamp = qexpand_->size(); + auto timestamp = qexpand_->size(); auto num_leaves = 1; while (!qexpand_->empty()) { @@ -764,9 +740,9 @@ class GPUHistMakerExperimental : public TreeUpdater { int nid; int depth; DeviceSplitCandidate split; - unsigned timestamp; + uint64_t timestamp; ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split, - unsigned timestamp) + uint64_t timestamp) : nid(nid), depth(depth), split(split), timestamp(timestamp) {} bool IsValid(const TrainParam& param, int num_leaves) const { if (split.loss_chg <= rt_eps) return false; diff --git a/tests/cpp/common/test_compressed_iterator.cc b/tests/cpp/common/test_compressed_iterator.cc index 8e57cea77..fe52a2be2 100644 --- a/tests/cpp/common/test_compressed_iterator.cc +++ b/tests/cpp/common/test_compressed_iterator.cc @@ -7,7 +7,7 @@ namespace common { TEST(CompressedIterator, Test) { ASSERT_TRUE(detail::SymbolBits(256) == 8); ASSERT_TRUE(detail::SymbolBits(150) == 8); - std::vector test_cases = {3, 426, 21, 64, 256, 100000, INT32_MAX}; + std::vector test_cases = {1, 3, 426, 21, 64, 256, 100000, INT32_MAX}; int num_elements = 1000; int repetitions = 1000; srand(9); diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 3c49a9993..56b98a1cc 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -12,7 +12,7 @@ void CreateTestData(xgboost::bst_uint num_rows, int max_row_size, thrust::host_vector *rows) { row_ptr->resize(num_rows + 1); int sum = 0; - for (int i = 0; i <= num_rows; i++) { + for (xgboost::bst_uint i = 0; i <= num_rows; i++) { (*row_ptr)[i] = sum; sum += rand() % max_row_size; // NOLINT diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 712fad986..3aa5f6f8a 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -16,7 +16,7 @@ TEST(gpu_predictor, Test) { std::unique_ptr(Predictor::Create("cpu_predictor")); std::vector> trees; - trees.push_back(std::unique_ptr()); + trees.push_back(std::unique_ptr(new RegTree())); trees.back()->InitModel(); (*trees.back())[0].set_leaf(1.5f); (*trees.back()).stat(0).sum_hess = 1.0f; @@ -39,7 +39,6 @@ TEST(gpu_predictor, Test) { ASSERT_LT(std::abs(gpu_out_predictions[i] - cpu_out_predictions[i]), abs_tolerance); } - // Test predict instance auto batch = dmat->RowIterator()->Value(); for (int i = 0; i < batch.size; i++) { diff --git a/tests/cpp/tree/test_gpu_hist_experimental.cu b/tests/cpp/tree/test_gpu_hist_experimental.cu index fd12aabb7..46fd99d5f 100644 --- a/tests/cpp/tree/test_gpu_hist_experimental.cu +++ b/tests/cpp/tree/test_gpu_hist_experimental.cu @@ -16,7 +16,7 @@ TEST(gpu_hist_experimental, TestSparseShard) { int rows = 100; int columns = 80; int max_bins = 4; - auto dmat = CreateDMatrix(rows, columns, 0.9); + auto dmat = CreateDMatrix(rows, columns, 0.9f); common::HistCutMatrix hmat; common::GHistIndexMatrix gmat; hmat.Init(dmat.get(), max_bins); @@ -33,7 +33,7 @@ TEST(gpu_hist_experimental, TestSparseShard) { for (int i = 0; i < rows; i++) { int row_offset = 0; - for (int j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) { + for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) { ASSERT_EQ(gidx[i * shard.row_stride + row_offset], gmat.index[j]); row_offset++; } diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index 5fc3ddcea..f1fea3273 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -61,7 +61,7 @@ if [ ${TASK} == "python_lightweight_test" ]; then conda install numpy scipy nose python -m pip install graphviz python -m nose tests/python || exit -1 - python -m pip install flake8 + python -m pip install flake8==3.4.1 flake8 --ignore E501 python-package || exit -1 flake8 --ignore E501 tests/python || exit -1 exit 0