Various bug fixes (#2825)

* Fatal error if GPU algorithm selected without GPU support compiled * Resolve type conversion warnings * Fix gpu unit test failure * Fix compressed iterator edge case * Fix python unit test failures due to flake8 update on pip
2017-10-25 14:45:01 +13:00
parent c71b62d48d
commit 13e7a2cff0
21 changed files with 163 additions and 180 deletions
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -6,7 +6,7 @@
 #include <xgboost/base.h>
 #include <cmath>
 #include <cstddef>
-#include "dmlc/logging.h"
+#include <algorithm>

 namespace xgboost {
 namespace common {
@@ -28,8 +28,9 @@ static const int padding = 4;  // Assign padding so we can read slightly off
                               // the beginning of the array

 // The number of bits required to represent a given unsigned range
-static int SymbolBits(int num_symbols) {
-  return std::ceil(std::log2(num_symbols));
+static size_t SymbolBits(size_t num_symbols) {
+  auto bits = std::ceil(std::log2(num_symbols));
+  return std::max(static_cast<size_t>(bits), size_t(1));
 }
 }  // namespace detail

@@ -72,9 +73,9 @@ class CompressedBufferWriter {

  static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
    const int bits_per_byte = 8;
-    size_t compressed_size = std::ceil(
+    size_t compressed_size = static_cast<size_t>(std::ceil(
        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
-        bits_per_byte);
+        bits_per_byte));
    return compressed_size + detail::padding;
  }

@@ -98,8 +99,8 @@ class CompressedBufferWriter {
  template <typename iter_t>
  void Write(compressed_byte_t *buffer, iter_t input_begin, iter_t input_end) {
    uint64_t tmp = 0;
-    int stored_bits = 0;
-    const int max_stored_bits = 64 - symbol_bits_;
+    size_t stored_bits = 0;
+    const size_t max_stored_bits = 64 - symbol_bits_;
    size_t buffer_position = detail::padding;
    const size_t num_symbols = input_end - input_begin;
    for (size_t i = 0; i < num_symbols; i++) {
@@ -108,7 +109,8 @@ class CompressedBufferWriter {
        // Eject only full bytes
        size_t tmp_bytes = stored_bits / 8;
        for (size_t j = 0; j < tmp_bytes; j++) {
-          buffer[buffer_position] = tmp >> (stored_bits - (j + 1) * 8);
+          buffer[buffer_position] = static_cast<compressed_byte_t>(
+              tmp >> (stored_bits - (j + 1) * 8));
          buffer_position++;
        }
        stored_bits -= tmp_bytes * 8;
@@ -121,13 +123,16 @@ class CompressedBufferWriter {
    }

    // Eject all bytes
-    size_t tmp_bytes = std::ceil(static_cast<float>(stored_bits) / 8);
-    for (size_t j = 0; j < tmp_bytes; j++) {
-      int shift_bits = stored_bits - (j + 1) * 8;
+    int tmp_bytes =
+        static_cast<int>(std::ceil(static_cast<float>(stored_bits) / 8));
+    for (int j = 0; j < tmp_bytes; j++) {
+      int shift_bits = static_cast<int>(stored_bits) - (j + 1) * 8;
      if (shift_bits >= 0) {
-        buffer[buffer_position] = tmp >> shift_bits;
+        buffer[buffer_position] =
+            static_cast<compressed_byte_t>(tmp >> shift_bits);
      } else {
-        buffer[buffer_position] = tmp << std::abs(shift_bits);
+        buffer[buffer_position] =
+            static_cast<compressed_byte_t>(tmp << std::abs(shift_bits));
      }
      buffer_position++;
    }
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -125,7 +125,7 @@ inline size_t available_memory(int device_idx) {
 * \param device_idx  Zero-based index of the device.
 */

-inline int max_shared_memory(int device_idx) {
+inline size_t max_shared_memory(int device_idx) {
  cudaDeviceProp prop;
  dh::safe_cuda(cudaGetDeviceProperties(&prop, device_idx));
  return prop.sharedMemPerBlock;
@@ -241,8 +241,7 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
  }

  safe_cuda(cudaSetDevice(device_idx));
-  // TODO: Template on n so GRID_SIZE always fits into int.
-  const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
+  const int GRID_SIZE = static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
  launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
                                                lambda);
 }
@@ -428,74 +427,66 @@ class bulk_allocator {

  const int align = 256;

-  template <typename SizeT>
-  size_t align_round_up(SizeT n) {
+  size_t align_round_up(size_t n) const {
    n = (n + align - 1) / align;
    return n * align;
  }

-  template <typename T, typename SizeT>
-  size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size) {
-    return align_round_up<SizeT>(first_size * sizeof(T));
+  template <typename T>
+  size_t get_size_bytes(dvec<T> *first_vec, size_t first_size) {
+    return align_round_up(first_size * sizeof(T));
  }

-  template <typename T, typename SizeT, typename... Args>
-  size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
-    return get_size_bytes<T, SizeT>(first_vec, first_size) +
-           get_size_bytes(args...);
+  template <typename T, typename... Args>
+  size_t get_size_bytes(dvec<T> *first_vec, size_t first_size, Args... args) {
+    return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
  }

-  template <typename T, typename SizeT>
+  template <typename T>
  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
-                     SizeT first_size) {
+                            size_t first_size) {
    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
                                 first_size);
  }

-  template <typename T, typename SizeT, typename... Args>
+  template <typename T, typename... Args>
  void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
-                     SizeT first_size, Args... args) {
-    first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
-                                 first_size);
+                     size_t first_size, Args... args) {
+    allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
    ptr += align_round_up(first_size * sizeof(T));
    allocate_dvec(device_idx, ptr, args...);
  }

-  //    template <memory_type MemoryT>
  char *allocate_device(int device_idx, size_t bytes, memory_type t) {
    char *ptr;
-    if (t == memory_type::DEVICE) {
-      safe_cuda(cudaSetDevice(device_idx));
-      safe_cuda(cudaMalloc(&ptr, bytes));
-    } else {
-      safe_cuda(cudaMallocManaged(&ptr, bytes));
-    }
+    safe_cuda(cudaSetDevice(device_idx));
+    safe_cuda(cudaMalloc(&ptr, bytes));
    return ptr;
  }
-  template <typename T, typename SizeT>
-  size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
+  template <typename T>
+  size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size) {
    return 2 * align_round_up(first_size * sizeof(T));
  }

-  template <typename T, typename SizeT, typename... Args>
-  size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
-    return get_size_bytes<T, SizeT>(first_vec, first_size) +
+  template <typename T, typename... Args>
+  size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
+    return get_size_bytes<T>(first_vec, first_size) +
           get_size_bytes(args...);
  }

-  template <typename T, typename SizeT>
+  template <typename T>
  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
-                     SizeT first_size) {
+                     size_t first_size) {
    first_vec->external_allocate(
        device_idx, static_cast<void *>(ptr),
        static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
        first_size);
  }

-  template <typename T, typename SizeT, typename... Args>
+  template <typename T, typename... Args>
  void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
-                     SizeT first_size, Args... args) {
-    allocate_dvec<T, SizeT>(device_idx, ptr, first_vec, first_size);
+                     size_t first_size, Args... args) {
+    allocate_dvec<T>(device_idx, ptr, first_vec, first_size);
    ptr += (align_round_up(first_size * sizeof(T)) * 2);
    allocate_dvec(device_idx, ptr, args...);
  }
@@ -544,14 +535,13 @@ struct CubMemory {
  // Thrust
  typedef char value_type;

-  CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
+  CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}

  ~CubMemory() { Free(); }

  template <typename T>
-  T* Pointer()
-  {
-    return static_cast<T*>(d_temp_storage);
+  T *Pointer() {
+    return static_cast<T *>(d_temp_storage);
  }

  void Free() {
@@ -611,7 +601,7 @@ void print(const dvec<T> &v, size_t max_items = 10) {

 template <typename coordinate_t, typename segments_t, typename offset_t>
 void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
-                         int num_tiles, int tile_size, segments_t segments,
+                         size_t num_tiles, int tile_size, segments_t segments,
                         offset_t num_rows, offset_t num_elements) {
  dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
    offset_t diagonal = idx * tile_size;
@@ -692,7 +682,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
  const int BLOCK_THREADS = 256;
  const int ITEMS_PER_THREAD = 1;
  const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-  int num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
+  auto num_tiles = dh::div_round_up(count + num_segments, BLOCK_THREADS);
+  CHECK(num_tiles < std::numeric_limits<unsigned int>::max());

  temp_memory->LazyAllocate(sizeof(coordinate_t) * (num_tiles + 1));
  coordinate_t *tmp_tile_coordinates =
@@ -702,7 +693,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
                      BLOCK_THREADS, segments, num_segments, count);

  LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
-      <<<num_tiles, BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
+      <<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
                                     num_segments);
 }

--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -26,7 +26,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {

  const int nthread = omp_get_max_threads();

-  unsigned nstep = (info.num_col + nthread - 1) / nthread;
+  unsigned nstep = static_cast<unsigned>((info.num_col + nthread - 1) / nthread);
  unsigned ncol = static_cast<unsigned>(info.num_col);
  sketchs.resize(info.num_col);
  for (auto& s : sketchs) {
@@ -79,7 +79,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
    if (a.size > 1 && a.size <= 16) {
      /* specialized code categorial / ordinal data -- use midpoints */
      for (size_t i = 1; i < a.size; ++i) {
-        bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0;
+        bst_float cpt = (a.data[i].value + a.data[i - 1].value) / 2.0f;
        if (i == 1 || cpt > cut.back()) {
          cut.push_back(cpt);
        }
@@ -99,7 +99,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) {
      bst_float last = cpt + fabs(cpt);
      cut.push_back(last);
    }
-    row_ptr.push_back(cut.size());
+    row_ptr.push_back(static_cast<bst_uint>(cut.size()));
  }
 }

@@ -148,7 +148,7 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat) {
    }

    #pragma omp parallel for num_threads(nthread) schedule(static)
-    for (bst_omp_uint idx = 0; idx < nbins; ++idx) {
+    for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) {
      for (int tid = 0; tid < nthread; ++tid) {
        hit_count[idx] += hit_count_tloc_[tid * nbins + idx];
      }
@@ -226,7 +226,7 @@ FindGroups_(const std::vector<unsigned>& feature_list,
    bool need_new_group = true;

    // randomly choose some of existing groups as candidates
-    std::vector<unsigned> search_groups;
+    std::vector<size_t> search_groups;
    for (size_t gid = 0; gid < groups.size(); ++gid) {
      if (group_nnz[gid] + cur_fid_nnz <= nrow + max_conflict_cnt) {
        search_groups.push_back(gid);
@@ -434,7 +434,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
      }
    }
  }
-  for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+  for (size_t i = nrows - rest; i < nrows; ++i) {
    const size_t rid = row_indices.begin[i];
    const size_t ibegin = gmat.row_ptr[rid];
    const size_t iend = gmat.row_ptr[rid + 1];
@@ -448,7 +448,7 @@ void GHistBuilder::BuildHist(const std::vector<bst_gpair>& gpair,
  /* reduction */
  const uint32_t nbins = nbins_;
  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint bin_id = 0; bin_id < nbins; ++bin_id) {
+  for (bst_omp_uint bin_id = 0; bin_id < bst_omp_uint(nbins); ++bin_id) {
    for (bst_omp_uint tid = 0; tid < nthread; ++tid) {
      hist.begin[bin_id].Add(data_[tid * nbins_ + bin_id]);
    }
@@ -462,7 +462,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
                                  GHistRow hist) {
  const int K = 8;  // loop unrolling factor
  const bst_omp_uint nthread = static_cast<bst_omp_uint>(this->nthread_);
-  const uint32_t nblock = gmatb.GetNumBlock();
+  const size_t nblock = gmatb.GetNumBlock();
  const size_t nrows = row_indices.end - row_indices.begin;
  const size_t rest = nrows % K;

@@ -492,7 +492,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<bst_gpair>& gpair,
        }
      }
    }
-    for (bst_omp_uint i = nrows - rest; i < nrows; ++i) {
+    for (size_t i = nrows - rest; i < nrows; ++i) {
      const size_t rid = row_indices.begin[i];
      const size_t ibegin = gmat.row_ptr[rid];
      const size_t iend = gmat.row_ptr[rid + 1];
@@ -511,7 +511,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa
  const int K = 8;  // loop unrolling factor
  const uint32_t rest = nbins % K;
  #pragma omp parallel for num_threads(nthread) schedule(static)
-  for (bst_omp_uint bin_id = 0; bin_id < nbins - rest; bin_id += K) {
+  for (bst_omp_uint bin_id = 0; bin_id < static_cast<bst_omp_uint>(nbins - rest); bin_id += K) {
    GHistEntry pb[K];
    GHistEntry sb[K];
    for (int k = 0; k < K; ++k) {
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -118,11 +118,11 @@ struct GHistIndexMatrix {
    return GHistIndexRow(&index[0] + row_ptr[i], row_ptr[i + 1] - row_ptr[i]);
  }
  inline void GetFeatureCounts(size_t* counts) const {
-    const unsigned nfeature = cut->row_ptr.size() - 1;
+    auto nfeature = cut->row_ptr.size() - 1;
    for (unsigned fid = 0; fid < nfeature; ++fid) {
-      const unsigned ibegin = cut->row_ptr[fid];
-      const unsigned iend = cut->row_ptr[fid + 1];
-      for (unsigned i = ibegin; i < iend; ++i) {
+      auto ibegin = cut->row_ptr[fid];
+      auto iend = cut->row_ptr[fid + 1];
+      for (auto i = ibegin; i < iend; ++i) {
        counts[fid] += hit_count[i];
      }
    }
@@ -235,7 +235,7 @@ class HistCollection {
  std::vector<GHistEntry> data_;

  /*! \brief row_ptr_[nid] locates bin for historgram of node nid */
-  std::vector<uint32_t> row_ptr_;
+  std::vector<size_t> row_ptr_;
 };

 /*!
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -680,12 +680,12 @@ class QuantileSketchTemplate {
    nlevel = 1;
    while (true) {
      limit_size = static_cast<size_t>(ceil(nlevel / eps)) + 1;
-      size_t n = (1UL << nlevel);
+      size_t n = (1ULL << nlevel);
      if (n * limit_size >= maxn) break;
      ++nlevel;
    }
    // check invariant
-    size_t n = (1UL << nlevel);
+    size_t n = (1ULL << nlevel);
    CHECK(n * limit_size >= maxn) << "invalid init parameter";
    CHECK(nlevel <= limit_size * eps) << "invalid init parameter";
    // lazy reserve the space, if there is only one value, no need to allocate space
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -88,7 +88,7 @@ class RowSetCollection {
                       unsigned left_node_id,
                       unsigned right_node_id) {
    const Elem e = elem_of_each_node_[node_id];
-    const unsigned nthread = row_split_tloc.size();
+    const bst_omp_uint nthread = static_cast<bst_omp_uint>(row_split_tloc.size());
    CHECK(e.begin != nullptr);
    size_t* all_begin = dmlc::BeginPtr(row_indices_);
    size_t* begin = all_begin + (e.begin - all_begin);