Various bug fixes (#2825)

* Fatal error if GPU algorithm selected without GPU support compiled * Resolve type conversion warnings * Fix gpu unit test failure * Fix compressed iterator edge case * Fix python unit test failures due to flake8 update on pip
2017-10-25 14:45:01 +13:00
parent c71b62d48d
commit 13e7a2cff0
21 changed files with 163 additions and 180 deletions
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -241,7 +241,7 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
 template <typename TrainingParams, typename T>
 XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
  if (sum_hess < p.min_child_weight)
-    return 0.0;
+    return T(0.0);
  if (p.max_delta_step == 0.0f) {
    if (p.reg_alpha == 0.0f) {
      return Sqr(sum_grad) / (sum_hess + p.reg_lambda);
@@ -251,11 +251,11 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess
    }
  } else {
    T w = CalcWeight(p, sum_grad, sum_hess);
-    T ret = sum_grad * w + 0.5 * (sum_hess + p.reg_lambda) * Sqr(w);
+    T ret = sum_grad * w + T(0.5) * (sum_hess + p.reg_lambda) * Sqr(w);
    if (p.reg_alpha == 0.0f) {
-      return -2.0 * ret;
+      return T(-2.0) * ret;
    } else {
-      return -2.0 * (ret + p.reg_alpha * std::abs(w));
+      return T(-2.0) * (ret + p.reg_alpha * std::abs(w));
    }
  }
 }
--- a/src/tree/updater_gpu.cu
+++ b/src/tree/updater_gpu.cu
@@ -630,7 +630,8 @@ class GPUMaker : public TreeUpdater {
      throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
    }
    std::vector<float> fval;
-    std::vector<int> fId, offset;
+    std::vector<int> fId;
+    std::vector<size_t> offset;
    convertToCsc(dmat, &fval, &fId, &offset);
    allocateAllData(static_cast<int>(offset.size()));
    transferAndSortData(fval, fId, offset);
@@ -638,10 +639,12 @@ class GPUMaker : public TreeUpdater {
  }

  void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
-                    std::vector<int>* fId, std::vector<int>* offset) {
+                    std::vector<int>* fId, std::vector<size_t>* offset) {
    MetaInfo info = dmat->info();
-    nRows = info.num_row;
-    nCols = info.num_col;
+    CHECK(info.num_col < std::numeric_limits<int>::max());
+    CHECK(info.num_row < std::numeric_limits<int>::max());
+    nRows = static_cast<int>(info.num_row);
+    nCols = static_cast<int>(info.num_col);
    offset->reserve(nCols + 1);
    offset->push_back(0);
    fval->reserve(nCols * nRows);
@@ -667,12 +670,13 @@ class GPUMaker : public TreeUpdater {
        offset->push_back(fval->size());
      }
    }
-    nVals = fval->size();
+    CHECK(fval->size() < std::numeric_limits<int>::max());
+    nVals = static_cast<int>(fval->size());
  }

  void transferAndSortData(const std::vector<float>& fval,
                           const std::vector<int>& fId,
-                           const std::vector<int>& offset) {
+                           const std::vector<size_t>& offset) {
    vals.current_dvec() = fval;
    instIds.current_dvec() = fId;
    colOffsets = offset;
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -104,7 +104,7 @@ struct DeviceHist {
 template <int BLOCK_THREADS>
 __global__ void find_split_kernel(
    const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth,
-    int n_features, int n_bins, DeviceNodeStats* d_nodes,
+    uint64_t n_features, int n_bins, DeviceNodeStats* d_nodes,
    int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map,
    GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp,
    bool colsample, int* d_feature_flags) {
@@ -293,7 +293,8 @@ class GPUHistMaker : public TreeUpdater {
    dh::Timer time1;
    // set member num_rows and n_devices for rest of GPUHistBuilder members
    info = &fmat.info();
-    num_rows = info->num_row;
+    CHECK(info->num_row < std::numeric_limits<bst_uint>::max());
+    num_rows = static_cast<bst_uint>(info->num_row);
    n_devices = dh::n_devices(param.n_gpus, num_rows);

    if (!initialised) {
@@ -396,15 +397,15 @@ class GPUHistMaker : public TreeUpdater {
        fflush(stdout);
      }

-      int n_bins = hmat_.row_ptr.back();
-      int n_features = hmat_.row_ptr.size() - 1;
+      int n_bins = static_cast<int >(hmat_.row_ptr.back());
+      int n_features = static_cast<int >(hmat_.row_ptr.size() - 1);

      // deliniate data onto multiple gpus
      device_row_segments.push_back(0);
      device_element_segments.push_back(0);
      bst_uint offset = 0;
-      bst_uint shard_size =
-          std::ceil(static_cast<double>(num_rows) / n_devices);
+      bst_uint shard_size = static_cast<bst_uint>(
+          std::ceil(static_cast<double>(num_rows) / n_devices));
      for (int d_idx = 0; d_idx < n_devices; d_idx++) {
        int device_idx = dList[d_idx];
        offset += shard_size;
@@ -425,7 +426,7 @@ class GPUHistMaker : public TreeUpdater {
      // Construct feature map
      std::vector<int> h_gidx_feature_map(n_bins);
      for (int fidx = 0; fidx < n_features; fidx++) {
-        for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
+        for (auto i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
          h_gidx_feature_map[i] = fidx;
        }
      }
@@ -456,7 +457,7 @@ class GPUHistMaker : public TreeUpdater {
      gidx_feature_map.resize(n_devices);
      gidx_fvalue_map.resize(n_devices);

-      int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
+      int find_split_n_devices = static_cast<int >(std::pow(2, std::floor(std::log2(n_devices))));
      find_split_n_devices =
          std::min(n_nodes_level(param.max_depth), find_split_n_devices);
      int max_num_nodes_device =
@@ -707,7 +708,7 @@ class GPUHistMaker : public TreeUpdater {
      int nodes_offset_device = 0;
      find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
          hist_vec[d_idx].GetLevelPtr(depth), feature_segments[d_idx].data(),
-          depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(),
+          depth, info->num_col, hmat_.row_ptr.back(), nodes[d_idx].data(),
          nodes_offset_device, fidx_min_map[d_idx].data(),
          gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
          left_child_smallest[d_idx].data(), colsample,
@@ -769,7 +770,7 @@ class GPUHistMaker : public TreeUpdater {
      DeviceNodeStats* d_nodes = nodes[d_idx].data();
      auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
      auto d_gidx = device_matrix[d_idx].gidx;
-      int n_columns = info->num_col;
+      auto n_columns = info->num_col;
      size_t begin = device_row_segments[d_idx];
      size_t end = device_row_segments[d_idx + 1];

--- a/src/tree/updater_gpu_hist_experimental.cu
+++ b/src/tree/updater_gpu_hist_experimental.cu
@@ -113,13 +113,11 @@ __device__ void EvaluateFeature(int fidx, const bst_gpair_integer* hist,
 }

 template <int BLOCK_THREADS>
-__global__ void evaluate_split_kernel(const bst_gpair_integer* d_hist, int nidx,
-                                      int n_features, DeviceNodeStats nodes,
-                                      const int* d_feature_segments,
-                                      const float* d_fidx_min_map,
-                                      const float* d_gidx_fvalue_map,
-                                      GPUTrainingParam gpu_param,
-                                      DeviceSplitCandidate* d_split) {
+__global__ void evaluate_split_kernel(
+    const bst_gpair_integer* d_hist, int nidx, uint64_t n_features,
+    DeviceNodeStats nodes, const int* d_feature_segments,
+    const float* d_fidx_min_map, const float* d_gidx_fvalue_map,
+    GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split) {
  typedef cub::KeyValuePair<int, float> ArgMaxT;
  typedef cub::BlockScan<bst_gpair_integer, BLOCK_THREADS,
                         cub::BLOCK_SCAN_WARP_SCANS>
@@ -190,24 +188,6 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data,
  return -1;
 }

-template <int BLOCK_THREADS>
-__global__ void RadixSortSmall(bst_uint* d_ridx, int* d_position, bst_uint n) {
-  typedef cub::BlockRadixSort<int, BLOCK_THREADS, 1, bst_uint> BlockRadixSort;
-  __shared__ typename BlockRadixSort::TempStorage temp_storage;
-
-  bool thread_active = threadIdx.x < n;
-  int thread_key[1];
-  bst_uint thread_value[1];
-  thread_key[0] = thread_active ? d_position[threadIdx.x] : INT_MAX;
-  thread_value[0] = thread_active ? d_ridx[threadIdx.x] : UINT_MAX;
-  BlockRadixSort(temp_storage).Sort(thread_key, thread_value);
-
-  if (thread_active) {
-    d_position[threadIdx.x] = thread_key[0];
-    d_ridx[threadIdx.x] = thread_value[0];
-  }
-}
-
 struct DeviceHistogram {
  dh::bulk_allocator<dh::memory_type::DEVICE> ba;
  dh::dvec<bst_gpair_integer> data;
@@ -269,7 +249,7 @@ struct DeviceShard {
        null_gidx_value(n_bins) {
    // Convert to ELLPACK matrix representation
    int max_elements_row = 0;
-    for (int i = row_begin; i < row_end; i++) {
+    for (auto i = row_begin; i < row_end; i++) {
      max_elements_row =
          (std::max)(max_elements_row,
                     static_cast<int>(gmat.row_ptr[i + 1] - gmat.row_ptr[i]));
@@ -277,9 +257,9 @@ struct DeviceShard {
    row_stride = max_elements_row;
    std::vector<int> ellpack_matrix(row_stride * n_rows, null_gidx_value);

-    for (int i = row_begin; i < row_end; i++) {
+    for (auto i = row_begin; i < row_end; i++) {
      int row_count = 0;
-      for (int j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
+      for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
        ellpack_matrix[i * row_stride + row_count] = gmat.index[j];
        row_count++;
      }
@@ -394,13 +374,8 @@ struct DeviceShard {
                    int right_nidx) {
    auto n = segment.second - segment.first;
    int min_bits = 0;
-    int max_bits = std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1));
-    // const int SINGLE_TILE_SIZE = 1024;
-    // if (n < SINGLE_TILE_SIZE) {
-    //  RadixSortSmall<SINGLE_TILE_SIZE>
-    //      <<<1, SINGLE_TILE_SIZE>>>(ridx.current() + segment.first,
-    //                                position.current() + segment.first, n);
-    //} else {
+    int max_bits = static_cast<int>(
+        std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1)));

    size_t temp_storage_bytes = 0;
    cub::DeviceRadixSort::SortPairs(
@@ -509,7 +484,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
                                   nidx_set.size());
    auto d_split = shard.temp_memory.Pointer<DeviceSplitCandidate>();

-    auto& streams = shard.GetStreams(nidx_set.size());
+    auto& streams = shard.GetStreams(static_cast<int>(nidx_set.size()));

    // Use streams to process nodes concurrently
    for (auto i = 0; i < nidx_set.size(); i++) {
@@ -518,7 +493,7 @@ class GPUHistMakerExperimental : public TreeUpdater {

      const int BLOCK_THREADS = 256;
      evaluate_split_kernel<BLOCK_THREADS>
-          <<<columns, BLOCK_THREADS, 0, streams[i]>>>(
+          <<<uint32_t(columns), BLOCK_THREADS, 0, streams[i]>>>(
              shard.hist.node_map[nidx], nidx, info->num_col, node,
              shard.feature_segments.data(), shard.min_fvalue.data(),
              shard.gidx_fvalue_map.data(), GPUTrainingParam(param),
@@ -573,10 +548,11 @@ class GPUHistMakerExperimental : public TreeUpdater {
    __host__ __device__ int operator()(int x) const { return x == val; }
  };

-  __device__ void CountLeft(bst_uint* d_count, int val, int left_nidx) {
+  __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
    unsigned ballot = __ballot(val == left_nidx);
    if (threadIdx.x % 32 == 0) {
-      atomicAdd(d_count, __popc(ballot));
+      atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
+                static_cast<unsigned long long>(__popc(ballot))); // NOLINT
    }
  }

@@ -601,9 +577,9 @@ class GPUHistMakerExperimental : public TreeUpdater {

    for (auto& shard : shards) {
      monitor.Start("update position kernel");
-      shard.temp_memory.LazyAllocate(sizeof(bst_uint));
-      auto d_left_count = shard.temp_memory.Pointer<bst_uint>();
-      dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(bst_uint)));
+      shard.temp_memory.LazyAllocate(sizeof(int64_t));
+      auto d_left_count = shard.temp_memory.Pointer<int64_t>();
+      dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t)));
      dh::safe_cuda(cudaSetDevice(shard.device_idx));
      auto segment = shard.ridx_segments[nidx];
      CHECK_GT(segment.second - segment.first, 0);
@@ -639,8 +615,8 @@ class GPUHistMakerExperimental : public TreeUpdater {
            d_position[idx] = position;
          });

-      bst_uint left_count;
-      dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(bst_uint),
+      int64_t left_count;
+      dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(int64_t),
                               cudaMemcpyDeviceToHost));
      monitor.Stop("update position kernel");

@@ -722,7 +698,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
    this->InitRoot(gpair, p_tree);
    monitor.Stop("InitRoot");

-    unsigned timestamp = qexpand_->size();
+    auto timestamp = qexpand_->size();
    auto num_leaves = 1;

    while (!qexpand_->empty()) {
@@ -764,9 +740,9 @@ class GPUHistMakerExperimental : public TreeUpdater {
    int nid;
    int depth;
    DeviceSplitCandidate split;
-    unsigned timestamp;
+    uint64_t timestamp;
    ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split,
-                unsigned timestamp)
+                uint64_t timestamp)
        : nid(nid), depth(depth), split(split), timestamp(timestamp) {}
    bool IsValid(const TrainParam& param, int num_leaves) const {
      if (split.loss_chg <= rt_eps) return false;