Run training with empty DMatrix. (#4990)

This makes GPU Hist robust in distributed environment as some workers might not be associated with any data in either training or evaluation. * Disable rabit mock test for now: See #5012 . * Disable dask-cudf test at prediction for now: See #5003 * Launch dask job for all workers despite they might not have any data. * Check 0 rows in elementwise evaluation metrics. Using AUC and AUC-PR still throws an error. See #4663 for a robust fix. * Add tests for edge cases. * Add `LaunchKernel` wrapper handling zero sized grid. * Move some parts of allreducer into a cu file. * Don't validate feature names when the booster is empty. * Sync number of columns in DMatrix. As num_feature is required to be the same across all workers in data split mode. * Filtering in dask interface now by default syncs all booster that's not empty, instead of using rank 0. * Fix Jenkins' GPU tests. * Install dask-cuda from source in Jenkins' test. Now all tests are actually running. * Restore GPU Hist tree synchronization test. * Check UUID of running devices. The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field. * Fix CMake policy and project variables. Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13. * Fix copying data to CPU * Fix race condition in cpu predictor. * Fix duplicated DMatrix construction. * Don't download extra nccl in CI script.
2019-11-06 16:13:13 +08:00
parent 807a244517
commit 7663de956c
44 changed files with 603 additions and 272 deletions
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -320,6 +320,32 @@ void DMatrix::SaveToLocalFile(const std::string& fname) {
 DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
                         const std::string& cache_prefix) {
  if (cache_prefix.length() == 0) {
+    // FIXME(trivialfis): Currently distcol is broken so we here check for number of rows.
+    // If we bring back column split this check will break.
+    bool is_distributed { rabit::IsDistributed() };
+    if (is_distributed) {
+      auto world_size = rabit::GetWorldSize();
+      auto rank = rabit::GetRank();
+      std::vector<uint64_t> ncols(world_size, 0);
+      ncols[rank] = source->info.num_col_;
+      rabit::Allreduce<rabit::op::Sum>(ncols.data(), ncols.size());
+      auto max_cols = std::max_element(ncols.cbegin(), ncols.cend());
+      auto max_ind = std::distance(ncols.cbegin(), max_cols);
+      // FIXME(trivialfis): This is a hack, we should store a reference to global shape if possible.
+      if (source->info.num_col_ == 0 && source->info.num_row_ == 0) {
+        LOG(WARNING) << "DMatrix at rank: " << rank << " worker is empty.";
+        source->info.num_col_ = *max_cols;
+      }
+
+      // validate the number of columns across all workers.
+      for (size_t i = 0; i < ncols.size(); ++i) {
+        auto v = ncols[i];
+        CHECK(v == 0 || v == *max_cols)
+            << "DMatrix at rank: " << i << " worker "
+            << "has different number of columns than rank: " << max_ind << " worker. "
+            << "(" << v << " vs. " << *max_cols << ")";
+      }
+    }
    return new data::SimpleDMatrix(std::move(source));
  } else {
 #if DMLC_ENABLE_STD_THREAD
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -99,13 +99,13 @@ EllpackInfo::EllpackInfo(int device,
                         bool is_dense,
                         size_t row_stride,
                         const common::HistogramCuts& hmat,
-                         dh::BulkAllocator& ba)
+                         dh::BulkAllocator* ba)
    : is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {

-  ba.Allocate(device,
-              &feature_segments, hmat.Ptrs().size(),
-              &gidx_fvalue_map, hmat.Values().size(),
-              &min_fvalue, hmat.MinValues().size());
+  ba->Allocate(device,
+               &feature_segments, hmat.Ptrs().size(),
+               &gidx_fvalue_map, hmat.Values().size(),
+               &min_fvalue, hmat.MinValues().size());
  dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
  dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
  dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
@@ -116,7 +116,7 @@ void EllpackPageImpl::InitInfo(int device,
                               bool is_dense,
                               size_t row_stride,
                               const common::HistogramCuts& hmat) {
-  matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, ba_);
+  matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
 }

 // Initialize the buffer to stored compressed features.
@@ -189,7 +189,8 @@ void EllpackPageImpl::CreateHistIndices(int device,
    const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
                     common::DivRoundUp(row_stride, block3.y),
                     1);
-    CompressBinEllpackKernel<<<grid3, block3>>>(
+    dh::LaunchKernel {grid3, block3} (
+        CompressBinEllpackKernel,
        common::CompressedBufferWriter(num_symbols),
        gidx_buffer.data(),
        row_ptrs.data().get(),
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -70,7 +70,7 @@ struct EllpackInfo {
                       bool is_dense,
                       size_t row_stride,
                       const common::HistogramCuts& hmat,
-                       dh::BulkAllocator& ba);
+                       dh::BulkAllocator* ba);
 };

 /** \brief Struct for accessing and manipulating an ellpack matrix on the
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -85,7 +85,7 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
  monitor_.StopCuda("Quantiles");

  monitor_.StartCuda("CreateEllpackInfo");
-  ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, hmat, ba_);
+  ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, hmat, &ba_);
  monitor_.StopCuda("CreateEllpackInfo");

  monitor_.StartCuda("WriteEllpackPages");
--- a/src/data/simple_csr_source.cu
+++ b/src/data/simple_csr_source.cu
@@ -101,7 +101,7 @@ void CountValid(std::vector<Json> const& j_columns, uint32_t column_id,
                HostDeviceVector<size_t>* out_offset,
                dh::caching_device_vector<int32_t>* out_d_flag,
                uint32_t* out_n_rows) {
-  int32_t constexpr kThreads = 256;
+  uint32_t constexpr kThreads = 256;
  auto const& j_column = j_columns[column_id];
  auto const& column_obj = get<Object const>(j_column);
  Columnar<T> foreign_column = ArrayInterfaceHandler::ExtractArray<T>(column_obj);
@@ -123,8 +123,9 @@ void CountValid(std::vector<Json> const& j_columns, uint32_t column_id,

  common::Span<size_t> s_offsets = out_offset->DeviceSpan();

-  int32_t const kBlocks = common::DivRoundUp(n_rows, kThreads);
-  CountValidKernel<T><<<kBlocks, kThreads>>>(
+  uint32_t const kBlocks = common::DivRoundUp(n_rows, kThreads);
+  dh::LaunchKernel {kBlocks, kThreads} (
+      CountValidKernel<T>,
      foreign_column,
      has_missing, missing,
      out_d_flag->data().get(), s_offsets);
@@ -135,13 +136,15 @@ template <typename T>
 void CreateCSR(std::vector<Json> const& j_columns, uint32_t column_id, uint32_t n_rows,
               bool has_missing, float missing,
               dh::device_vector<size_t>* tmp_offset, common::Span<Entry> s_data) {
-  int32_t constexpr kThreads = 256;
+  uint32_t constexpr kThreads = 256;
  auto const& j_column = j_columns[column_id];
  auto const& column_obj = get<Object const>(j_column);
  Columnar<T> foreign_column = ArrayInterfaceHandler::ExtractArray<T>(column_obj);
-  int32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
-  CreateCSRKernel<T><<<kBlocks, kThreads>>>(foreign_column, column_id, has_missing, missing,
-                                            dh::ToSpan(*tmp_offset), s_data);
+  uint32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
+  dh::LaunchKernel {kBlocks, kThreads} (
+      CreateCSRKernel<T>,
+      foreign_column, column_id, has_missing, missing,
+      dh::ToSpan(*tmp_offset), s_data);
 }

 void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,