Run training with empty DMatrix. (#4990)

This makes GPU Hist robust in distributed environment as some workers might not
be associated with any data in either training or evaluation.

* Disable rabit mock test for now: See #5012 .

* Disable dask-cudf test at prediction for now: See #5003

* Launch dask job for all workers despite they might not have any data.
* Check 0 rows in elementwise evaluation metrics.

   Using AUC and AUC-PR still throws an error.  See #4663 for a robust fix.

* Add tests for edge cases.
* Add `LaunchKernel` wrapper handling zero sized grid.
* Move some parts of allreducer into a cu file.
* Don't validate feature names when the booster is empty.

* Sync number of columns in DMatrix.

  As num_feature is required to be the same across all workers in data split
  mode.

* Filtering in dask interface now by default syncs all booster that's not
empty, instead of using rank 0.

* Fix Jenkins' GPU tests.

* Install dask-cuda from source in Jenkins' test.

  Now all tests are actually running.

* Restore GPU Hist tree synchronization test.

* Check UUID of running devices.

  The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field.

* Fix CMake policy and project variables.

  Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13.

* Fix copying data to CPU

* Fix race condition in cpu predictor.

* Fix duplicated DMatrix construction.

* Don't download extra nccl in CI script.
This commit is contained in:
Jiaming Yuan
2019-11-06 16:13:13 +08:00
committed by GitHub
parent 807a244517
commit 7663de956c
44 changed files with 603 additions and 272 deletions

View File

@@ -0,0 +1,91 @@
/*!
* Copyright 2017-2019 XGBoost contributors
*
* \brief Utilities for CUDA.
*/
#ifdef XGBOOST_USE_NCCL
#include <nccl.h>
#endif // #ifdef XGBOOST_USE_NCCL
#include <sstream>
#include "device_helpers.cuh"
namespace dh {
#if __CUDACC_VER_MAJOR__ > 9
constexpr std::size_t kUuidLength =
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
void GetCudaUUID(int world_size, int rank, int device_ord,
xgboost::common::Span<uint64_t, kUuidLength> uuid) {
cudaDeviceProp prob;
safe_cuda(cudaGetDeviceProperties(&prob, device_ord));
std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
}
std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
std::stringstream ss;
for (auto v : uuid) {
ss << std::hex << v;
}
return ss.str();
}
#endif // __CUDACC_VER_MAJOR__ > 9
void AllReducer::Init(int _device_ordinal) {
#ifdef XGBOOST_USE_NCCL
LOG(DEBUG) << "Running nccl init on: " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__;
device_ordinal = _device_ordinal;
int32_t const rank = rabit::GetRank();
#if __CUDACC_VER_MAJOR__ > 9
int32_t const world = rabit::GetWorldSize();
std::vector<uint64_t> uuids(world * kUuidLength, 0);
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
GetCudaUUID(world, rank, device_ordinal, s_this_uuid);
// No allgather yet.
rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);;
size_t j = 0;
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
converted[j] =
xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
j++;
}
auto iter = std::unique(converted.begin(), converted.end());
auto n_uniques = std::distance(converted.begin(), iter);
CHECK_EQ(n_uniques, world)
<< "Multiple processes within communication group running on same CUDA "
<< "device is not supported";
#endif // __CUDACC_VER_MAJOR__ > 9
id = GetUniqueId();
dh::safe_cuda(cudaSetDevice(device_ordinal));
dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rank));
safe_cuda(cudaStreamCreate(&stream));
initialised_ = true;
#endif // XGBOOST_USE_NCCL
}
AllReducer::~AllReducer() {
#ifdef XGBOOST_USE_NCCL
if (initialised_) {
dh::safe_cuda(cudaStreamDestroy(stream));
ncclCommDestroy(comm);
}
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
LOG(CONSOLE) << "======== NCCL Statistics========";
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
}
#endif // XGBOOST_USE_NCCL
}
} // namespace dh

View File

@@ -7,24 +7,25 @@
#include <thrust/device_malloc_allocator.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include <xgboost/logging.h>
#include <omp.h>
#include <rabit/rabit.h>
#include <cub/cub.cuh>
#include <cub/util_allocator.cuh>
#include "xgboost/host_device_vector.h"
#include "xgboost/span.h"
#include "common.h"
#include <algorithm>
#include <omp.h>
#include <chrono>
#include <ctime>
#include <cub/cub.cuh>
#include <numeric>
#include <sstream>
#include <string>
#include <vector>
#include "xgboost/logging.h"
#include "xgboost/host_device_vector.h"
#include "xgboost/span.h"
#include "common.h"
#include "timer.h"
#ifdef XGBOOST_USE_NCCL
@@ -205,24 +206,53 @@ __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
}
template <typename L>
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
L lambda) {
L lambda) {
for (auto i : GridStrideRange(begin, end)) {
lambda(i, device_idx);
}
}
/* \brief A wrapper around kernel launching syntax, used to guard against empty input.
*
* - nvcc fails to deduce template argument when kernel is a template accepting __device__
* function as argument. Hence functions like `LaunchN` cannot use this wrapper.
*
* - With c++ initialization list `{}` syntax, you are forced to comply with the CUDA type
* spcification.
*/
class LaunchKernel {
size_t shmem_size_;
cudaStream_t stream_;
dim3 grids_;
dim3 blocks_;
public:
LaunchKernel(uint32_t _grids, uint32_t _blk, size_t _shmem=0, cudaStream_t _s=0) :
grids_{_grids, 1, 1}, blocks_{_blk, 1, 1}, shmem_size_{_shmem}, stream_{_s} {}
LaunchKernel(dim3 _grids, dim3 _blk, size_t _shmem=0, cudaStream_t _s=0) :
grids_{_grids}, blocks_{_blk}, shmem_size_{_shmem}, stream_{_s} {}
template <typename K, typename... Args>
void operator()(K kernel, Args... args) {
if (XGBOOST_EXPECT(grids_.x * grids_.y * grids_.z == 0, false)) {
LOG(DEBUG) << "Skipping empty CUDA kernel.";
return;
}
kernel<<<grids_, blocks_, shmem_size_, stream_>>>(args...); // NOLINT
}
};
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
if (n == 0) {
return;
}
safe_cuda(cudaSetDevice(device_idx));
const int GRID_SIZE =
static_cast<int>(xgboost::common::DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
n, lambda);
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>( // NOLINT
static_cast<size_t>(0), n, lambda);
}
// Default stream version
@@ -301,6 +331,16 @@ inline detail::MemoryLogger &GlobalMemoryLogger() {
return memory_logger;
}
// dh::DebugSyncDevice(__FILE__, __LINE__);
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
if (file != "" && line != -1) {
auto rank = rabit::GetRank();
LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
}
safe_cuda(cudaDeviceSynchronize());
safe_cuda(cudaGetLastError());
}
namespace detail{
/**
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
@@ -763,7 +803,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
BLOCK_THREADS, segments, num_segments, count);
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, OffsetT>
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, // NOLINT
segments + 1, f, num_segments);
}
@@ -963,7 +1003,6 @@ class SaveCudaContext {
* streams. Must be initialised before use. If XGBoost is compiled without NCCL
* this is a dummy class that will error if used with more than one GPU.
*/
class AllReducer {
bool initialised_;
size_t allreduce_bytes_; // Keep statistics of the number of bytes communicated
@@ -986,31 +1025,9 @@ class AllReducer {
*
* \param device_ordinal The device ordinal.
*/
void Init(int _device_ordinal);
void Init(int _device_ordinal) {
#ifdef XGBOOST_USE_NCCL
/** \brief this >monitor . init. */
device_ordinal = _device_ordinal;
id = GetUniqueId();
dh::safe_cuda(cudaSetDevice(device_ordinal));
dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank()));
safe_cuda(cudaStreamCreate(&stream));
initialised_ = true;
#endif
}
~AllReducer() {
#ifdef XGBOOST_USE_NCCL
if (initialised_) {
dh::safe_cuda(cudaStreamDestroy(stream));
ncclCommDestroy(comm);
}
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
LOG(CONSOLE) << "======== NCCL Statistics========";
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
}
#endif
}
~AllReducer();
/**
* \brief Allreduce. Use in exactly the same way as NCCL but without needing

View File

@@ -293,6 +293,7 @@ void DenseCuts::Build(DMatrix* p_fmat, uint32_t max_num_bins) {
void DenseCuts::Init
(std::vector<WXQSketch>* in_sketchs, uint32_t max_num_bins) {
monitor_.Start(__func__);
std::vector<WXQSketch>& sketchs = *in_sketchs;
constexpr int kFactor = 8;
// gather the histogram data
@@ -332,6 +333,7 @@ void DenseCuts::Init
CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
p_cuts_->cut_ptrs_.push_back(cut_size);
}
monitor_.Stop(__func__);
}
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {

View File

@@ -252,8 +252,10 @@ class GPUSketcher {
});
} else if (n_cuts_cur_[icol] > 0) {
// if more elements than cuts: use binary search on cumulative weights
int block = 256;
FindCutsK<<<common::DivRoundUp(n_cuts_cur_[icol], block), block>>>(
uint32_t constexpr kBlockThreads = 256;
uint32_t const kGrids = common::DivRoundUp(n_cuts_cur_[icol], kBlockThreads);
dh::LaunchKernel {kGrids, kBlockThreads} (
FindCutsK,
cuts_d_.data().get() + icol * n_cuts_,
fvalues_cur_.data().get(),
weights2_.data().get(),
@@ -403,7 +405,8 @@ class GPUSketcher {
// NOTE: This will typically support ~ 4M features - 64K*64
dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
common::DivRoundUp(num_cols_, block3.y), 1);
UnpackFeaturesK<<<grid3, block3>>>(
dh::LaunchKernel {grid3, block3} (
UnpackFeaturesK,
fvalues_.data().get(),
has_weights_ ? feature_weights_.data().get() : nullptr,
row_ptrs_.data().get() + batch_row_begin,

View File

@@ -13,6 +13,20 @@
namespace xgboost {
namespace common {
void Monitor::Start(std::string const &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
statistics_map[name].timer.Start();
}
}
void Monitor::Stop(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
}
}
std::vector<Monitor::StatMap> Monitor::CollectFromOtherRanks() const {
// Since other nodes might have started timers that this one haven't, so
// we can't simply call all reduce.

38
src/common/timer.cu Normal file
View File

@@ -0,0 +1,38 @@
/*!
* Copyright by Contributors 2019
*/
#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)
#include <string>
#include "xgboost/logging.h"
#include "device_helpers.cuh"
#include "timer.h"
namespace xgboost {
namespace common {
void Monitor::StartCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
stats.nvtx_id = nvtxRangeStartA(name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
}
}
void Monitor::StopCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
}
}
} // namespace common
} // namespace xgboost

View File

@@ -10,10 +10,6 @@
#include <utility>
#include <vector>
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
namespace xgboost {
namespace common {
@@ -84,37 +80,10 @@ struct Monitor {
void Print() const;
void Init(std::string label) { this->label = label; }
void Start(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
statistics_map[name].timer.Start();
}
}
void Stop(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
}
}
void StartCuda(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
stats.nvtx_id = nvtxRangeStartA(name.c_str());
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
}
}
void StopCuda(const std::string &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
}
}
void Start(const std::string &name);
void Stop(const std::string &name);
void StartCuda(const std::string &name);
void StopCuda(const std::string &name);
};
} // namespace common
} // namespace xgboost

View File

@@ -133,9 +133,12 @@ class Transform {
size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device_));
const int GRID_SIZE =
const int kGrids =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
if (kGrids == 0) {
return;
}
detail::LaunchCUDAKernel<<<kGrids, kBlockThreads>>>( // NOLINT
_func, shard_range, UnpackHDVOnDevice(_vectors)...);
}
#else