Improved gpu_hist_experimental algorithm (#2866)

- Implement colsampling, subsampling for gpu_hist_experimental

 - Optimised multi-GPU implementation for gpu_hist_experimental

 - Make nccl optional

 - Add Volta architecture flag

 - Optimise RegLossObj

 - Add timing utilities for debug verbose mode

 - Bump required cuda version to 8.0
This commit is contained in:
Rory Mitchell
2017-11-11 13:58:40 +13:00
committed by GitHub
parent 16c63f30d0
commit 40c6e2f0c8
14 changed files with 855 additions and 473 deletions

View File

@@ -15,7 +15,10 @@
#include <sstream>
#include <string>
#include <vector>
#ifdef XGBOOST_USE_NCCL
#include "nccl.h"
#endif
// Uncomment to enable
#define TIMERS
@@ -44,6 +47,7 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
return code;
}
#ifdef XGBOOST_USE_NCCL
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
@@ -57,6 +61,7 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
return code;
}
#endif
template <typename T>
T *raw(thrust::device_vector<T> &v) { // NOLINT
@@ -137,6 +142,19 @@ inline int get_device_idx(int gpu_id) {
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
}
inline void check_compute_capability() {
int n_devices = n_visible_devices();
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
cudaDeviceProp prop;
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
std::ostringstream oss;
oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
<< prop.minor << " is insufficient. Need >=3.5";
int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
}
}
/*
* Range iterator
*/
@@ -241,37 +259,12 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
}
safe_cuda(cudaSetDevice(device_idx));
const int GRID_SIZE = static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
const int GRID_SIZE =
static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
lambda);
}
/*
* Timers
*/
struct Timer {
typedef std::chrono::high_resolution_clock ClockT;
typedef std::chrono::high_resolution_clock::time_point TimePointT;
typedef std::chrono::high_resolution_clock::duration DurationT;
typedef std::chrono::duration<double> SecondsT;
TimePointT start;
DurationT elapsed;
Timer() { Reset(); }
void Reset() {
elapsed = DurationT::zero();
Start();
}
void Start() { start = ClockT::now(); }
void Stop() { elapsed += ClockT::now() - start; }
double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
void PrintElapsed(std::string label) {
printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
Reset();
}
};
/*
* Memory
*/
@@ -444,7 +437,7 @@ class bulk_allocator {
template <typename T>
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
size_t first_size) {
size_t first_size) {
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
first_size);
}
@@ -470,8 +463,7 @@ class bulk_allocator {
template <typename T, typename... Args>
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
return get_size_bytes<T>(first_vec, first_size) +
get_size_bytes(args...);
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
}
template <typename T>
@@ -497,6 +489,7 @@ class bulk_allocator {
if (!(d_ptr[i] == nullptr)) {
safe_cuda(cudaSetDevice(_device_idx[i]));
safe_cuda(cudaFree(d_ptr[i]));
d_ptr[i] = nullptr;
}
}
}
@@ -642,7 +635,8 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
for (auto item : dh::block_stride_range(int(0), int(tile_num_rows + 1))) {
temp_storage.tile_segment_end_offsets[item] =
segment_end_offsets[min(tile_start_coord.x + item, num_segments - 1)];
segment_end_offsets[min(static_cast<size_t>(tile_start_coord.x + item),
static_cast<size_t>(num_segments - 1))];
}
__syncthreads();
@@ -693,8 +687,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
BLOCK_THREADS, segments, num_segments, count);
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
num_segments);
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
segments + 1, f, num_segments);
}
template <typename func_t, typename offset_t>
@@ -836,4 +830,125 @@ void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
});
}
/**
* \class AllReducer
*
* \brief All reducer class that manages its own communication group and
* streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
*/
class AllReducer {
bool initialised;
#ifdef XGBOOST_USE_NCCL
std::vector<ncclComm_t> comms;
std::vector<cudaStream_t> streams;
std::vector<int> device_ordinals;
#endif
public:
AllReducer() : initialised(false) {}
/**
* \fn void Init(const std::vector<int> &device_ordinals)
*
* \brief Initialise with the desired device ordinals for this communication
* group.
*
* \param device_ordinals The device ordinals.
*/
void Init(const std::vector<int> &device_ordinals) {
#ifdef XGBOOST_USE_NCCL
this->device_ordinals = device_ordinals;
comms.resize(device_ordinals.size());
dh::safe_nccl(ncclCommInitAll(comms.data(),
static_cast<int>(device_ordinals.size()),
device_ordinals.data()));
streams.resize(device_ordinals.size());
for (size_t i = 0; i < device_ordinals.size(); i++) {
safe_cuda(cudaSetDevice(device_ordinals[i]));
safe_cuda(cudaStreamCreate(&streams[i]));
}
initialised = true;
#else
CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
#endif
}
~AllReducer() {
#ifdef XGBOOST_USE_NCCL
if (initialised) {
for (auto &stream : streams) {
dh::safe_cuda(cudaStreamDestroy(stream));
}
for (auto &comm : comms) {
ncclCommDestroy(comm);
}
}
#endif
}
/**
* \fn void AllReduceSum(int communication_group_idx, const double *sendbuff,
* double *recvbuff, int count)
*
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
* streams or comms.
*
* \param communication_group_idx Zero-based index of the
* communication group. \param sendbuff The sendbuff. \param
* sendbuff The sendbuff. \param [in,out] recvbuff
* The recvbuff. \param count Number of.
*/
void AllReduceSum(int communication_group_idx, const double *sendbuff,
double *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
CHECK(initialised);
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
comms[communication_group_idx],
streams[communication_group_idx]));
#endif
}
/**
* \fn void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
*
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
*
* \param communication_group_idx Zero-based index of the communication group. \param
* sendbuff The sendbuff. \param sendbuff
* The sendbuff. \param [in,out] recvbuff The recvbuff.
* \param count Number of.
* \param sendbuff The sendbuff.
* \param [in,out] recvbuff If non-null, the recvbuff.
* \param count Number of.
*/
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
int64_t *recvbuff, int count) {
#ifdef XGBOOST_USE_NCCL
CHECK(initialised);
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
comms[communication_group_idx],
streams[communication_group_idx]));
#endif
}
/**
* \fn void Synchronize()
*
* \brief Synchronizes the entire communication group.
*/
void Synchronize() {
#ifdef XGBOOST_USE_NCCL
for (int i = 0; i < device_ordinals.size(); i++) {
dh::safe_cuda(cudaSetDevice(device_ordinals[i]));
dh::safe_cuda(cudaStreamSynchronize(streams[i]));
}
#endif
}
};
} // namespace dh

77
src/common/timer.h Normal file
View File

@@ -0,0 +1,77 @@
/*!
* Copyright by Contributors 2017
*/
#pragma once
#include <chrono>
#include <iostream>
#include <map>
#include <string>
namespace xgboost {
namespace common {
struct Timer {
typedef std::chrono::high_resolution_clock ClockT;
typedef std::chrono::high_resolution_clock::time_point TimePointT;
typedef std::chrono::high_resolution_clock::duration DurationT;
typedef std::chrono::duration<double> SecondsT;
TimePointT start;
DurationT elapsed;
Timer() { Reset(); }
void Reset() {
elapsed = DurationT::zero();
Start();
}
void Start() { start = ClockT::now(); }
void Stop() { elapsed += ClockT::now() - start; }
double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
void PrintElapsed(std::string label) {
printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
Reset();
}
};
/**
* \struct Monitor
*
* \brief Timing utility used to measure total method execution time over the
* lifetime of the containing object.
*/
struct Monitor {
bool debug_verbose = false;
std::string label = "";
std::map<std::string, Timer> timer_map;
Timer self_timer;
Monitor() { self_timer.Start(); }
~Monitor() {
if (!debug_verbose) return;
std::cout << "========\n";
std::cout << "Monitor: " << label << "\n";
std::cout << "========\n";
for (auto &kv : timer_map) {
kv.second.PrintElapsed(kv.first);
}
self_timer.Stop();
self_timer.PrintElapsed(label + " Lifetime");
}
void Init(std::string label, bool debug_verbose) {
this->debug_verbose = debug_verbose;
this->label = label;
}
void Start(const std::string &name) { timer_map[name].Start(); }
void Stop(const std::string &name) {
if (debug_verbose) {
#ifdef __CUDACC__
#include "device_helpers.cuh"
dh::synchronize_all();
#endif
}
timer_map[name].Stop();
}
};
} // namespace common
} // namespace xgboost