Improved gpu_hist_experimental algorithm (#2866)
- Implement colsampling, subsampling for gpu_hist_experimental - Optimised multi-GPU implementation for gpu_hist_experimental - Make nccl optional - Add Volta architecture flag - Optimise RegLossObj - Add timing utilities for debug verbose mode - Bump required cuda version to 8.0
This commit is contained in:
@@ -15,7 +15,10 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#include "nccl.h"
|
||||
#endif
|
||||
|
||||
// Uncomment to enable
|
||||
#define TIMERS
|
||||
@@ -44,6 +47,7 @@ inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
|
||||
return code;
|
||||
}
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#define safe_nccl(ans) throw_on_nccl_error((ans), __FILE__, __LINE__)
|
||||
|
||||
inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
@@ -57,6 +61,7 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
@@ -137,6 +142,19 @@ inline int get_device_idx(int gpu_id) {
|
||||
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
|
||||
}
|
||||
|
||||
inline void check_compute_capability() {
|
||||
int n_devices = n_visible_devices();
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
cudaDeviceProp prop;
|
||||
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
|
||||
std::ostringstream oss;
|
||||
oss << "CUDA Capability Major/Minor version number: " << prop.major << "."
|
||||
<< prop.minor << " is insufficient. Need >=3.5";
|
||||
int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
|
||||
if (failed) LOG(WARNING) << oss.str() << " for device: " << d_idx;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Range iterator
|
||||
*/
|
||||
@@ -241,37 +259,12 @@ inline void launch_n(int device_idx, size_t n, L lambda) {
|
||||
}
|
||||
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
const int GRID_SIZE = static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
const int GRID_SIZE =
|
||||
static_cast<int>(div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(static_cast<size_t>(0), n,
|
||||
lambda);
|
||||
}
|
||||
|
||||
/*
|
||||
* Timers
|
||||
*/
|
||||
|
||||
struct Timer {
|
||||
typedef std::chrono::high_resolution_clock ClockT;
|
||||
typedef std::chrono::high_resolution_clock::time_point TimePointT;
|
||||
typedef std::chrono::high_resolution_clock::duration DurationT;
|
||||
typedef std::chrono::duration<double> SecondsT;
|
||||
|
||||
TimePointT start;
|
||||
DurationT elapsed;
|
||||
Timer() { Reset(); }
|
||||
void Reset() {
|
||||
elapsed = DurationT::zero();
|
||||
Start();
|
||||
}
|
||||
void Start() { start = ClockT::now(); }
|
||||
void Stop() { elapsed += ClockT::now() - start; }
|
||||
double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
|
||||
void PrintElapsed(std::string label) {
|
||||
printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
|
||||
Reset();
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Memory
|
||||
*/
|
||||
@@ -444,7 +437,7 @@ class bulk_allocator {
|
||||
|
||||
template <typename T>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec<T> *first_vec,
|
||||
size_t first_size) {
|
||||
size_t first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
first_size);
|
||||
}
|
||||
@@ -470,8 +463,7 @@ class bulk_allocator {
|
||||
|
||||
template <typename T, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, size_t first_size, Args... args) {
|
||||
return get_size_bytes<T>(first_vec, first_size) +
|
||||
get_size_bytes(args...);
|
||||
return get_size_bytes<T>(first_vec, first_size) + get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@@ -497,6 +489,7 @@ class bulk_allocator {
|
||||
if (!(d_ptr[i] == nullptr)) {
|
||||
safe_cuda(cudaSetDevice(_device_idx[i]));
|
||||
safe_cuda(cudaFree(d_ptr[i]));
|
||||
d_ptr[i] = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -642,7 +635,8 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
|
||||
|
||||
for (auto item : dh::block_stride_range(int(0), int(tile_num_rows + 1))) {
|
||||
temp_storage.tile_segment_end_offsets[item] =
|
||||
segment_end_offsets[min(tile_start_coord.x + item, num_segments - 1)];
|
||||
segment_end_offsets[min(static_cast<size_t>(tile_start_coord.x + item),
|
||||
static_cast<size_t>(num_segments - 1))];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -693,8 +687,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
BLOCK_THREADS, segments, num_segments, count);
|
||||
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
|
||||
num_segments);
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
|
||||
segments + 1, f, num_segments);
|
||||
}
|
||||
|
||||
template <typename func_t, typename offset_t>
|
||||
@@ -836,4 +830,125 @@ void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* \class AllReducer
|
||||
*
|
||||
* \brief All reducer class that manages its own communication group and
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
|
||||
*/
|
||||
|
||||
class AllReducer {
|
||||
bool initialised;
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
std::vector<ncclComm_t> comms;
|
||||
std::vector<cudaStream_t> streams;
|
||||
std::vector<int> device_ordinals;
|
||||
#endif
|
||||
public:
|
||||
AllReducer() : initialised(false) {}
|
||||
|
||||
/**
|
||||
* \fn void Init(const std::vector<int> &device_ordinals)
|
||||
*
|
||||
* \brief Initialise with the desired device ordinals for this communication
|
||||
* group.
|
||||
*
|
||||
* \param device_ordinals The device ordinals.
|
||||
*/
|
||||
|
||||
void Init(const std::vector<int> &device_ordinals) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
this->device_ordinals = device_ordinals;
|
||||
comms.resize(device_ordinals.size());
|
||||
dh::safe_nccl(ncclCommInitAll(comms.data(),
|
||||
static_cast<int>(device_ordinals.size()),
|
||||
device_ordinals.data()));
|
||||
streams.resize(device_ordinals.size());
|
||||
for (size_t i = 0; i < device_ordinals.size(); i++) {
|
||||
safe_cuda(cudaSetDevice(device_ordinals[i]));
|
||||
safe_cuda(cudaStreamCreate(&streams[i]));
|
||||
}
|
||||
initialised = true;
|
||||
#else
|
||||
CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
|
||||
#endif
|
||||
}
|
||||
~AllReducer() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
if (initialised) {
|
||||
for (auto &stream : streams) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream));
|
||||
}
|
||||
for (auto &comm : comms) {
|
||||
ncclCommDestroy(comm);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void AllReduceSum(int communication_group_idx, const double *sendbuff,
|
||||
* double *recvbuff, int count)
|
||||
*
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the
|
||||
* communication group. \param sendbuff The sendbuff. \param
|
||||
* sendbuff The sendbuff. \param [in,out] recvbuff
|
||||
* The recvbuff. \param count Number of.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const double *sendbuff,
|
||||
double *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
|
||||
comms[communication_group_idx],
|
||||
streams[communication_group_idx]));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
|
||||
*
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group. \param
|
||||
* sendbuff The sendbuff. \param sendbuff
|
||||
* The sendbuff. \param [in,out] recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param [in,out] recvbuff If non-null, the recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
|
||||
int64_t *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclInt64, ncclSum,
|
||||
comms[communication_group_idx],
|
||||
streams[communication_group_idx]));
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void Synchronize()
|
||||
*
|
||||
* \brief Synchronizes the entire communication group.
|
||||
*/
|
||||
void Synchronize() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
for (int i = 0; i < device_ordinals.size(); i++) {
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[i]));
|
||||
dh::safe_cuda(cudaStreamSynchronize(streams[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
} // namespace dh
|
||||
|
||||
77
src/common/timer.h
Normal file
77
src/common/timer.h
Normal file
@@ -0,0 +1,77 @@
|
||||
/*!
|
||||
* Copyright by Contributors 2017
|
||||
*/
|
||||
#pragma once
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
struct Timer {
|
||||
typedef std::chrono::high_resolution_clock ClockT;
|
||||
typedef std::chrono::high_resolution_clock::time_point TimePointT;
|
||||
typedef std::chrono::high_resolution_clock::duration DurationT;
|
||||
typedef std::chrono::duration<double> SecondsT;
|
||||
|
||||
TimePointT start;
|
||||
DurationT elapsed;
|
||||
Timer() { Reset(); }
|
||||
void Reset() {
|
||||
elapsed = DurationT::zero();
|
||||
Start();
|
||||
}
|
||||
void Start() { start = ClockT::now(); }
|
||||
void Stop() { elapsed += ClockT::now() - start; }
|
||||
double ElapsedSeconds() const { return SecondsT(elapsed).count(); }
|
||||
void PrintElapsed(std::string label) {
|
||||
printf("%s:\t %fs\n", label.c_str(), SecondsT(elapsed).count());
|
||||
Reset();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \struct Monitor
|
||||
*
|
||||
* \brief Timing utility used to measure total method execution time over the
|
||||
* lifetime of the containing object.
|
||||
*/
|
||||
|
||||
struct Monitor {
|
||||
bool debug_verbose = false;
|
||||
std::string label = "";
|
||||
std::map<std::string, Timer> timer_map;
|
||||
Timer self_timer;
|
||||
|
||||
Monitor() { self_timer.Start(); }
|
||||
|
||||
~Monitor() {
|
||||
if (!debug_verbose) return;
|
||||
|
||||
std::cout << "========\n";
|
||||
std::cout << "Monitor: " << label << "\n";
|
||||
std::cout << "========\n";
|
||||
for (auto &kv : timer_map) {
|
||||
kv.second.PrintElapsed(kv.first);
|
||||
}
|
||||
self_timer.Stop();
|
||||
self_timer.PrintElapsed(label + " Lifetime");
|
||||
}
|
||||
void Init(std::string label, bool debug_verbose) {
|
||||
this->debug_verbose = debug_verbose;
|
||||
this->label = label;
|
||||
}
|
||||
void Start(const std::string &name) { timer_map[name].Start(); }
|
||||
void Stop(const std::string &name) {
|
||||
if (debug_verbose) {
|
||||
#ifdef __CUDACC__
|
||||
#include "device_helpers.cuh"
|
||||
dh::synchronize_all();
|
||||
#endif
|
||||
}
|
||||
timer_map[name].Stop();
|
||||
}
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
Reference in New Issue
Block a user