Improved gpu_hist_experimental algorithm (#2866)

- Implement colsampling, subsampling for gpu_hist_experimental

 - Optimised multi-GPU implementation for gpu_hist_experimental

 - Make nccl optional

 - Add Volta architecture flag

 - Optimise RegLossObj

 - Add timing utilities for debug verbose mode

 - Bump required cuda version to 8.0
This commit is contained in:
Rory Mitchell
2017-11-11 13:58:40 +13:00
committed by GitHub
parent 16c63f30d0
commit 40c6e2f0c8
14 changed files with 855 additions and 473 deletions

View File

@@ -8,6 +8,7 @@
#include "../common/compressed_iterator.h"
#include "../common/device_helpers.cuh"
#include "../common/hist_util.h"
#include "../common/timer.h"
#include "param.h"
#include "updater_gpu_common.cuh"
@@ -17,7 +18,6 @@ namespace tree {
DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
typedef bst_gpair_integer gpair_sum_t;
static const ncclDataType_t nccl_sum_t = ncclInt64;
// Helper for explicit template specialisation
template <int N>
@@ -63,15 +63,7 @@ struct HistHelper {
__device__ void Add(bst_gpair gpair, int gidx, int nidx) const {
int hist_idx = nidx * n_bins + gidx;
auto dst_ptr =
reinterpret_cast<unsigned long long int*>(&d_hist[hist_idx]); // NOLINT
gpair_sum_t tmp(gpair.GetGrad(), gpair.GetHess());
auto src_ptr = reinterpret_cast<gpair_sum_t::value_t*>(&tmp);
atomicAdd(dst_ptr,
static_cast<unsigned long long int>(*src_ptr)); // NOLINT
atomicAdd(dst_ptr + 1,
static_cast<unsigned long long int>(*(src_ptr + 1))); // NOLINT
AtomicAddGpair(d_hist + hist_idx, gpair);
}
__device__ gpair_sum_t Get(int gidx, int nidx) const {
return d_hist[nidx * n_bins + gidx];
@@ -244,22 +236,7 @@ class GPUHistMaker : public TreeUpdater {
is_dense(false),
p_last_fmat_(nullptr),
prediction_cache_initialised(false) {}
~GPUHistMaker() {
if (initialised) {
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
ncclCommDestroy(comms[d_idx]);
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
dh::safe_cuda(cudaStreamDestroy(*(streams[d_idx])));
}
for (int num_d = 1; num_d <= n_devices;
++num_d) { // loop over number of devices used
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
ncclCommDestroy(find_split_comms[num_d - 1][d_idx]);
}
}
}
}
~GPUHistMaker() {}
void Init(
const std::vector<std::pair<std::string, std::string>>& args) override {
param.InitAllowUnknown(args);
@@ -290,7 +267,7 @@ class GPUHistMaker : public TreeUpdater {
void InitData(const std::vector<bst_gpair>& gpair, DMatrix& fmat, // NOLINT
const RegTree& tree) {
dh::Timer time1;
common::Timer time1;
// set member num_rows and n_devices for rest of GPUHistBuilder members
info = &fmat.info();
CHECK(info->num_row < std::numeric_limits<bst_uint>::max());
@@ -298,6 +275,12 @@ class GPUHistMaker : public TreeUpdater {
n_devices = dh::n_devices(param.n_gpus, num_rows);
if (!initialised) {
// Check gradients are within acceptable size range
CheckGradientMax(gpair);
// Check compute capability is high enough
dh::check_compute_capability();
// reset static timers used across iterations
cpu_init_time = 0;
gpu_init_time = 0;
@@ -312,57 +295,10 @@ class GPUHistMaker : public TreeUpdater {
}
// initialize nccl
comms.resize(n_devices);
streams.resize(n_devices);
dh::safe_nccl(ncclCommInitAll(comms.data(), n_devices,
dList.data())); // initialize communicator
// (One communicator per
// process)
// printf("# NCCL: Using devices\n");
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
streams[d_idx] =
reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
int cudaDev;
int rank;
cudaDeviceProp prop;
dh::safe_nccl(ncclCommCuDevice(comms[d_idx], &cudaDev));
dh::safe_nccl(ncclCommUserRank(comms[d_idx], &rank));
dh::safe_cuda(cudaGetDeviceProperties(&prop, cudaDev));
// printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
// prop.pciBusID, prop.name);
// cudaDriverGetVersion(&driverVersion);
// cudaRuntimeGetVersion(&runtimeVersion);
std::ostringstream oss;
oss << "CUDA Capability Major/Minor version number: " << prop.major
<< "." << prop.minor << " is insufficient. Need >=3.5.";
int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
CHECK(failed == 0) << oss.str();
}
// local find_split group of comms for each case of reduced number of
// GPUs to use
find_split_comms.resize(
n_devices,
std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
// ok, and best to do
// here instead of
// repeatedly
for (int num_d = 1; num_d <= n_devices;
++num_d) { // loop over number of devices used
dh::safe_nccl(
ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
dList.data())); // initialize communicator
// (One communicator per
// process)
}
reducer.Init(dList);
is_dense = info->num_nonzero == info->num_col * info->num_row;
dh::Timer time0;
common::Timer time0;
hmat_.Init(&fmat, param.max_bin);
cpu_init_time += time0.ElapsedSeconds();
if (param.debug_verbose) { // Only done once for each training session
@@ -397,8 +333,8 @@ class GPUHistMaker : public TreeUpdater {
fflush(stdout);
}
int n_bins = static_cast<int >(hmat_.row_ptr.back());
int n_features = static_cast<int >(hmat_.row_ptr.size() - 1);
int n_bins = static_cast<int>(hmat_.row_ptr.back());
int n_features = static_cast<int>(hmat_.row_ptr.size() - 1);
// deliniate data onto multiple gpus
device_row_segments.push_back(0);
@@ -442,10 +378,7 @@ class GPUHistMaker : public TreeUpdater {
temp_memory.resize(n_devices);
hist_vec.resize(n_devices);
nodes.resize(n_devices);
nodes_temp.resize(n_devices);
nodes_child_temp.resize(n_devices);
left_child_smallest.resize(n_devices);
left_child_smallest_temp.resize(n_devices);
feature_flags.resize(n_devices);
fidx_min_map.resize(n_devices);
feature_segments.resize(n_devices);
@@ -457,12 +390,6 @@ class GPUHistMaker : public TreeUpdater {
gidx_feature_map.resize(n_devices);
gidx_fvalue_map.resize(n_devices);
int find_split_n_devices = static_cast<int >(std::pow(2, std::floor(std::log2(n_devices))));
find_split_n_devices =
std::min(n_nodes_level(param.max_depth), find_split_n_devices);
int max_num_nodes_device =
n_nodes_level(param.max_depth) / find_split_n_devices;
// num_rows_segment: for sharding rows onto gpus for splitting data
// num_elements_segment: for sharding rows (of elements) onto gpus for
// splitting data
@@ -476,26 +403,31 @@ class GPUHistMaker : public TreeUpdater {
device_row_segments[d_idx + 1] - device_row_segments[d_idx];
bst_ulong num_elements_segment =
device_element_segments[d_idx + 1] - device_element_segments[d_idx];
// ensure allocation doesn't overflow
size_t hist_size = static_cast<size_t>(n_nodes(param.max_depth - 1)) *
static_cast<size_t>(n_bins);
size_t nodes_size = static_cast<size_t>(n_nodes(param.max_depth));
size_t hmat_size = static_cast<size_t>(hmat_.min_val.size());
size_t buffer_size = static_cast<size_t>(
common::CompressedBufferWriter::CalculateBufferSize(
static_cast<size_t>(num_elements_segment),
static_cast<size_t>(n_bins)));
ba.allocate(
device_idx, param.silent, &(hist_vec[d_idx].data),
n_nodes(param.max_depth - 1) * n_bins, &nodes[d_idx],
n_nodes(param.max_depth), &nodes_temp[d_idx], max_num_nodes_device,
&nodes_child_temp[d_idx], max_num_nodes_device,
&left_child_smallest[d_idx], n_nodes(param.max_depth),
&left_child_smallest_temp[d_idx], max_num_nodes_device,
&feature_flags[d_idx],
device_idx, param.silent, &(hist_vec[d_idx].data), hist_size,
&nodes[d_idx], n_nodes(param.max_depth),
&left_child_smallest[d_idx], nodes_size, &feature_flags[d_idx],
n_features, // may change but same on all devices
&fidx_min_map[d_idx],
hmat_.min_val.size(), // constant and same on all devices
hmat_size, // constant and same on all devices
&feature_segments[d_idx],
h_feature_segments.size(), // constant and same on all devices
&prediction_cache[d_idx], num_rows_segment, &position[d_idx],
num_rows_segment, &position_tmp[d_idx], num_rows_segment,
&device_gpair[d_idx], num_rows_segment,
&device_matrix[d_idx].gidx_buffer,
common::CompressedBufferWriter::CalculateBufferSize(
num_elements_segment,
n_bins), // constant and same on all devices
buffer_size, // constant and same on all devices
&device_matrix[d_idx].row_ptr, num_rows_segment + 1,
&gidx_feature_map[d_idx],
n_bins, // constant and same on all devices
@@ -529,17 +461,12 @@ class GPUHistMaker : public TreeUpdater {
dh::safe_cuda(cudaSetDevice(device_idx));
nodes[d_idx].fill(DeviceNodeStats());
nodes_temp[d_idx].fill(DeviceNodeStats());
nodes_child_temp[d_idx].fill(DeviceNodeStats());
position[d_idx].fill(0);
device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
gpair.begin() + device_row_segments[d_idx + 1]);
// Check gradients are within acceptable size range
CheckGradientMax(device_gpair[d_idx]);
subsample_gpair(&device_gpair[d_idx], param.subsample,
device_row_segments[d_idx]);
@@ -618,21 +545,16 @@ class GPUHistMaker : public TreeUpdater {
// fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float));
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
dh::safe_nccl(ncclAllReduce(
reinterpret_cast<const void*>(hist_vec[d_idx].GetLevelPtr(depth)),
reinterpret_cast<void*>(hist_vec[d_idx].GetLevelPtr(depth)),
hist_vec[d_idx].LevelSize(depth) * sizeof(gpair_sum_t) /
sizeof(gpair_sum_t::value_t),
nccl_sum_t, ncclSum, comms[d_idx], *(streams[d_idx])));
reducer.AllReduceSum(device_idx,
reinterpret_cast<gpair_sum_t::value_t*>(
hist_vec[d_idx].GetLevelPtr(depth)),
reinterpret_cast<gpair_sum_t::value_t*>(
hist_vec[d_idx].GetLevelPtr(depth)),
hist_vec[d_idx].LevelSize(depth) *
sizeof(gpair_sum_t) /
sizeof(gpair_sum_t::value_t));
}
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
dh::safe_cuda(cudaSetDevice(device_idx));
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
}
// if no NCCL, then presume only 1 GPU, then already correct
reducer.Synchronize();
// time.printElapsed("Reduce-Add Time");
@@ -955,7 +877,7 @@ class GPUHistMaker : public TreeUpdater {
}
void UpdateTree(const std::vector<bst_gpair>& gpair, DMatrix* p_fmat,
RegTree* p_tree) {
dh::Timer time0;
common::Timer time0;
this->InitData(gpair, *p_fmat, *p_tree);
this->InitFirstNode(gpair);
@@ -1019,10 +941,7 @@ class GPUHistMaker : public TreeUpdater {
std::vector<dh::CubMemory> temp_memory;
std::vector<DeviceHist> hist_vec;
std::vector<dh::dvec<DeviceNodeStats>> nodes;
std::vector<dh::dvec<DeviceNodeStats>> nodes_temp;
std::vector<dh::dvec<DeviceNodeStats>> nodes_child_temp;
std::vector<dh::dvec<bool>> left_child_smallest;
std::vector<dh::dvec<bool>> left_child_smallest_temp;
std::vector<dh::dvec<int>> feature_flags;
std::vector<dh::dvec<float>> fidx_min_map;
std::vector<dh::dvec<int>> feature_segments;
@@ -1034,13 +953,11 @@ class GPUHistMaker : public TreeUpdater {
std::vector<dh::dvec<int>> gidx_feature_map;
std::vector<dh::dvec<float>> gidx_fvalue_map;
std::vector<cudaStream_t*> streams;
std::vector<ncclComm_t> comms;
std::vector<std::vector<ncclComm_t>> find_split_comms;
dh::AllReducer reducer;
double cpu_init_time;
double gpu_init_time;
dh::Timer cpu_time;
common::Timer cpu_time;
double gpu_time;
};