* [GPU-Plugin] Add GPU accelerated prediction * Improve allocation message * Update documentation * Resolve linker error for predictor * Add unit tests
1163 lines
43 KiB
Plaintext
1163 lines
43 KiB
Plaintext
/*!
|
|
* Copyright 2017 Rory mitchell
|
|
*/
|
|
#include <thrust/binary_search.h>
|
|
#include <thrust/count.h>
|
|
#include <thrust/sequence.h>
|
|
#include <thrust/sort.h>
|
|
#include <cub/cub.cuh>
|
|
#include <string>
|
|
#include <sstream>
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <future>
|
|
#include <numeric>
|
|
#include "common.cuh"
|
|
#include "device_helpers.cuh"
|
|
#include "dmlc/timer.h"
|
|
#include "gpu_hist_builder.cuh"
|
|
|
|
namespace xgboost {
|
|
namespace tree {
|
|
|
|
void DeviceGMat::Init(int device_idx, const common::GHistIndexMatrix& gmat,
|
|
bst_ulong element_begin, bst_ulong element_end,
|
|
bst_ulong row_begin, bst_ulong row_end, int n_bins) {
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
CHECK(gidx_buffer.size()) << "gidx_buffer must be externally allocated";
|
|
CHECK_EQ(row_ptr.size(), (row_end - row_begin) + 1)
|
|
<< "row_ptr must be externally allocated";
|
|
|
|
common::CompressedBufferWriter cbw(n_bins);
|
|
std::vector<common::compressed_byte_t> host_buffer(gidx_buffer.size());
|
|
cbw.Write(host_buffer.data(), gmat.index.begin() + element_begin,
|
|
gmat.index.begin() + element_end);
|
|
gidx_buffer = host_buffer;
|
|
gidx = common::CompressedIterator<uint32_t>(gidx_buffer.data(), n_bins);
|
|
|
|
// row_ptr
|
|
thrust::copy(gmat.row_ptr.data() + row_begin,
|
|
gmat.row_ptr.data() + row_end + 1, row_ptr.tbegin());
|
|
// normalise row_ptr
|
|
size_t start = gmat.row_ptr[row_begin];
|
|
thrust::transform(row_ptr.tbegin(), row_ptr.tend(), row_ptr.tbegin(),
|
|
[=] __device__(size_t val) { return val - start; });
|
|
}
|
|
|
|
void DeviceHist::Init(int n_bins_in) {
|
|
this->n_bins = n_bins_in;
|
|
CHECK(!data.empty()) << "DeviceHist must be externally allocated";
|
|
}
|
|
|
|
void DeviceHist::Reset(int device_idx) {
|
|
cudaSetDevice(device_idx);
|
|
data.fill(bst_gpair_precise());
|
|
}
|
|
|
|
bst_gpair_precise* DeviceHist::GetLevelPtr(int depth) {
|
|
return data.data() + n_nodes(depth - 1) * n_bins;
|
|
}
|
|
|
|
int DeviceHist::LevelSize(int depth) { return n_bins * n_nodes_level(depth); }
|
|
|
|
HistBuilder DeviceHist::GetBuilder() {
|
|
return HistBuilder(data.data(), n_bins);
|
|
}
|
|
|
|
HistBuilder::HistBuilder(bst_gpair_precise* ptr, int n_bins)
|
|
: d_hist(ptr), n_bins(n_bins) {}
|
|
|
|
// Define double precision atomic add for older architectures
|
|
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
|
#else
|
|
__device__ double atomicAdd(double* address, double val) {
|
|
unsigned long long int* address_as_ull = (unsigned long long int*)address; // NOLINT
|
|
unsigned long long int old = *address_as_ull, assumed; // NOLINT
|
|
do {
|
|
assumed = old;
|
|
old = atomicCAS(address_as_ull, assumed,
|
|
__double_as_longlong(val + __longlong_as_double(assumed)));
|
|
} while (assumed != old);
|
|
return __longlong_as_double(old);
|
|
}
|
|
#endif
|
|
|
|
__device__ void HistBuilder::Add(bst_gpair_precise gpair, int gidx, int nidx) const {
|
|
int hist_idx = nidx * n_bins + gidx;
|
|
atomicAdd(&(d_hist[hist_idx].grad), gpair.grad); // OPTMARK: This and below
|
|
// line lead to about 3X
|
|
// slowdown due to memory
|
|
// dependency and access
|
|
// pattern issues.
|
|
atomicAdd(&(d_hist[hist_idx].hess), gpair.hess);
|
|
}
|
|
|
|
__device__ bst_gpair_precise HistBuilder::Get(int gidx, int nidx) const {
|
|
return d_hist[nidx * n_bins + gidx];
|
|
}
|
|
|
|
GPUHistBuilder::GPUHistBuilder()
|
|
: initialised(false),
|
|
is_dense(false),
|
|
p_last_fmat_(nullptr),
|
|
prediction_cache_initialised(false) {}
|
|
|
|
GPUHistBuilder::~GPUHistBuilder() {
|
|
if (initialised) {
|
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
|
ncclCommDestroy(comms[d_idx]);
|
|
|
|
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
|
|
dh::safe_cuda(cudaStreamDestroy(*(streams[d_idx])));
|
|
}
|
|
for (int num_d = 1; num_d <= n_devices;
|
|
++num_d) { // loop over number of devices used
|
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
|
ncclCommDestroy(find_split_comms[num_d - 1][d_idx]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void GPUHistBuilder::Init(const TrainParam& param) {
|
|
CHECK(param.max_depth < 16) << "Tree depth too large.";
|
|
CHECK(param.max_depth != 0) << "Tree depth cannot be 0.";
|
|
CHECK(param.grow_policy != TrainParam::kLossGuide)
|
|
<< "Loss guided growth policy not supported. Use CPU algorithm.";
|
|
this->param = param;
|
|
|
|
CHECK(param.n_gpus != 0) << "Must have at least one device";
|
|
}
|
|
void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
|
DMatrix& fmat, // NOLINT
|
|
const RegTree& tree) {
|
|
dh::Timer time1;
|
|
// set member num_rows and n_devices for rest of GPUHistBuilder members
|
|
info = &fmat.info();
|
|
num_rows = info->num_row;
|
|
n_devices = dh::n_devices(param.n_gpus, num_rows);
|
|
|
|
if (!initialised) {
|
|
// reset static timers used across iterations
|
|
cpu_init_time = 0;
|
|
gpu_init_time = 0;
|
|
cpu_time.reset();
|
|
gpu_time = 0;
|
|
|
|
// set dList member
|
|
dList.resize(n_devices);
|
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
|
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
|
dList[d_idx] = device_idx;
|
|
}
|
|
|
|
// initialize nccl
|
|
|
|
comms.resize(n_devices);
|
|
streams.resize(n_devices);
|
|
dh::safe_nccl(ncclCommInitAll(comms.data(), n_devices,
|
|
dList.data())); // initialize communicator
|
|
// (One communicator per
|
|
// process)
|
|
|
|
// printf("# NCCL: Using devices\n");
|
|
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
|
streams[d_idx] =
|
|
reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
|
|
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
|
|
dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
|
|
|
|
int cudaDev;
|
|
int rank;
|
|
cudaDeviceProp prop;
|
|
dh::safe_nccl(ncclCommCuDevice(comms[d_idx], &cudaDev));
|
|
dh::safe_nccl(ncclCommUserRank(comms[d_idx], &rank));
|
|
dh::safe_cuda(cudaGetDeviceProperties(&prop, cudaDev));
|
|
// printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
|
|
// prop.pciBusID, prop.name);
|
|
// cudaDriverGetVersion(&driverVersion);
|
|
// cudaRuntimeGetVersion(&runtimeVersion);
|
|
std::ostringstream oss;
|
|
oss << "CUDA Capability Major/Minor version number: "
|
|
<< prop.major << "." << prop.minor << " is insufficient. Need >=3.5.";
|
|
int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5;
|
|
CHECK(failed == 0) << oss.str();
|
|
}
|
|
|
|
// local find_split group of comms for each case of reduced number of GPUs
|
|
// to use
|
|
find_split_comms.resize(
|
|
n_devices,
|
|
std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
|
|
// ok, and best to do
|
|
// here instead of
|
|
// repeatedly
|
|
for (int num_d = 1; num_d <= n_devices;
|
|
++num_d) { // loop over number of devices used
|
|
dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
|
|
dList.data())); // initialize communicator
|
|
// (One communicator per
|
|
// process)
|
|
}
|
|
|
|
is_dense = info->num_nonzero == info->num_col * info->num_row;
|
|
dh::Timer time0;
|
|
hmat_.Init(&fmat, param.max_bin);
|
|
cpu_init_time += time0.elapsedSeconds();
|
|
if (param.debug_verbose) { // Only done once for each training session
|
|
LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init "
|
|
<< time0.elapsedSeconds() << " sec";
|
|
fflush(stdout);
|
|
}
|
|
time0.reset();
|
|
|
|
gmat_.cut = &hmat_;
|
|
cpu_init_time += time0.elapsedSeconds();
|
|
if (param.debug_verbose) { // Only done once for each training session
|
|
LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.cut "
|
|
<< time0.elapsedSeconds() << " sec";
|
|
fflush(stdout);
|
|
}
|
|
time0.reset();
|
|
|
|
gmat_.Init(&fmat);
|
|
cpu_init_time += time0.elapsedSeconds();
|
|
if (param.debug_verbose) { // Only done once for each training session
|
|
LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.Init() "
|
|
<< time0.elapsedSeconds() << " sec";
|
|
fflush(stdout);
|
|
}
|
|
time0.reset();
|
|
|
|
if (param.debug_verbose) { // Only done once for each training session
|
|
LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init, gmat_.cut, gmat_.Init "
|
|
<< cpu_init_time << " sec";
|
|
fflush(stdout);
|
|
}
|
|
|
|
int n_bins = hmat_.row_ptr.back();
|
|
int n_features = hmat_.row_ptr.size() - 1;
|
|
|
|
// deliniate data onto multiple gpus
|
|
device_row_segments.push_back(0);
|
|
device_element_segments.push_back(0);
|
|
bst_uint offset = 0;
|
|
bst_uint shard_size = std::ceil(static_cast<double>(num_rows) / n_devices);
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
offset += shard_size;
|
|
offset = std::min(offset, num_rows);
|
|
device_row_segments.push_back(offset);
|
|
device_element_segments.push_back(gmat_.row_ptr[offset]);
|
|
}
|
|
|
|
// Build feature segments
|
|
std::vector<int> h_feature_segments;
|
|
for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) {
|
|
for (int fidx = 0; fidx < n_features; fidx++) {
|
|
h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins);
|
|
}
|
|
}
|
|
h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins);
|
|
|
|
// Construct feature map
|
|
std::vector<int> h_gidx_feature_map(n_bins);
|
|
for (int fidx = 0; fidx < n_features; fidx++) {
|
|
for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
|
|
h_gidx_feature_map[i] = fidx;
|
|
}
|
|
}
|
|
|
|
int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins;
|
|
|
|
// allocate unique common data that reside on master device (NOTE: None
|
|
// currently)
|
|
// int master_device=dList[0];
|
|
// ba.allocate(master_device, );
|
|
|
|
// allocate vectors across all devices
|
|
temp_memory.resize(n_devices);
|
|
hist_vec.resize(n_devices);
|
|
nodes.resize(n_devices);
|
|
nodes_temp.resize(n_devices);
|
|
nodes_child_temp.resize(n_devices);
|
|
left_child_smallest.resize(n_devices);
|
|
left_child_smallest_temp.resize(n_devices);
|
|
feature_flags.resize(n_devices);
|
|
fidx_min_map.resize(n_devices);
|
|
feature_segments.resize(n_devices);
|
|
prediction_cache.resize(n_devices);
|
|
position.resize(n_devices);
|
|
position_tmp.resize(n_devices);
|
|
device_matrix.resize(n_devices);
|
|
device_gpair.resize(n_devices);
|
|
gidx_feature_map.resize(n_devices);
|
|
gidx_fvalue_map.resize(n_devices);
|
|
|
|
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
|
find_split_n_devices =
|
|
std::min(n_nodes_level(param.max_depth), find_split_n_devices);
|
|
int max_num_nodes_device =
|
|
n_nodes_level(param.max_depth) / find_split_n_devices;
|
|
|
|
// num_rows_segment: for sharding rows onto gpus for splitting data
|
|
// num_elements_segment: for sharding rows (of elements) onto gpus for
|
|
// splitting data
|
|
// max_num_nodes_device: for sharding nodes onto gpus for split finding
|
|
// All other variables have full copy on gpu, with copy either being
|
|
// identical or just current portion (like for histogram) before AllReduce
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
bst_uint num_rows_segment =
|
|
device_row_segments[d_idx + 1] - device_row_segments[d_idx];
|
|
bst_ulong num_elements_segment =
|
|
device_element_segments[d_idx + 1] - device_element_segments[d_idx];
|
|
ba.allocate(
|
|
device_idx, param.silent,
|
|
&(hist_vec[d_idx].data),
|
|
n_nodes(param.max_depth - 1) * n_bins, &nodes[d_idx],
|
|
n_nodes(param.max_depth), &nodes_temp[d_idx], max_num_nodes_device,
|
|
&nodes_child_temp[d_idx], max_num_nodes_device,
|
|
&left_child_smallest[d_idx], n_nodes(param.max_depth),
|
|
&left_child_smallest_temp[d_idx], max_num_nodes_device,
|
|
&feature_flags[d_idx],
|
|
n_features, // may change but same on all devices
|
|
&fidx_min_map[d_idx],
|
|
hmat_.min_val.size(), // constant and same on all devices
|
|
&feature_segments[d_idx],
|
|
h_feature_segments.size(), // constant and same on all devices
|
|
&prediction_cache[d_idx], num_rows_segment, &position[d_idx],
|
|
num_rows_segment, &position_tmp[d_idx], num_rows_segment,
|
|
&device_gpair[d_idx], num_rows_segment,
|
|
&device_matrix[d_idx].gidx_buffer,
|
|
common::CompressedBufferWriter::CalculateBufferSize(
|
|
num_elements_segment,
|
|
n_bins), // constant and same on all devices
|
|
&device_matrix[d_idx].row_ptr, num_rows_segment + 1,
|
|
&gidx_feature_map[d_idx], n_bins, // constant and same on all devices
|
|
&gidx_fvalue_map[d_idx],
|
|
hmat_.cut.size()); // constant and same on all devices
|
|
|
|
// Copy Host to Device (assumes comes after ba.allocate that sets device)
|
|
device_matrix[d_idx].Init(
|
|
device_idx, gmat_, device_element_segments[d_idx],
|
|
device_element_segments[d_idx + 1], device_row_segments[d_idx],
|
|
device_row_segments[d_idx + 1], n_bins);
|
|
gidx_feature_map[d_idx] = h_gidx_feature_map;
|
|
gidx_fvalue_map[d_idx] = hmat_.cut;
|
|
feature_segments[d_idx] = h_feature_segments;
|
|
fidx_min_map[d_idx] = hmat_.min_val;
|
|
|
|
// Initialize, no copy
|
|
hist_vec[d_idx].Init(n_bins); // init host object
|
|
prediction_cache[d_idx].fill(0); // init device object (assumes comes
|
|
// after ba.allocate that sets device)
|
|
feature_flags[d_idx].fill(1); // init device object (assumes comes after
|
|
// ba.allocate that sets device)
|
|
}
|
|
}
|
|
|
|
// copy or init to do every iteration
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
nodes[d_idx].fill(Node());
|
|
nodes_temp[d_idx].fill(Node());
|
|
nodes_child_temp[d_idx].fill(Node());
|
|
|
|
position[d_idx].fill(0);
|
|
|
|
device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx],
|
|
gpair.begin() + device_row_segments[d_idx + 1]);
|
|
|
|
subsample_gpair(&device_gpair[d_idx], param.subsample,
|
|
device_row_segments[d_idx]);
|
|
|
|
hist_vec[d_idx].Reset(device_idx);
|
|
|
|
// left_child_smallest and left_child_smallest_temp don't need to be
|
|
// initialized
|
|
}
|
|
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
|
|
if (!initialised) {
|
|
gpu_init_time = time1.elapsedSeconds() - cpu_init_time;
|
|
gpu_time = -cpu_init_time;
|
|
if (param.debug_verbose) { // Only done once for each training session
|
|
LOG(CONSOLE) << "[GPU Plug-in] Time for GPU operations during First Call to InitData() "
|
|
<< gpu_init_time << " sec";
|
|
fflush(stdout);
|
|
}
|
|
}
|
|
|
|
|
|
p_last_fmat_ = &fmat;
|
|
|
|
initialised = true;
|
|
}
|
|
|
|
void GPUHistBuilder::BuildHist(int depth) {
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
size_t begin = device_element_segments[d_idx];
|
|
size_t end = device_element_segments[d_idx + 1];
|
|
size_t row_begin = device_row_segments[d_idx];
|
|
size_t row_end = device_row_segments[d_idx + 1];
|
|
|
|
auto d_gidx = device_matrix[d_idx].gidx;
|
|
auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
|
|
auto d_position = position[d_idx].data();
|
|
auto d_gpair = device_gpair[d_idx].data();
|
|
auto d_left_child_smallest = left_child_smallest[d_idx].data();
|
|
auto hist_builder = hist_vec[d_idx].GetBuilder();
|
|
dh::TransformLbs(
|
|
device_idx, &temp_memory[d_idx], end - begin, d_row_ptr,
|
|
row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) {
|
|
int nidx = d_position[local_ridx]; // OPTMARK: latency
|
|
if (!is_active(nidx, depth)) return;
|
|
|
|
// Only increment smallest node
|
|
bool is_smallest = (d_left_child_smallest[parent_nidx(nidx)] &&
|
|
is_left_child(nidx)) ||
|
|
(!d_left_child_smallest[parent_nidx(nidx)] &&
|
|
!is_left_child(nidx));
|
|
if (!is_smallest && depth > 0) return;
|
|
|
|
int gidx = d_gidx[local_idx];
|
|
bst_gpair gpair = d_gpair[local_ridx];
|
|
|
|
hist_builder.Add(gpair, gidx,
|
|
nidx); // OPTMARK: This is slow, could use
|
|
// shared memory or cache results
|
|
// intead of writing to global
|
|
// memory every time in atomic way.
|
|
});
|
|
}
|
|
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
|
|
// time.printElapsed("Add Time");
|
|
|
|
// (in-place) reduce each element of histogram (for only current level) across
|
|
// multiple gpus
|
|
// TODO(JCM): use out of place with pre-allocated buffer, but then have to
|
|
// copy
|
|
// back on device
|
|
// fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float));
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
dh::safe_nccl(ncclAllReduce(
|
|
reinterpret_cast<const void*>(hist_vec[d_idx].GetLevelPtr(depth)),
|
|
reinterpret_cast<void*>(hist_vec[d_idx].GetLevelPtr(depth)),
|
|
hist_vec[d_idx].LevelSize(depth) * sizeof(bst_gpair_precise) / sizeof(double),
|
|
ncclDouble, ncclSum, comms[d_idx], *(streams[d_idx])));
|
|
}
|
|
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
|
}
|
|
// if no NCCL, then presume only 1 GPU, then already correct
|
|
|
|
// time.printElapsed("Reduce-Add Time");
|
|
|
|
// Subtraction trick (applied to all devices in same way -- to avoid doing on
|
|
// master and then Bcast)
|
|
if (depth > 0) {
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
auto hist_builder = hist_vec[d_idx].GetBuilder();
|
|
auto d_left_child_smallest = left_child_smallest[d_idx].data();
|
|
int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins;
|
|
|
|
dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) {
|
|
int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2);
|
|
bool left_smallest = d_left_child_smallest[parent_nidx(nidx)];
|
|
if (left_smallest) {
|
|
nidx++; // If left is smallest switch to right child
|
|
}
|
|
|
|
int gidx = idx % hist_builder.n_bins;
|
|
bst_gpair_precise parent = hist_builder.Get(gidx, parent_nidx(nidx));
|
|
int other_nidx = left_smallest ? nidx - 1 : nidx + 1;
|
|
bst_gpair_precise other = hist_builder.Get(gidx, other_nidx);
|
|
hist_builder.Add(parent - other, gidx,
|
|
nidx); // OPTMARK: This is slow, could use shared
|
|
// memory or cache results intead of writing to
|
|
// global memory every time in atomic way.
|
|
});
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
}
|
|
}
|
|
|
|
template <int BLOCK_THREADS>
|
|
__global__ void find_split_kernel(
|
|
const bst_gpair_precise* d_level_hist, int* d_feature_segments, int depth,
|
|
int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp,
|
|
Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map,
|
|
float* d_gidx_fvalue_map, GPUTrainingParam gpu_param,
|
|
bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) {
|
|
typedef cub::KeyValuePair<int, float> ArgMaxT;
|
|
typedef cub::BlockScan<bst_gpair_precise, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>
|
|
BlockScanT;
|
|
typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
|
|
typedef cub::BlockReduce<bst_gpair_precise, BLOCK_THREADS> SumReduceT;
|
|
|
|
union TempStorage {
|
|
typename BlockScanT::TempStorage scan;
|
|
typename MaxReduceT::TempStorage max_reduce;
|
|
typename SumReduceT::TempStorage sum_reduce;
|
|
};
|
|
|
|
struct UninitializedSplit : cub::Uninitialized<Split> {};
|
|
struct UninitializedGpair : cub::Uninitialized<bst_gpair_precise> {};
|
|
|
|
__shared__ UninitializedSplit uninitialized_split;
|
|
Split& split = uninitialized_split.Alias();
|
|
__shared__ UninitializedGpair uninitialized_sum;
|
|
bst_gpair_precise& shared_sum = uninitialized_sum.Alias();
|
|
__shared__ ArgMaxT block_max;
|
|
__shared__ TempStorage temp_storage;
|
|
|
|
if (threadIdx.x == 0) {
|
|
split = Split();
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// below two are for accessing full-sized node list stored on each device
|
|
// always one block per node, BLOCK_THREADS threads per block
|
|
int level_node_idx = blockIdx.x + nodes_offset_device;
|
|
int node_idx = n_nodes(depth - 1) + level_node_idx;
|
|
|
|
for (int fidx = 0; fidx < n_features; fidx++) {
|
|
if (colsample && d_feature_flags[fidx] == 0) continue;
|
|
|
|
int begin = d_feature_segments[level_node_idx * n_features + fidx];
|
|
int end = d_feature_segments[level_node_idx * n_features + fidx + 1];
|
|
|
|
bst_gpair_precise feature_sum = bst_gpair_precise();
|
|
for (int reduce_begin = begin; reduce_begin < end;
|
|
reduce_begin += BLOCK_THREADS) {
|
|
bool thread_active = reduce_begin + threadIdx.x < end;
|
|
// Scan histogram
|
|
bst_gpair_precise bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x]
|
|
: bst_gpair_precise();
|
|
|
|
feature_sum +=
|
|
SumReduceT(temp_storage.sum_reduce).Reduce(bin, cub::Sum());
|
|
}
|
|
|
|
if (threadIdx.x == 0) {
|
|
shared_sum = feature_sum;
|
|
}
|
|
// __syncthreads(); // no need to synch because below there is a Scan
|
|
|
|
GpairCallbackOp prefix_op = GpairCallbackOp();
|
|
for (int scan_begin = begin; scan_begin < end;
|
|
scan_begin += BLOCK_THREADS) {
|
|
bool thread_active = scan_begin + threadIdx.x < end;
|
|
bst_gpair_precise bin =
|
|
thread_active ? d_level_hist[scan_begin + threadIdx.x] : bst_gpair_precise();
|
|
|
|
BlockScanT(temp_storage.scan)
|
|
.ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
|
|
|
|
// Calculate gain
|
|
bst_gpair_precise parent_sum = d_nodes[node_idx].sum_gradients;
|
|
float parent_gain = d_nodes[node_idx].root_gain;
|
|
|
|
bst_gpair_precise missing = parent_sum - shared_sum;
|
|
|
|
bool missing_left;
|
|
float gain = thread_active
|
|
? loss_chg_missing(bin, missing, parent_sum, parent_gain,
|
|
gpu_param, missing_left)
|
|
: -FLT_MAX;
|
|
__syncthreads();
|
|
|
|
// Find thread with best gain
|
|
ArgMaxT tuple(threadIdx.x, gain);
|
|
ArgMaxT best =
|
|
MaxReduceT(temp_storage.max_reduce).Reduce(tuple, cub::ArgMax());
|
|
|
|
if (threadIdx.x == 0) {
|
|
block_max = best;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
// Best thread updates split
|
|
if (threadIdx.x == block_max.key) {
|
|
float fvalue;
|
|
int gidx = (scan_begin - (level_node_idx * n_bins)) + threadIdx.x;
|
|
if (threadIdx.x == 0 &&
|
|
begin == scan_begin) { // check at start of first tile
|
|
fvalue = d_fidx_min_map[fidx];
|
|
} else {
|
|
fvalue = d_gidx_fvalue_map[gidx - 1];
|
|
}
|
|
|
|
bst_gpair_precise left = missing_left ? bin + missing : bin;
|
|
bst_gpair_precise right = parent_sum - left;
|
|
|
|
split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param);
|
|
}
|
|
__syncthreads();
|
|
} // end scan
|
|
} // end over features
|
|
|
|
// Create node
|
|
if (threadIdx.x == 0) {
|
|
if (d_nodes_temp == NULL) {
|
|
d_nodes[node_idx].split = split;
|
|
} else {
|
|
d_nodes_temp[blockIdx.x] = d_nodes[node_idx]; // first copy node values
|
|
d_nodes_temp[blockIdx.x].split = split; // now assign split
|
|
}
|
|
|
|
// if (depth == 0) {
|
|
// split.Print();
|
|
// }
|
|
|
|
Node *Nodeleft, *Noderight;
|
|
bool* left_child_smallest;
|
|
if (d_nodes_temp == NULL) {
|
|
Nodeleft = &d_nodes[left_child_nidx(node_idx)];
|
|
Noderight = &d_nodes[right_child_nidx(node_idx)];
|
|
left_child_smallest =
|
|
&d_left_child_smallest_temp[node_idx]; // NOTE: not per level, even
|
|
// though _temp variable name
|
|
} else {
|
|
Nodeleft = &d_nodes_child_temp[blockIdx.x * 2 + 0];
|
|
Noderight = &d_nodes_child_temp[blockIdx.x * 2 + 1];
|
|
left_child_smallest = &d_left_child_smallest_temp[blockIdx.x];
|
|
}
|
|
|
|
*Nodeleft =
|
|
Node(split.left_sum,
|
|
CalcGain(gpu_param, split.left_sum.grad, split.left_sum.hess),
|
|
CalcWeight(gpu_param, split.left_sum.grad, split.left_sum.hess));
|
|
|
|
*Noderight =
|
|
Node(split.right_sum,
|
|
CalcGain(gpu_param, split.right_sum.grad, split.right_sum.hess),
|
|
CalcWeight(gpu_param, split.right_sum.grad, split.right_sum.hess));
|
|
|
|
// Record smallest node
|
|
if (split.left_sum.hess <= split.right_sum.hess) {
|
|
*left_child_smallest = true;
|
|
} else {
|
|
*left_child_smallest = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
#define MIN_BLOCK_THREADS 32
|
|
#define CHUNK_BLOCK_THREADS 32
|
|
// MAX_BLOCK_THREADS of 1024 is hard-coded maximum block size due
|
|
// to CUDA capability 35 and above requirement
|
|
// for Maximum number of threads per block
|
|
#define MAX_BLOCK_THREADS 1024
|
|
|
|
void GPUHistBuilder::FindSplit(int depth) {
|
|
// Specialised based on max_bins
|
|
this->FindSplitSpecialize<MIN_BLOCK_THREADS>(depth);
|
|
}
|
|
|
|
template <>
|
|
void GPUHistBuilder::FindSplitSpecialize<MAX_BLOCK_THREADS>(int depth) {
|
|
LaunchFindSplit<MAX_BLOCK_THREADS>(depth);
|
|
}
|
|
template <int BLOCK_THREADS>
|
|
void GPUHistBuilder::FindSplitSpecialize(int depth) {
|
|
if (param.max_bin <= BLOCK_THREADS) {
|
|
LaunchFindSplit<BLOCK_THREADS>(depth);
|
|
} else {
|
|
this->FindSplitSpecialize<BLOCK_THREADS + CHUNK_BLOCK_THREADS>(depth);
|
|
}
|
|
}
|
|
|
|
template <int BLOCK_THREADS>
|
|
void GPUHistBuilder::LaunchFindSplit(int depth) {
|
|
bool colsample =
|
|
param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
|
|
|
|
int dosimuljob = 1;
|
|
|
|
int simuljob = 1; // whether to do job on single GPU and broadcast (0) or to
|
|
// do same job on each GPU (1) (could make user parameter,
|
|
// but too fine-grained maybe)
|
|
int findsplit_shardongpus = 0; // too expensive generally, disable for now
|
|
|
|
if (findsplit_shardongpus) {
|
|
dosimuljob = 0;
|
|
// use power of 2 for split finder because nodes are power of 2 (broadcast
|
|
// result to remaining devices)
|
|
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
|
find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
|
|
int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
|
|
int num_nodes_child_device =
|
|
n_nodes_level(depth + 1) / find_split_n_devices;
|
|
const int GRID_SIZE = num_nodes_device;
|
|
|
|
// NOTE: No need to scatter before gather as all devices have same copy of
|
|
// nodes, and within find_split_kernel() nodes_temp is given values from
|
|
// nodes
|
|
|
|
// for all nodes (split among devices) find best split per node
|
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
int nodes_offset_device = d_idx * num_nodes_device;
|
|
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
|
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
|
feature_segments[d_idx].data(), depth, (info->num_col),
|
|
(hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
|
|
nodes_child_temp[d_idx].data(), nodes_offset_device,
|
|
fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(),
|
|
GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(),
|
|
colsample, feature_flags[d_idx].data());
|
|
}
|
|
|
|
// nccl only on devices that did split
|
|
dh::synchronize_n_devices(find_split_n_devices, dList);
|
|
|
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
dh::safe_nccl(ncclAllGather(
|
|
reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
|
|
num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
|
*(streams[d_idx])));
|
|
|
|
if (depth !=
|
|
param.max_depth) { // don't copy over children nodes if no more nodes
|
|
dh::safe_nccl(ncclAllGather(
|
|
reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
|
|
num_nodes_child_device * sizeof(Node) / sizeof(char), ncclChar,
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
|
*(streams[d_idx]))); // Note offset by n_nodes(depth)
|
|
// for recvbuff for child nodes
|
|
}
|
|
|
|
dh::safe_nccl(ncclAllGather(
|
|
reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
|
|
num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
|
|
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
|
n_nodes(depth - 1)),
|
|
find_split_comms[find_split_n_devices - 1][d_idx],
|
|
*(streams[d_idx])));
|
|
}
|
|
|
|
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
|
}
|
|
|
|
if (n_devices > find_split_n_devices && n_devices > 1) {
|
|
// if n_devices==1, no need to Bcast
|
|
// if find_split_n_devices==1, this is just a copy operation, else it
|
|
// copies
|
|
// from master to all nodes in case extra devices not involved in split
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
int master_device = dList[0];
|
|
dh::safe_nccl(ncclBcast(
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
|
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
|
master_device, comms[d_idx], *(streams[d_idx])));
|
|
|
|
if (depth != param.max_depth) { // don't copy over children nodes if no
|
|
// more nodes
|
|
dh::safe_nccl(ncclBcast(
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
|
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
|
master_device, comms[d_idx], *(streams[d_idx])));
|
|
}
|
|
|
|
dh::safe_nccl(ncclBcast(
|
|
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
|
n_nodes(depth - 1)),
|
|
n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
|
|
master_device, comms[d_idx], *(streams[d_idx])));
|
|
}
|
|
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
|
}
|
|
}
|
|
} else if (simuljob == 0) {
|
|
dosimuljob = 0;
|
|
int num_nodes_device = n_nodes_level(depth);
|
|
const int GRID_SIZE = num_nodes_device;
|
|
|
|
int d_idx = 0;
|
|
int master_device = dList[d_idx];
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
int nodes_offset_device = d_idx * num_nodes_device;
|
|
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
|
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
|
feature_segments[d_idx].data(), depth, (info->num_col),
|
|
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
|
|
nodes_offset_device, fidx_min_map[d_idx].data(),
|
|
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
|
|
left_child_smallest[d_idx].data(), colsample,
|
|
feature_flags[d_idx].data());
|
|
|
|
// broadcast result
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
dh::safe_nccl(ncclBcast(
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
|
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
|
master_device, comms[d_idx], *(streams[d_idx])));
|
|
|
|
if (depth !=
|
|
param.max_depth) { // don't copy over children nodes if no more nodes
|
|
dh::safe_nccl(ncclBcast(
|
|
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
|
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
|
master_device, comms[d_idx], *(streams[d_idx])));
|
|
}
|
|
|
|
dh::safe_nccl(
|
|
ncclBcast(reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
|
n_nodes(depth - 1)),
|
|
n_nodes_level(depth) * sizeof(bool) / sizeof(char),
|
|
ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
|
|
}
|
|
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
|
}
|
|
} else {
|
|
dosimuljob = 1;
|
|
}
|
|
|
|
if (dosimuljob) { // if no NCCL or simuljob==1, do this
|
|
int num_nodes_device = n_nodes_level(depth);
|
|
const int GRID_SIZE = num_nodes_device;
|
|
|
|
// all GPUs do same work
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
int nodes_offset_device = 0;
|
|
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
|
(const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
|
feature_segments[d_idx].data(), depth, (info->num_col),
|
|
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
|
|
nodes_offset_device, fidx_min_map[d_idx].data(),
|
|
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
|
|
left_child_smallest[d_idx].data(), colsample,
|
|
feature_flags[d_idx].data());
|
|
}
|
|
}
|
|
|
|
// NOTE: No need to syncrhonize with host as all above pure P2P ops or
|
|
// on-device ops
|
|
}
|
|
|
|
void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
|
|
// Perform asynchronous reduction on each gpu
|
|
std::vector<bst_gpair> device_sums(n_devices);
|
|
#pragma omp parallel for num_threads(n_devices)
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
auto begin = device_gpair[d_idx].tbegin();
|
|
auto end = device_gpair[d_idx].tend();
|
|
bst_gpair init = bst_gpair();
|
|
auto binary_op = thrust::plus<bst_gpair>();
|
|
device_sums[d_idx] = thrust::reduce(begin, end, init, binary_op);
|
|
}
|
|
|
|
bst_gpair sum = bst_gpair();
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
sum += device_sums[d_idx];
|
|
}
|
|
|
|
// Setup first node so all devices have same first node (here done same on all
|
|
// devices, or could have done one device and Bcast if worried about exact
|
|
// precision issues)
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
|
|
auto d_nodes = nodes[d_idx].data();
|
|
auto gpu_param = GPUTrainingParam(param);
|
|
|
|
dh::launch_n(device_idx, 1, [=] __device__(int idx) {
|
|
bst_gpair sum_gradients = sum;
|
|
d_nodes[idx] =
|
|
Node(sum_gradients,
|
|
CalcGain(gpu_param, sum_gradients.grad, sum_gradients.hess),
|
|
CalcWeight(gpu_param, sum_gradients.grad, sum_gradients.hess));
|
|
});
|
|
}
|
|
// synch all devices to host before moving on (No, can avoid because BuildHist
|
|
// calls another kernel in default stream)
|
|
// dh::synchronize_n_devices(n_devices, dList);
|
|
}
|
|
|
|
void GPUHistBuilder::UpdatePosition(int depth) {
|
|
if (is_dense) {
|
|
this->UpdatePositionDense(depth);
|
|
} else {
|
|
this->UpdatePositionSparse(depth);
|
|
}
|
|
}
|
|
|
|
void GPUHistBuilder::UpdatePositionDense(int depth) {
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
|
|
auto d_position = position[d_idx].data();
|
|
Node* d_nodes = nodes[d_idx].data();
|
|
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
|
|
auto d_gidx = device_matrix[d_idx].gidx;
|
|
int n_columns = info->num_col;
|
|
size_t begin = device_row_segments[d_idx];
|
|
size_t end = device_row_segments[d_idx + 1];
|
|
|
|
dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) {
|
|
int pos = d_position[local_idx];
|
|
if (!is_active(pos, depth)) {
|
|
return;
|
|
}
|
|
Node node = d_nodes[pos];
|
|
|
|
if (node.IsLeaf()) {
|
|
return;
|
|
}
|
|
|
|
int gidx = d_gidx[local_idx *
|
|
static_cast<size_t>(n_columns) + static_cast<size_t>(node.split.findex)];
|
|
|
|
float fvalue = d_gidx_fvalue_map[gidx];
|
|
|
|
if (fvalue <= node.split.fvalue) {
|
|
d_position[local_idx] = left_child_nidx(pos);
|
|
} else {
|
|
d_position[local_idx] = right_child_nidx(pos);
|
|
}
|
|
});
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
// dh::safe_cuda(cudaDeviceSynchronize());
|
|
}
|
|
|
|
void GPUHistBuilder::UpdatePositionSparse(int depth) {
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
|
|
auto d_position = position[d_idx].data();
|
|
auto d_position_tmp = position_tmp[d_idx].data();
|
|
Node* d_nodes = nodes[d_idx].data();
|
|
auto d_gidx_feature_map = gidx_feature_map[d_idx].data();
|
|
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
|
|
auto d_gidx = device_matrix[d_idx].gidx;
|
|
auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin();
|
|
|
|
size_t row_begin = device_row_segments[d_idx];
|
|
size_t row_end = device_row_segments[d_idx + 1];
|
|
size_t element_begin = device_element_segments[d_idx];
|
|
size_t element_end = device_element_segments[d_idx + 1];
|
|
|
|
// Update missing direction
|
|
dh::launch_n(device_idx, row_end - row_begin,
|
|
[=] __device__(int local_idx) {
|
|
int pos = d_position[local_idx];
|
|
if (!is_active(pos, depth)) {
|
|
d_position_tmp[local_idx] = pos;
|
|
return;
|
|
}
|
|
|
|
Node node = d_nodes[pos];
|
|
|
|
if (node.IsLeaf()) {
|
|
d_position_tmp[local_idx] = pos;
|
|
return;
|
|
} else if (node.split.missing_left) {
|
|
d_position_tmp[local_idx] = pos * 2 + 1;
|
|
} else {
|
|
d_position_tmp[local_idx] = pos * 2 + 2;
|
|
}
|
|
});
|
|
|
|
// Update node based on fvalue where exists
|
|
// OPTMARK: This kernel is very inefficient for both compute and memory,
|
|
// dominated by memory dependency / access patterns
|
|
|
|
dh::TransformLbs(
|
|
device_idx, &temp_memory[d_idx], element_end - element_begin, d_row_ptr,
|
|
row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) {
|
|
int pos = d_position[local_ridx];
|
|
if (!is_active(pos, depth)) {
|
|
return;
|
|
}
|
|
|
|
Node node = d_nodes[pos];
|
|
|
|
if (node.IsLeaf()) {
|
|
return;
|
|
}
|
|
|
|
int gidx = d_gidx[local_idx];
|
|
int findex = d_gidx_feature_map[gidx]; // OPTMARK: slowest global
|
|
// memory access, maybe setup
|
|
// position, gidx, etc. as
|
|
// combined structure?
|
|
|
|
if (findex == node.split.findex) {
|
|
float fvalue = d_gidx_fvalue_map[gidx];
|
|
|
|
if (fvalue <= node.split.fvalue) {
|
|
d_position_tmp[local_ridx] = left_child_nidx(pos);
|
|
} else {
|
|
d_position_tmp[local_ridx] = right_child_nidx(pos);
|
|
}
|
|
}
|
|
});
|
|
position[d_idx] = position_tmp[d_idx];
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
}
|
|
|
|
void GPUHistBuilder::ColSampleTree() {
|
|
if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return;
|
|
|
|
feature_set_tree.resize(info->num_col);
|
|
std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
|
|
feature_set_tree = col_sample(feature_set_tree, param.colsample_bytree);
|
|
}
|
|
|
|
void GPUHistBuilder::ColSampleLevel() {
|
|
if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return;
|
|
|
|
feature_set_level.resize(feature_set_tree.size());
|
|
feature_set_level = col_sample(feature_set_tree, param.colsample_bylevel);
|
|
std::vector<int> h_feature_flags(info->num_col, 0);
|
|
for (auto fidx : feature_set_level) {
|
|
h_feature_flags[fidx] = 1;
|
|
}
|
|
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
|
|
|
feature_flags[d_idx] = h_feature_flags;
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
}
|
|
|
|
bool GPUHistBuilder::UpdatePredictionCache(
|
|
const DMatrix* data, std::vector<bst_float>* p_out_preds) {
|
|
std::vector<bst_float>& out_preds = *p_out_preds;
|
|
|
|
if (nodes.empty() || !p_last_fmat_ || data != p_last_fmat_) {
|
|
return false;
|
|
}
|
|
|
|
if (!prediction_cache_initialised) {
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
size_t row_begin = device_row_segments[d_idx];
|
|
size_t row_end = device_row_segments[d_idx + 1];
|
|
|
|
prediction_cache[d_idx].copy(out_preds.begin() + row_begin,
|
|
out_preds.begin() + row_end);
|
|
}
|
|
prediction_cache_initialised = true;
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
|
|
float eps = param.learning_rate;
|
|
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
|
int device_idx = dList[d_idx];
|
|
size_t row_begin = device_row_segments[d_idx];
|
|
size_t row_end = device_row_segments[d_idx + 1];
|
|
|
|
auto d_nodes = nodes[d_idx].data();
|
|
auto d_position = position[d_idx].data();
|
|
auto d_prediction_cache = prediction_cache[d_idx].data();
|
|
|
|
dh::launch_n(device_idx, prediction_cache[d_idx].size(),
|
|
[=] __device__(int local_idx) {
|
|
int pos = d_position[local_idx];
|
|
d_prediction_cache[local_idx] += d_nodes[pos].weight * eps;
|
|
});
|
|
|
|
thrust::copy(prediction_cache[d_idx].tbegin(),
|
|
prediction_cache[d_idx].tend(), &out_preds[row_begin]);
|
|
}
|
|
dh::synchronize_n_devices(n_devices, dList);
|
|
|
|
return true;
|
|
}
|
|
|
|
void GPUHistBuilder::Update(const std::vector<bst_gpair>& gpair,
|
|
DMatrix* p_fmat, RegTree* p_tree) {
|
|
dh::Timer time0;
|
|
|
|
this->InitData(gpair, *p_fmat, *p_tree);
|
|
this->InitFirstNode(gpair);
|
|
this->ColSampleTree();
|
|
|
|
for (int depth = 0; depth < param.max_depth; depth++) {
|
|
this->ColSampleLevel();
|
|
this->BuildHist(depth);
|
|
this->FindSplit(depth);
|
|
this->UpdatePosition(depth);
|
|
}
|
|
|
|
// done with multi-GPU, pass back result from master to tree on host
|
|
int master_device = dList[0];
|
|
dh::safe_cuda(cudaSetDevice(master_device));
|
|
dense2sparse_tree(p_tree, nodes[0].tbegin(), nodes[0].tend(), param);
|
|
|
|
gpu_time += time0.elapsedSeconds();
|
|
|
|
if (param.debug_verbose) {
|
|
LOG(CONSOLE) << "[GPU Plug-in] Cumulative GPU Time excluding initial time "
|
|
<< (gpu_time - gpu_init_time)
|
|
<< " sec";
|
|
fflush(stdout);
|
|
}
|
|
|
|
if (param.debug_verbose) {
|
|
LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time "
|
|
<< cpu_time.elapsedSeconds() << " sec";
|
|
LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time excluding initial time "
|
|
<< (cpu_time.elapsedSeconds() - cpu_init_time - gpu_time)
|
|
<< " sec";
|
|
fflush(stdout);
|
|
}
|
|
}
|
|
} // namespace tree
|
|
} // namespace xgboost
|