/*! * Copyright 2017 Rory mitchell */ #include #include #include #include #include #include #include #include #include #include #include #include "common.cuh" #include "device_helpers.cuh" #include "dmlc/timer.h" #include "gpu_hist_builder.cuh" namespace xgboost { namespace tree { void DeviceGMat::Init(int device_idx, const common::GHistIndexMatrix& gmat, bst_ulong element_begin, bst_ulong element_end, bst_ulong row_begin, bst_ulong row_end, int n_bins) { dh::safe_cuda(cudaSetDevice(device_idx)); CHECK(gidx_buffer.size()) << "gidx_buffer must be externally allocated"; CHECK_EQ(row_ptr.size(), (row_end - row_begin) + 1) << "row_ptr must be externally allocated"; common::CompressedBufferWriter cbw(n_bins); std::vector host_buffer(gidx_buffer.size()); cbw.Write(host_buffer.data(), gmat.index.begin() + element_begin, gmat.index.begin() + element_end); gidx_buffer = host_buffer; gidx = common::CompressedIterator(gidx_buffer.data(), n_bins); // row_ptr thrust::copy(gmat.row_ptr.data() + row_begin, gmat.row_ptr.data() + row_end + 1, row_ptr.tbegin()); // normalise row_ptr size_t start = gmat.row_ptr[row_begin]; thrust::transform(row_ptr.tbegin(), row_ptr.tend(), row_ptr.tbegin(), [=] __device__(size_t val) { return val - start; }); } void DeviceHist::Init(int n_bins_in) { this->n_bins = n_bins_in; CHECK(!data.empty()) << "DeviceHist must be externally allocated"; } void DeviceHist::Reset(int device_idx) { cudaSetDevice(device_idx); data.fill(bst_gpair_precise()); } bst_gpair_precise* DeviceHist::GetLevelPtr(int depth) { return data.data() + n_nodes(depth - 1) * n_bins; } int DeviceHist::LevelSize(int depth) { return n_bins * n_nodes_level(depth); } HistBuilder DeviceHist::GetBuilder() { return HistBuilder(data.data(), n_bins); } HistBuilder::HistBuilder(bst_gpair_precise* ptr, int n_bins) : d_hist(ptr), n_bins(n_bins) {} // Define double precision atomic add for older architectures #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 #else __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; // NOLINT unsigned long long int old = *address_as_ull, assumed; // NOLINT do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif __device__ void HistBuilder::Add(bst_gpair_precise gpair, int gidx, int nidx) const { int hist_idx = nidx * n_bins + gidx; atomicAdd(&(d_hist[hist_idx].grad), gpair.grad); // OPTMARK: This and below // line lead to about 3X // slowdown due to memory // dependency and access // pattern issues. atomicAdd(&(d_hist[hist_idx].hess), gpair.hess); } __device__ bst_gpair_precise HistBuilder::Get(int gidx, int nidx) const { return d_hist[nidx * n_bins + gidx]; } GPUHistBuilder::GPUHistBuilder() : initialised(false), is_dense(false), p_last_fmat_(nullptr), prediction_cache_initialised(false) {} GPUHistBuilder::~GPUHistBuilder() { if (initialised) { for (int d_idx = 0; d_idx < n_devices; ++d_idx) { ncclCommDestroy(comms[d_idx]); dh::safe_cuda(cudaSetDevice(dList[d_idx])); dh::safe_cuda(cudaStreamDestroy(*(streams[d_idx]))); } for (int num_d = 1; num_d <= n_devices; ++num_d) { // loop over number of devices used for (int d_idx = 0; d_idx < n_devices; ++d_idx) { ncclCommDestroy(find_split_comms[num_d - 1][d_idx]); } } } } void GPUHistBuilder::Init(const TrainParam& param) { CHECK(param.max_depth < 16) << "Tree depth too large."; CHECK(param.max_depth != 0) << "Tree depth cannot be 0."; CHECK(param.grow_policy != TrainParam::kLossGuide) << "Loss guided growth policy not supported. Use CPU algorithm."; this->param = param; CHECK(param.n_gpus != 0) << "Must have at least one device"; } void GPUHistBuilder::InitData(const std::vector& gpair, DMatrix& fmat, // NOLINT const RegTree& tree) { dh::Timer time1; // set member num_rows and n_devices for rest of GPUHistBuilder members info = &fmat.info(); num_rows = info->num_row; n_devices = dh::n_devices(param.n_gpus, num_rows); if (!initialised) { // reset static timers used across iterations cpu_init_time = 0; gpu_init_time = 0; cpu_time.reset(); gpu_time = 0; // set dList member dList.resize(n_devices); for (int d_idx = 0; d_idx < n_devices; ++d_idx) { int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices(); dList[d_idx] = device_idx; } // initialize nccl comms.resize(n_devices); streams.resize(n_devices); dh::safe_nccl(ncclCommInitAll(comms.data(), n_devices, dList.data())); // initialize communicator // (One communicator per // process) // printf("# NCCL: Using devices\n"); for (int d_idx = 0; d_idx < n_devices; ++d_idx) { streams[d_idx] = reinterpret_cast(malloc(sizeof(cudaStream_t))); dh::safe_cuda(cudaSetDevice(dList[d_idx])); dh::safe_cuda(cudaStreamCreate(streams[d_idx])); int cudaDev; int rank; cudaDeviceProp prop; dh::safe_nccl(ncclCommCuDevice(comms[d_idx], &cudaDev)); dh::safe_nccl(ncclCommUserRank(comms[d_idx], &rank)); dh::safe_cuda(cudaGetDeviceProperties(&prop, cudaDev)); // printf("# Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev, // prop.pciBusID, prop.name); // cudaDriverGetVersion(&driverVersion); // cudaRuntimeGetVersion(&runtimeVersion); std::ostringstream oss; oss << "CUDA Capability Major/Minor version number: " << prop.major << "." << prop.minor << " is insufficient. Need >=3.5."; int failed = prop.major < 3 || prop.major == 3 && prop.minor < 5; CHECK(failed == 0) << oss.str(); } // local find_split group of comms for each case of reduced number of GPUs // to use find_split_comms.resize( n_devices, std::vector(n_devices)); // TODO(JCM): Excessive, but // ok, and best to do // here instead of // repeatedly for (int num_d = 1; num_d <= n_devices; ++num_d) { // loop over number of devices used dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d, dList.data())); // initialize communicator // (One communicator per // process) } is_dense = info->num_nonzero == info->num_col * info->num_row; dh::Timer time0; hmat_.Init(&fmat, param.max_bin); cpu_init_time += time0.elapsedSeconds(); if (param.debug_verbose) { // Only done once for each training session LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init " << time0.elapsedSeconds() << " sec"; fflush(stdout); } time0.reset(); gmat_.cut = &hmat_; cpu_init_time += time0.elapsedSeconds(); if (param.debug_verbose) { // Only done once for each training session LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.cut " << time0.elapsedSeconds() << " sec"; fflush(stdout); } time0.reset(); gmat_.Init(&fmat); cpu_init_time += time0.elapsedSeconds(); if (param.debug_verbose) { // Only done once for each training session LOG(CONSOLE) << "[GPU Plug-in] CPU Time for gmat_.Init() " << time0.elapsedSeconds() << " sec"; fflush(stdout); } time0.reset(); if (param.debug_verbose) { // Only done once for each training session LOG(CONSOLE) << "[GPU Plug-in] CPU Time for hmat_.Init, gmat_.cut, gmat_.Init " << cpu_init_time << " sec"; fflush(stdout); } int n_bins = hmat_.row_ptr.back(); int n_features = hmat_.row_ptr.size() - 1; // deliniate data onto multiple gpus device_row_segments.push_back(0); device_element_segments.push_back(0); bst_uint offset = 0; bst_uint shard_size = std::ceil(static_cast(num_rows) / n_devices); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; offset += shard_size; offset = std::min(offset, num_rows); device_row_segments.push_back(offset); device_element_segments.push_back(gmat_.row_ptr[offset]); } // Build feature segments std::vector h_feature_segments; for (int node = 0; node < n_nodes_level(param.max_depth - 1); node++) { for (int fidx = 0; fidx < n_features; fidx++) { h_feature_segments.push_back(hmat_.row_ptr[fidx] + node * n_bins); } } h_feature_segments.push_back(n_nodes_level(param.max_depth - 1) * n_bins); // Construct feature map std::vector h_gidx_feature_map(n_bins); for (int fidx = 0; fidx < n_features; fidx++) { for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) { h_gidx_feature_map[i] = fidx; } } int level_max_bins = n_nodes_level(param.max_depth - 1) * n_bins; // allocate unique common data that reside on master device (NOTE: None // currently) // int master_device=dList[0]; // ba.allocate(master_device, ); // allocate vectors across all devices temp_memory.resize(n_devices); hist_vec.resize(n_devices); nodes.resize(n_devices); nodes_temp.resize(n_devices); nodes_child_temp.resize(n_devices); left_child_smallest.resize(n_devices); left_child_smallest_temp.resize(n_devices); feature_flags.resize(n_devices); fidx_min_map.resize(n_devices); feature_segments.resize(n_devices); prediction_cache.resize(n_devices); position.resize(n_devices); position_tmp.resize(n_devices); device_matrix.resize(n_devices); device_gpair.resize(n_devices); gidx_feature_map.resize(n_devices); gidx_fvalue_map.resize(n_devices); int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices))); find_split_n_devices = std::min(n_nodes_level(param.max_depth), find_split_n_devices); int max_num_nodes_device = n_nodes_level(param.max_depth) / find_split_n_devices; // num_rows_segment: for sharding rows onto gpus for splitting data // num_elements_segment: for sharding rows (of elements) onto gpus for // splitting data // max_num_nodes_device: for sharding nodes onto gpus for split finding // All other variables have full copy on gpu, with copy either being // identical or just current portion (like for histogram) before AllReduce for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; bst_uint num_rows_segment = device_row_segments[d_idx + 1] - device_row_segments[d_idx]; bst_ulong num_elements_segment = device_element_segments[d_idx + 1] - device_element_segments[d_idx]; ba.allocate( device_idx, param.silent, &(hist_vec[d_idx].data), n_nodes(param.max_depth - 1) * n_bins, &nodes[d_idx], n_nodes(param.max_depth), &nodes_temp[d_idx], max_num_nodes_device, &nodes_child_temp[d_idx], max_num_nodes_device, &left_child_smallest[d_idx], n_nodes(param.max_depth), &left_child_smallest_temp[d_idx], max_num_nodes_device, &feature_flags[d_idx], n_features, // may change but same on all devices &fidx_min_map[d_idx], hmat_.min_val.size(), // constant and same on all devices &feature_segments[d_idx], h_feature_segments.size(), // constant and same on all devices &prediction_cache[d_idx], num_rows_segment, &position[d_idx], num_rows_segment, &position_tmp[d_idx], num_rows_segment, &device_gpair[d_idx], num_rows_segment, &device_matrix[d_idx].gidx_buffer, common::CompressedBufferWriter::CalculateBufferSize( num_elements_segment, n_bins), // constant and same on all devices &device_matrix[d_idx].row_ptr, num_rows_segment + 1, &gidx_feature_map[d_idx], n_bins, // constant and same on all devices &gidx_fvalue_map[d_idx], hmat_.cut.size()); // constant and same on all devices // Copy Host to Device (assumes comes after ba.allocate that sets device) device_matrix[d_idx].Init( device_idx, gmat_, device_element_segments[d_idx], device_element_segments[d_idx + 1], device_row_segments[d_idx], device_row_segments[d_idx + 1], n_bins); gidx_feature_map[d_idx] = h_gidx_feature_map; gidx_fvalue_map[d_idx] = hmat_.cut; feature_segments[d_idx] = h_feature_segments; fidx_min_map[d_idx] = hmat_.min_val; // Initialize, no copy hist_vec[d_idx].Init(n_bins); // init host object prediction_cache[d_idx].fill(0); // init device object (assumes comes // after ba.allocate that sets device) feature_flags[d_idx].fill(1); // init device object (assumes comes after // ba.allocate that sets device) } } // copy or init to do every iteration for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); nodes[d_idx].fill(Node()); nodes_temp[d_idx].fill(Node()); nodes_child_temp[d_idx].fill(Node()); position[d_idx].fill(0); device_gpair[d_idx].copy(gpair.begin() + device_row_segments[d_idx], gpair.begin() + device_row_segments[d_idx + 1]); subsample_gpair(&device_gpair[d_idx], param.subsample, device_row_segments[d_idx]); hist_vec[d_idx].Reset(device_idx); // left_child_smallest and left_child_smallest_temp don't need to be // initialized } dh::synchronize_n_devices(n_devices, dList); if (!initialised) { gpu_init_time = time1.elapsedSeconds() - cpu_init_time; gpu_time = -cpu_init_time; if (param.debug_verbose) { // Only done once for each training session LOG(CONSOLE) << "[GPU Plug-in] Time for GPU operations during First Call to InitData() " << gpu_init_time << " sec"; fflush(stdout); } } p_last_fmat_ = &fmat; initialised = true; } void GPUHistBuilder::BuildHist(int depth) { for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; size_t begin = device_element_segments[d_idx]; size_t end = device_element_segments[d_idx + 1]; size_t row_begin = device_row_segments[d_idx]; size_t row_end = device_row_segments[d_idx + 1]; auto d_gidx = device_matrix[d_idx].gidx; auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin(); auto d_position = position[d_idx].data(); auto d_gpair = device_gpair[d_idx].data(); auto d_left_child_smallest = left_child_smallest[d_idx].data(); auto hist_builder = hist_vec[d_idx].GetBuilder(); dh::TransformLbs( device_idx, &temp_memory[d_idx], end - begin, d_row_ptr, row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) { int nidx = d_position[local_ridx]; // OPTMARK: latency if (!is_active(nidx, depth)) return; // Only increment smallest node bool is_smallest = (d_left_child_smallest[parent_nidx(nidx)] && is_left_child(nidx)) || (!d_left_child_smallest[parent_nidx(nidx)] && !is_left_child(nidx)); if (!is_smallest && depth > 0) return; int gidx = d_gidx[local_idx]; bst_gpair gpair = d_gpair[local_ridx]; hist_builder.Add(gpair, gidx, nidx); // OPTMARK: This is slow, could use // shared memory or cache results // intead of writing to global // memory every time in atomic way. }); } dh::synchronize_n_devices(n_devices, dList); // time.printElapsed("Add Time"); // (in-place) reduce each element of histogram (for only current level) across // multiple gpus // TODO(JCM): use out of place with pre-allocated buffer, but then have to // copy // back on device // fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float)); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_nccl(ncclAllReduce( reinterpret_cast(hist_vec[d_idx].GetLevelPtr(depth)), reinterpret_cast(hist_vec[d_idx].GetLevelPtr(depth)), hist_vec[d_idx].LevelSize(depth) * sizeof(bst_gpair_precise) / sizeof(double), ncclDouble, ncclSum, comms[d_idx], *(streams[d_idx]))); } for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx]))); } // if no NCCL, then presume only 1 GPU, then already correct // time.printElapsed("Reduce-Add Time"); // Subtraction trick (applied to all devices in same way -- to avoid doing on // master and then Bcast) if (depth > 0) { for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); auto hist_builder = hist_vec[d_idx].GetBuilder(); auto d_left_child_smallest = left_child_smallest[d_idx].data(); int n_sub_bins = (n_nodes_level(depth) / 2) * hist_builder.n_bins; dh::launch_n(device_idx, n_sub_bins, [=] __device__(int idx) { int nidx = n_nodes(depth - 1) + ((idx / hist_builder.n_bins) * 2); bool left_smallest = d_left_child_smallest[parent_nidx(nidx)]; if (left_smallest) { nidx++; // If left is smallest switch to right child } int gidx = idx % hist_builder.n_bins; bst_gpair_precise parent = hist_builder.Get(gidx, parent_nidx(nidx)); int other_nidx = left_smallest ? nidx - 1 : nidx + 1; bst_gpair_precise other = hist_builder.Get(gidx, other_nidx); hist_builder.Add(parent - other, gidx, nidx); // OPTMARK: This is slow, could use shared // memory or cache results intead of writing to // global memory every time in atomic way. }); } dh::synchronize_n_devices(n_devices, dList); } } template __global__ void find_split_kernel( const bst_gpair_precise* d_level_hist, int* d_feature_segments, int depth, int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp, Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map, GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) { typedef cub::KeyValuePair ArgMaxT; typedef cub::BlockScan BlockScanT; typedef cub::BlockReduce MaxReduceT; typedef cub::BlockReduce SumReduceT; union TempStorage { typename BlockScanT::TempStorage scan; typename MaxReduceT::TempStorage max_reduce; typename SumReduceT::TempStorage sum_reduce; }; struct UninitializedSplit : cub::Uninitialized {}; struct UninitializedGpair : cub::Uninitialized {}; __shared__ UninitializedSplit uninitialized_split; Split& split = uninitialized_split.Alias(); __shared__ UninitializedGpair uninitialized_sum; bst_gpair_precise& shared_sum = uninitialized_sum.Alias(); __shared__ ArgMaxT block_max; __shared__ TempStorage temp_storage; if (threadIdx.x == 0) { split = Split(); } __syncthreads(); // below two are for accessing full-sized node list stored on each device // always one block per node, BLOCK_THREADS threads per block int level_node_idx = blockIdx.x + nodes_offset_device; int node_idx = n_nodes(depth - 1) + level_node_idx; for (int fidx = 0; fidx < n_features; fidx++) { if (colsample && d_feature_flags[fidx] == 0) continue; int begin = d_feature_segments[level_node_idx * n_features + fidx]; int end = d_feature_segments[level_node_idx * n_features + fidx + 1]; bst_gpair_precise feature_sum = bst_gpair_precise(); for (int reduce_begin = begin; reduce_begin < end; reduce_begin += BLOCK_THREADS) { bool thread_active = reduce_begin + threadIdx.x < end; // Scan histogram bst_gpair_precise bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x] : bst_gpair_precise(); feature_sum += SumReduceT(temp_storage.sum_reduce).Reduce(bin, cub::Sum()); } if (threadIdx.x == 0) { shared_sum = feature_sum; } // __syncthreads(); // no need to synch because below there is a Scan GpairCallbackOp prefix_op = GpairCallbackOp(); for (int scan_begin = begin; scan_begin < end; scan_begin += BLOCK_THREADS) { bool thread_active = scan_begin + threadIdx.x < end; bst_gpair_precise bin = thread_active ? d_level_hist[scan_begin + threadIdx.x] : bst_gpair_precise(); BlockScanT(temp_storage.scan) .ExclusiveScan(bin, bin, cub::Sum(), prefix_op); // Calculate gain bst_gpair_precise parent_sum = d_nodes[node_idx].sum_gradients; float parent_gain = d_nodes[node_idx].root_gain; bst_gpair_precise missing = parent_sum - shared_sum; bool missing_left; float gain = thread_active ? loss_chg_missing(bin, missing, parent_sum, parent_gain, gpu_param, missing_left) : -FLT_MAX; __syncthreads(); // Find thread with best gain ArgMaxT tuple(threadIdx.x, gain); ArgMaxT best = MaxReduceT(temp_storage.max_reduce).Reduce(tuple, cub::ArgMax()); if (threadIdx.x == 0) { block_max = best; } __syncthreads(); // Best thread updates split if (threadIdx.x == block_max.key) { float fvalue; int gidx = (scan_begin - (level_node_idx * n_bins)) + threadIdx.x; if (threadIdx.x == 0 && begin == scan_begin) { // check at start of first tile fvalue = d_fidx_min_map[fidx]; } else { fvalue = d_gidx_fvalue_map[gidx - 1]; } bst_gpair_precise left = missing_left ? bin + missing : bin; bst_gpair_precise right = parent_sum - left; split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param); } __syncthreads(); } // end scan } // end over features // Create node if (threadIdx.x == 0) { if (d_nodes_temp == NULL) { d_nodes[node_idx].split = split; } else { d_nodes_temp[blockIdx.x] = d_nodes[node_idx]; // first copy node values d_nodes_temp[blockIdx.x].split = split; // now assign split } // if (depth == 0) { // split.Print(); // } Node *Nodeleft, *Noderight; bool* left_child_smallest; if (d_nodes_temp == NULL) { Nodeleft = &d_nodes[left_child_nidx(node_idx)]; Noderight = &d_nodes[right_child_nidx(node_idx)]; left_child_smallest = &d_left_child_smallest_temp[node_idx]; // NOTE: not per level, even // though _temp variable name } else { Nodeleft = &d_nodes_child_temp[blockIdx.x * 2 + 0]; Noderight = &d_nodes_child_temp[blockIdx.x * 2 + 1]; left_child_smallest = &d_left_child_smallest_temp[blockIdx.x]; } *Nodeleft = Node(split.left_sum, CalcGain(gpu_param, split.left_sum.grad, split.left_sum.hess), CalcWeight(gpu_param, split.left_sum.grad, split.left_sum.hess)); *Noderight = Node(split.right_sum, CalcGain(gpu_param, split.right_sum.grad, split.right_sum.hess), CalcWeight(gpu_param, split.right_sum.grad, split.right_sum.hess)); // Record smallest node if (split.left_sum.hess <= split.right_sum.hess) { *left_child_smallest = true; } else { *left_child_smallest = false; } } } #define MIN_BLOCK_THREADS 32 #define CHUNK_BLOCK_THREADS 32 // MAX_BLOCK_THREADS of 1024 is hard-coded maximum block size due // to CUDA capability 35 and above requirement // for Maximum number of threads per block #define MAX_BLOCK_THREADS 1024 void GPUHistBuilder::FindSplit(int depth) { // Specialised based on max_bins this->FindSplitSpecialize(depth); } template <> void GPUHistBuilder::FindSplitSpecialize(int depth) { LaunchFindSplit(depth); } template void GPUHistBuilder::FindSplitSpecialize(int depth) { if (param.max_bin <= BLOCK_THREADS) { LaunchFindSplit(depth); } else { this->FindSplitSpecialize(depth); } } template void GPUHistBuilder::LaunchFindSplit(int depth) { bool colsample = param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0; int dosimuljob = 1; int simuljob = 1; // whether to do job on single GPU and broadcast (0) or to // do same job on each GPU (1) (could make user parameter, // but too fine-grained maybe) int findsplit_shardongpus = 0; // too expensive generally, disable for now if (findsplit_shardongpus) { dosimuljob = 0; // use power of 2 for split finder because nodes are power of 2 (broadcast // result to remaining devices) int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices))); find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices); int num_nodes_device = n_nodes_level(depth) / find_split_n_devices; int num_nodes_child_device = n_nodes_level(depth + 1) / find_split_n_devices; const int GRID_SIZE = num_nodes_device; // NOTE: No need to scatter before gather as all devices have same copy of // nodes, and within find_split_kernel() nodes_temp is given values from // nodes // for all nodes (split among devices) find best split per node for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); int nodes_offset_device = d_idx * num_nodes_device; find_split_kernel<<>>( (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(), nodes_child_temp[d_idx].data(), nodes_offset_device, fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(), colsample, feature_flags[d_idx].data()); } // nccl only on devices that did split dh::synchronize_n_devices(find_split_n_devices, dList); for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_nccl(ncclAllGather( reinterpret_cast(nodes_temp[d_idx].data()), num_nodes_device * sizeof(Node) / sizeof(char), ncclChar, reinterpret_cast(nodes[d_idx].data() + n_nodes(depth - 1)), find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx]))); if (depth != param.max_depth) { // don't copy over children nodes if no more nodes dh::safe_nccl(ncclAllGather( reinterpret_cast(nodes_child_temp[d_idx].data()), num_nodes_child_device * sizeof(Node) / sizeof(char), ncclChar, reinterpret_cast(nodes[d_idx].data() + n_nodes(depth)), find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx]))); // Note offset by n_nodes(depth) // for recvbuff for child nodes } dh::safe_nccl(ncclAllGather( reinterpret_cast(left_child_smallest_temp[d_idx].data()), num_nodes_device * sizeof(bool) / sizeof(char), ncclChar, reinterpret_cast(left_child_smallest[d_idx].data() + n_nodes(depth - 1)), find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx]))); } for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx]))); } if (n_devices > find_split_n_devices && n_devices > 1) { // if n_devices==1, no need to Bcast // if find_split_n_devices==1, this is just a copy operation, else it // copies // from master to all nodes in case extra devices not involved in split for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); int master_device = dList[0]; dh::safe_nccl(ncclBcast( reinterpret_cast(nodes[d_idx].data() + n_nodes(depth - 1)), n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); if (depth != param.max_depth) { // don't copy over children nodes if no // more nodes dh::safe_nccl(ncclBcast( reinterpret_cast(nodes[d_idx].data() + n_nodes(depth)), n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); } dh::safe_nccl(ncclBcast( reinterpret_cast(left_child_smallest[d_idx].data() + n_nodes(depth - 1)), n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); } for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx]))); } } } else if (simuljob == 0) { dosimuljob = 0; int num_nodes_device = n_nodes_level(depth); const int GRID_SIZE = num_nodes_device; int d_idx = 0; int master_device = dList[d_idx]; int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); int nodes_offset_device = d_idx * num_nodes_device; find_split_kernel<<>>( (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL, nodes_offset_device, fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest[d_idx].data(), colsample, feature_flags[d_idx].data()); // broadcast result for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_nccl(ncclBcast( reinterpret_cast(nodes[d_idx].data() + n_nodes(depth - 1)), n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); if (depth != param.max_depth) { // don't copy over children nodes if no more nodes dh::safe_nccl(ncclBcast( reinterpret_cast(nodes[d_idx].data() + n_nodes(depth)), n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); } dh::safe_nccl( ncclBcast(reinterpret_cast(left_child_smallest[d_idx].data() + n_nodes(depth - 1)), n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar, master_device, comms[d_idx], *(streams[d_idx]))); } for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx]))); } } else { dosimuljob = 1; } if (dosimuljob) { // if no NCCL or simuljob==1, do this int num_nodes_device = n_nodes_level(depth); const int GRID_SIZE = num_nodes_device; // all GPUs do same work for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); int nodes_offset_device = 0; find_split_kernel<<>>( (const bst_gpair_precise*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL, nodes_offset_device, fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest[d_idx].data(), colsample, feature_flags[d_idx].data()); } } // NOTE: No need to syncrhonize with host as all above pure P2P ops or // on-device ops } void GPUHistBuilder::InitFirstNode(const std::vector& gpair) { // Perform asynchronous reduction on each gpu std::vector device_sums(n_devices); #pragma omp parallel for num_threads(n_devices) for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); auto begin = device_gpair[d_idx].tbegin(); auto end = device_gpair[d_idx].tend(); bst_gpair init = bst_gpair(); auto binary_op = thrust::plus(); device_sums[d_idx] = thrust::reduce(begin, end, init, binary_op); } bst_gpair sum = bst_gpair(); for (int d_idx = 0; d_idx < n_devices; d_idx++) { sum += device_sums[d_idx]; } // Setup first node so all devices have same first node (here done same on all // devices, or could have done one device and Bcast if worried about exact // precision issues) for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; auto d_nodes = nodes[d_idx].data(); auto gpu_param = GPUTrainingParam(param); dh::launch_n(device_idx, 1, [=] __device__(int idx) { bst_gpair sum_gradients = sum; d_nodes[idx] = Node(sum_gradients, CalcGain(gpu_param, sum_gradients.grad, sum_gradients.hess), CalcWeight(gpu_param, sum_gradients.grad, sum_gradients.hess)); }); } // synch all devices to host before moving on (No, can avoid because BuildHist // calls another kernel in default stream) // dh::synchronize_n_devices(n_devices, dList); } void GPUHistBuilder::UpdatePosition(int depth) { if (is_dense) { this->UpdatePositionDense(depth); } else { this->UpdatePositionSparse(depth); } } void GPUHistBuilder::UpdatePositionDense(int depth) { for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; auto d_position = position[d_idx].data(); Node* d_nodes = nodes[d_idx].data(); auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data(); auto d_gidx = device_matrix[d_idx].gidx; int n_columns = info->num_col; size_t begin = device_row_segments[d_idx]; size_t end = device_row_segments[d_idx + 1]; dh::launch_n(device_idx, end - begin, [=] __device__(size_t local_idx) { int pos = d_position[local_idx]; if (!is_active(pos, depth)) { return; } Node node = d_nodes[pos]; if (node.IsLeaf()) { return; } int gidx = d_gidx[local_idx * static_cast(n_columns) + static_cast(node.split.findex)]; float fvalue = d_gidx_fvalue_map[gidx]; if (fvalue <= node.split.fvalue) { d_position[local_idx] = left_child_nidx(pos); } else { d_position[local_idx] = right_child_nidx(pos); } }); } dh::synchronize_n_devices(n_devices, dList); // dh::safe_cuda(cudaDeviceSynchronize()); } void GPUHistBuilder::UpdatePositionSparse(int depth) { for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; auto d_position = position[d_idx].data(); auto d_position_tmp = position_tmp[d_idx].data(); Node* d_nodes = nodes[d_idx].data(); auto d_gidx_feature_map = gidx_feature_map[d_idx].data(); auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data(); auto d_gidx = device_matrix[d_idx].gidx; auto d_row_ptr = device_matrix[d_idx].row_ptr.tbegin(); size_t row_begin = device_row_segments[d_idx]; size_t row_end = device_row_segments[d_idx + 1]; size_t element_begin = device_element_segments[d_idx]; size_t element_end = device_element_segments[d_idx + 1]; // Update missing direction dh::launch_n(device_idx, row_end - row_begin, [=] __device__(int local_idx) { int pos = d_position[local_idx]; if (!is_active(pos, depth)) { d_position_tmp[local_idx] = pos; return; } Node node = d_nodes[pos]; if (node.IsLeaf()) { d_position_tmp[local_idx] = pos; return; } else if (node.split.missing_left) { d_position_tmp[local_idx] = pos * 2 + 1; } else { d_position_tmp[local_idx] = pos * 2 + 2; } }); // Update node based on fvalue where exists // OPTMARK: This kernel is very inefficient for both compute and memory, // dominated by memory dependency / access patterns dh::TransformLbs( device_idx, &temp_memory[d_idx], element_end - element_begin, d_row_ptr, row_end - row_begin, is_dense, [=] __device__(size_t local_idx, int local_ridx) { int pos = d_position[local_ridx]; if (!is_active(pos, depth)) { return; } Node node = d_nodes[pos]; if (node.IsLeaf()) { return; } int gidx = d_gidx[local_idx]; int findex = d_gidx_feature_map[gidx]; // OPTMARK: slowest global // memory access, maybe setup // position, gidx, etc. as // combined structure? if (findex == node.split.findex) { float fvalue = d_gidx_fvalue_map[gidx]; if (fvalue <= node.split.fvalue) { d_position_tmp[local_ridx] = left_child_nidx(pos); } else { d_position_tmp[local_ridx] = right_child_nidx(pos); } } }); position[d_idx] = position_tmp[d_idx]; } dh::synchronize_n_devices(n_devices, dList); } void GPUHistBuilder::ColSampleTree() { if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return; feature_set_tree.resize(info->num_col); std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0); feature_set_tree = col_sample(feature_set_tree, param.colsample_bytree); } void GPUHistBuilder::ColSampleLevel() { if (param.colsample_bylevel == 1.0 && param.colsample_bytree == 1.0) return; feature_set_level.resize(feature_set_tree.size()); feature_set_level = col_sample(feature_set_tree, param.colsample_bylevel); std::vector h_feature_flags(info->num_col, 0); for (auto fidx : feature_set_level) { h_feature_flags[fidx] = 1; } for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); feature_flags[d_idx] = h_feature_flags; } dh::synchronize_n_devices(n_devices, dList); } bool GPUHistBuilder::UpdatePredictionCache( const DMatrix* data, std::vector* p_out_preds) { std::vector& out_preds = *p_out_preds; if (nodes.empty() || !p_last_fmat_ || data != p_last_fmat_) { return false; } if (!prediction_cache_initialised) { for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; size_t row_begin = device_row_segments[d_idx]; size_t row_end = device_row_segments[d_idx + 1]; prediction_cache[d_idx].copy(out_preds.begin() + row_begin, out_preds.begin() + row_end); } prediction_cache_initialised = true; } dh::synchronize_n_devices(n_devices, dList); float eps = param.learning_rate; for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; size_t row_begin = device_row_segments[d_idx]; size_t row_end = device_row_segments[d_idx + 1]; auto d_nodes = nodes[d_idx].data(); auto d_position = position[d_idx].data(); auto d_prediction_cache = prediction_cache[d_idx].data(); dh::launch_n(device_idx, prediction_cache[d_idx].size(), [=] __device__(int local_idx) { int pos = d_position[local_idx]; d_prediction_cache[local_idx] += d_nodes[pos].weight * eps; }); thrust::copy(prediction_cache[d_idx].tbegin(), prediction_cache[d_idx].tend(), &out_preds[row_begin]); } dh::synchronize_n_devices(n_devices, dList); return true; } void GPUHistBuilder::Update(const std::vector& gpair, DMatrix* p_fmat, RegTree* p_tree) { dh::Timer time0; this->InitData(gpair, *p_fmat, *p_tree); this->InitFirstNode(gpair); this->ColSampleTree(); for (int depth = 0; depth < param.max_depth; depth++) { this->ColSampleLevel(); this->BuildHist(depth); this->FindSplit(depth); this->UpdatePosition(depth); } // done with multi-GPU, pass back result from master to tree on host int master_device = dList[0]; dh::safe_cuda(cudaSetDevice(master_device)); dense2sparse_tree(p_tree, nodes[0].tbegin(), nodes[0].tend(), param); gpu_time += time0.elapsedSeconds(); if (param.debug_verbose) { LOG(CONSOLE) << "[GPU Plug-in] Cumulative GPU Time excluding initial time " << (gpu_time - gpu_init_time) << " sec"; fflush(stdout); } if (param.debug_verbose) { LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time " << cpu_time.elapsedSeconds() << " sec"; LOG(CONSOLE) << "[GPU Plug-in] Cumulative CPU Time excluding initial time " << (cpu_time.elapsedSeconds() - cpu_init_time - gpu_time) << " sec"; fflush(stdout); } } } // namespace tree } // namespace xgboost