/*! * Copyright 2017-2018 XGBoost contributors */ #include #include #include #include #include #include "../common/common.h" #include "param.h" #include "updater_gpu_common.cuh" namespace xgboost { namespace tree { DMLC_REGISTRY_FILE_TAG(updater_gpu); template XGBOOST_DEVICE float inline LossChangeMissing(const GradientPairT& scan, const GradientPairT& missing, const GradientPairT& parent_sum, const float& parent_gain, const GPUTrainingParam& param, bool& missing_left_out) { // NOLINT // Put gradients of missing values to left float missing_left_loss = DeviceCalcLossChange(param, scan + missing, parent_sum, parent_gain); float missing_right_loss = DeviceCalcLossChange(param, scan, parent_sum, parent_gain); if (missing_left_loss >= missing_right_loss) { missing_left_out = true; return missing_left_loss; } else { missing_left_out = false; return missing_right_loss; } } /** * @brief Absolute BFS order IDs to col-wise unique IDs based on user input * @param tid the index of the element that this thread should access * @param abs the array of absolute IDs * @param colIds the array of column IDs for each element * @param nodeStart the start of the node ID at this level * @param nKeys number of nodes at this level. * @return the uniq key */ static HOST_DEV_INLINE NodeIdT Abs2UniqueKey(int tid, common::Span abs, common::Span colIds, NodeIdT nodeStart, int nKeys) { int a = abs[tid]; if (a == kUnusedNode) return a; return ((a - nodeStart) + (colIds[tid] * nKeys)); } /** * @struct Pair * @brief Pair used for key basd scan operations on GradientPair */ struct Pair { int key; GradientPair value; }; /** define a key that's not used at all in the entire boosting process */ static const int kNoneKey = -100; /** * @brief Allocate temporary buffers needed for scan operations * @param tmpScans gradient buffer * @param tmpKeys keys buffer * @param size number of elements that will be scanned */ template int ScanTempBufferSize(int size) { int num_blocks = dh::DivRoundUp(size, BLKDIM_L1L3); return num_blocks; } struct AddByKey { template HOST_DEV_INLINE T operator()(const T& first, const T& second) const { T result; if (first.key == second.key) { result.key = first.key; result.value = first.value + second.value; } else { result.key = second.key; result.value = second.value; } return result; } }; /** * @brief Gradient value getter function * @param id the index into the vals or instIds array to which to fetch * @param vals the gradient value buffer * @param instIds instance index buffer * @return the expected gradient value */ HOST_DEV_INLINE GradientPair Get(int id, common::Span vals, common::Span instIds) { id = instIds[id]; return vals[id]; } template __global__ void CubScanByKeyL1( common::Span scans, common::Span vals, common::Span instIds, common::Span mScans, common::Span mKeys, common::Span keys, int nUniqKeys, common::Span colIds, NodeIdT nodeStart, const int size) { Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)}; int myKey; GradientPair myValue; using BlockScan = cub::BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; Pair threadData; int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x; if (tid < size) { myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys); myValue = Get(tid, vals, instIds); } else { myKey = kNoneKey; myValue = {}; } threadData.key = myKey; threadData.value = myValue; // get previous key, especially needed for the last thread in this block // in order to pass on the partial scan values. // this statement MUST appear before the checks below! // else, the result of this shuffle operation will be undefined #if (__CUDACC_VER_MAJOR__ >= 9) int previousKey = __shfl_up_sync(0xFFFFFFFF, myKey, 1); #else int previousKey = __shfl_up(myKey, 1); #endif // Collectively compute the block-wide exclusive prefix sum BlockScan(temp_storage) .ExclusiveScan(threadData, threadData, rootPair, AddByKey()); if (tid < size) { scans[tid] = threadData.value; } else { return; } if (threadIdx.x == BLKDIM_L1L3 - 1) { threadData.value = (myKey == previousKey) ? threadData.value : GradientPair(0.0f, 0.0f); mKeys[blockIdx.x] = myKey; mScans[blockIdx.x] = threadData.value + myValue; } } template __global__ void CubScanByKeyL2(common::Span mScans, common::Span mKeys, int mLength) { using BlockScan = cub::BlockScan; Pair threadData; __shared__ typename BlockScan::TempStorage temp_storage; for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) { threadData.key = mKeys[i]; threadData.value = mScans[i]; BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey()); mScans[i] = threadData.value; __syncthreads(); } } template __global__ void CubScanByKeyL3(common::Span sums, common::Span scans, common::Span vals, common::Span instIds, common::Span mScans, common::Span mKeys, common::Span keys, int nUniqKeys, common::Span colIds, NodeIdT nodeStart, const int size) { int relId = threadIdx.x; int tid = (blockIdx.x * BLKDIM_L1L3) + relId; // to avoid the following warning from nvcc: // __shared__ memory variable with non-empty constructor or destructor // (potential race between threads) __shared__ char gradBuff[sizeof(GradientPair)]; __shared__ int s_mKeys; GradientPair* s_mScans = reinterpret_cast(gradBuff); if (tid >= size) return; // cache block-wide partial scan info if (relId == 0) { s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey; s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair(); } int myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys); int previousKey = tid == 0 ? kNoneKey : Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys); GradientPair my_value = scans[tid]; __syncthreads(); if (blockIdx.x > 0 && s_mKeys == previousKey) { my_value += s_mScans[0]; } if (tid == size - 1) { sums[previousKey] = my_value + Get(tid, vals, instIds); } if ((previousKey != myKey) && (previousKey >= 0)) { sums[previousKey] = my_value; my_value = GradientPair(0.0f, 0.0f); } scans[tid] = my_value; } /** * @brief Performs fused reduce and scan by key functionality. It is assumed * that * the keys occur contiguously! * @param sums the output gradient reductions for each element performed * key-wise * @param scans the output gradient scans for each element performed key-wise * @param vals the gradients evaluated for each observation. * @param instIds instance ids for each element * @param keys keys to be used to segment the reductions. They need not occur * contiguously in contrast to scan_by_key. Currently, we need one key per * value in the 'vals' array. * @param size number of elements in the 'vals' array * @param nUniqKeys max number of uniq keys found per column * @param nCols number of columns * @param tmpScans temporary scan buffer needed for cub-pyramid algo * @param tmpKeys temporary key buffer needed for cub-pyramid algo * @param colIds column indices for each element in the array * @param nodeStart index of the leftmost node in the current level */ template void ReduceScanByKey(common::Span sums, common::Span scans, common::Span vals, common::Span instIds, common::Span keys, int size, int nUniqKeys, int nCols, common::Span tmpScans, common::Span tmpKeys, common::Span colIds, NodeIdT nodeStart) { int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3); cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair)); CubScanByKeyL1 <<>>(scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds, nodeStart, size); CubScanByKeyL2<<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks); CubScanByKeyL3 <<>>(sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds, nodeStart, size); } /** * @struct ExactSplitCandidate * @brief Abstraction of a possible split in the decision tree */ struct ExactSplitCandidate { /** the optimal gain score for this node */ float score; /** index where to split in the DMatrix */ int index; HOST_DEV_INLINE ExactSplitCandidate() : score{-FLT_MAX}, index{INT_MAX} {} /** * @brief Whether the split info is valid to be used to create a new child * @param minSplitLoss minimum score above which decision to split is made * @return true if splittable, else false */ HOST_DEV_INLINE bool IsSplittable(float minSplitLoss) const { return ((score >= minSplitLoss) && (index != INT_MAX)); } }; /** * @enum ArgMaxByKeyAlgo best_split_evaluation.cuh * @brief Help decide which algorithm to use for multi-argmax operation */ enum ArgMaxByKeyAlgo { /** simplest, use gmem-atomics for all updates */ kAbkGmem = 0, /** use smem-atomics for updates (when number of keys are less) */ kAbkSmem }; /** max depth until which to use shared mem based atomics for argmax */ static const int kMaxAbkLevels = 3; HOST_DEV_INLINE ExactSplitCandidate MaxSplit(ExactSplitCandidate a, ExactSplitCandidate b) { ExactSplitCandidate out; if (a.score < b.score) { out.score = b.score; out.index = b.index; } else if (a.score == b.score) { out.score = a.score; out.index = (a.index < b.index) ? a.index : b.index; } else { out.score = a.score; out.index = a.index; } return out; } DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address, ExactSplitCandidate val) { unsigned long long* intAddress = reinterpret_cast(address); // NOLINT unsigned long long old = *intAddress; // NOLINT unsigned long long assumed = old; // NOLINT do { assumed = old; ExactSplitCandidate res = MaxSplit(val, *reinterpret_cast(&assumed)); old = atomicCAS(intAddress, assumed, *reinterpret_cast(&res)); } while (assumed != old); } DEV_INLINE void ArgMaxWithAtomics( int id, common::Span nodeSplits, common::Span gradScans, common::Span gradSums, common::Span vals, common::Span colIds, common::Span nodeAssigns, common::Span nodes, int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam& param) { int nodeId = nodeAssigns[id]; // @todo: this is really a bad check! but will be fixed when we move // to key-based reduction if ((id == 0) || !((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) && (vals[id] == vals[id - 1]))) { if (nodeId != kUnusedNode) { int sumId = Abs2UniqueKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys); GradientPair colSum = gradSums[sumId]; int uid = nodeId - nodeStart; DeviceNodeStats node_stat = nodes[nodeId]; GradientPair parentSum = node_stat.sum_gradients; float parentGain = node_stat.root_gain; bool tmp; ExactSplitCandidate s; GradientPair missing = parentSum - colSum; s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain, param, tmp); s.index = id; AtomicArgMax(&nodeSplits[uid], s); } // end if nodeId != UNUSED_NODE } // end if id == 0 ... } __global__ void AtomicArgMaxByKeyGmem( common::Span nodeSplits, common::Span gradScans, common::Span gradSums, common::Span vals, common::Span colIds, common::Span nodeAssigns, common::Span nodes, int nUniqKeys, NodeIdT nodeStart, int len, const TrainParam param) { int id = threadIdx.x + (blockIdx.x * blockDim.x); const int stride = blockDim.x * gridDim.x; for (; id < len; id += stride) { ArgMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nUniqKeys, nodeStart, len, GPUTrainingParam(param)); } } __global__ void AtomicArgMaxByKeySmem( common::Span nodeSplits, common::Span gradScans, common::Span gradSums, common::Span vals, common::Span colIds, common::Span nodeAssigns, common::Span nodes, int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam param) { extern __shared__ char sArr[]; common::Span sNodeSplits = common::Span( reinterpret_cast(sArr), static_cast::index_type>( nUniqKeys * sizeof(ExactSplitCandidate))); int tid = threadIdx.x; ExactSplitCandidate defVal; for (int i = tid; i < nUniqKeys; i += blockDim.x) { sNodeSplits[i] = defVal; } __syncthreads(); int id = tid + (blockIdx.x * blockDim.x); const int stride = blockDim.x * gridDim.x; for (; id < len; id += stride) { ArgMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nUniqKeys, nodeStart, len, param); } __syncthreads(); for (int i = tid; i < nUniqKeys; i += blockDim.x) { ExactSplitCandidate s = sNodeSplits[i]; AtomicArgMax(&nodeSplits[i], s); } } /** * @brief Performs argmax_by_key functionality but for cases when keys need not * occur contiguously * @param nodeSplits will contain information on best split for each node * @param gradScans exclusive sum on sorted segments for each col * @param gradSums gradient sum for each column in DMatrix based on to node-ids * @param vals feature values * @param colIds column index for each element in the feature values array * @param nodeAssigns node-id assignments to each element in DMatrix * @param nodes pointer to all nodes for this tree in BFS order * @param nUniqKeys number of unique node-ids in this level * @param nodeStart start index of the node-ids in this level * @param len number of elements * @param param training parameters * @param algo which algorithm to use for argmax_by_key */ template void ArgMaxByKey(common::Span nodeSplits, common::Span gradScans, common::Span gradSums, common::Span vals, common::Span colIds, common::Span nodeAssigns, common::Span nodes, int nUniqKeys, NodeIdT nodeStart, int len, const TrainParam param, ArgMaxByKeyAlgo algo, GPUSet const& devices) { dh::FillConst( *(devices.begin()), nodeSplits.data(), nUniqKeys, ExactSplitCandidate()); int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM); switch (algo) { case kAbkGmem: AtomicArgMaxByKeyGmem<<>>( nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nUniqKeys, nodeStart, len, param); break; case kAbkSmem: AtomicArgMaxByKeySmem<<>>( nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nUniqKeys, nodeStart, len, GPUTrainingParam(param)); break; default: throw std::runtime_error("argMaxByKey: Bad algo passed!"); } } __global__ void AssignColIds(int* colIds, const int* colOffsets) { int myId = blockIdx.x; int start = colOffsets[myId]; int end = colOffsets[myId + 1]; for (int id = start + threadIdx.x; id < end; id += blockDim.x) { colIds[id] = myId; } } __global__ void FillDefaultNodeIds(NodeIdT* nodeIdsPerInst, const DeviceNodeStats* nodes, int n_rows) { int id = threadIdx.x + (blockIdx.x * blockDim.x); if (id >= n_rows) { return; } // if this element belongs to none of the currently active node-id's NodeIdT nId = nodeIdsPerInst[id]; if (nId == kUnusedNode) { return; } const DeviceNodeStats n = nodes[nId]; NodeIdT result; if (n.IsLeaf() || n.IsUnused()) { result = kUnusedNode; } else if (n.dir == kLeftDir) { result = (2 * n.idx) + 1; } else { result = (2 * n.idx) + 2; } nodeIdsPerInst[id] = result; } __global__ void AssignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations, const NodeIdT* nodeIds, const int* instId, const DeviceNodeStats* nodes, const int* colOffsets, const float* vals, int nVals, int nCols) { int id = threadIdx.x + (blockIdx.x * blockDim.x); const int stride = blockDim.x * gridDim.x; for (; id < nVals; id += stride) { // fusing generation of indices for node locations nodeLocations[id] = id; // using nodeIds here since the previous kernel would have updated // the nodeIdsPerInst with all default assignments int nId = nodeIds[id]; // if this element belongs to none of the currently active node-id's if (nId != kUnusedNode) { const DeviceNodeStats n = nodes[nId]; int colId = n.fidx; // printf("nid=%d colId=%d id=%d\n", nId, colId, id); int start = colOffsets[colId]; int end = colOffsets[colId + 1]; // @todo: too much wasteful threads!! if ((id >= start) && (id < end) && !(n.IsLeaf() || n.IsUnused())) { NodeIdT result = (2 * n.idx) + 1 + (vals[id] >= n.fvalue); nodeIdsPerInst[instId[id]] = result; } } } } __global__ void MarkLeavesKernel(DeviceNodeStats* nodes, int len) { int id = (blockIdx.x * blockDim.x) + threadIdx.x; if ((id < len) && !nodes[id].IsUnused()) { int lid = (id << 1) + 1; int rid = (id << 1) + 2; if ((lid >= len) || (rid >= len)) { nodes[id].root_gain = -FLT_MAX; // bottom-most nodes } else if (nodes[lid].IsUnused() && nodes[rid].IsUnused()) { nodes[id].root_gain = -FLT_MAX; // unused child nodes } } } class GPUMaker : public TreeUpdater { protected: TrainParam param_; /** whether we have initialized memory already (so as not to repeat!) */ bool allocated_; /** feature values stored in column-major compressed format */ dh::DoubleBuffer vals_; common::Span vals_cached_; /** corresponding instance id's of these featutre values */ dh::DoubleBuffer instIds_; common::Span inst_ids_cached_; /** column offsets for these feature values */ common::Span colOffsets_; common::Span gradsInst_; dh::DoubleBuffer nodeAssigns_; dh::DoubleBuffer nodeLocations_; common::Span nodes_; common::Span node_assigns_per_inst_; common::Span gradsums_; common::Span gradscans_; common::Span nodeSplits_; int n_vals_; int n_rows_; int n_cols_; int maxNodes_; int maxLeaves_; // devices are only used for sharding the HostDeviceVector passed as a parameter; // the algorithm works with a single GPU only GPUSet devices_; dh::CubMemory tmp_mem_; common::Span tmpScanGradBuff_; common::Span tmp_scan_key_buff_; common::Span colIds_; dh::BulkAllocator ba_; public: GPUMaker() : allocated_{false} {} ~GPUMaker() override = default; void Init(const std::vector> &args) override { param_.InitAllowUnknown(args); maxNodes_ = (1 << (param_.max_depth + 1)) - 1; maxLeaves_ = 1 << param_.max_depth; devices_ = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus); } void Update(HostDeviceVector* gpair, DMatrix* dmat, const std::vector& trees) override { // rescale learning rate according to size of trees float lr = param_.learning_rate; param_.learning_rate = lr / trees.size(); gpair->Shard(devices_); try { // build tree for (auto tree : trees) { UpdateTree(gpair, dmat, tree); } } catch (const std::exception& e) { LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl; } param_.learning_rate = lr; } /// @note: Update should be only after Init!! void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, RegTree* hTree) { if (!allocated_) { SetupOneTimeData(dmat); } for (int i = 0; i < param_.max_depth; ++i) { if (i == 0) { // make sure to start on a fresh tree with sorted values! dh::CopyDeviceSpan(vals_.CurrentSpan(), vals_cached_); dh::CopyDeviceSpan(instIds_.CurrentSpan(), inst_ids_cached_); TransferGrads(gpair); } int nNodes = 1 << i; NodeIdT nodeStart = nNodes - 1; InitNodeData(i, nodeStart, nNodes); FindSplit(i, nodeStart, nNodes); } // mark all the used nodes with unused children as leaf nodes MarkLeaves(); Dense2SparseTree(hTree, nodes_, param_); } void Split2Node(int nNodes, NodeIdT nodeStart) { auto d_nodes = nodes_; auto d_gradScans = gradscans_; auto d_gradsums = gradsums_; auto d_nodeAssigns = nodeAssigns_.CurrentSpan(); auto d_colIds = colIds_; auto d_vals = vals_.Current(); auto d_nodeSplits = nodeSplits_.data(); int nUniqKeys = nNodes; float min_split_loss = param_.min_split_loss; auto gpu_param = GPUTrainingParam(param_); dh::LaunchN(*(devices_.begin()), nNodes, [=] __device__(int uid) { int absNodeId = uid + nodeStart; ExactSplitCandidate s = d_nodeSplits[uid]; if (s.IsSplittable(min_split_loss)) { int idx = s.index; int nodeInstId = Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys); bool missingLeft = true; const DeviceNodeStats& n = d_nodes[absNodeId]; GradientPair gradScan = d_gradScans[idx]; GradientPair gradSum = d_gradsums[nodeInstId]; float thresh = d_vals[idx]; int colId = d_colIds[idx]; // get the default direction for the current node GradientPair missing = n.sum_gradients - gradSum; LossChangeMissing(gradScan, missing, n.sum_gradients, n.root_gain, gpu_param, missingLeft); // get the score/weight/id/gradSum for left and right child nodes GradientPair lGradSum = missingLeft ? gradScan + missing : gradScan; GradientPair rGradSum = n.sum_gradients - lGradSum; // Create children d_nodes[LeftChildNodeIdx(absNodeId)] = DeviceNodeStats(lGradSum, LeftChildNodeIdx(absNodeId), gpu_param); d_nodes[RightChildNodeIdx(absNodeId)] = DeviceNodeStats(rGradSum, RightChildNodeIdx(absNodeId), gpu_param); // Set split for parent d_nodes[absNodeId].SetSplit(thresh, colId, missingLeft ? kLeftDir : kRightDir, lGradSum, rGradSum); } else { // cannot be split further, so this node is a leaf! d_nodes[absNodeId].root_gain = -FLT_MAX; } }); } void FindSplit(int level, NodeIdT nodeStart, int nNodes) { ReduceScanByKey(gradsums_, gradscans_, gradsInst_, instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes, n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_, colIds_, nodeStart); auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus); ArgMaxByKey(nodeSplits_, gradscans_, gradsums_, vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(), nodes_, nNodes, nodeStart, n_vals_, param_, level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem, devices); Split2Node(nNodes, nodeStart); } void AllocateAllData(int offsetSize) { int tmpBuffSize = ScanTempBufferSize(n_vals_); ba_.Allocate(*(devices_.begin()), &vals_, n_vals_, &vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_, &colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_, &nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_, n_rows_, &gradsums_, maxLeaves_ * n_cols_, &gradscans_, n_vals_, &nodeSplits_, maxLeaves_, &tmpScanGradBuff_, tmpBuffSize, &tmp_scan_key_buff_, tmpBuffSize, &colIds_, n_vals_); } void SetupOneTimeData(DMatrix* dmat) { if (!dmat->SingleColBlock()) { LOG(FATAL) << "exact::GPUBuilder - must have 1 column block"; } std::vector fval; std::vector fId; std::vector offset; ConvertToCsc(dmat, &fval, &fId, &offset); AllocateAllData(static_cast(offset.size())); TransferAndSortData(fval, fId, offset); allocated_ = true; } void ConvertToCsc(DMatrix* dmat, std::vector* fval, std::vector* fId, std::vector* offset) { const MetaInfo& info = dmat->Info(); CHECK(info.num_col_ < std::numeric_limits::max()); CHECK(info.num_row_ < std::numeric_limits::max()); n_rows_ = static_cast(info.num_row_); n_cols_ = static_cast(info.num_col_); offset->reserve(n_cols_ + 1); offset->push_back(0); fval->reserve(n_cols_ * n_rows_); fId->reserve(n_cols_ * n_rows_); // in case you end up with a DMatrix having no column access // then make sure to enable that before copying the data! for (const auto& batch : dmat->GetSortedColumnBatches()) { for (int i = 0; i < batch.Size(); i++) { auto col = batch[i]; for (const Entry& e : col) { int inst_id = static_cast(e.index); fval->push_back(e.fvalue); fId->push_back(inst_id); } offset->push_back(static_cast(fval->size())); } } CHECK(fval->size() < std::numeric_limits::max()); n_vals_ = static_cast(fval->size()); } void TransferAndSortData(const std::vector& fval, const std::vector& fId, const std::vector& offset) { dh::CopyVectorToDeviceSpan(vals_.CurrentSpan(), fval); dh::CopyVectorToDeviceSpan(instIds_.CurrentSpan(), fId); dh::CopyVectorToDeviceSpan(colOffsets_, offset); dh::SegmentedSort(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_, colOffsets_); dh::CopyDeviceSpan(vals_cached_, vals_.CurrentSpan()); dh::CopyDeviceSpan(inst_ids_cached_, instIds_.CurrentSpan()); AssignColIds<<>>(colIds_.data(), colOffsets_.data()); } void TransferGrads(HostDeviceVector* gpair) { gpair->GatherTo( thrust::device_pointer_cast(gradsInst_.data()), thrust::device_pointer_cast(gradsInst_.data() + gradsInst_.size())); // evaluate the full-grad reduction for the root node dh::SumReduction(tmp_mem_, gradsInst_, gradsums_, n_rows_); } void InitNodeData(int level, NodeIdT nodeStart, int nNodes) { // all instances belong to root node at the beginning! if (level == 0) { thrust::fill(thrust::device_pointer_cast(nodes_.data()), thrust::device_pointer_cast(nodes_.data() + nodes_.size()), DeviceNodeStats()); thrust::fill(thrust::device_pointer_cast(nodeAssigns_.Current()), thrust::device_pointer_cast(nodeAssigns_.Current() + nodeAssigns_.Size()), 0); thrust::fill(thrust::device_pointer_cast(node_assigns_per_inst_.data()), thrust::device_pointer_cast(node_assigns_per_inst_.data() + node_assigns_per_inst_.size()), 0); // for root node, just update the gradient/score/weight/id info // before splitting it! Currently all data is on GPU, hence this // stupid little kernel auto d_nodes = nodes_; auto d_sums = gradsums_; auto gpu_params = GPUTrainingParam(param_); dh::LaunchN(*(devices_.begin()), 1, [=] __device__(int idx) { d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params); }); } else { const int BlkDim = 256; const int ItemsPerThread = 4; // assign default node ids first int nBlks = dh::DivRoundUp(n_rows_, BlkDim); FillDefaultNodeIds<<>>(node_assigns_per_inst_.data(), nodes_.data(), n_rows_); // evaluate the correct child indices of non-missing values next nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread); AssignNodeIds<<>>( node_assigns_per_inst_.data(), nodeLocations_.Current(), nodeAssigns_.Current(), instIds_.Current(), nodes_.data(), colOffsets_.data(), vals_.Current(), n_vals_, n_cols_); // gather the node assignments across all other columns too dh::Gather(*(devices_.begin()), nodeAssigns_.Current(), node_assigns_per_inst_.data(), instIds_.Current(), n_vals_); SortKeys(level); } } void SortKeys(int level) { // segmented-sort the arrays based on node-id's // but we don't need more than level+1 bits for sorting! SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_, colOffsets_, 0, level + 1); dh::Gather(*(devices_.begin()), vals_.other(), vals_.Current(), instIds_.other(), instIds_.Current(), nodeLocations_.Current(), n_vals_); vals_.buff.selector ^= 1; instIds_.buff.selector ^= 1; } void MarkLeaves() { const int BlkDim = 128; int nBlks = dh::DivRoundUp(maxNodes_, BlkDim); MarkLeavesKernel<<>>(nodes_.data(), maxNodes_); } }; XGBOOST_REGISTER_TREE_UPDATER(GPUMaker, "grow_gpu") .describe("Grow tree with GPU.") .set_body([]() { LOG(WARNING) << "The gpu_exact tree method is deprecated and may be " "removed in a future version."; return new GPUMaker(); }); } // namespace tree } // namespace xgboost