Use Span in GPU exact updater. (#4020)

* Use Span in GPU exact updater.

* Add a small test.
This commit is contained in:
Jiaming Yuan 2018-12-26 12:44:46 +08:00 committed by GitHub
parent 7735252925
commit 9897b5042f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 180 additions and 84 deletions

View File

@ -378,6 +378,11 @@ class DVec2 {
DVec<T> &D2() { return d2_; } DVec<T> &D2() { return d2_; }
T *Current() { return buff_.Current(); } T *Current() { return buff_.Current(); }
xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{
buff_.Current(),
static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
}
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); } DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
@ -791,7 +796,7 @@ typename std::iterator_traits<T>::value_type SumReduction(
template <typename T, int BlkDim = 256, int ItemsPerThread = 4> template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
void FillConst(int device_idx, T *out, int len, T def) { void FillConst(int device_idx, T *out, int len, T def) {
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len, dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
[=] __device__(int i) { out[i] = def; }); [=] __device__(int i) { out[i] = def; });
} }
/** /**

View File

@ -1,9 +1,12 @@
/*! /*!
* Copyright 2017 XGBoost contributors * Copyright 2017-2018 XGBoost contributors
*/ */
#include <xgboost/tree_updater.h> #include <xgboost/tree_updater.h>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <limits>
#include <string>
#include "../common/common.h" #include "../common/common.h"
#include "param.h" #include "param.h"
#include "updater_gpu_common.cuh" #include "updater_gpu_common.cuh"
@ -22,9 +25,9 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu);
* @param nKeys number of nodes at this level. * @param nKeys number of nodes at this level.
* @return the uniq key * @return the uniq key
*/ */
static HOST_DEV_INLINE NodeIdT Abs2UniqueKey(int tid,
static HOST_DEV_INLINE NodeIdT abs2uniqKey(int tid, const NodeIdT* abs, common::Span<const NodeIdT> abs,
const int* colIds, common::Span<const int> colIds,
NodeIdT nodeStart, int nKeys) { NodeIdT nodeStart, int nKeys) {
int a = abs[tid]; int a = abs[tid];
if (a == kUnusedNode) return a; if (a == kUnusedNode) return a;
@ -77,18 +80,24 @@ struct AddByKey {
* @param instIds instance index buffer * @param instIds instance index buffer
* @return the expected gradient value * @return the expected gradient value
*/ */
HOST_DEV_INLINE GradientPair get(int id, const GradientPair* vals, HOST_DEV_INLINE GradientPair get(int id,
const int* instIds) { common::Span<const GradientPair> vals,
common::Span<const int> instIds) {
id = instIds[id]; id = instIds[id];
return vals[id]; return vals[id];
} }
template <int BLKDIM_L1L3> template <int BLKDIM_L1L3>
__global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals, __global__ void CubScanByKeyL1(
const int* instIds, GradientPair* mScans, common::Span<GradientPair> scans,
int* mKeys, const NodeIdT* keys, int nUniqKeys, common::Span<const GradientPair> vals,
const int* colIds, NodeIdT nodeStart, common::Span<const int> instIds,
const int size) { common::Span<GradientPair> mScans,
common::Span<int> mKeys,
common::Span<const NodeIdT> keys,
int nUniqKeys,
common::Span<const int> colIds, NodeIdT nodeStart,
const int size) {
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)}; Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
int myKey; int myKey;
GradientPair myValue; GradientPair myValue;
@ -97,7 +106,7 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
Pair threadData; Pair threadData;
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x; int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
if (tid < size) { if (tid < size) {
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys); myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
myValue = get(tid, vals, instIds); myValue = get(tid, vals, instIds);
} else { } else {
myKey = kNoneKey; myKey = kNoneKey;
@ -127,7 +136,8 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
} }
template <int BLKSIZE> template <int BLKSIZE>
__global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) { __global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
common::Span<int> mKeys, int mLength) {
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan; typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
Pair threadData; Pair threadData;
__shared__ typename BlockScan::TempStorage temp_storage; __shared__ typename BlockScan::TempStorage temp_storage;
@ -141,11 +151,15 @@ __global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
} }
template <int BLKDIM_L1L3> template <int BLKDIM_L1L3>
__global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans, __global__ void CubScanByKeyL3(common::Span<GradientPair> sums,
const GradientPair* vals, const int* instIds, common::Span<GradientPair> scans,
const GradientPair* mScans, const int* mKeys, common::Span<const GradientPair> vals,
const NodeIdT* keys, int nUniqKeys, common::Span<const int> instIds,
const int* colIds, NodeIdT nodeStart, common::Span<const GradientPair> mScans,
common::Span<const int> mKeys,
common::Span<const NodeIdT> keys,
int nUniqKeys,
common::Span<const int> colIds, NodeIdT nodeStart,
const int size) { const int size) {
int relId = threadIdx.x; int relId = threadIdx.x;
int tid = (blockIdx.x * BLKDIM_L1L3) + relId; int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
@ -161,10 +175,10 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey; s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair(); s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
} }
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys); int myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
int previousKey = int previousKey =
tid == 0 ? kNoneKey tid == 0 ? kNoneKey
: abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys); : Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
GradientPair myValue = scans[tid]; GradientPair myValue = scans[tid];
__syncthreads(); __syncthreads();
if (blockIdx.x > 0 && s_mKeys == previousKey) { if (blockIdx.x > 0 && s_mKeys == previousKey) {
@ -201,17 +215,22 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
* @param nodeStart index of the leftmost node in the current level * @param nodeStart index of the leftmost node in the current level
*/ */
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512> template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
void reduceScanByKey(GradientPair* sums, GradientPair* scans, const GradientPair* vals, void ReduceScanByKey(common::Span<GradientPair> sums,
const int* instIds, const NodeIdT* keys, int size, common::Span<GradientPair> scans,
int nUniqKeys, int nCols, GradientPair* tmpScans, common::Span<GradientPair> vals,
int* tmpKeys, const int* colIds, NodeIdT nodeStart) { common::Span<const int> instIds,
common::Span<const NodeIdT> keys,
int size, int nUniqKeys, int nCols,
common::Span<GradientPair> tmpScans,
common::Span<int> tmpKeys,
common::Span<const int> colIds, NodeIdT nodeStart) {
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3); int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(GradientPair)); cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair));
cubScanByKeyL1<BLKDIM_L1L3> CubScanByKeyL1<BLKDIM_L1L3>
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys, <<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
nUniqKeys, colIds, nodeStart, size); nUniqKeys, colIds, nodeStart, size);
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks); CubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
cubScanByKeyL3<BLKDIM_L1L3> CubScanByKeyL3<BLKDIM_L1L3>
<<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys, <<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
keys, nUniqKeys, colIds, nodeStart, size); keys, nUniqKeys, colIds, nodeStart, size);
} }
@ -268,7 +287,7 @@ HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
return out; return out;
} }
DEV_INLINE void atomicArgMax(ExactSplitCandidate* address, DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
ExactSplitCandidate val) { ExactSplitCandidate val) {
unsigned long long* intAddress = (unsigned long long*)address; // NOLINT unsigned long long* intAddress = (unsigned long long*)address; // NOLINT
unsigned long long old = *intAddress; // NOLINT unsigned long long old = *intAddress; // NOLINT
@ -281,11 +300,17 @@ DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
} while (assumed != old); } while (assumed != old);
} }
DEV_INLINE void argMaxWithAtomics( DEV_INLINE void ArgMaxWithAtomics(
int id, ExactSplitCandidate* nodeSplits, const GradientPair* gradScans, int id,
const GradientPair* gradSums, const float* vals, const int* colIds, common::Span<ExactSplitCandidate> nodeSplits,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys, common::Span<const GradientPair> gradScans,
NodeIdT nodeStart, int len, const GPUTrainingParam& param) { common::Span<const GradientPair> gradSums,
common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes, int nUniqKeys,
NodeIdT nodeStart, int len,
const GPUTrainingParam& param) {
int nodeId = nodeAssigns[id]; int nodeId = nodeAssigns[id];
// @todo: this is really a bad check! but will be fixed when we move // @todo: this is really a bad check! but will be fixed when we move
// to key-based reduction // to key-based reduction
@ -293,45 +318,59 @@ DEV_INLINE void argMaxWithAtomics(
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) && !((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
(vals[id] == vals[id - 1]))) { (vals[id] == vals[id - 1]))) {
if (nodeId != kUnusedNode) { if (nodeId != kUnusedNode) {
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys); int sumId = Abs2UniqueKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
GradientPair colSum = gradSums[sumId]; GradientPair colSum = gradSums[sumId];
int uid = nodeId - nodeStart; int uid = nodeId - nodeStart;
DeviceNodeStats n = nodes[nodeId]; DeviceNodeStats node_stat = nodes[nodeId];
GradientPair parentSum = n.sum_gradients; GradientPair parentSum = node_stat.sum_gradients;
float parentGain = n.root_gain; float parentGain = node_stat.root_gain;
bool tmp; bool tmp;
ExactSplitCandidate s; ExactSplitCandidate s;
GradientPair missing = parentSum - colSum; GradientPair missing = parentSum - colSum;
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain, s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
param, tmp); param, tmp);
s.index = id; s.index = id;
atomicArgMax(nodeSplits + uid, s); AtomicArgMax(&nodeSplits[uid], s);
} // end if nodeId != UNUSED_NODE } // end if nodeId != UNUSED_NODE
} // end if id == 0 ... } // end if id == 0 ...
} }
__global__ void atomicArgMaxByKeyGmem( __global__ void AtomicArgMaxByKeyGmem(
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans, common::Span<ExactSplitCandidate> nodeSplits,
const GradientPair* gradSums, const float* vals, const int* colIds, common::Span<const GradientPair> gradScans,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys, common::Span<const GradientPair> gradSums,
NodeIdT nodeStart, int len, const TrainParam param) { common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys,
NodeIdT nodeStart,
int len,
const TrainParam param) {
int id = threadIdx.x + (blockIdx.x * blockDim.x); int id = threadIdx.x + (blockIdx.x * blockDim.x);
const int stride = blockDim.x * gridDim.x; const int stride = blockDim.x * gridDim.x;
for (; id < len; id += stride) { for (; id < len; id += stride) {
argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds, ArgMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
nodeAssigns, nodes, nUniqKeys, nodeStart, len, nodeAssigns, nodes, nUniqKeys, nodeStart, len,
GPUTrainingParam(param)); GPUTrainingParam(param));
} }
} }
__global__ void atomicArgMaxByKeySmem( __global__ void AtomicArgMaxByKeySmem(
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans, common::Span<ExactSplitCandidate> nodeSplits,
const GradientPair* gradSums, const float* vals, const int* colIds, common::Span<const GradientPair> gradScans,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys, common::Span<const GradientPair> gradSums,
NodeIdT nodeStart, int len, const GPUTrainingParam param) { common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam param) {
extern __shared__ char sArr[]; extern __shared__ char sArr[];
ExactSplitCandidate* sNodeSplits = common::Span<ExactSplitCandidate> sNodeSplits =
reinterpret_cast<ExactSplitCandidate*>(sArr); common::Span<ExactSplitCandidate>(
reinterpret_cast<ExactSplitCandidate*>(sArr),
static_cast<typename common::Span<ExactSplitCandidate>::index_type>(
nUniqKeys * sizeof(ExactSplitCandidate)));
int tid = threadIdx.x; int tid = threadIdx.x;
ExactSplitCandidate defVal; ExactSplitCandidate defVal;
#pragma unroll 1 #pragma unroll 1
@ -342,13 +381,13 @@ __global__ void atomicArgMaxByKeySmem(
int id = tid + (blockIdx.x * blockDim.x); int id = tid + (blockIdx.x * blockDim.x);
const int stride = blockDim.x * gridDim.x; const int stride = blockDim.x * gridDim.x;
for (; id < len; id += stride) { for (; id < len; id += stride) {
argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds, ArgMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param); nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
} }
__syncthreads(); __syncthreads();
for (int i = tid; i < nUniqKeys; i += blockDim.x) { for (int i = tid; i < nUniqKeys; i += blockDim.x) {
ExactSplitCandidate s = sNodeSplits[i]; ExactSplitCandidate s = sNodeSplits[i];
atomicArgMax(nodeSplits + i, s); AtomicArgMax(&nodeSplits[i], s);
} }
} }
@ -369,24 +408,28 @@ __global__ void atomicArgMaxByKeySmem(
* @param algo which algorithm to use for argmax_by_key * @param algo which algorithm to use for argmax_by_key
*/ */
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4> template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
void argMaxByKey(ExactSplitCandidate* nodeSplits, const GradientPair* gradScans, void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
const GradientPair* gradSums, const float* vals, common::Span<const GradientPair> gradScans,
const int* colIds, const NodeIdT* nodeAssigns, common::Span<const GradientPair> gradSums,
const DeviceNodeStats* nodes, int nUniqKeys, common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys,
NodeIdT nodeStart, int len, const TrainParam param, NodeIdT nodeStart, int len, const TrainParam param,
ArgMaxByKeyAlgo algo) { ArgMaxByKeyAlgo algo) {
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>( dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
param.gpu_id, nodeSplits, nUniqKeys, param.gpu_id, nodeSplits.data(), nUniqKeys,
ExactSplitCandidate()); ExactSplitCandidate());
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM); int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
switch (algo) { switch (algo) {
case kAbkGmem: case kAbkGmem:
atomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>( AtomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
nUniqKeys, nodeStart, len, param); nUniqKeys, nodeStart, len, param);
break; break;
case kAbkSmem: case kAbkSmem:
atomicArgMaxByKeySmem<<<nBlks, BLKDIM, AtomicArgMaxByKeySmem<<<nBlks, BLKDIM,
sizeof(ExactSplitCandidate) * nUniqKeys>>>( sizeof(ExactSplitCandidate) * nUniqKeys>>>(
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes, nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
nUniqKeys, nodeStart, len, GPUTrainingParam(param)); nUniqKeys, nodeStart, len, GPUTrainingParam(param));
@ -512,7 +555,7 @@ class GPUMaker : public TreeUpdater {
~GPUMaker() {} ~GPUMaker() {}
void Init( void Init(
const std::vector<std::pair<std::string, std::string>>& args) override { const std::vector<std::pair<std::string, std::string>>& args) {
param.InitAllowUnknown(args); param.InitAllowUnknown(args);
maxNodes = (1 << (param.max_depth + 1)) - 1; maxNodes = (1 << (param.max_depth + 1)) - 1;
maxLeaves = 1 << param.max_depth; maxLeaves = 1 << param.max_depth;
@ -521,7 +564,7 @@ class GPUMaker : public TreeUpdater {
} }
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat, void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) override { const std::vector<RegTree*>& trees) {
GradStats::CheckInfo(dmat->Info()); GradStats::CheckInfo(dmat->Info());
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
float lr = param.learning_rate; float lr = param.learning_rate;
@ -535,7 +578,7 @@ class GPUMaker : public TreeUpdater {
UpdateTree(gpair, dmat, trees[i]); UpdateTree(gpair, dmat, trees[i]);
} }
} catch (const std::exception& e) { } catch (const std::exception& e) {
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl; LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
} }
param.learning_rate = lr; param.learning_rate = lr;
} }
@ -543,7 +586,7 @@ class GPUMaker : public TreeUpdater {
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat, void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
RegTree* hTree) { RegTree* hTree) {
if (!allocated) { if (!allocated) {
setupOneTimeData(dmat); SetupOneTimeData(dmat);
} }
for (int i = 0; i < param.max_depth; ++i) { for (int i = 0; i < param.max_depth; ++i) {
if (i == 0) { if (i == 0) {
@ -563,11 +606,11 @@ class GPUMaker : public TreeUpdater {
} }
void split2node(int nNodes, NodeIdT nodeStart) { void split2node(int nNodes, NodeIdT nodeStart) {
auto d_nodes = nodes.Data(); auto d_nodes = nodes.GetSpan();
auto d_gradScans = gradScans.Data(); auto d_gradScans = gradScans.GetSpan();
auto d_gradSums = gradSums.Data(); auto d_gradSums = gradSums.GetSpan();
auto d_nodeAssigns = nodeAssigns.Current(); auto d_nodeAssigns = nodeAssigns.CurrentSpan();
auto d_colIds = colIds.Data(); auto d_colIds = colIds.GetSpan();
auto d_vals = vals.Current(); auto d_vals = vals.Current();
auto d_nodeSplits = nodeSplits.Data(); auto d_nodeSplits = nodeSplits.Data();
int nUniqKeys = nNodes; int nUniqKeys = nNodes;
@ -580,7 +623,7 @@ class GPUMaker : public TreeUpdater {
if (s.isSplittable(min_split_loss)) { if (s.isSplittable(min_split_loss)) {
int idx = s.index; int idx = s.index;
int nodeInstId = int nodeInstId =
abs2uniqKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys); Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
bool missingLeft = true; bool missingLeft = true;
const DeviceNodeStats& n = d_nodes[absNodeId]; const DeviceNodeStats& n = d_nodes[absNodeId];
GradientPair gradScan = d_gradScans[idx]; GradientPair gradScan = d_gradScans[idx];
@ -612,13 +655,13 @@ class GPUMaker : public TreeUpdater {
} }
void findSplit(int level, NodeIdT nodeStart, int nNodes) { void findSplit(int level, NodeIdT nodeStart, int nNodes) {
reduceScanByKey(gradSums.Data(), gradScans.Data(), gradsInst.Data(), ReduceScanByKey(gradSums.GetSpan(), gradScans.GetSpan(), gradsInst.GetSpan(),
instIds.Current(), nodeAssigns.Current(), nVals, nNodes, instIds.CurrentSpan(), nodeAssigns.CurrentSpan(), nVals, nNodes,
nCols, tmpScanGradBuff.Data(), tmpScanKeyBuff.Data(), nCols, tmpScanGradBuff.GetSpan(), tmpScanKeyBuff.GetSpan(),
colIds.Data(), nodeStart); colIds.GetSpan(), nodeStart);
argMaxByKey(nodeSplits.Data(), gradScans.Data(), gradSums.Data(), ArgMaxByKey(nodeSplits.GetSpan(), gradScans.GetSpan(), gradSums.GetSpan(),
vals.Current(), colIds.Data(), nodeAssigns.Current(), vals.CurrentSpan(), colIds.GetSpan(), nodeAssigns.CurrentSpan(),
nodes.Data(), nNodes, nodeStart, nVals, param, nodes.GetSpan(), nNodes, nodeStart, nVals, param,
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem); level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
split2node(nNodes, nodeStart); split2node(nNodes, nodeStart);
} }
@ -634,7 +677,7 @@ class GPUMaker : public TreeUpdater {
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals); &tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
} }
void setupOneTimeData(DMatrix* dmat) { void SetupOneTimeData(DMatrix* dmat) {
size_t free_memory = dh::AvailableMemory(param.gpu_id); size_t free_memory = dh::AvailableMemory(param.gpu_id);
if (!dmat->SingleColBlock()) { if (!dmat->SingleColBlock()) {
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block"; LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
@ -726,11 +769,11 @@ class GPUMaker : public TreeUpdater {
// gather the node assignments across all other columns too // gather the node assignments across all other columns too
dh::Gather(param.gpu_id, nodeAssigns.Current(), dh::Gather(param.gpu_id, nodeAssigns.Current(),
nodeAssignsPerInst.Data(), instIds.Current(), nVals); nodeAssignsPerInst.Data(), instIds.Current(), nVals);
sortKeys(level); SortKeys(level);
} }
} }
void sortKeys(int level) { void SortKeys(int level) {
// segmented-sort the arrays based on node-id's // segmented-sort the arrays based on node-id's
// but we don't need more than level+1 bits for sorting! // but we don't need more than level+1 bits for sorting!
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols, SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,

View File

@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
*/ */
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT> template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT>
__device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram, __device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram,
TempStorageT* temp_storage) { TempStorageT* temp_storage) {
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum; __shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
GradientSumT& shared_sum = uninitialized_sum.Alias(); GradientSumT& shared_sum = uninitialized_sum.Alias();

View File

@ -0,0 +1,48 @@
#include <gtest/gtest.h>
#include <xgboost/tree_updater.h>
#include <vector>
#include <string>
#include <utility>
#include "../helpers.h"
namespace xgboost {
namespace tree {
TEST(GPUExact, Update) {
using Arg = std::pair<std::string, std::string>;
std::vector<Arg> args{
{"n_gpus", "1"},
{"gpu_id", "0"},
{"max_depth", "1"}};
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu");
p_gpuexact_maker->Init(args);
size_t constexpr n_rows = 4;
size_t constexpr n_cols = 8;
bst_float constexpr sparsity = 0.0f;
auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3);
std::vector<GradientPair> h_gpair(n_rows);
for (size_t i = 0; i < n_rows; ++i) {
h_gpair[i] = GradientPair(i % 2, 1);
}
HostDeviceVector<GradientPair> gpair (h_gpair);
RegTree tree;
p_gpuexact_maker->Update(&gpair, (*dmat).get(), {&tree});
auto const& nodes = tree.GetNodes();
ASSERT_EQ(nodes.size(), 3);
float constexpr kRtEps = 1e-6;
ASSERT_NEAR(tree.Stat(0).sum_hess, 4, kRtEps);
ASSERT_NEAR(tree.Stat(1).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
}
} // namespace tree
} // namespace xgboost