Use Span in GPU exact updater. (#4020)
* Use Span in GPU exact updater. * Add a small test.
This commit is contained in:
parent
7735252925
commit
9897b5042f
@ -378,6 +378,11 @@ class DVec2 {
|
|||||||
DVec<T> &D2() { return d2_; }
|
DVec<T> &D2() { return d2_; }
|
||||||
|
|
||||||
T *Current() { return buff_.Current(); }
|
T *Current() { return buff_.Current(); }
|
||||||
|
xgboost::common::Span<T> CurrentSpan() {
|
||||||
|
return xgboost::common::Span<T>{
|
||||||
|
buff_.Current(),
|
||||||
|
static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
|
||||||
|
}
|
||||||
|
|
||||||
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
|
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
|
||||||
|
|
||||||
@ -791,7 +796,7 @@ typename std::iterator_traits<T>::value_type SumReduction(
|
|||||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||||
void FillConst(int device_idx, T *out, int len, T def) {
|
void FillConst(int device_idx, T *out, int len, T def) {
|
||||||
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
|
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
|
||||||
[=] __device__(int i) { out[i] = def; });
|
[=] __device__(int i) { out[i] = def; });
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,9 +1,12 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2017 XGBoost contributors
|
* Copyright 2017-2018 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <xgboost/tree_updater.h>
|
#include <xgboost/tree_updater.h>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <limits>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
#include "param.h"
|
#include "param.h"
|
||||||
#include "updater_gpu_common.cuh"
|
#include "updater_gpu_common.cuh"
|
||||||
@ -22,9 +25,9 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu);
|
|||||||
* @param nKeys number of nodes at this level.
|
* @param nKeys number of nodes at this level.
|
||||||
* @return the uniq key
|
* @return the uniq key
|
||||||
*/
|
*/
|
||||||
|
static HOST_DEV_INLINE NodeIdT Abs2UniqueKey(int tid,
|
||||||
static HOST_DEV_INLINE NodeIdT abs2uniqKey(int tid, const NodeIdT* abs,
|
common::Span<const NodeIdT> abs,
|
||||||
const int* colIds,
|
common::Span<const int> colIds,
|
||||||
NodeIdT nodeStart, int nKeys) {
|
NodeIdT nodeStart, int nKeys) {
|
||||||
int a = abs[tid];
|
int a = abs[tid];
|
||||||
if (a == kUnusedNode) return a;
|
if (a == kUnusedNode) return a;
|
||||||
@ -77,18 +80,24 @@ struct AddByKey {
|
|||||||
* @param instIds instance index buffer
|
* @param instIds instance index buffer
|
||||||
* @return the expected gradient value
|
* @return the expected gradient value
|
||||||
*/
|
*/
|
||||||
HOST_DEV_INLINE GradientPair get(int id, const GradientPair* vals,
|
HOST_DEV_INLINE GradientPair get(int id,
|
||||||
const int* instIds) {
|
common::Span<const GradientPair> vals,
|
||||||
|
common::Span<const int> instIds) {
|
||||||
id = instIds[id];
|
id = instIds[id];
|
||||||
return vals[id];
|
return vals[id];
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int BLKDIM_L1L3>
|
template <int BLKDIM_L1L3>
|
||||||
__global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
|
__global__ void CubScanByKeyL1(
|
||||||
const int* instIds, GradientPair* mScans,
|
common::Span<GradientPair> scans,
|
||||||
int* mKeys, const NodeIdT* keys, int nUniqKeys,
|
common::Span<const GradientPair> vals,
|
||||||
const int* colIds, NodeIdT nodeStart,
|
common::Span<const int> instIds,
|
||||||
const int size) {
|
common::Span<GradientPair> mScans,
|
||||||
|
common::Span<int> mKeys,
|
||||||
|
common::Span<const NodeIdT> keys,
|
||||||
|
int nUniqKeys,
|
||||||
|
common::Span<const int> colIds, NodeIdT nodeStart,
|
||||||
|
const int size) {
|
||||||
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
||||||
int myKey;
|
int myKey;
|
||||||
GradientPair myValue;
|
GradientPair myValue;
|
||||||
@ -97,7 +106,7 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
|
|||||||
Pair threadData;
|
Pair threadData;
|
||||||
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
||||||
if (tid < size) {
|
if (tid < size) {
|
||||||
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||||
myValue = get(tid, vals, instIds);
|
myValue = get(tid, vals, instIds);
|
||||||
} else {
|
} else {
|
||||||
myKey = kNoneKey;
|
myKey = kNoneKey;
|
||||||
@ -127,7 +136,8 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLKSIZE>
|
template <int BLKSIZE>
|
||||||
__global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
|
__global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
|
||||||
|
common::Span<int> mKeys, int mLength) {
|
||||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
||||||
Pair threadData;
|
Pair threadData;
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
@ -141,11 +151,15 @@ __global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int BLKDIM_L1L3>
|
template <int BLKDIM_L1L3>
|
||||||
__global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
|
__global__ void CubScanByKeyL3(common::Span<GradientPair> sums,
|
||||||
const GradientPair* vals, const int* instIds,
|
common::Span<GradientPair> scans,
|
||||||
const GradientPair* mScans, const int* mKeys,
|
common::Span<const GradientPair> vals,
|
||||||
const NodeIdT* keys, int nUniqKeys,
|
common::Span<const int> instIds,
|
||||||
const int* colIds, NodeIdT nodeStart,
|
common::Span<const GradientPair> mScans,
|
||||||
|
common::Span<const int> mKeys,
|
||||||
|
common::Span<const NodeIdT> keys,
|
||||||
|
int nUniqKeys,
|
||||||
|
common::Span<const int> colIds, NodeIdT nodeStart,
|
||||||
const int size) {
|
const int size) {
|
||||||
int relId = threadIdx.x;
|
int relId = threadIdx.x;
|
||||||
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
|
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
|
||||||
@ -161,10 +175,10 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
|
|||||||
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
|
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
|
||||||
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
|
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
|
||||||
}
|
}
|
||||||
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
int myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||||
int previousKey =
|
int previousKey =
|
||||||
tid == 0 ? kNoneKey
|
tid == 0 ? kNoneKey
|
||||||
: abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
: Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
||||||
GradientPair myValue = scans[tid];
|
GradientPair myValue = scans[tid];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
||||||
@ -201,17 +215,22 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
|
|||||||
* @param nodeStart index of the leftmost node in the current level
|
* @param nodeStart index of the leftmost node in the current level
|
||||||
*/
|
*/
|
||||||
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
||||||
void reduceScanByKey(GradientPair* sums, GradientPair* scans, const GradientPair* vals,
|
void ReduceScanByKey(common::Span<GradientPair> sums,
|
||||||
const int* instIds, const NodeIdT* keys, int size,
|
common::Span<GradientPair> scans,
|
||||||
int nUniqKeys, int nCols, GradientPair* tmpScans,
|
common::Span<GradientPair> vals,
|
||||||
int* tmpKeys, const int* colIds, NodeIdT nodeStart) {
|
common::Span<const int> instIds,
|
||||||
|
common::Span<const NodeIdT> keys,
|
||||||
|
int size, int nUniqKeys, int nCols,
|
||||||
|
common::Span<GradientPair> tmpScans,
|
||||||
|
common::Span<int> tmpKeys,
|
||||||
|
common::Span<const int> colIds, NodeIdT nodeStart) {
|
||||||
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
|
||||||
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(GradientPair));
|
cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair));
|
||||||
cubScanByKeyL1<BLKDIM_L1L3>
|
CubScanByKeyL1<BLKDIM_L1L3>
|
||||||
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
|
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
|
||||||
nUniqKeys, colIds, nodeStart, size);
|
nUniqKeys, colIds, nodeStart, size);
|
||||||
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
CubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
||||||
cubScanByKeyL3<BLKDIM_L1L3>
|
CubScanByKeyL3<BLKDIM_L1L3>
|
||||||
<<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
|
<<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
|
||||||
keys, nUniqKeys, colIds, nodeStart, size);
|
keys, nUniqKeys, colIds, nodeStart, size);
|
||||||
}
|
}
|
||||||
@ -268,7 +287,7 @@ HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
|
DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
|
||||||
ExactSplitCandidate val) {
|
ExactSplitCandidate val) {
|
||||||
unsigned long long* intAddress = (unsigned long long*)address; // NOLINT
|
unsigned long long* intAddress = (unsigned long long*)address; // NOLINT
|
||||||
unsigned long long old = *intAddress; // NOLINT
|
unsigned long long old = *intAddress; // NOLINT
|
||||||
@ -281,11 +300,17 @@ DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
|
|||||||
} while (assumed != old);
|
} while (assumed != old);
|
||||||
}
|
}
|
||||||
|
|
||||||
DEV_INLINE void argMaxWithAtomics(
|
DEV_INLINE void ArgMaxWithAtomics(
|
||||||
int id, ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
int id,
|
||||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
common::Span<ExactSplitCandidate> nodeSplits,
|
||||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
common::Span<const GradientPair> gradScans,
|
||||||
NodeIdT nodeStart, int len, const GPUTrainingParam& param) {
|
common::Span<const GradientPair> gradSums,
|
||||||
|
common::Span<const float> vals,
|
||||||
|
common::Span<const int> colIds,
|
||||||
|
common::Span<const NodeIdT> nodeAssigns,
|
||||||
|
common::Span<const DeviceNodeStats> nodes, int nUniqKeys,
|
||||||
|
NodeIdT nodeStart, int len,
|
||||||
|
const GPUTrainingParam& param) {
|
||||||
int nodeId = nodeAssigns[id];
|
int nodeId = nodeAssigns[id];
|
||||||
// @todo: this is really a bad check! but will be fixed when we move
|
// @todo: this is really a bad check! but will be fixed when we move
|
||||||
// to key-based reduction
|
// to key-based reduction
|
||||||
@ -293,45 +318,59 @@ DEV_INLINE void argMaxWithAtomics(
|
|||||||
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
||||||
(vals[id] == vals[id - 1]))) {
|
(vals[id] == vals[id - 1]))) {
|
||||||
if (nodeId != kUnusedNode) {
|
if (nodeId != kUnusedNode) {
|
||||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
int sumId = Abs2UniqueKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||||
GradientPair colSum = gradSums[sumId];
|
GradientPair colSum = gradSums[sumId];
|
||||||
int uid = nodeId - nodeStart;
|
int uid = nodeId - nodeStart;
|
||||||
DeviceNodeStats n = nodes[nodeId];
|
DeviceNodeStats node_stat = nodes[nodeId];
|
||||||
GradientPair parentSum = n.sum_gradients;
|
GradientPair parentSum = node_stat.sum_gradients;
|
||||||
float parentGain = n.root_gain;
|
float parentGain = node_stat.root_gain;
|
||||||
bool tmp;
|
bool tmp;
|
||||||
ExactSplitCandidate s;
|
ExactSplitCandidate s;
|
||||||
GradientPair missing = parentSum - colSum;
|
GradientPair missing = parentSum - colSum;
|
||||||
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
|
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
|
||||||
param, tmp);
|
param, tmp);
|
||||||
s.index = id;
|
s.index = id;
|
||||||
atomicArgMax(nodeSplits + uid, s);
|
AtomicArgMax(&nodeSplits[uid], s);
|
||||||
} // end if nodeId != UNUSED_NODE
|
} // end if nodeId != UNUSED_NODE
|
||||||
} // end if id == 0 ...
|
} // end if id == 0 ...
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void atomicArgMaxByKeyGmem(
|
__global__ void AtomicArgMaxByKeyGmem(
|
||||||
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
common::Span<ExactSplitCandidate> nodeSplits,
|
||||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
common::Span<const GradientPair> gradScans,
|
||||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
common::Span<const GradientPair> gradSums,
|
||||||
NodeIdT nodeStart, int len, const TrainParam param) {
|
common::Span<const float> vals,
|
||||||
|
common::Span<const int> colIds,
|
||||||
|
common::Span<const NodeIdT> nodeAssigns,
|
||||||
|
common::Span<const DeviceNodeStats> nodes,
|
||||||
|
int nUniqKeys,
|
||||||
|
NodeIdT nodeStart,
|
||||||
|
int len,
|
||||||
|
const TrainParam param) {
|
||||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||||
const int stride = blockDim.x * gridDim.x;
|
const int stride = blockDim.x * gridDim.x;
|
||||||
for (; id < len; id += stride) {
|
for (; id < len; id += stride) {
|
||||||
argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
|
ArgMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
|
||||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len,
|
nodeAssigns, nodes, nUniqKeys, nodeStart, len,
|
||||||
GPUTrainingParam(param));
|
GPUTrainingParam(param));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void atomicArgMaxByKeySmem(
|
__global__ void AtomicArgMaxByKeySmem(
|
||||||
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
common::Span<ExactSplitCandidate> nodeSplits,
|
||||||
const GradientPair* gradSums, const float* vals, const int* colIds,
|
common::Span<const GradientPair> gradScans,
|
||||||
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
|
common::Span<const GradientPair> gradSums,
|
||||||
NodeIdT nodeStart, int len, const GPUTrainingParam param) {
|
common::Span<const float> vals,
|
||||||
|
common::Span<const int> colIds,
|
||||||
|
common::Span<const NodeIdT> nodeAssigns,
|
||||||
|
common::Span<const DeviceNodeStats> nodes,
|
||||||
|
int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam param) {
|
||||||
extern __shared__ char sArr[];
|
extern __shared__ char sArr[];
|
||||||
ExactSplitCandidate* sNodeSplits =
|
common::Span<ExactSplitCandidate> sNodeSplits =
|
||||||
reinterpret_cast<ExactSplitCandidate*>(sArr);
|
common::Span<ExactSplitCandidate>(
|
||||||
|
reinterpret_cast<ExactSplitCandidate*>(sArr),
|
||||||
|
static_cast<typename common::Span<ExactSplitCandidate>::index_type>(
|
||||||
|
nUniqKeys * sizeof(ExactSplitCandidate)));
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
ExactSplitCandidate defVal;
|
ExactSplitCandidate defVal;
|
||||||
#pragma unroll 1
|
#pragma unroll 1
|
||||||
@ -342,13 +381,13 @@ __global__ void atomicArgMaxByKeySmem(
|
|||||||
int id = tid + (blockIdx.x * blockDim.x);
|
int id = tid + (blockIdx.x * blockDim.x);
|
||||||
const int stride = blockDim.x * gridDim.x;
|
const int stride = blockDim.x * gridDim.x;
|
||||||
for (; id < len; id += stride) {
|
for (; id < len; id += stride) {
|
||||||
argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
|
ArgMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
|
||||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
|
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||||
ExactSplitCandidate s = sNodeSplits[i];
|
ExactSplitCandidate s = sNodeSplits[i];
|
||||||
atomicArgMax(nodeSplits + i, s);
|
AtomicArgMax(&nodeSplits[i], s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -369,24 +408,28 @@ __global__ void atomicArgMaxByKeySmem(
|
|||||||
* @param algo which algorithm to use for argmax_by_key
|
* @param algo which algorithm to use for argmax_by_key
|
||||||
*/
|
*/
|
||||||
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
||||||
void argMaxByKey(ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
|
void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
|
||||||
const GradientPair* gradSums, const float* vals,
|
common::Span<const GradientPair> gradScans,
|
||||||
const int* colIds, const NodeIdT* nodeAssigns,
|
common::Span<const GradientPair> gradSums,
|
||||||
const DeviceNodeStats* nodes, int nUniqKeys,
|
common::Span<const float> vals,
|
||||||
|
common::Span<const int> colIds,
|
||||||
|
common::Span<const NodeIdT> nodeAssigns,
|
||||||
|
common::Span<const DeviceNodeStats> nodes,
|
||||||
|
int nUniqKeys,
|
||||||
NodeIdT nodeStart, int len, const TrainParam param,
|
NodeIdT nodeStart, int len, const TrainParam param,
|
||||||
ArgMaxByKeyAlgo algo) {
|
ArgMaxByKeyAlgo algo) {
|
||||||
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
||||||
param.gpu_id, nodeSplits, nUniqKeys,
|
param.gpu_id, nodeSplits.data(), nUniqKeys,
|
||||||
ExactSplitCandidate());
|
ExactSplitCandidate());
|
||||||
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
||||||
switch (algo) {
|
switch (algo) {
|
||||||
case kAbkGmem:
|
case kAbkGmem:
|
||||||
atomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
|
AtomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
|
||||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||||
nUniqKeys, nodeStart, len, param);
|
nUniqKeys, nodeStart, len, param);
|
||||||
break;
|
break;
|
||||||
case kAbkSmem:
|
case kAbkSmem:
|
||||||
atomicArgMaxByKeySmem<<<nBlks, BLKDIM,
|
AtomicArgMaxByKeySmem<<<nBlks, BLKDIM,
|
||||||
sizeof(ExactSplitCandidate) * nUniqKeys>>>(
|
sizeof(ExactSplitCandidate) * nUniqKeys>>>(
|
||||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||||
nUniqKeys, nodeStart, len, GPUTrainingParam(param));
|
nUniqKeys, nodeStart, len, GPUTrainingParam(param));
|
||||||
@ -512,7 +555,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
~GPUMaker() {}
|
~GPUMaker() {}
|
||||||
|
|
||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) override {
|
const std::vector<std::pair<std::string, std::string>>& args) {
|
||||||
param.InitAllowUnknown(args);
|
param.InitAllowUnknown(args);
|
||||||
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
||||||
maxLeaves = 1 << param.max_depth;
|
maxLeaves = 1 << param.max_depth;
|
||||||
@ -521,7 +564,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||||
const std::vector<RegTree*>& trees) override {
|
const std::vector<RegTree*>& trees) {
|
||||||
GradStats::CheckInfo(dmat->Info());
|
GradStats::CheckInfo(dmat->Info());
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
float lr = param.learning_rate;
|
float lr = param.learning_rate;
|
||||||
@ -535,7 +578,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
UpdateTree(gpair, dmat, trees[i]);
|
UpdateTree(gpair, dmat, trees[i]);
|
||||||
}
|
}
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
|
LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
|
||||||
}
|
}
|
||||||
param.learning_rate = lr;
|
param.learning_rate = lr;
|
||||||
}
|
}
|
||||||
@ -543,7 +586,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||||
RegTree* hTree) {
|
RegTree* hTree) {
|
||||||
if (!allocated) {
|
if (!allocated) {
|
||||||
setupOneTimeData(dmat);
|
SetupOneTimeData(dmat);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < param.max_depth; ++i) {
|
for (int i = 0; i < param.max_depth; ++i) {
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
@ -563,11 +606,11 @@ class GPUMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void split2node(int nNodes, NodeIdT nodeStart) {
|
void split2node(int nNodes, NodeIdT nodeStart) {
|
||||||
auto d_nodes = nodes.Data();
|
auto d_nodes = nodes.GetSpan();
|
||||||
auto d_gradScans = gradScans.Data();
|
auto d_gradScans = gradScans.GetSpan();
|
||||||
auto d_gradSums = gradSums.Data();
|
auto d_gradSums = gradSums.GetSpan();
|
||||||
auto d_nodeAssigns = nodeAssigns.Current();
|
auto d_nodeAssigns = nodeAssigns.CurrentSpan();
|
||||||
auto d_colIds = colIds.Data();
|
auto d_colIds = colIds.GetSpan();
|
||||||
auto d_vals = vals.Current();
|
auto d_vals = vals.Current();
|
||||||
auto d_nodeSplits = nodeSplits.Data();
|
auto d_nodeSplits = nodeSplits.Data();
|
||||||
int nUniqKeys = nNodes;
|
int nUniqKeys = nNodes;
|
||||||
@ -580,7 +623,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
if (s.isSplittable(min_split_loss)) {
|
if (s.isSplittable(min_split_loss)) {
|
||||||
int idx = s.index;
|
int idx = s.index;
|
||||||
int nodeInstId =
|
int nodeInstId =
|
||||||
abs2uniqKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
||||||
bool missingLeft = true;
|
bool missingLeft = true;
|
||||||
const DeviceNodeStats& n = d_nodes[absNodeId];
|
const DeviceNodeStats& n = d_nodes[absNodeId];
|
||||||
GradientPair gradScan = d_gradScans[idx];
|
GradientPair gradScan = d_gradScans[idx];
|
||||||
@ -612,13 +655,13 @@ class GPUMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void findSplit(int level, NodeIdT nodeStart, int nNodes) {
|
void findSplit(int level, NodeIdT nodeStart, int nNodes) {
|
||||||
reduceScanByKey(gradSums.Data(), gradScans.Data(), gradsInst.Data(),
|
ReduceScanByKey(gradSums.GetSpan(), gradScans.GetSpan(), gradsInst.GetSpan(),
|
||||||
instIds.Current(), nodeAssigns.Current(), nVals, nNodes,
|
instIds.CurrentSpan(), nodeAssigns.CurrentSpan(), nVals, nNodes,
|
||||||
nCols, tmpScanGradBuff.Data(), tmpScanKeyBuff.Data(),
|
nCols, tmpScanGradBuff.GetSpan(), tmpScanKeyBuff.GetSpan(),
|
||||||
colIds.Data(), nodeStart);
|
colIds.GetSpan(), nodeStart);
|
||||||
argMaxByKey(nodeSplits.Data(), gradScans.Data(), gradSums.Data(),
|
ArgMaxByKey(nodeSplits.GetSpan(), gradScans.GetSpan(), gradSums.GetSpan(),
|
||||||
vals.Current(), colIds.Data(), nodeAssigns.Current(),
|
vals.CurrentSpan(), colIds.GetSpan(), nodeAssigns.CurrentSpan(),
|
||||||
nodes.Data(), nNodes, nodeStart, nVals, param,
|
nodes.GetSpan(), nNodes, nodeStart, nVals, param,
|
||||||
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
||||||
split2node(nNodes, nodeStart);
|
split2node(nNodes, nodeStart);
|
||||||
}
|
}
|
||||||
@ -634,7 +677,7 @@ class GPUMaker : public TreeUpdater {
|
|||||||
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
|
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
|
||||||
}
|
}
|
||||||
|
|
||||||
void setupOneTimeData(DMatrix* dmat) {
|
void SetupOneTimeData(DMatrix* dmat) {
|
||||||
size_t free_memory = dh::AvailableMemory(param.gpu_id);
|
size_t free_memory = dh::AvailableMemory(param.gpu_id);
|
||||||
if (!dmat->SingleColBlock()) {
|
if (!dmat->SingleColBlock()) {
|
||||||
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
|
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
|
||||||
@ -726,11 +769,11 @@ class GPUMaker : public TreeUpdater {
|
|||||||
// gather the node assignments across all other columns too
|
// gather the node assignments across all other columns too
|
||||||
dh::Gather(param.gpu_id, nodeAssigns.Current(),
|
dh::Gather(param.gpu_id, nodeAssigns.Current(),
|
||||||
nodeAssignsPerInst.Data(), instIds.Current(), nVals);
|
nodeAssignsPerInst.Data(), instIds.Current(), nVals);
|
||||||
sortKeys(level);
|
SortKeys(level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sortKeys(int level) {
|
void SortKeys(int level) {
|
||||||
// segmented-sort the arrays based on node-id's
|
// segmented-sort the arrays based on node-id's
|
||||||
// but we don't need more than level+1 bits for sorting!
|
// but we don't need more than level+1 bits for sorting!
|
||||||
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
|
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
|
||||||
|
|||||||
@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
|
|||||||
*/
|
*/
|
||||||
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT>
|
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT>
|
||||||
__device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
__device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram,
|
||||||
TempStorageT* temp_storage) {
|
TempStorageT* temp_storage) {
|
||||||
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
|
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
|
||||||
GradientSumT& shared_sum = uninitialized_sum.Alias();
|
GradientSumT& shared_sum = uninitialized_sum.Alias();
|
||||||
|
|
||||||
|
|||||||
48
tests/cpp/tree/test_gpu_exact.cu
Normal file
48
tests/cpp/tree/test_gpu_exact.cu
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "../helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
TEST(GPUExact, Update) {
|
||||||
|
using Arg = std::pair<std::string, std::string>;
|
||||||
|
std::vector<Arg> args{
|
||||||
|
{"n_gpus", "1"},
|
||||||
|
{"gpu_id", "0"},
|
||||||
|
{"max_depth", "1"}};
|
||||||
|
|
||||||
|
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu");
|
||||||
|
p_gpuexact_maker->Init(args);
|
||||||
|
|
||||||
|
size_t constexpr n_rows = 4;
|
||||||
|
size_t constexpr n_cols = 8;
|
||||||
|
bst_float constexpr sparsity = 0.0f;
|
||||||
|
|
||||||
|
auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3);
|
||||||
|
std::vector<GradientPair> h_gpair(n_rows);
|
||||||
|
for (size_t i = 0; i < n_rows; ++i) {
|
||||||
|
h_gpair[i] = GradientPair(i % 2, 1);
|
||||||
|
}
|
||||||
|
HostDeviceVector<GradientPair> gpair (h_gpair);
|
||||||
|
RegTree tree;
|
||||||
|
|
||||||
|
p_gpuexact_maker->Update(&gpair, (*dmat).get(), {&tree});
|
||||||
|
auto const& nodes = tree.GetNodes();
|
||||||
|
ASSERT_EQ(nodes.size(), 3);
|
||||||
|
|
||||||
|
float constexpr kRtEps = 1e-6;
|
||||||
|
ASSERT_NEAR(tree.Stat(0).sum_hess, 4, kRtEps);
|
||||||
|
ASSERT_NEAR(tree.Stat(1).sum_hess, 2, kRtEps);
|
||||||
|
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
|
||||||
|
|
||||||
|
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
Loading…
x
Reference in New Issue
Block a user