Use Span in GPU exact updater. (#4020)

* Use Span in GPU exact updater.

* Add a small test.
This commit is contained in:
Jiaming Yuan 2018-12-26 12:44:46 +08:00 committed by GitHub
parent 7735252925
commit 9897b5042f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 180 additions and 84 deletions

View File

@ -378,6 +378,11 @@ class DVec2 {
DVec<T> &D2() { return d2_; }
T *Current() { return buff_.Current(); }
xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{
buff_.Current(),
static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
}
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); }
@ -791,7 +796,7 @@ typename std::iterator_traits<T>::value_type SumReduction(
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
void FillConst(int device_idx, T *out, int len, T def) {
dh::LaunchN<ItemsPerThread, BlkDim>(device_idx, len,
[=] __device__(int i) { out[i] = def; });
[=] __device__(int i) { out[i] = def; });
}
/**

View File

@ -1,9 +1,12 @@
/*!
* Copyright 2017 XGBoost contributors
* Copyright 2017-2018 XGBoost contributors
*/
#include <xgboost/tree_updater.h>
#include <utility>
#include <vector>
#include <limits>
#include <string>
#include "../common/common.h"
#include "param.h"
#include "updater_gpu_common.cuh"
@ -22,9 +25,9 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu);
* @param nKeys number of nodes at this level.
* @return the uniq key
*/
static HOST_DEV_INLINE NodeIdT abs2uniqKey(int tid, const NodeIdT* abs,
const int* colIds,
static HOST_DEV_INLINE NodeIdT Abs2UniqueKey(int tid,
common::Span<const NodeIdT> abs,
common::Span<const int> colIds,
NodeIdT nodeStart, int nKeys) {
int a = abs[tid];
if (a == kUnusedNode) return a;
@ -77,18 +80,24 @@ struct AddByKey {
* @param instIds instance index buffer
* @return the expected gradient value
*/
HOST_DEV_INLINE GradientPair get(int id, const GradientPair* vals,
const int* instIds) {
HOST_DEV_INLINE GradientPair get(int id,
common::Span<const GradientPair> vals,
common::Span<const int> instIds) {
id = instIds[id];
return vals[id];
}
template <int BLKDIM_L1L3>
__global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
const int* instIds, GradientPair* mScans,
int* mKeys, const NodeIdT* keys, int nUniqKeys,
const int* colIds, NodeIdT nodeStart,
const int size) {
__global__ void CubScanByKeyL1(
common::Span<GradientPair> scans,
common::Span<const GradientPair> vals,
common::Span<const int> instIds,
common::Span<GradientPair> mScans,
common::Span<int> mKeys,
common::Span<const NodeIdT> keys,
int nUniqKeys,
common::Span<const int> colIds, NodeIdT nodeStart,
const int size) {
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
int myKey;
GradientPair myValue;
@ -97,7 +106,7 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
Pair threadData;
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
if (tid < size) {
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
myValue = get(tid, vals, instIds);
} else {
myKey = kNoneKey;
@ -127,7 +136,8 @@ __global__ void cubScanByKeyL1(GradientPair* scans, const GradientPair* vals,
}
template <int BLKSIZE>
__global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
__global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
common::Span<int> mKeys, int mLength) {
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
Pair threadData;
__shared__ typename BlockScan::TempStorage temp_storage;
@ -141,11 +151,15 @@ __global__ void cubScanByKeyL2(GradientPair* mScans, int* mKeys, int mLength) {
}
template <int BLKDIM_L1L3>
__global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
const GradientPair* vals, const int* instIds,
const GradientPair* mScans, const int* mKeys,
const NodeIdT* keys, int nUniqKeys,
const int* colIds, NodeIdT nodeStart,
__global__ void CubScanByKeyL3(common::Span<GradientPair> sums,
common::Span<GradientPair> scans,
common::Span<const GradientPair> vals,
common::Span<const int> instIds,
common::Span<const GradientPair> mScans,
common::Span<const int> mKeys,
common::Span<const NodeIdT> keys,
int nUniqKeys,
common::Span<const int> colIds, NodeIdT nodeStart,
const int size) {
int relId = threadIdx.x;
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
@ -161,10 +175,10 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
}
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
int myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
int previousKey =
tid == 0 ? kNoneKey
: abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
: Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
GradientPair myValue = scans[tid];
__syncthreads();
if (blockIdx.x > 0 && s_mKeys == previousKey) {
@ -201,17 +215,22 @@ __global__ void cubScanByKeyL3(GradientPair* sums, GradientPair* scans,
* @param nodeStart index of the leftmost node in the current level
*/
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
void reduceScanByKey(GradientPair* sums, GradientPair* scans, const GradientPair* vals,
const int* instIds, const NodeIdT* keys, int size,
int nUniqKeys, int nCols, GradientPair* tmpScans,
int* tmpKeys, const int* colIds, NodeIdT nodeStart) {
void ReduceScanByKey(common::Span<GradientPair> sums,
common::Span<GradientPair> scans,
common::Span<GradientPair> vals,
common::Span<const int> instIds,
common::Span<const NodeIdT> keys,
int size, int nUniqKeys, int nCols,
common::Span<GradientPair> tmpScans,
common::Span<int> tmpKeys,
common::Span<const int> colIds, NodeIdT nodeStart) {
int nBlks = dh::DivRoundUp(size, BLKDIM_L1L3);
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(GradientPair));
cubScanByKeyL1<BLKDIM_L1L3>
cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair));
CubScanByKeyL1<BLKDIM_L1L3>
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
nUniqKeys, colIds, nodeStart, size);
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
cubScanByKeyL3<BLKDIM_L1L3>
CubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
CubScanByKeyL3<BLKDIM_L1L3>
<<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
keys, nUniqKeys, colIds, nodeStart, size);
}
@ -268,7 +287,7 @@ HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
return out;
}
DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
ExactSplitCandidate val) {
unsigned long long* intAddress = (unsigned long long*)address; // NOLINT
unsigned long long old = *intAddress; // NOLINT
@ -281,11 +300,17 @@ DEV_INLINE void atomicArgMax(ExactSplitCandidate* address,
} while (assumed != old);
}
DEV_INLINE void argMaxWithAtomics(
int id, ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
const GradientPair* gradSums, const float* vals, const int* colIds,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
NodeIdT nodeStart, int len, const GPUTrainingParam& param) {
DEV_INLINE void ArgMaxWithAtomics(
int id,
common::Span<ExactSplitCandidate> nodeSplits,
common::Span<const GradientPair> gradScans,
common::Span<const GradientPair> gradSums,
common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes, int nUniqKeys,
NodeIdT nodeStart, int len,
const GPUTrainingParam& param) {
int nodeId = nodeAssigns[id];
// @todo: this is really a bad check! but will be fixed when we move
// to key-based reduction
@ -293,45 +318,59 @@ DEV_INLINE void argMaxWithAtomics(
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
(vals[id] == vals[id - 1]))) {
if (nodeId != kUnusedNode) {
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
int sumId = Abs2UniqueKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
GradientPair colSum = gradSums[sumId];
int uid = nodeId - nodeStart;
DeviceNodeStats n = nodes[nodeId];
GradientPair parentSum = n.sum_gradients;
float parentGain = n.root_gain;
DeviceNodeStats node_stat = nodes[nodeId];
GradientPair parentSum = node_stat.sum_gradients;
float parentGain = node_stat.root_gain;
bool tmp;
ExactSplitCandidate s;
GradientPair missing = parentSum - colSum;
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
param, tmp);
param, tmp);
s.index = id;
atomicArgMax(nodeSplits + uid, s);
AtomicArgMax(&nodeSplits[uid], s);
} // end if nodeId != UNUSED_NODE
} // end if id == 0 ...
}
__global__ void atomicArgMaxByKeyGmem(
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
const GradientPair* gradSums, const float* vals, const int* colIds,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
NodeIdT nodeStart, int len, const TrainParam param) {
__global__ void AtomicArgMaxByKeyGmem(
common::Span<ExactSplitCandidate> nodeSplits,
common::Span<const GradientPair> gradScans,
common::Span<const GradientPair> gradSums,
common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys,
NodeIdT nodeStart,
int len,
const TrainParam param) {
int id = threadIdx.x + (blockIdx.x * blockDim.x);
const int stride = blockDim.x * gridDim.x;
for (; id < len; id += stride) {
argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
ArgMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
nodeAssigns, nodes, nUniqKeys, nodeStart, len,
GPUTrainingParam(param));
}
}
__global__ void atomicArgMaxByKeySmem(
ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
const GradientPair* gradSums, const float* vals, const int* colIds,
const NodeIdT* nodeAssigns, const DeviceNodeStats* nodes, int nUniqKeys,
NodeIdT nodeStart, int len, const GPUTrainingParam param) {
__global__ void AtomicArgMaxByKeySmem(
common::Span<ExactSplitCandidate> nodeSplits,
common::Span<const GradientPair> gradScans,
common::Span<const GradientPair> gradSums,
common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam param) {
extern __shared__ char sArr[];
ExactSplitCandidate* sNodeSplits =
reinterpret_cast<ExactSplitCandidate*>(sArr);
common::Span<ExactSplitCandidate> sNodeSplits =
common::Span<ExactSplitCandidate>(
reinterpret_cast<ExactSplitCandidate*>(sArr),
static_cast<typename common::Span<ExactSplitCandidate>::index_type>(
nUniqKeys * sizeof(ExactSplitCandidate)));
int tid = threadIdx.x;
ExactSplitCandidate defVal;
#pragma unroll 1
@ -342,13 +381,13 @@ __global__ void atomicArgMaxByKeySmem(
int id = tid + (blockIdx.x * blockDim.x);
const int stride = blockDim.x * gridDim.x;
for (; id < len; id += stride) {
argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
ArgMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
}
__syncthreads();
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
ExactSplitCandidate s = sNodeSplits[i];
atomicArgMax(nodeSplits + i, s);
AtomicArgMax(&nodeSplits[i], s);
}
}
@ -369,24 +408,28 @@ __global__ void atomicArgMaxByKeySmem(
* @param algo which algorithm to use for argmax_by_key
*/
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
void argMaxByKey(ExactSplitCandidate* nodeSplits, const GradientPair* gradScans,
const GradientPair* gradSums, const float* vals,
const int* colIds, const NodeIdT* nodeAssigns,
const DeviceNodeStats* nodes, int nUniqKeys,
void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
common::Span<const GradientPair> gradScans,
common::Span<const GradientPair> gradSums,
common::Span<const float> vals,
common::Span<const int> colIds,
common::Span<const NodeIdT> nodeAssigns,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys,
NodeIdT nodeStart, int len, const TrainParam param,
ArgMaxByKeyAlgo algo) {
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
param.gpu_id, nodeSplits, nUniqKeys,
param.gpu_id, nodeSplits.data(), nUniqKeys,
ExactSplitCandidate());
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
switch (algo) {
case kAbkGmem:
atomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
AtomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
nUniqKeys, nodeStart, len, param);
break;
case kAbkSmem:
atomicArgMaxByKeySmem<<<nBlks, BLKDIM,
AtomicArgMaxByKeySmem<<<nBlks, BLKDIM,
sizeof(ExactSplitCandidate) * nUniqKeys>>>(
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
nUniqKeys, nodeStart, len, GPUTrainingParam(param));
@ -512,7 +555,7 @@ class GPUMaker : public TreeUpdater {
~GPUMaker() {}
void Init(
const std::vector<std::pair<std::string, std::string>>& args) override {
const std::vector<std::pair<std::string, std::string>>& args) {
param.InitAllowUnknown(args);
maxNodes = (1 << (param.max_depth + 1)) - 1;
maxLeaves = 1 << param.max_depth;
@ -521,7 +564,7 @@ class GPUMaker : public TreeUpdater {
}
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) override {
const std::vector<RegTree*>& trees) {
GradStats::CheckInfo(dmat->Info());
// rescale learning rate according to size of trees
float lr = param.learning_rate;
@ -535,7 +578,7 @@ class GPUMaker : public TreeUpdater {
UpdateTree(gpair, dmat, trees[i]);
}
} catch (const std::exception& e) {
LOG(FATAL) << "GPU plugin exception: " << e.what() << std::endl;
LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
}
param.learning_rate = lr;
}
@ -543,7 +586,7 @@ class GPUMaker : public TreeUpdater {
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
RegTree* hTree) {
if (!allocated) {
setupOneTimeData(dmat);
SetupOneTimeData(dmat);
}
for (int i = 0; i < param.max_depth; ++i) {
if (i == 0) {
@ -563,11 +606,11 @@ class GPUMaker : public TreeUpdater {
}
void split2node(int nNodes, NodeIdT nodeStart) {
auto d_nodes = nodes.Data();
auto d_gradScans = gradScans.Data();
auto d_gradSums = gradSums.Data();
auto d_nodeAssigns = nodeAssigns.Current();
auto d_colIds = colIds.Data();
auto d_nodes = nodes.GetSpan();
auto d_gradScans = gradScans.GetSpan();
auto d_gradSums = gradSums.GetSpan();
auto d_nodeAssigns = nodeAssigns.CurrentSpan();
auto d_colIds = colIds.GetSpan();
auto d_vals = vals.Current();
auto d_nodeSplits = nodeSplits.Data();
int nUniqKeys = nNodes;
@ -580,7 +623,7 @@ class GPUMaker : public TreeUpdater {
if (s.isSplittable(min_split_loss)) {
int idx = s.index;
int nodeInstId =
abs2uniqKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
bool missingLeft = true;
const DeviceNodeStats& n = d_nodes[absNodeId];
GradientPair gradScan = d_gradScans[idx];
@ -612,13 +655,13 @@ class GPUMaker : public TreeUpdater {
}
void findSplit(int level, NodeIdT nodeStart, int nNodes) {
reduceScanByKey(gradSums.Data(), gradScans.Data(), gradsInst.Data(),
instIds.Current(), nodeAssigns.Current(), nVals, nNodes,
nCols, tmpScanGradBuff.Data(), tmpScanKeyBuff.Data(),
colIds.Data(), nodeStart);
argMaxByKey(nodeSplits.Data(), gradScans.Data(), gradSums.Data(),
vals.Current(), colIds.Data(), nodeAssigns.Current(),
nodes.Data(), nNodes, nodeStart, nVals, param,
ReduceScanByKey(gradSums.GetSpan(), gradScans.GetSpan(), gradsInst.GetSpan(),
instIds.CurrentSpan(), nodeAssigns.CurrentSpan(), nVals, nNodes,
nCols, tmpScanGradBuff.GetSpan(), tmpScanKeyBuff.GetSpan(),
colIds.GetSpan(), nodeStart);
ArgMaxByKey(nodeSplits.GetSpan(), gradScans.GetSpan(), gradSums.GetSpan(),
vals.CurrentSpan(), colIds.GetSpan(), nodeAssigns.CurrentSpan(),
nodes.GetSpan(), nNodes, nodeStart, nVals, param,
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
split2node(nNodes, nodeStart);
}
@ -634,7 +677,7 @@ class GPUMaker : public TreeUpdater {
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
}
void setupOneTimeData(DMatrix* dmat) {
void SetupOneTimeData(DMatrix* dmat) {
size_t free_memory = dh::AvailableMemory(param.gpu_id);
if (!dmat->SingleColBlock()) {
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
@ -726,11 +769,11 @@ class GPUMaker : public TreeUpdater {
// gather the node assignments across all other columns too
dh::Gather(param.gpu_id, nodeAssigns.Current(),
nodeAssignsPerInst.Data(), instIds.Current(), nVals);
sortKeys(level);
SortKeys(level);
}
}
void sortKeys(int level) {
void SortKeys(int level) {
// segmented-sort the arrays based on node-id's
// but we don't need more than level+1 bits for sorting!
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,

View File

@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
*/
template <int BLOCK_THREADS, typename ReduceT, typename TempStorageT, typename GradientSumT>
__device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_histogram,
TempStorageT* temp_storage) {
TempStorageT* temp_storage) {
__shared__ cub::Uninitialized<GradientSumT> uninitialized_sum;
GradientSumT& shared_sum = uninitialized_sum.Alias();

View File

@ -0,0 +1,48 @@
#include <gtest/gtest.h>
#include <xgboost/tree_updater.h>
#include <vector>
#include <string>
#include <utility>
#include "../helpers.h"
namespace xgboost {
namespace tree {
TEST(GPUExact, Update) {
using Arg = std::pair<std::string, std::string>;
std::vector<Arg> args{
{"n_gpus", "1"},
{"gpu_id", "0"},
{"max_depth", "1"}};
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu");
p_gpuexact_maker->Init(args);
size_t constexpr n_rows = 4;
size_t constexpr n_cols = 8;
bst_float constexpr sparsity = 0.0f;
auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3);
std::vector<GradientPair> h_gpair(n_rows);
for (size_t i = 0; i < n_rows; ++i) {
h_gpair[i] = GradientPair(i % 2, 1);
}
HostDeviceVector<GradientPair> gpair (h_gpair);
RegTree tree;
p_gpuexact_maker->Update(&gpair, (*dmat).get(), {&tree});
auto const& nodes = tree.GetNodes();
ASSERT_EQ(nodes.size(), 3);
float constexpr kRtEps = 1e-6;
ASSERT_NEAR(tree.Stat(0).sum_hess, 4, kRtEps);
ASSERT_NEAR(tree.Stat(1).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
}
} // namespace tree
} // namespace xgboost