724 lines
22 KiB
Plaintext
724 lines
22 KiB
Plaintext
/*!
|
|
* Copyright 2016 Rory mitchell
|
|
*/
|
|
#pragma once
|
|
#include <cub/cub.cuh>
|
|
#include <xgboost/base.h>
|
|
#include "device_helpers.cuh"
|
|
#include "gpu_data.cuh"
|
|
#include "types.cuh"
|
|
#include "common.cuh"
|
|
|
|
namespace xgboost {
|
|
namespace tree {
|
|
|
|
typedef uint64_t BitFlagSet;
|
|
|
|
__device__ __inline__ void set_bit(BitFlagSet &bf, int index) { // NOLINT
|
|
bf |= 1 << index;
|
|
}
|
|
|
|
__device__ __inline__ bool check_bit(BitFlagSet bf, int index) {
|
|
return (bf >> index) & 1;
|
|
}
|
|
|
|
// Carryover prefix for scanning multiple tiles of bit flags
|
|
struct FlagPrefixCallbackOp {
|
|
BitFlagSet tile_carry;
|
|
|
|
__device__ FlagPrefixCallbackOp() : tile_carry(0) {}
|
|
|
|
__device__ BitFlagSet operator()(BitFlagSet block_aggregate) {
|
|
BitFlagSet old_prefix = tile_carry;
|
|
tile_carry |= block_aggregate;
|
|
return old_prefix;
|
|
}
|
|
};
|
|
|
|
// Scan op for bit flags that resets if the final bit is set
|
|
struct FlagScanOp {
|
|
__device__ __forceinline__ BitFlagSet operator()(const BitFlagSet &a,
|
|
const BitFlagSet &b) {
|
|
if (check_bit(b, 63)) {
|
|
return b;
|
|
} else {
|
|
return a | b;
|
|
}
|
|
}
|
|
};
|
|
|
|
template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
|
|
struct FindSplitParamsMultiscan {
|
|
enum {
|
|
BLOCK_THREADS = _BLOCK_THREADS,
|
|
TILE_ITEMS = BLOCK_THREADS,
|
|
N_NODES = _N_NODES,
|
|
N_WARPS = _BLOCK_THREADS / 32,
|
|
DEBUG_VALIDATE = _DEBUG_VALIDATE,
|
|
ITEMS_PER_THREAD = 1
|
|
};
|
|
};
|
|
|
|
template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
|
|
struct ReduceParamsMultiscan {
|
|
enum {
|
|
BLOCK_THREADS = _BLOCK_THREADS,
|
|
ITEMS_PER_THREAD = 1,
|
|
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
|
|
N_NODES = _N_NODES,
|
|
N_WARPS = _BLOCK_THREADS / 32,
|
|
DEBUG_VALIDATE = _DEBUG_VALIDATE
|
|
};
|
|
};
|
|
|
|
template <typename ParamsT> struct ReduceEnactorMultiscan {
|
|
typedef cub::WarpReduce<gpu_gpair> WarpReduceT;
|
|
|
|
struct _TempStorage {
|
|
typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
|
|
gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
|
|
};
|
|
|
|
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
|
|
|
struct _Reduction {
|
|
gpu_gpair node_sums[ParamsT::N_NODES];
|
|
};
|
|
|
|
struct Reduction : cub::Uninitialized<_Reduction> {};
|
|
|
|
// Thread local member variables
|
|
const ItemIter item_iter;
|
|
_TempStorage &temp_storage;
|
|
_Reduction &reduction;
|
|
gpu_gpair gpair;
|
|
NodeIdT node_id;
|
|
NodeIdT node_id_adjusted;
|
|
const int node_begin;
|
|
|
|
__device__ __forceinline__
|
|
ReduceEnactorMultiscan(TempStorage &temp_storage, // NOLINT
|
|
Reduction &reduction, // NOLINT
|
|
const ItemIter item_iter, const int node_begin)
|
|
: temp_storage(temp_storage.Alias()), reduction(reduction.Alias()),
|
|
item_iter(item_iter), node_begin(node_begin) {}
|
|
|
|
__device__ __forceinline__ void ResetPartials() {
|
|
if (threadIdx.x < ParamsT::N_WARPS) {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ResetReduction() {
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
reduction.node_sums[threadIdx.x] = gpu_gpair();
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void LoadTile(const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
if (threadIdx.x < num_remaining) {
|
|
bst_uint i = offset + threadIdx.x;
|
|
gpair = thrust::get<0>(item_iter[i]);
|
|
// gpair = d_items[offset + threadIdx.x].gpair;
|
|
// node_id = d_node_id[offset + threadIdx.x];
|
|
node_id = thrust::get<2>(item_iter[i]);
|
|
node_id_adjusted = node_id - node_begin;
|
|
} else {
|
|
gpair = gpu_gpair();
|
|
node_id = -1;
|
|
node_id_adjusted = -1;
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ProcessTile(const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
LoadTile(offset, num_remaining);
|
|
|
|
// Warp synchronous reduction
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
bool active = node_id_adjusted == NODE;
|
|
|
|
unsigned int ballot = __ballot(active);
|
|
|
|
int warp_id = threadIdx.x / 32;
|
|
int lane_id = threadIdx.x % 32;
|
|
|
|
if (ballot == 0) {
|
|
continue;
|
|
} else if (__popc(ballot) == 1) {
|
|
if (active) {
|
|
temp_storage.partial_sums[NODE][warp_id] += gpair;
|
|
}
|
|
} else {
|
|
gpu_gpair sum = WarpReduceT(temp_storage.warp_reduce[warp_id])
|
|
.Sum(active ? gpair : gpu_gpair());
|
|
if (lane_id == 0) {
|
|
temp_storage.partial_sums[NODE][warp_id] += sum;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ReducePartials() {
|
|
// Use single warp to reduce partials
|
|
if (threadIdx.x < 32) {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
gpu_gpair sum =
|
|
WarpReduceT(temp_storage.warp_reduce[0])
|
|
.Sum(threadIdx.x < ParamsT::N_WARPS
|
|
? temp_storage.partial_sums[NODE][threadIdx.x]
|
|
: gpu_gpair());
|
|
|
|
if (threadIdx.x == 0) {
|
|
reduction.node_sums[NODE] = sum;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
|
const bst_uint &segment_end) {
|
|
// Current position
|
|
bst_uint offset = segment_begin;
|
|
|
|
ResetReduction();
|
|
ResetPartials();
|
|
|
|
__syncthreads();
|
|
|
|
// Process full tiles
|
|
while (offset < segment_end) {
|
|
ProcessTile(offset, segment_end - offset);
|
|
offset += ParamsT::TILE_ITEMS;
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
ReducePartials();
|
|
|
|
__syncthreads();
|
|
}
|
|
};
|
|
|
|
template <typename ParamsT, typename ReductionT>
|
|
struct FindSplitEnactorMultiscan {
|
|
typedef cub::BlockScan<BitFlagSet, ParamsT::BLOCK_THREADS> FlagsBlockScanT;
|
|
|
|
typedef cub::WarpReduce<Split> WarpSplitReduceT;
|
|
|
|
typedef cub::WarpReduce<float> WarpReduceT;
|
|
|
|
typedef cub::WarpScan<gpu_gpair> WarpScanT;
|
|
|
|
struct _TempStorage {
|
|
union {
|
|
typename WarpSplitReduceT::TempStorage warp_split_reduce;
|
|
typename FlagsBlockScanT::TempStorage flags_scan;
|
|
typename WarpScanT::TempStorage warp_gpair_scan[ParamsT::N_WARPS];
|
|
typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
|
|
};
|
|
|
|
Split warp_best_splits[ParamsT::N_NODES][ParamsT::N_WARPS];
|
|
gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
|
|
gpu_gpair top_level_sum[ParamsT::N_NODES]; // Sum of current partial sums
|
|
gpu_gpair tile_carry[ParamsT::N_NODES]; // Contains top-level sums from
|
|
// previous tiles
|
|
Split best_splits[ParamsT::N_NODES];
|
|
// Cache current level nodes into shared memory
|
|
float node_root_gain[ParamsT::N_NODES];
|
|
gpu_gpair node_parent_sum[ParamsT::N_NODES];
|
|
};
|
|
|
|
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
|
|
|
// Thread local member variables
|
|
const ItemIter item_iter;
|
|
Split *d_split_candidates_out;
|
|
const Node *d_nodes;
|
|
_TempStorage &temp_storage;
|
|
gpu_gpair gpair;
|
|
float fvalue;
|
|
NodeIdT node_id;
|
|
NodeIdT node_id_adjusted;
|
|
const NodeIdT node_begin;
|
|
const GPUTrainingParam ¶m;
|
|
const ReductionT &reduction;
|
|
const int level;
|
|
FlagPrefixCallbackOp flag_prefix_op;
|
|
|
|
__device__ __forceinline__ FindSplitEnactorMultiscan(
|
|
TempStorage &temp_storage, const ItemIter item_iter, // NOLINT
|
|
Split *d_split_candidates_out, const Node *d_nodes,
|
|
const NodeIdT node_begin, const GPUTrainingParam ¶m,
|
|
const ReductionT reduction, const int level)
|
|
: temp_storage(temp_storage.Alias()), item_iter(item_iter),
|
|
d_split_candidates_out(d_split_candidates_out), d_nodes(d_nodes),
|
|
node_begin(node_begin), param(param), reduction(reduction),
|
|
level(level), flag_prefix_op() {}
|
|
|
|
__device__ __forceinline__ void UpdateTileCarry() {
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
temp_storage.tile_carry[threadIdx.x] +=
|
|
temp_storage.top_level_sum[threadIdx.x];
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ResetTileCarry() {
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
temp_storage.tile_carry[threadIdx.x] = gpu_gpair();
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ResetPartials() {
|
|
if (threadIdx.x < ParamsT::N_WARPS) {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
|
|
}
|
|
}
|
|
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
temp_storage.top_level_sum[threadIdx.x] = gpu_gpair();
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ResetSplits() {
|
|
if (threadIdx.x < ParamsT::N_WARPS) {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
temp_storage.warp_best_splits[NODE][threadIdx.x] = Split();
|
|
}
|
|
}
|
|
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
temp_storage.best_splits[threadIdx.x] = Split();
|
|
}
|
|
}
|
|
|
|
// Cache d_nodes array for this level into shared memory
|
|
__device__ __forceinline__ void CacheNodes() {
|
|
// Get pointer to nodes on the current level
|
|
const Node *d_nodes_level = d_nodes + node_begin;
|
|
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
temp_storage.node_root_gain[threadIdx.x] =
|
|
d_nodes_level[threadIdx.x].root_gain;
|
|
temp_storage.node_parent_sum[threadIdx.x] =
|
|
d_nodes_level[threadIdx.x].sum_gradients;
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void LoadTile(bst_uint offset,
|
|
bst_uint num_remaining) {
|
|
if (threadIdx.x < num_remaining) {
|
|
bst_uint i = offset + threadIdx.x;
|
|
gpair = thrust::get<0>(item_iter[i]);
|
|
fvalue = thrust::get<1>(item_iter[i]);
|
|
node_id = thrust::get<2>(item_iter[i]);
|
|
node_id_adjusted = node_id - node_begin;
|
|
} else {
|
|
node_id = -1;
|
|
node_id_adjusted = -1;
|
|
fvalue = -FLT_MAX;
|
|
gpair = gpu_gpair();
|
|
}
|
|
}
|
|
|
|
// Is this node being processed by current kernel iteration?
|
|
__device__ __forceinline__ bool NodeActive() {
|
|
return node_id_adjusted < ParamsT::N_NODES && node_id_adjusted >= 0;
|
|
}
|
|
|
|
// Is current fvalue different from left fvalue
|
|
__device__ __forceinline__ bool
|
|
LeftMostFvalue(const bst_uint &segment_begin, const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
int left_index = offset + threadIdx.x - 1;
|
|
float left_fvalue = left_index >= static_cast<int>(segment_begin) &&
|
|
threadIdx.x < num_remaining
|
|
? thrust::get<1>(item_iter[left_index])
|
|
: -FLT_MAX;
|
|
|
|
return left_fvalue != fvalue;
|
|
}
|
|
|
|
// Prevent splitting in the middle of same valued instances
|
|
__device__ __forceinline__ bool
|
|
CheckSplitValid(const bst_uint &segment_begin, const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
BitFlagSet bit_flag = 0;
|
|
|
|
bool valid_split = false;
|
|
|
|
if (LeftMostFvalue(segment_begin, offset, num_remaining)) {
|
|
valid_split = true;
|
|
// Use MSB bit to flag if fvalue is leftmost
|
|
set_bit(bit_flag, 63);
|
|
}
|
|
|
|
// Flag nodeid
|
|
if (NodeActive()) {
|
|
set_bit(bit_flag, node_id_adjusted);
|
|
}
|
|
|
|
FlagsBlockScanT(temp_storage.flags_scan)
|
|
.ExclusiveScan(bit_flag, bit_flag, FlagScanOp(), flag_prefix_op);
|
|
__syncthreads();
|
|
|
|
if (!valid_split && NodeActive()) {
|
|
if (!check_bit(bit_flag, node_id_adjusted)) {
|
|
valid_split = true;
|
|
}
|
|
}
|
|
|
|
return valid_split;
|
|
}
|
|
|
|
// Perform warp reduction to find if this lane contains best loss_chg in warp
|
|
__device__ __forceinline__ bool QueryLaneBestLoss(const float &loss_chg) {
|
|
int lane_id = threadIdx.x % 32;
|
|
int warp_id = threadIdx.x / 32;
|
|
|
|
// Possible source of bugs. Not all threads in warp are active here. Not
|
|
// sure if reduce function will behave correctly
|
|
float best = WarpReduceT(temp_storage.warp_reduce[warp_id])
|
|
.Reduce(loss_chg, cub::Max());
|
|
|
|
// Its possible for more than one lane to contain the best value, so make
|
|
// sure only one lane returns true
|
|
unsigned int ballot = __ballot(loss_chg == best);
|
|
|
|
if (lane_id == (__ffs(ballot) - 1)) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Which thread in this warp should update the current best split, if any
|
|
// Returns true for one thread or none
|
|
__device__ __forceinline__ bool
|
|
QueryUpdateWarpSplit(const float &loss_chg,
|
|
volatile const float &warp_best_loss) {
|
|
bool update = false;
|
|
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
bool active = node_id_adjusted == NODE;
|
|
|
|
unsigned int ballot = __ballot(loss_chg > warp_best_loss && active);
|
|
|
|
// No lane has improved loss_chg
|
|
if (__popc(ballot) == 0) {
|
|
continue;
|
|
} else if (__popc(ballot) == 1) {
|
|
// A single lane has improved loss_chg, set true for this lane
|
|
int lane_id = threadIdx.x % 32;
|
|
|
|
if (lane_id == __ffs(ballot) - 1) {
|
|
update = true;
|
|
}
|
|
} else {
|
|
// More than one lane has improved loss_chg, perform a reduction.
|
|
if (QueryLaneBestLoss(active ? loss_chg : -FLT_MAX)) {
|
|
update = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
return update;
|
|
}
|
|
|
|
__device__ void PrintTileScan(int block_id, bool thread_active,
|
|
float loss_chg, gpu_gpair missing) {
|
|
if (blockIdx.x != block_id) {
|
|
return;
|
|
}
|
|
|
|
for (int warp = 0; warp < ParamsT::N_WARPS; warp++) {
|
|
if (threadIdx.x / 32 == warp) {
|
|
for (int lane = 0; lane < 32; lane++) {
|
|
gpu_gpair g = cub::ShuffleIndex(gpair, lane);
|
|
gpu_gpair missing_broadcast = cub::ShuffleIndex(missing, lane);
|
|
float fvalue_broadcast = __shfl(fvalue, lane);
|
|
bool thread_active_broadcast = __shfl(thread_active, lane);
|
|
float loss_chg_broadcast = __shfl(loss_chg, lane);
|
|
NodeIdT node_id_broadcast = cub::ShuffleIndex(node_id, lane);
|
|
if (threadIdx.x == 32 * warp) {
|
|
printf("tid %d, nid %d, fvalue %1.2f, active %c, loss %1.2f, scan ",
|
|
threadIdx.x + lane, node_id_broadcast, fvalue_broadcast,
|
|
thread_active_broadcast ? 'y' : 'n',
|
|
loss_chg_broadcast < 0.0f ? 0 : loss_chg_broadcast);
|
|
g.print();
|
|
}
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void
|
|
EvaluateSplits(const bst_uint &segment_begin, const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
bool valid_split = CheckSplitValid(segment_begin, offset, num_remaining);
|
|
|
|
bool thread_active =
|
|
NodeActive() && valid_split && threadIdx.x < num_remaining;
|
|
|
|
const int warp_id = threadIdx.x / 32;
|
|
|
|
gpu_gpair parent_sum = thread_active
|
|
? temp_storage.node_parent_sum[node_id_adjusted]
|
|
: gpu_gpair();
|
|
float parent_gain =
|
|
thread_active ? temp_storage.node_root_gain[node_id_adjusted] : 0.0f;
|
|
gpu_gpair missing = thread_active
|
|
? parent_sum - reduction.node_sums[node_id_adjusted]
|
|
: gpu_gpair();
|
|
|
|
bool missing_left;
|
|
|
|
float loss_chg = thread_active
|
|
? loss_chg_missing(gpair, missing, parent_sum,
|
|
parent_gain, param, missing_left)
|
|
: -FLT_MAX;
|
|
|
|
// PrintTileScan(64, thread_active, loss_chg, missing);
|
|
|
|
float warp_best_loss =
|
|
thread_active
|
|
? temp_storage.warp_best_splits[node_id_adjusted][warp_id].loss_chg
|
|
: 0.0f;
|
|
|
|
if (QueryUpdateWarpSplit(loss_chg, warp_best_loss)) {
|
|
float fvalue_split = fvalue - FVALUE_EPS;
|
|
|
|
if (missing_left) {
|
|
gpu_gpair left_sum = missing + gpair;
|
|
gpu_gpair right_sum = parent_sum - left_sum;
|
|
temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
|
|
loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
|
|
right_sum, param);
|
|
} else {
|
|
gpu_gpair left_sum = gpair;
|
|
gpu_gpair right_sum = parent_sum - left_sum;
|
|
temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
|
|
loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
|
|
right_sum, param);
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void BlockExclusiveScan() {
|
|
ResetPartials();
|
|
|
|
__syncthreads();
|
|
int warp_id = threadIdx.x / 32;
|
|
int lane_id = threadIdx.x % 32;
|
|
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
bool node_active = node_id_adjusted == NODE;
|
|
|
|
unsigned int ballot = __ballot(node_active);
|
|
|
|
gpu_gpair warp_sum = gpu_gpair();
|
|
gpu_gpair scan_result = gpu_gpair();
|
|
|
|
if (ballot > 0) {
|
|
WarpScanT(temp_storage.warp_gpair_scan[warp_id])
|
|
.InclusiveScan(node_active ? gpair : gpu_gpair(), scan_result,
|
|
cub::Sum(), warp_sum);
|
|
}
|
|
|
|
if (node_active) {
|
|
gpair = scan_result - gpair;
|
|
}
|
|
|
|
if (lane_id == 0) {
|
|
temp_storage.partial_sums[NODE][warp_id] = warp_sum;
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if (threadIdx.x < 32) {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
gpu_gpair top_level_sum;
|
|
bool warp_active = threadIdx.x < ParamsT::N_WARPS;
|
|
gpu_gpair scan_result;
|
|
WarpScanT(temp_storage.warp_gpair_scan[warp_id])
|
|
.InclusiveScan(warp_active
|
|
? temp_storage.partial_sums[NODE][threadIdx.x]
|
|
: gpu_gpair(),
|
|
scan_result, cub::Sum(), top_level_sum);
|
|
|
|
if (warp_active) {
|
|
temp_storage.partial_sums[NODE][threadIdx.x] =
|
|
scan_result - temp_storage.partial_sums[NODE][threadIdx.x];
|
|
}
|
|
|
|
if (threadIdx.x == 0) {
|
|
temp_storage.top_level_sum[NODE] = top_level_sum;
|
|
}
|
|
}
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
if (NodeActive()) {
|
|
gpair += temp_storage.partial_sums[node_id_adjusted][warp_id] +
|
|
temp_storage.tile_carry[node_id_adjusted];
|
|
}
|
|
|
|
__syncthreads();
|
|
|
|
UpdateTileCarry();
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
__device__ __forceinline__ void ProcessTile(const bst_uint &segment_begin,
|
|
const bst_uint &offset,
|
|
const bst_uint &num_remaining) {
|
|
LoadTile(offset, num_remaining);
|
|
BlockExclusiveScan();
|
|
EvaluateSplits(segment_begin, offset, num_remaining);
|
|
}
|
|
|
|
__device__ __forceinline__ void ReduceSplits() {
|
|
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
|
if (threadIdx.x < 32) {
|
|
Split s = Split();
|
|
if (threadIdx.x < ParamsT::N_WARPS) {
|
|
s = temp_storage.warp_best_splits[NODE][threadIdx.x];
|
|
}
|
|
Split best = WarpSplitReduceT(temp_storage.warp_split_reduce)
|
|
.Reduce(s, split_reduce_op());
|
|
if (threadIdx.x == 0) {
|
|
temp_storage.best_splits[NODE] = best;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void WriteBestSplits() {
|
|
const int nodes_level = 1 << level;
|
|
|
|
if (threadIdx.x < ParamsT::N_NODES) {
|
|
d_split_candidates_out[blockIdx.x * nodes_level + threadIdx.x] =
|
|
temp_storage.best_splits[threadIdx.x];
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
|
const bst_uint &segment_end) {
|
|
// Current position
|
|
bst_uint offset = segment_begin;
|
|
|
|
ResetTileCarry();
|
|
ResetSplits();
|
|
CacheNodes();
|
|
__syncthreads();
|
|
|
|
// Process full tiles
|
|
while (offset < segment_end) {
|
|
ProcessTile(segment_begin, offset, segment_end - offset);
|
|
__syncthreads();
|
|
offset += ParamsT::TILE_ITEMS;
|
|
}
|
|
|
|
__syncthreads();
|
|
ReduceSplits();
|
|
|
|
__syncthreads();
|
|
WriteBestSplits();
|
|
}
|
|
};
|
|
|
|
template <typename FindSplitParamsT, typename ReduceParamsT>
|
|
__global__ void
|
|
#if __CUDA_ARCH__ <= 530
|
|
__launch_bounds__(1024, 2)
|
|
#endif
|
|
find_split_candidates_multiscan_kernel(
|
|
const ItemIter items_iter, Split *d_split_candidates_out,
|
|
const Node *d_nodes, const int node_begin, bst_uint num_items,
|
|
int num_features, const int *d_feature_offsets,
|
|
const GPUTrainingParam param, const int *d_feature_flags,
|
|
const int level) {
|
|
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
|
|
return;
|
|
}
|
|
|
|
int segment_begin = d_feature_offsets[blockIdx.x];
|
|
int segment_end = d_feature_offsets[blockIdx.x + 1];
|
|
|
|
typedef ReduceEnactorMultiscan<ReduceParamsT> ReduceT;
|
|
typedef FindSplitEnactorMultiscan<FindSplitParamsT,
|
|
typename ReduceT::_Reduction>
|
|
FindSplitT;
|
|
|
|
__shared__ union {
|
|
typename ReduceT::TempStorage reduce;
|
|
typename FindSplitT::TempStorage find_split;
|
|
} temp_storage;
|
|
|
|
__shared__ typename ReduceT::Reduction reduction;
|
|
|
|
ReduceT(temp_storage.reduce, reduction, items_iter, node_begin)
|
|
.ProcessRegion(segment_begin, segment_end);
|
|
__syncthreads();
|
|
|
|
FindSplitT find_split(temp_storage.find_split, items_iter,
|
|
d_split_candidates_out, d_nodes, node_begin, param,
|
|
reduction.Alias(), level);
|
|
find_split.ProcessRegion(segment_begin, segment_end);
|
|
}
|
|
|
|
template <int N_NODES>
|
|
void find_split_candidates_multiscan_variation(GPUData *data, const int level) {
|
|
const int node_begin = (1 << level) - 1;
|
|
const int BLOCK_THREADS = 512;
|
|
|
|
CHECK(BLOCK_THREADS / 32 < 32)
|
|
<< "Too many active warps. See FindSplitEnactor - ReduceSplits.";
|
|
|
|
typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
|
|
find_split_params;
|
|
typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
|
|
int grid_size = data->n_features;
|
|
|
|
find_split_candidates_multiscan_kernel<
|
|
find_split_params,
|
|
reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
|
|
data->items_iter, data->split_candidates.data(), data->nodes.data(),
|
|
node_begin, data->fvalues.size(), data->n_features, data->foffsets.data(),
|
|
data->param, data->feature_flags.data(), level);
|
|
|
|
dh::safe_cuda(cudaDeviceSynchronize());
|
|
}
|
|
|
|
void find_split_candidates_multiscan(GPUData *data, const int level) {
|
|
// Select templated variation of split finding algorithm
|
|
switch (level) {
|
|
case 0:
|
|
find_split_candidates_multiscan_variation<1>(data, level);
|
|
break;
|
|
case 1:
|
|
find_split_candidates_multiscan_variation<2>(data, level);
|
|
break;
|
|
case 2:
|
|
find_split_candidates_multiscan_variation<4>(data, level);
|
|
break;
|
|
case 3:
|
|
find_split_candidates_multiscan_variation<8>(data, level);
|
|
break;
|
|
case 4:
|
|
find_split_candidates_multiscan_variation<16>(data, level);
|
|
break;
|
|
}
|
|
}
|
|
} // namespace tree
|
|
} // namespace xgboost
|