Further optimisations for gpu_hist. (#4283)

- Fuse final update position functions into a single more efficient kernel

- Refactor gpu_hist with a more explicit ellpack  matrix representation
This commit is contained in:
Rory Mitchell 2019-03-24 17:17:22 +13:00 committed by GitHub
parent 5aa42b5f11
commit 6d5b34d824
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 345 additions and 297 deletions

View File

@ -93,65 +93,63 @@ class RegTree {
"Node: 64 bit align"); "Node: 64 bit align");
} }
/*! \brief index of left child */ /*! \brief index of left child */
int LeftChild() const { XGBOOST_DEVICE int LeftChild() const {
return this->cleft_; return this->cleft_;
} }
/*! \brief index of right child */ /*! \brief index of right child */
int RightChild() const { XGBOOST_DEVICE int RightChild() const {
return this->cright_; return this->cright_;
} }
/*! \brief index of default child when feature is missing */ /*! \brief index of default child when feature is missing */
int DefaultChild() const { XGBOOST_DEVICE int DefaultChild() const {
return this->DefaultLeft() ? this->LeftChild() : this->RightChild(); return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
} }
/*! \brief feature index of split condition */ /*! \brief feature index of split condition */
unsigned SplitIndex() const { XGBOOST_DEVICE unsigned SplitIndex() const {
return sindex_ & ((1U << 31) - 1U); return sindex_ & ((1U << 31) - 1U);
} }
/*! \brief when feature is unknown, whether goes to left child */ /*! \brief when feature is unknown, whether goes to left child */
bool DefaultLeft() const { XGBOOST_DEVICE bool DefaultLeft() const {
return (sindex_ >> 31) != 0; return (sindex_ >> 31) != 0;
} }
/*! \brief whether current node is leaf node */ /*! \brief whether current node is leaf node */
bool IsLeaf() const { XGBOOST_DEVICE bool IsLeaf() const {
return cleft_ == -1; return cleft_ == -1;
} }
/*! \return get leaf value of leaf node */ /*! \return get leaf value of leaf node */
bst_float LeafValue() const { XGBOOST_DEVICE bst_float LeafValue() const {
return (this->info_).leaf_value; return (this->info_).leaf_value;
} }
/*! \return get split condition of the node */ /*! \return get split condition of the node */
SplitCondT SplitCond() const { XGBOOST_DEVICE SplitCondT SplitCond() const {
return (this->info_).split_cond; return (this->info_).split_cond;
} }
/*! \brief get parent of the node */ /*! \brief get parent of the node */
int Parent() const { XGBOOST_DEVICE int Parent() const {
return parent_ & ((1U << 31) - 1); return parent_ & ((1U << 31) - 1);
} }
/*! \brief whether current node is left child */ /*! \brief whether current node is left child */
bool IsLeftChild() const { XGBOOST_DEVICE bool IsLeftChild() const {
return (parent_ & (1U << 31)) != 0; return (parent_ & (1U << 31)) != 0;
} }
/*! \brief whether this node is deleted */ /*! \brief whether this node is deleted */
bool IsDeleted() const { XGBOOST_DEVICE bool IsDeleted() const {
return sindex_ == std::numeric_limits<unsigned>::max(); return sindex_ == std::numeric_limits<unsigned>::max();
} }
/*! \brief whether current node is root */ /*! \brief whether current node is root */
bool IsRoot() const { XGBOOST_DEVICE bool IsRoot() const { return parent_ == -1; }
return parent_ == -1;
}
/*! /*!
* \brief set the left child * \brief set the left child
* \param nid node id to right child * \param nid node id to right child
*/ */
void SetLeftChild(int nid) { XGBOOST_DEVICE void SetLeftChild(int nid) {
this->cleft_ = nid; this->cleft_ = nid;
} }
/*! /*!
* \brief set the right child * \brief set the right child
* \param nid node id to right child * \param nid node id to right child
*/ */
void SetRightChild(int nid) { XGBOOST_DEVICE void SetRightChild(int nid) {
this->cright_ = nid; this->cright_ = nid;
} }
/*! /*!
@ -160,7 +158,7 @@ class RegTree {
* \param split_cond split condition * \param split_cond split condition
* \param default_left the default direction when feature is unknown * \param default_left the default direction when feature is unknown
*/ */
void SetSplit(unsigned split_index, SplitCondT split_cond, XGBOOST_DEVICE void SetSplit(unsigned split_index, SplitCondT split_cond,
bool default_left = false) { bool default_left = false) {
if (default_left) split_index |= (1U << 31); if (default_left) split_index |= (1U << 31);
this->sindex_ = split_index; this->sindex_ = split_index;
@ -172,17 +170,17 @@ class RegTree {
* \param right right index, could be used to store * \param right right index, could be used to store
* additional information * additional information
*/ */
void SetLeaf(bst_float value, int right = -1) { XGBOOST_DEVICE void SetLeaf(bst_float value, int right = -1) {
(this->info_).leaf_value = value; (this->info_).leaf_value = value;
this->cleft_ = -1; this->cleft_ = -1;
this->cright_ = right; this->cright_ = right;
} }
/*! \brief mark that this node is deleted */ /*! \brief mark that this node is deleted */
void MarkDelete() { XGBOOST_DEVICE void MarkDelete() {
this->sindex_ = std::numeric_limits<unsigned>::max(); this->sindex_ = std::numeric_limits<unsigned>::max();
} }
// set parent // set parent
void SetParent(int pidx, bool is_left_child = true) { XGBOOST_DEVICE void SetParent(int pidx, bool is_left_child = true) {
if (is_left_child) pidx |= (1U << 31); if (is_left_child) pidx |= (1U << 31);
this->parent_ = pidx; this->parent_ = pidx;
} }

View File

@ -303,6 +303,7 @@ class GPUPredictor : public xgboost::Predictor {
const gbm::GBTreeModel& model, size_t tree_begin, const gbm::GBTreeModel& model, size_t tree_begin,
size_t tree_end) { size_t tree_end) {
if (tree_end - tree_begin == 0) { return; } if (tree_end - tree_begin == 0) { return; }
monitor_.StartCuda("DevicePredictInternal");
CHECK_EQ(model.param.size_leaf_vector, 0); CHECK_EQ(model.param.size_leaf_vector, 0);
// Copy decision trees to device // Copy decision trees to device
@ -337,6 +338,7 @@ class GPUPredictor : public xgboost::Predictor {
}); });
i_batch++; i_batch++;
} }
monitor_.StopCuda("DevicePredictInternal");
} }
public: public:
@ -388,9 +390,11 @@ class GPUPredictor : public xgboost::Predictor {
if (it != cache_.end()) { if (it != cache_.end()) {
const HostDeviceVector<bst_float>& y = it->second.predictions; const HostDeviceVector<bst_float>& y = it->second.predictions;
if (y.Size() != 0) { if (y.Size() != 0) {
monitor_.StartCuda("PredictFromCache");
out_preds->Reshard(y.Distribution()); out_preds->Reshard(y.Distribution());
out_preds->Resize(y.Size()); out_preds->Resize(y.Size());
out_preds->Copy(y); out_preds->Copy(y);
monitor_.StopCuda("PredictFromCache");
return true; return true;
} }
} }
@ -481,6 +485,7 @@ class GPUPredictor : public xgboost::Predictor {
std::unique_ptr<Predictor> cpu_predictor_; std::unique_ptr<Predictor> cpu_predictor_;
std::vector<DeviceShard> shards_; std::vector<DeviceShard> shards_;
GPUSet devices_; GPUSet devices_;
common::Monitor monitor_;
}; };
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")

View File

@ -50,6 +50,133 @@ struct GPUHistMakerTrainParam
DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;
uint64_t timestamp;
ExpandEntry() = default;
ExpandEntry(int nid, int depth, DeviceSplitCandidate split,
uint64_t timestamp)
: nid(nid), depth(depth), split(std::move(split)), timestamp(timestamp) {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
return false;
}
if (param.max_depth > 0 && depth == param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
return true;
}
static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}
friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
os << "ExpandEntry: \n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
os << "loss: " << e.split.loss_chg << "\n";
os << "left_sum: " << e.split.left_sum << "\n";
os << "right_sum: " << e.split.right_sum << "\n";
return os;
}
};
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.depth == rhs.depth) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.depth > rhs.depth; // favor small depth
}
}
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.split.loss_chg == rhs.split.loss_chg) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
}
}
// Find a gidx value for a given feature otherwise return -1 if not found
__device__ int BinarySearchRow(bst_uint begin, bst_uint end,
common::CompressedIterator<uint32_t> data,
int const fidx_begin, int const fidx_end) {
bst_uint previous_middle = UINT32_MAX;
while (end != begin) {
auto middle = begin + (end - begin) / 2;
if (middle == previous_middle) {
break;
}
previous_middle = middle;
auto gidx = data[middle];
if (gidx >= fidx_begin && gidx < fidx_end) {
return gidx;
} else if (gidx < fidx_begin) {
begin = middle;
} else {
end = middle;
}
}
// Value is missing
return -1;
}
/** \brief Struct for accessing and manipulating an ellpack matrix on the
* device. Does not own underlying memory and may be trivially copied into
* kernels.*/
struct ELLPackMatrix {
common::Span<uint32_t> feature_segments;
/*! \brief minimum value for each feature. */
common::Span<bst_float> min_fvalue;
/*! \brief Cut. */
common::Span<bst_float> gidx_fvalue_map;
/*! \brief row length for ELLPack. */
size_t row_stride{0};
common::CompressedIterator<uint32_t> gidx_iter;
bool is_dense;
int null_gidx_value;
XGBOOST_DEVICE size_t BinCount() const { return gidx_fvalue_map.size(); }
// Get a matrix element, uses binary search for look up
// Return NaN if missing
__device__ bst_float GetElement(size_t ridx, size_t fidx) const {
auto row_begin = row_stride * ridx;
auto row_end = row_begin + row_stride;
auto gidx = -1;
if (is_dense) {
gidx = gidx_iter[row_begin + fidx];
} else {
gidx =
BinarySearchRow(row_begin, row_end, gidx_iter, feature_segments[fidx],
feature_segments[fidx + 1]);
}
if (gidx == -1) {
return nan("");
}
return gidx_fvalue_map[gidx];
}
void Init(common::Span<uint32_t> feature_segments,
common::Span<bst_float> min_fvalue,
common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
int null_gidx_value) {
this->feature_segments = feature_segments;
this->min_fvalue = min_fvalue;
this->gidx_fvalue_map = gidx_fvalue_map;
this->row_stride = row_stride;
this->gidx_iter = gidx_iter;
this->is_dense = is_dense;
this->null_gidx_value = null_gidx_value;
}
};
// With constraints // With constraints
template <typename GradientPairT> template <typename GradientPairT>
XGBOOST_DEVICE float inline LossChangeMissing( XGBOOST_DEVICE float inline LossChangeMissing(
@ -111,19 +238,17 @@ __device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_h
template <int BLOCK_THREADS, typename ReduceT, typename ScanT, template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
typename MaxReduceT, typename TempStorageT, typename GradientSumT> typename MaxReduceT, typename TempStorageT, typename GradientSumT>
__device__ void EvaluateFeature( __device__ void EvaluateFeature(
int fidx, int fidx, common::Span<const GradientSumT> node_histogram,
common::Span<const GradientSumT> node_histogram, const ELLPackMatrix& matrix,
common::Span<const uint32_t> feature_segments, // cut.row_ptr
float min_fvalue, // cut.min_value
common::Span<const float> gidx_fvalue_map, // cut.cut
DeviceSplitCandidate* best_split, // shared memory storing best split DeviceSplitCandidate* best_split, // shared memory storing best split
const DeviceNodeStats& node, const GPUTrainingParam& param, const DeviceNodeStats& node, const GPUTrainingParam& param,
TempStorageT* temp_storage, // temp memory for cub operations TempStorageT* temp_storage, // temp memory for cub operations
int constraint, // monotonic_constraints int constraint, // monotonic_constraints
const ValueConstraint& value_constraint) { const ValueConstraint& value_constraint) {
// Use pointer from cut to indicate begin and end of bins for each feature. // Use pointer from cut to indicate begin and end of bins for each feature.
uint32_t gidx_begin = feature_segments[fidx]; // begining bin uint32_t gidx_begin = matrix.feature_segments[fidx]; // begining bin
uint32_t gidx_end = feature_segments[fidx + 1]; // end bin for i^th feature uint32_t gidx_end =
matrix.feature_segments[fidx + 1]; // end bin for i^th feature
// Sum histogram bins for current feature // Sum histogram bins for current feature
GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>( GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
@ -168,16 +293,17 @@ __device__ void EvaluateFeature(
// Best thread updates split // Best thread updates split
if (threadIdx.x == block_max.key) { if (threadIdx.x == block_max.key) {
int gidx = scan_begin + threadIdx.x; int split_gidx = (scan_begin + threadIdx.x) - 1;
float fvalue = float fvalue;
gidx == gidx_begin ? min_fvalue : gidx_fvalue_map[gidx - 1]; if (split_gidx < static_cast<int>(gidx_begin)) {
fvalue = matrix.min_fvalue[fidx];
} else {
fvalue = matrix.gidx_fvalue_map[split_gidx];
}
GradientSumT left = missing_left ? bin + missing : bin; GradientSumT left = missing_left ? bin + missing : bin;
GradientSumT right = parent_sum - left; GradientSumT right = parent_sum - left;
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue,
fvalue, fidx, fidx, GradientPair(left), GradientPair(right), param);
GradientPair(left),
GradientPair(right),
param);
} }
__syncthreads(); __syncthreads();
} }
@ -189,10 +315,7 @@ __global__ void EvaluateSplitKernel(
node_histogram, // histogram for gradients node_histogram, // histogram for gradients
common::Span<const int> feature_set, // Selected features common::Span<const int> feature_set, // Selected features
DeviceNodeStats node, DeviceNodeStats node,
common::Span<const uint32_t> ELLPackMatrix matrix,
d_feature_segments, // row_ptr form HistCutMatrix
common::Span<const float> d_fidx_min_map, // min_value
common::Span<const float> d_gidx_fvalue_map, // cut
GPUTrainingParam gpu_param, GPUTrainingParam gpu_param,
common::Span<DeviceSplitCandidate> split_candidates, // resulting split common::Span<DeviceSplitCandidate> split_candidates, // resulting split
ValueConstraint value_constraint, ValueConstraint value_constraint,
@ -226,10 +349,8 @@ __global__ void EvaluateSplitKernel(
int fidx = feature_set[blockIdx.x]; int fidx = feature_set[blockIdx.x];
int constraint = d_monotonic_constraints[fidx]; int constraint = d_monotonic_constraints[fidx];
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>( EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
fidx, node_histogram, fidx, node_histogram, matrix, &best_split, node, gpu_param, &temp_storage,
d_feature_segments, d_fidx_min_map[fidx], d_gidx_fvalue_map, constraint, value_constraint);
&best_split, node, gpu_param, &temp_storage, constraint,
value_constraint);
__syncthreads(); __syncthreads();
@ -239,32 +360,6 @@ __global__ void EvaluateSplitKernel(
} }
} }
// Find a gidx value for a given feature otherwise return -1 if not found
template <typename GidxIterT>
__device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data,
int const fidx_begin, int const fidx_end) {
bst_uint previous_middle = UINT32_MAX;
while (end != begin) {
auto middle = begin + (end - begin) / 2;
if (middle == previous_middle) {
break;
}
previous_middle = middle;
auto gidx = data[middle];
if (gidx >= fidx_begin && gidx < fidx_end) {
return gidx;
} else if (gidx < fidx_begin) {
begin = middle;
} else {
end = middle;
}
}
// Value is missing
return -1;
}
/** /**
* \struct DeviceHistogram * \struct DeviceHistogram
* *
@ -290,7 +385,6 @@ class DeviceHistogram {
} }
void Reset() { void Reset() {
dh::safe_cuda(cudaSetDevice(device_id_));
dh::safe_cuda(cudaMemsetAsync( dh::safe_cuda(cudaMemsetAsync(
data_.data().get(), 0, data_.data().get(), 0,
data_.size() * sizeof(typename decltype(data_)::value_type))); data_.size() * sizeof(typename decltype(data_)::value_type)));
@ -397,27 +491,27 @@ __global__ void CompressBinEllpackKernel(
} }
template <typename GradientSumT> template <typename GradientSumT>
__global__ void SharedMemHistKernel(size_t row_stride, const bst_uint* d_ridx, __global__ void SharedMemHistKernel(ELLPackMatrix matrix, const bst_uint* d_ridx,
common::CompressedIterator<uint32_t> d_gidx,
int null_gidx_value,
GradientSumT* d_node_hist, GradientSumT* d_node_hist,
const GradientPair* d_gpair, const GradientPair* d_gpair,
size_t segment_begin, size_t n_elements) { size_t segment_begin, size_t n_elements) {
extern __shared__ char smem[]; extern __shared__ char smem[];
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
for (auto i : dh::BlockStrideRange(0, null_gidx_value)) { for (auto i :
dh::BlockStrideRange(static_cast<size_t>(0), matrix.BinCount())) {
smem_arr[i] = GradientSumT(); smem_arr[i] = GradientSumT();
} }
__syncthreads(); __syncthreads();
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) { for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
int ridx = d_ridx[idx / row_stride + segment_begin]; int ridx = d_ridx[idx / matrix.row_stride + segment_begin];
int gidx = d_gidx[ridx * row_stride + idx % row_stride]; int gidx = matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride];
if (gidx != null_gidx_value) { if (gidx != matrix.null_gidx_value) {
AtomicAddGpair(smem_arr + gidx, d_gpair[ridx]); AtomicAddGpair(smem_arr + gidx, d_gpair[ridx]);
} }
} }
__syncthreads(); __syncthreads();
for (auto i : dh::BlockStrideRange(0, null_gidx_value)) { for (auto i :
dh::BlockStrideRange(static_cast<size_t>(0), matrix.BinCount())) {
AtomicAddGpair(d_node_hist + i, smem_arr[i]); AtomicAddGpair(d_node_hist + i, smem_arr[i]);
} }
} }
@ -509,32 +603,26 @@ struct DeviceShard {
dh::BulkAllocator<dh::MemoryType::kDevice> ba; dh::BulkAllocator<dh::MemoryType::kDevice> ba;
/*! \brief HistCutMatrix stored in device. */ ELLPackMatrix ellpack_matrix;
struct DeviceHistCutMatrix {
/*! \brief row_ptr form HistCutMatrix. */
dh::DVec<uint32_t> feature_segments;
/*! \brief minimum value for each feature. */
dh::DVec<bst_float> min_fvalue;
/*! \brief Cut. */
dh::DVec<bst_float> gidx_fvalue_map;
} d_cut;
/*! \brief Range of rows for each node. */ /*! \brief Range of rows for each node. */
std::vector<Segment> ridx_segments; std::vector<Segment> ridx_segments;
DeviceHistogram<GradientSumT> hist; DeviceHistogram<GradientSumT> hist;
/*! \brief row length for ELLPack. */ /*! \brief row_ptr form HistCutMatrix. */
size_t row_stride; dh::DVec<uint32_t> feature_segments;
common::CompressedIterator<uint32_t> gidx; /*! \brief minimum value for each feature. */
dh::DVec<bst_float> min_fvalue;
/*! \brief Cut. */
dh::DVec<bst_float> gidx_fvalue_map;
/*! \brief global index of histogram, which is stored in ELLPack format. */
dh::DVec<common::CompressedByteT> gidx_buffer;
/*! \brief Row indices relative to this shard, necessary for sorting rows. */ /*! \brief Row indices relative to this shard, necessary for sorting rows. */
dh::DVec2<bst_uint> ridx; dh::DVec2<bst_uint> ridx;
/*! \brief Gradient pair for each row. */ /*! \brief Gradient pair for each row. */
dh::DVec<GradientPair> gpair; dh::DVec<GradientPair> gpair;
/*! \brief The last histogram index. */
int null_gidx_value;
dh::DVec2<int> position; dh::DVec2<int> position;
dh::DVec<int> monotone_constraints; dh::DVec<int> monotone_constraints;
@ -543,8 +631,6 @@ struct DeviceShard {
/*! \brief Sum gradient for each node. */ /*! \brief Sum gradient for each node. */
std::vector<GradientPair> node_sum_gradients; std::vector<GradientPair> node_sum_gradients;
dh::DVec<GradientPair> node_sum_gradients_d; dh::DVec<GradientPair> node_sum_gradients_d;
/*! \brief global index of histogram, which is stored in ELLPack format. */
dh::DVec<common::CompressedByteT> gidx_buffer;
/*! \brief row offset in SparsePage (the input data). */ /*! \brief row offset in SparsePage (the input data). */
thrust::device_vector<size_t> row_ptrs; thrust::device_vector<size_t> row_ptrs;
/*! \brief On-device feature set, only actually used on one of the devices */ /*! \brief On-device feature set, only actually used on one of the devices */
@ -572,16 +658,13 @@ struct DeviceShard {
: device_id(_device_id), : device_id(_device_id),
row_begin_idx(row_begin), row_begin_idx(row_begin),
row_end_idx(row_end), row_end_idx(row_end),
row_stride(0),
n_rows(row_end - row_begin), n_rows(row_end - row_begin),
n_bins{0}, n_bins(0),
null_gidx_value(0),
param(std::move(_param)), param(std::move(_param)),
prediction_cache_initialised(false) {} prediction_cache_initialised(false) {}
/* Init row_ptrs and row_stride */ /* Init row_ptrs and row_stride */
void InitRowPtrs(const SparsePage& row_batch) { size_t InitRowPtrs(const SparsePage& row_batch) {
dh::safe_cuda(cudaSetDevice(device_id));
const auto& offset_vec = row_batch.offset.HostVector(); const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs.resize(n_rows + 1); row_ptrs.resize(n_rows + 1);
thrust::copy(offset_vec.data() + row_begin_idx, thrust::copy(offset_vec.data() + row_begin_idx,
@ -595,22 +678,17 @@ struct DeviceShard {
auto counting = thrust::make_counting_iterator(size_t(0)); auto counting = thrust::make_counting_iterator(size_t(0));
using TransformT = thrust::transform_iterator<decltype(get_size), using TransformT = thrust::transform_iterator<decltype(get_size),
decltype(counting), size_t>; decltype(counting), size_t>;
TransformT row_size_iter = TransformT(counting, get_size); TransformT row_size_iter = TransformT(counting, get_size);
row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0, size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
thrust::maximum<size_t>()); thrust::maximum<size_t>());
return row_stride;
} }
/*
Init:
n_bins, null_gidx_value, gidx_buffer, row_ptrs, gidx, gidx_fvalue_map,
min_fvalue, feature_segments, node_sum_gradients, ridx_segments,
hist
*/
void InitCompressedData( void InitCompressedData(
const common::HistCutMatrix& hmat, const SparsePage& row_batch); const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense);
void CreateHistIndices(const SparsePage& row_batch); void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value);
~DeviceShard() { ~DeviceShard() {
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device_id));
@ -708,10 +786,9 @@ struct DeviceShard {
int constexpr kBlockThreads = 256; int constexpr kBlockThreads = 256;
EvaluateSplitKernel<kBlockThreads, GradientSumT> EvaluateSplitKernel<kBlockThreads, GradientSumT>
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>( <<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
hist.GetNodeHistogram(nidx), d_feature_set, node, hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
d_cut.feature_segments.GetSpan(), d_cut.min_fvalue.GetSpan(), gpu_param, d_split_candidates, value_constraints[nidx],
d_cut.gidx_fvalue_map.GetSpan(), gpu_param, d_split_candidates, monotone_constraints.GetSpan());
value_constraints[nidx], monotone_constraints.GetSpan());
// Reduce over features to find best feature // Reduce over features to find best feature
auto d_result = d_result_all.subspan(i, 1); auto d_result = d_result_all.subspan(i, 1);
@ -756,47 +833,35 @@ struct DeviceShard {
hist.HistogramExists(nidx_parent); hist.HistogramExists(nidx_parent);
} }
void UpdatePosition(int nidx, int left_nidx, int right_nidx, int fidx, void UpdatePosition(int nidx, RegTree::Node split_node) {
int64_t split_gidx, bool default_dir_left, bool is_dense, CHECK(!split_node.IsLeaf()) <<"Node must not be leaf";
int fidx_begin, // cut.row_ptr[fidx]
int fidx_end) { // cut.row_ptr[fidx + 1]
dh::safe_cuda(cudaSetDevice(device_id));
Segment segment = ridx_segments[nidx]; Segment segment = ridx_segments[nidx];
bst_uint* d_ridx = ridx.Current(); bst_uint* d_ridx = ridx.Current();
int* d_position = position.Current(); int* d_position = position.Current();
common::CompressedIterator<uint32_t> d_gidx = gidx;
size_t row_stride = this->row_stride;
if (left_counts.size() <= nidx) { if (left_counts.size() <= nidx) {
left_counts.resize((nidx * 2) + 1); left_counts.resize((nidx * 2) + 1);
} }
int64_t* d_left_count = left_counts.data().get() + nidx; int64_t* d_left_count = left_counts.data().get() + nidx;
auto d_matrix = this->ellpack_matrix;
// Launch 1 thread for each row // Launch 1 thread for each row
dh::LaunchN<1, 128>( dh::LaunchN<1, 128>(
device_id, segment.Size(), [=] __device__(bst_uint idx) { device_id, segment.Size(), [=] __device__(bst_uint idx) {
idx += segment.begin; idx += segment.begin;
bst_uint ridx = d_ridx[idx]; bst_uint ridx = d_ridx[idx];
auto row_begin = row_stride * ridx; bst_float element = d_matrix.GetElement(ridx, split_node.SplitIndex());
auto row_end = row_begin + row_stride; // Missing value
auto gidx = -1; int new_position = 0;
if (is_dense) { if (isnan(element)) {
// FIXME: Maybe just search the cuts again. new_position = split_node.DefaultChild();
gidx = d_gidx[row_begin + fidx];
} else { } else {
gidx = BinarySearchRow(row_begin, row_end, d_gidx, fidx_begin, if (element <= split_node.SplitCond()) {
fidx_end); new_position = split_node.LeftChild();
} else {
new_position = split_node.RightChild();
}
} }
CountLeft(d_left_count, new_position, split_node.LeftChild());
// belong to left node or right node. d_position[idx] = new_position;
int position;
if (gidx >= 0) {
// Feature is found
position = gidx <= split_gidx ? left_nidx : right_nidx;
} else {
// Feature is missing
position = default_dir_left ? left_nidx : right_nidx;
}
CountLeft(d_left_count, position, left_nidx);
d_position[idx] = position;
}); });
// Overlap device to host memory copy (left_count) with sort // Overlap device to host memory copy (left_count) with sort
@ -805,16 +870,16 @@ struct DeviceShard {
dh::safe_cuda(cudaMemcpyAsync(tmp_pinned.data(), d_left_count, sizeof(int64_t), dh::safe_cuda(cudaMemcpyAsync(tmp_pinned.data(), d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost, streams[0])); cudaMemcpyDeviceToHost, streams[0]));
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, SortPositionAndCopy(segment, split_node.LeftChild(), split_node.RightChild(), d_left_count,
streams[1]); streams[1]);
dh::safe_cuda(cudaStreamSynchronize(streams[0])); dh::safe_cuda(cudaStreamSynchronize(streams[0]));
int64_t left_count = tmp_pinned[0]; int64_t left_count = tmp_pinned[0];
CHECK_LE(left_count, segment.Size()); CHECK_LE(left_count, segment.Size());
CHECK_GE(left_count, 0); CHECK_GE(left_count, 0);
ridx_segments[left_nidx] = ridx_segments[split_node.LeftChild()] =
Segment(segment.begin, segment.begin + left_count); Segment(segment.begin, segment.begin + left_count);
ridx_segments[right_nidx] = ridx_segments[split_node.RightChild()] =
Segment(segment.begin + left_count, segment.end); Segment(segment.begin + left_count, segment.end);
} }
@ -840,6 +905,41 @@ struct DeviceShard {
}); });
} }
// After tree update is finished, update the position of all training
// instances to their final leaf This information is used later to update the
// prediction cache
void FinalisePosition(RegTree* p_tree) {
const auto d_nodes =
temp_memory.GetSpan<RegTree::Node>(p_tree->GetNodes().size());
dh::safe_cuda(cudaMemcpy(d_nodes.data(), p_tree->GetNodes().data(),
d_nodes.size() * sizeof(RegTree::Node),
cudaMemcpyHostToDevice));
auto d_position = position.Current();
const auto d_ridx = ridx.Current();
auto d_matrix = this->ellpack_matrix;
dh::LaunchN(device_id, position.Size(), [=] __device__(size_t idx) {
auto position = d_position[idx];
auto node = d_nodes[position];
bst_uint ridx = d_ridx[idx];
while (!node.IsLeaf()) {
bst_float element = d_matrix.GetElement(ridx, node.SplitIndex());
// Missing value
if (isnan(element)) {
position = node.DefaultChild();
} else {
if (element <= node.SplitCond()) {
position = node.LeftChild();
} else {
position = node.RightChild();
}
}
node = d_nodes[position];
}
d_position[idx] = position;
});
}
void UpdatePredictionCache(bst_float* out_preds_d) { void UpdatePredictionCache(bst_float* out_preds_d) {
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device_id));
if (!prediction_cache_initialised) { if (!prediction_cache_initialised) {
@ -880,14 +980,12 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
auto segment = shard->ridx_segments[nidx]; auto segment = shard->ridx_segments[nidx];
auto segment_begin = segment.begin; auto segment_begin = segment.begin;
auto d_node_hist = shard->hist.GetNodeHistogram(nidx); auto d_node_hist = shard->hist.GetNodeHistogram(nidx);
auto d_gidx = shard->gidx;
auto d_ridx = shard->ridx.Current(); auto d_ridx = shard->ridx.Current();
auto d_gpair = shard->gpair.Data(); auto d_gpair = shard->gpair.Data();
int null_gidx_value = shard->null_gidx_value; auto n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
auto n_elements = segment.Size() * shard->row_stride;
const size_t smem_size = sizeof(GradientSumT) * shard->null_gidx_value; const size_t smem_size = sizeof(GradientSumT) * shard->ellpack_matrix.BinCount();
const int items_per_thread = 8; const int items_per_thread = 8;
const int block_threads = 256; const int block_threads = 256;
const int grid_size = const int grid_size =
@ -896,10 +994,9 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
if (grid_size <= 0) { if (grid_size <= 0) {
return; return;
} }
dh::safe_cuda(cudaSetDevice(shard->device_id)); SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>(
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>> shard->ellpack_matrix, d_ridx, d_node_hist.data(), d_gpair,
(shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair, segment_begin, n_elements);
segment_begin, n_elements);
} }
}; };
@ -908,20 +1005,18 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
void Build(DeviceShard<GradientSumT>* shard, int nidx) override { void Build(DeviceShard<GradientSumT>* shard, int nidx) override {
Segment segment = shard->ridx_segments[nidx]; Segment segment = shard->ridx_segments[nidx];
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data(); auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
common::CompressedIterator<uint32_t> d_gidx = shard->gidx;
bst_uint* d_ridx = shard->ridx.Current(); bst_uint* d_ridx = shard->ridx.Current();
GradientPair* d_gpair = shard->gpair.Data(); GradientPair* d_gpair = shard->gpair.Data();
size_t const n_elements = segment.Size() * shard->row_stride; size_t const n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
size_t const row_stride = shard->row_stride; auto d_matrix = shard->ellpack_matrix;
int const null_gidx_value = shard->null_gidx_value;
dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) { dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) {
int ridx = d_ridx[(idx / row_stride) + segment.begin]; int ridx = d_ridx[(idx / d_matrix.row_stride) + segment.begin];
// lookup the index (bin) of histogram. // lookup the index (bin) of histogram.
int gidx = d_gidx[ridx * row_stride + idx % row_stride]; int gidx = d_matrix.gidx_iter[ridx * d_matrix.row_stride + idx % d_matrix.row_stride];
if (gidx != null_gidx_value) { if (gidx != d_matrix.null_gidx_value) {
AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]); AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]);
} }
}); });
@ -930,10 +1025,10 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
template <typename GradientSumT> template <typename GradientSumT>
inline void DeviceShard<GradientSumT>::InitCompressedData( inline void DeviceShard<GradientSumT>::InitCompressedData(
const common::HistCutMatrix& hmat, const SparsePage& row_batch) { const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) {
dh::safe_cuda(cudaSetDevice(device_id)); size_t row_stride = this->InitRowPtrs(row_batch);
n_bins = hmat.NumBins(); n_bins = hmat.row_ptr.back();
null_gidx_value = hmat.NumBins(); int null_gidx_value = hmat.row_ptr.back();
int max_nodes = int max_nodes =
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth); param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
@ -944,13 +1039,13 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
&position, n_rows, &position, n_rows,
&prediction_cache, n_rows, &prediction_cache, n_rows,
&node_sum_gradients_d, max_nodes, &node_sum_gradients_d, max_nodes,
&d_cut.feature_segments, hmat.row_ptr.size(), &feature_segments, hmat.row_ptr.size(),
&d_cut.gidx_fvalue_map, hmat.cut.size(), &gidx_fvalue_map, hmat.cut.size(),
&d_cut.min_fvalue, hmat.min_val.size(), &min_fvalue, hmat.min_val.size(),
&monotone_constraints, param.monotone_constraints.size()); &monotone_constraints, param.monotone_constraints.size());
d_cut.gidx_fvalue_map = hmat.cut; gidx_fvalue_map = hmat.cut;
d_cut.min_fvalue = hmat.min_val; min_fvalue = hmat.min_val;
d_cut.feature_segments = hmat.row_ptr; feature_segments = hmat.row_ptr;
monotone_constraints = param.monotone_constraints; monotone_constraints = param.monotone_constraints;
node_sum_gradients.resize(max_nodes); node_sum_gradients.resize(max_nodes);
@ -970,15 +1065,18 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes); ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
gidx_buffer.Fill(0); gidx_buffer.Fill(0);
int nbits = common::detail::SymbolBits(num_symbols); this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
CreateHistIndices(row_batch); ellpack_matrix.Init(
feature_segments.GetSpan(), min_fvalue.GetSpan(),
gidx = common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols); gidx_fvalue_map.GetSpan(), row_stride,
common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols),
is_dense, null_gidx_value);
// check if we can use shared memory for building histograms // check if we can use shared memory for building histograms
// (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding) // (assuming atleast we need 2 CTAs per SM to maintain decent latency
auto histogram_size = sizeof(GradientSumT) * null_gidx_value; // hiding)
auto histogram_size = sizeof(GradientSumT) * hmat.row_ptr.back();
auto max_smem = dh::MaxSharedMemory(device_id); auto max_smem = dh::MaxSharedMemory(device_id);
if (histogram_size <= max_smem) { if (histogram_size <= max_smem) {
hist_builder.reset(new SharedMemHistBuilder<GradientSumT>); hist_builder.reset(new SharedMemHistBuilder<GradientSumT>);
@ -990,9 +1088,9 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
hist.Init(device_id, hmat.NumBins()); hist.Init(device_id, hmat.NumBins());
} }
template <typename GradientSumT> template <typename GradientSumT>
inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_batch) { inline void DeviceShard<GradientSumT>::CreateHistIndices(
const SparsePage& row_batch, size_t row_stride, int null_gidx_value) {
int num_symbols = n_bins + 1; int num_symbols = n_bins + 1;
// bin and compress entries in batches of rows // bin and compress entries in batches of rows
size_t gpu_batch_nrows = size_t gpu_batch_nrows =
@ -1026,7 +1124,7 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
gidx_buffer.Data(), gidx_buffer.Data(),
row_ptrs.data().get() + batch_row_begin, row_ptrs.data().get() + batch_row_begin,
entries_d.data().get(), entries_d.data().get(),
d_cut.gidx_fvalue_map.Data(), d_cut.feature_segments.Data(), gidx_fvalue_map.Data(), feature_segments.Data(),
batch_row_begin, batch_nrows, batch_row_begin, batch_nrows,
row_ptrs[batch_row_begin], row_ptrs[batch_row_begin],
row_stride, null_gidx_value); row_stride, null_gidx_value);
@ -1039,12 +1137,9 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
entries_d.shrink_to_fit(); entries_d.shrink_to_fit();
} }
template <typename GradientSumT> template <typename GradientSumT>
class GPUHistMakerSpecialised{ class GPUHistMakerSpecialised{
public: public:
struct ExpandEntry;
GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {} GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
void Init( void Init(
const std::vector<std::pair<std::string, std::string>>& args) { const std::vector<std::pair<std::string, std::string>>& args) {
@ -1111,7 +1206,6 @@ class GPUHistMakerSpecialised{
shard = std::unique_ptr<DeviceShard<GradientSumT>>( shard = std::unique_ptr<DeviceShard<GradientSumT>>(
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start, new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start,
start + size, param_)); start + size, param_));
shard->InitRowPtrs(batch);
}); });
// Find the cuts. // Find the cuts.
@ -1119,13 +1213,14 @@ class GPUHistMakerSpecialised{
common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows); common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
n_bins_ = hmat_.row_ptr.back(); n_bins_ = hmat_.row_ptr.back();
monitor_.StopCuda("Quantiles"); monitor_.StopCuda("Quantiles");
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
monitor_.StartCuda("BinningCompression"); monitor_.StartCuda("BinningCompression");
dh::ExecuteIndexShards( dh::ExecuteIndexShards(
&shards_, &shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
dh::safe_cuda(cudaSetDevice(shard->device_id)); dh::safe_cuda(cudaSetDevice(shard->device_id));
shard->InitCompressedData(hmat_, batch); shard->InitCompressedData(hmat_, batch, is_dense);
}); });
monitor_.StopCuda("BinningCompression"); monitor_.StopCuda("BinningCompression");
++batch_iter; ++batch_iter;
@ -1300,32 +1395,19 @@ class GPUHistMakerSpecialised{
} }
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) { void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
int nidx = candidate.nid;
int left_nidx = (*p_tree)[nidx].LeftChild();
int right_nidx = (*p_tree)[nidx].RightChild();
// convert floating-point split_pt into corresponding bin_id
// split_cond = -1 indicates that split_pt is less than all known cut points
int64_t split_gidx = -1;
int64_t fidx = candidate.split.findex;
bool default_dir_left = candidate.split.dir == kLeftDir;
uint32_t fidx_begin = hmat_.row_ptr[fidx];
uint32_t fidx_end = hmat_.row_ptr[fidx + 1];
// split_gidx = i where i is the i^th bin containing split value.
for (auto i = fidx_begin; i < fidx_end; ++i) {
if (candidate.split.fvalue == hmat_.cut[i]) {
split_gidx = static_cast<int64_t>(i);
}
}
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
dh::ExecuteIndexShards( dh::ExecuteIndexShards(
&shards_, &shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { [&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
dh::safe_cuda(cudaSetDevice(shard->device_id)); dh::safe_cuda(cudaSetDevice(shard->device_id));
shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx, split_gidx, shard->UpdatePosition(candidate.nid,
default_dir_left, is_dense, fidx_begin, p_tree->GetNodes()[candidate.nid]);
fidx_end); });
}
void FinalisePosition(RegTree* p_tree) {
dh::ExecuteIndexShards(
&shards_,
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
shard->FinalisePosition(p_tree);
}); });
} }
@ -1380,20 +1462,22 @@ class GPUHistMakerSpecialised{
while (!qexpand_->empty()) { while (!qexpand_->empty()) {
ExpandEntry candidate = qexpand_->top(); ExpandEntry candidate = qexpand_->top();
qexpand_->pop(); qexpand_->pop();
if (!candidate.IsValid(param_, num_leaves)) continue; if (!candidate.IsValid(param_, num_leaves)) {
continue;
}
this->ApplySplit(candidate, p_tree); this->ApplySplit(candidate, p_tree);
monitor_.StartCuda("UpdatePosition");
this->UpdatePosition(candidate, p_tree);
monitor_.StopCuda("UpdatePosition");
num_leaves++; num_leaves++;
int left_child_nidx = tree[candidate.nid].LeftChild(); int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild(); int right_child_nidx = tree[candidate.nid].RightChild();
// Only create child entries if needed // Only create child entries if needed
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx), if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
num_leaves)) { num_leaves)) {
monitor_.StartCuda("UpdatePosition");
this->UpdatePosition(candidate, p_tree);
monitor_.StopCuda("UpdatePosition");
monitor_.StartCuda("BuildHist"); monitor_.StartCuda("BuildHist");
this->BuildHistLeftRight(candidate.nid, left_child_nidx, this->BuildHistLeftRight(candidate.nid, left_child_nidx,
right_child_nidx); right_child_nidx);
@ -1407,10 +1491,14 @@ class GPUHistMakerSpecialised{
timestamp++)); timestamp++));
qexpand_->push(ExpandEntry(right_child_nidx, qexpand_->push(ExpandEntry(right_child_nidx,
tree.GetDepth(right_child_nidx), tree.GetDepth(right_child_nidx),
splits.at(1), timestamp++)); splits.at(1), timestamp++));
monitor_.StopCuda("EvaluateSplits"); monitor_.StopCuda("EvaluateSplits");
} }
} }
monitor_.StartCuda("FinalisePosition");
this->FinalisePosition(p_tree);
monitor_.StopCuda("FinalisePosition");
} }
bool UpdatePredictionCache( bool UpdatePredictionCache(
@ -1431,64 +1519,6 @@ class GPUHistMakerSpecialised{
return true; return true;
} }
struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;
uint64_t timestamp;
ExpandEntry(int _nid, int _depth, const DeviceSplitCandidate _split,
uint64_t _timestamp) :
nid{_nid}, depth{_depth}, split(std::move(_split)),
timestamp{_timestamp} {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) {
return false;
}
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
return false;
}
if (param.max_depth > 0 && depth == param.max_depth) {
return false;
}
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
return false;
}
return true;
}
static bool ChildIsValid(const TrainParam& param, int depth,
int num_leaves) {
if (param.max_depth > 0 && depth >= param.max_depth) return false;
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
return true;
}
friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
os << "ExpandEntry: \n";
os << "nidx: " << e.nid << "\n";
os << "depth: " << e.depth << "\n";
os << "loss: " << e.split.loss_chg << "\n";
os << "left_sum: " << e.split.left_sum << "\n";
os << "right_sum: " << e.split.right_sum << "\n";
return os;
}
};
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.depth == rhs.depth) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.depth > rhs.depth; // favor small depth
}
}
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
if (lhs.split.loss_chg == rhs.split.loss_chg) {
return lhs.timestamp > rhs.timestamp; // favor small timestamp
} else {
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
}
}
TrainParam param_; // NOLINT TrainParam param_; // NOLINT
common::HistCutMatrix hmat_; // NOLINT common::HistCutMatrix hmat_; // NOLINT
MetaInfo* info_; // NOLINT MetaInfo* info_; // NOLINT
@ -1507,8 +1537,9 @@ class GPUHistMakerSpecialised{
GPUHistMakerTrainParam hist_maker_param_; GPUHistMakerTrainParam hist_maker_param_;
common::GHistIndexMatrix gmat_; common::GHistIndexMatrix gmat_;
using ExpandQueue = std::priority_queue<ExpandEntry, std::vector<ExpandEntry>, using ExpandQueue =
std::function<bool(ExpandEntry, ExpandEntry)>>; std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
std::function<bool(ExpandEntry, ExpandEntry)>>;
std::unique_ptr<ExpandQueue> qexpand_; std::unique_ptr<ExpandQueue> qexpand_;
dh::AllReducer reducer_; dh::AllReducer reducer_;

View File

@ -26,7 +26,7 @@
bool FileExists(const std::string& filename); bool FileExists(const std::string& filename);
long GetFileSize(const std::string& filename); int64_t GetFileSize(const std::string& filename);
void CreateSimpleTestData(const std::string& filename); void CreateSimpleTestData(const std::string& filename);

View File

@ -39,8 +39,9 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
0.26f, 0.74f, 1.98f, 0.26f, 0.74f, 1.98f,
0.26f, 0.71f, 1.83f}; 0.26f, 0.71f, 1.83f};
shard->InitRowPtrs(batch); auto is_dense = (*dmat)->Info().num_nonzero_ ==
shard->InitCompressedData(cmat, batch); (*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
shard->InitCompressedData(cmat, batch, is_dense);
delete dmat; delete dmat;
} }
@ -59,7 +60,7 @@ TEST(GpuHist, BuildGidxDense) {
h_gidx_buffer = shard.gidx_buffer.AsVector(); h_gidx_buffer = shard.gidx_buffer.AsVector();
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25); common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_EQ(shard.row_stride, kNCols); ASSERT_EQ(shard.ellpack_matrix.row_stride, kNCols);
std::vector<uint32_t> solution = { std::vector<uint32_t> solution = {
0, 3, 8, 9, 14, 17, 20, 21, 0, 3, 8, 9, 14, 17, 20, 21,
@ -98,7 +99,7 @@ TEST(GpuHist, BuildGidxSparse) {
h_gidx_buffer = shard.gidx_buffer.AsVector(); h_gidx_buffer = shard.gidx_buffer.AsVector();
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25); common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_LE(shard.row_stride, 3); ASSERT_LE(shard.ellpack_matrix.row_stride, 3);
// row_stride = 3, 16 rows, 48 entries for ELLPack // row_stride = 3, 16 rows, 48 entries for ELLPack
std::vector<uint32_t> solution = { std::vector<uint32_t> solution = {
@ -106,7 +107,7 @@ TEST(GpuHist, BuildGidxSparse) {
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
}; };
for (size_t i = 0; i < kNRows * shard.row_stride; ++i) { for (size_t i = 0; i < kNRows * shard.ellpack_matrix.row_stride; ++i) {
ASSERT_EQ(solution[i], gidx[i]); ASSERT_EQ(solution[i], gidx[i]);
} }
} }
@ -256,16 +257,19 @@ TEST(GpuHist, EvaluateSplits) {
common::HistCutMatrix cmat = GetHostCutMatrix(); common::HistCutMatrix cmat = GetHostCutMatrix();
// Copy cut matrix to device. // Copy cut matrix to device.
DeviceShard<GradientPairPrecise>::DeviceHistCutMatrix cut;
shard->ba.Allocate(0, shard->ba.Allocate(0,
&(shard->d_cut.feature_segments), cmat.row_ptr.size(), &(shard->feature_segments), cmat.row_ptr.size(),
&(shard->d_cut.min_fvalue), cmat.min_val.size(), &(shard->min_fvalue), cmat.min_val.size(),
&(shard->d_cut.gidx_fvalue_map), 24, &(shard->gidx_fvalue_map), 24,
&(shard->monotone_constraints), kNCols); &(shard->monotone_constraints), kNCols);
shard->d_cut.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
shard->d_cut.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
shard->monotone_constraints.copy(param.monotone_constraints.begin(), shard->monotone_constraints.copy(param.monotone_constraints.begin(),
param.monotone_constraints.end()); param.monotone_constraints.end());
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan();
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan();
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end());
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan();
// Initialize DeviceShard::hist // Initialize DeviceShard::hist
shard->hist.Init(0, (max_bins - 1) * kNCols); shard->hist.Init(0, (max_bins - 1) * kNCols);
@ -339,7 +343,7 @@ TEST(GpuHist, ApplySplit) {
shard->ridx_segments[0] = Segment(0, kNRows); shard->ridx_segments[0] = Segment(0, kNRows);
shard->ba.Allocate(0, &(shard->ridx), kNRows, shard->ba.Allocate(0, &(shard->ridx), kNRows,
&(shard->position), kNRows); &(shard->position), kNRows);
shard->row_stride = kNCols; shard->ellpack_matrix.row_stride = kNCols;
thrust::sequence(shard->ridx.CurrentDVec().tbegin(), thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
shard->ridx.CurrentDVec().tend()); shard->ridx.CurrentDVec().tend());
// Initialize GPUHistMaker // Initialize GPUHistMaker
@ -351,11 +355,9 @@ TEST(GpuHist, ApplySplit) {
0.59, 4, // fvalue has to be equal to one of the cut field 0.59, 4, // fvalue has to be equal to one of the cut field
GradientPair(8.2, 2.8), GradientPair(6.3, 3.6), GradientPair(8.2, 2.8), GradientPair(6.3, 3.6),
GPUTrainingParam(param)); GPUTrainingParam(param));
GPUHistMakerSpecialised<GradientPairPrecise>::ExpandEntry candidate_entry {0, 0, candidate, 0}; ExpandEntry candidate_entry {0, 0, candidate, 0};
candidate_entry.nid = kNId; candidate_entry.nid = kNId;
auto const& nodes = tree.GetNodes();
// Used to get bin_id in update position. // Used to get bin_id in update position.
common::HistCutMatrix cmat = GetHostCutMatrix(); common::HistCutMatrix cmat = GetHostCutMatrix();
hist_maker.hmat_ = cmat; hist_maker.hmat_ = cmat;
@ -370,19 +372,31 @@ TEST(GpuHist, ApplySplit) {
int row_stride = kNCols; int row_stride = kNCols;
int num_symbols = n_bins + 1; int num_symbols = n_bins + 1;
size_t compressed_size_bytes = size_t compressed_size_bytes =
common::CompressedBufferWriter::CalculateBufferSize( common::CompressedBufferWriter::CalculateBufferSize(row_stride * kNRows,
row_stride * kNRows, num_symbols); num_symbols);
shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes); shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes,
&(shard->feature_segments), cmat.row_ptr.size(),
&(shard->min_fvalue), cmat.min_val.size(),
&(shard->gidx_fvalue_map), 24);
shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan();
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan();
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end());
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan();
shard->ellpack_matrix.is_dense = true;
common::CompressedBufferWriter wr(num_symbols); common::CompressedBufferWriter wr(num_symbols);
std::vector<int> h_gidx (kNRows * row_stride); // gidx 14 should go right, 12 goes left
std::iota(h_gidx.begin(), h_gidx.end(), 0); std::vector<int> h_gidx (kNRows * row_stride, 14);
h_gidx[4] = 12;
h_gidx[12] = 12;
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes); std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end()); wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end());
shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end()); shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end());
shard->gidx = common::CompressedIterator<uint32_t>( shard->ellpack_matrix.gidx_iter = common::CompressedIterator<uint32_t>(
shard->gidx_buffer.Data(), num_symbols); shard->gidx_buffer.Data(), num_symbols);
hist_maker.info_ = &info; hist_maker.info_ = &info;
@ -395,8 +409,8 @@ TEST(GpuHist, ApplySplit) {
int right_nidx = tree[kNId].RightChild(); int right_nidx = tree[kNId].RightChild();
ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0); ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0);
ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6); ASSERT_EQ(shard->ridx_segments[left_nidx].end, 2);
ASSERT_EQ(shard->ridx_segments[right_nidx].begin, 6); ASSERT_EQ(shard->ridx_segments[right_nidx].begin, 2);
ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16); ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
} }
@ -417,7 +431,7 @@ void TestSortPosition(const std::vector<int>& position_in, int left_idx,
common::Span<int>(position_out.data().get(), position_out.size()), common::Span<int>(position_out.data().get(), position_out.size()),
common::Span<bst_uint>(ridx.data().get(), ridx.size()), common::Span<bst_uint>(ridx.data().get(), ridx.size()),
common::Span<bst_uint>(ridx_out.data().get(), ridx_out.size()), left_idx, common::Span<bst_uint>(ridx_out.data().get(), ridx_out.size()), left_idx,
right_idx, d_left_count.data().get()); right_idx, d_left_count.data().get(), nullptr);
thrust::host_vector<int> position_result = position_out; thrust::host_vector<int> position_result = position_out;
thrust::host_vector<int> ridx_result = ridx_out; thrust::host_vector<int> ridx_result = ridx_out;