Further optimisations for gpu_hist. (#4283)
- Fuse final update position functions into a single more efficient kernel - Refactor gpu_hist with a more explicit ellpack matrix representation
This commit is contained in:
parent
5aa42b5f11
commit
6d5b34d824
@ -93,65 +93,63 @@ class RegTree {
|
|||||||
"Node: 64 bit align");
|
"Node: 64 bit align");
|
||||||
}
|
}
|
||||||
/*! \brief index of left child */
|
/*! \brief index of left child */
|
||||||
int LeftChild() const {
|
XGBOOST_DEVICE int LeftChild() const {
|
||||||
return this->cleft_;
|
return this->cleft_;
|
||||||
}
|
}
|
||||||
/*! \brief index of right child */
|
/*! \brief index of right child */
|
||||||
int RightChild() const {
|
XGBOOST_DEVICE int RightChild() const {
|
||||||
return this->cright_;
|
return this->cright_;
|
||||||
}
|
}
|
||||||
/*! \brief index of default child when feature is missing */
|
/*! \brief index of default child when feature is missing */
|
||||||
int DefaultChild() const {
|
XGBOOST_DEVICE int DefaultChild() const {
|
||||||
return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
|
return this->DefaultLeft() ? this->LeftChild() : this->RightChild();
|
||||||
}
|
}
|
||||||
/*! \brief feature index of split condition */
|
/*! \brief feature index of split condition */
|
||||||
unsigned SplitIndex() const {
|
XGBOOST_DEVICE unsigned SplitIndex() const {
|
||||||
return sindex_ & ((1U << 31) - 1U);
|
return sindex_ & ((1U << 31) - 1U);
|
||||||
}
|
}
|
||||||
/*! \brief when feature is unknown, whether goes to left child */
|
/*! \brief when feature is unknown, whether goes to left child */
|
||||||
bool DefaultLeft() const {
|
XGBOOST_DEVICE bool DefaultLeft() const {
|
||||||
return (sindex_ >> 31) != 0;
|
return (sindex_ >> 31) != 0;
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is leaf node */
|
/*! \brief whether current node is leaf node */
|
||||||
bool IsLeaf() const {
|
XGBOOST_DEVICE bool IsLeaf() const {
|
||||||
return cleft_ == -1;
|
return cleft_ == -1;
|
||||||
}
|
}
|
||||||
/*! \return get leaf value of leaf node */
|
/*! \return get leaf value of leaf node */
|
||||||
bst_float LeafValue() const {
|
XGBOOST_DEVICE bst_float LeafValue() const {
|
||||||
return (this->info_).leaf_value;
|
return (this->info_).leaf_value;
|
||||||
}
|
}
|
||||||
/*! \return get split condition of the node */
|
/*! \return get split condition of the node */
|
||||||
SplitCondT SplitCond() const {
|
XGBOOST_DEVICE SplitCondT SplitCond() const {
|
||||||
return (this->info_).split_cond;
|
return (this->info_).split_cond;
|
||||||
}
|
}
|
||||||
/*! \brief get parent of the node */
|
/*! \brief get parent of the node */
|
||||||
int Parent() const {
|
XGBOOST_DEVICE int Parent() const {
|
||||||
return parent_ & ((1U << 31) - 1);
|
return parent_ & ((1U << 31) - 1);
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is left child */
|
/*! \brief whether current node is left child */
|
||||||
bool IsLeftChild() const {
|
XGBOOST_DEVICE bool IsLeftChild() const {
|
||||||
return (parent_ & (1U << 31)) != 0;
|
return (parent_ & (1U << 31)) != 0;
|
||||||
}
|
}
|
||||||
/*! \brief whether this node is deleted */
|
/*! \brief whether this node is deleted */
|
||||||
bool IsDeleted() const {
|
XGBOOST_DEVICE bool IsDeleted() const {
|
||||||
return sindex_ == std::numeric_limits<unsigned>::max();
|
return sindex_ == std::numeric_limits<unsigned>::max();
|
||||||
}
|
}
|
||||||
/*! \brief whether current node is root */
|
/*! \brief whether current node is root */
|
||||||
bool IsRoot() const {
|
XGBOOST_DEVICE bool IsRoot() const { return parent_ == -1; }
|
||||||
return parent_ == -1;
|
|
||||||
}
|
|
||||||
/*!
|
/*!
|
||||||
* \brief set the left child
|
* \brief set the left child
|
||||||
* \param nid node id to right child
|
* \param nid node id to right child
|
||||||
*/
|
*/
|
||||||
void SetLeftChild(int nid) {
|
XGBOOST_DEVICE void SetLeftChild(int nid) {
|
||||||
this->cleft_ = nid;
|
this->cleft_ = nid;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
* \brief set the right child
|
* \brief set the right child
|
||||||
* \param nid node id to right child
|
* \param nid node id to right child
|
||||||
*/
|
*/
|
||||||
void SetRightChild(int nid) {
|
XGBOOST_DEVICE void SetRightChild(int nid) {
|
||||||
this->cright_ = nid;
|
this->cright_ = nid;
|
||||||
}
|
}
|
||||||
/*!
|
/*!
|
||||||
@ -160,7 +158,7 @@ class RegTree {
|
|||||||
* \param split_cond split condition
|
* \param split_cond split condition
|
||||||
* \param default_left the default direction when feature is unknown
|
* \param default_left the default direction when feature is unknown
|
||||||
*/
|
*/
|
||||||
void SetSplit(unsigned split_index, SplitCondT split_cond,
|
XGBOOST_DEVICE void SetSplit(unsigned split_index, SplitCondT split_cond,
|
||||||
bool default_left = false) {
|
bool default_left = false) {
|
||||||
if (default_left) split_index |= (1U << 31);
|
if (default_left) split_index |= (1U << 31);
|
||||||
this->sindex_ = split_index;
|
this->sindex_ = split_index;
|
||||||
@ -172,17 +170,17 @@ class RegTree {
|
|||||||
* \param right right index, could be used to store
|
* \param right right index, could be used to store
|
||||||
* additional information
|
* additional information
|
||||||
*/
|
*/
|
||||||
void SetLeaf(bst_float value, int right = -1) {
|
XGBOOST_DEVICE void SetLeaf(bst_float value, int right = -1) {
|
||||||
(this->info_).leaf_value = value;
|
(this->info_).leaf_value = value;
|
||||||
this->cleft_ = -1;
|
this->cleft_ = -1;
|
||||||
this->cright_ = right;
|
this->cright_ = right;
|
||||||
}
|
}
|
||||||
/*! \brief mark that this node is deleted */
|
/*! \brief mark that this node is deleted */
|
||||||
void MarkDelete() {
|
XGBOOST_DEVICE void MarkDelete() {
|
||||||
this->sindex_ = std::numeric_limits<unsigned>::max();
|
this->sindex_ = std::numeric_limits<unsigned>::max();
|
||||||
}
|
}
|
||||||
// set parent
|
// set parent
|
||||||
void SetParent(int pidx, bool is_left_child = true) {
|
XGBOOST_DEVICE void SetParent(int pidx, bool is_left_child = true) {
|
||||||
if (is_left_child) pidx |= (1U << 31);
|
if (is_left_child) pidx |= (1U << 31);
|
||||||
this->parent_ = pidx;
|
this->parent_ = pidx;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -303,6 +303,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
const gbm::GBTreeModel& model, size_t tree_begin,
|
const gbm::GBTreeModel& model, size_t tree_begin,
|
||||||
size_t tree_end) {
|
size_t tree_end) {
|
||||||
if (tree_end - tree_begin == 0) { return; }
|
if (tree_end - tree_begin == 0) { return; }
|
||||||
|
monitor_.StartCuda("DevicePredictInternal");
|
||||||
|
|
||||||
CHECK_EQ(model.param.size_leaf_vector, 0);
|
CHECK_EQ(model.param.size_leaf_vector, 0);
|
||||||
// Copy decision trees to device
|
// Copy decision trees to device
|
||||||
@ -337,6 +338,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
});
|
});
|
||||||
i_batch++;
|
i_batch++;
|
||||||
}
|
}
|
||||||
|
monitor_.StopCuda("DevicePredictInternal");
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -388,9 +390,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (it != cache_.end()) {
|
if (it != cache_.end()) {
|
||||||
const HostDeviceVector<bst_float>& y = it->second.predictions;
|
const HostDeviceVector<bst_float>& y = it->second.predictions;
|
||||||
if (y.Size() != 0) {
|
if (y.Size() != 0) {
|
||||||
|
monitor_.StartCuda("PredictFromCache");
|
||||||
out_preds->Reshard(y.Distribution());
|
out_preds->Reshard(y.Distribution());
|
||||||
out_preds->Resize(y.Size());
|
out_preds->Resize(y.Size());
|
||||||
out_preds->Copy(y);
|
out_preds->Copy(y);
|
||||||
|
monitor_.StopCuda("PredictFromCache");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -481,6 +485,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
std::unique_ptr<Predictor> cpu_predictor_;
|
std::unique_ptr<Predictor> cpu_predictor_;
|
||||||
std::vector<DeviceShard> shards_;
|
std::vector<DeviceShard> shards_;
|
||||||
GPUSet devices_;
|
GPUSet devices_;
|
||||||
|
common::Monitor monitor_;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
|
||||||
|
|||||||
@ -50,6 +50,133 @@ struct GPUHistMakerTrainParam
|
|||||||
|
|
||||||
DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
|
DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
|
||||||
|
|
||||||
|
struct ExpandEntry {
|
||||||
|
int nid;
|
||||||
|
int depth;
|
||||||
|
DeviceSplitCandidate split;
|
||||||
|
uint64_t timestamp;
|
||||||
|
ExpandEntry() = default;
|
||||||
|
ExpandEntry(int nid, int depth, DeviceSplitCandidate split,
|
||||||
|
uint64_t timestamp)
|
||||||
|
: nid(nid), depth(depth), split(std::move(split)), timestamp(timestamp) {}
|
||||||
|
bool IsValid(const TrainParam& param, int num_leaves) const {
|
||||||
|
if (split.loss_chg <= kRtEps) return false;
|
||||||
|
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (param.max_depth > 0 && depth == param.max_depth) return false;
|
||||||
|
if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ChildIsValid(const TrainParam& param, int depth, int num_leaves) {
|
||||||
|
if (param.max_depth > 0 && depth >= param.max_depth) return false;
|
||||||
|
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
|
||||||
|
os << "ExpandEntry: \n";
|
||||||
|
os << "nidx: " << e.nid << "\n";
|
||||||
|
os << "depth: " << e.depth << "\n";
|
||||||
|
os << "loss: " << e.split.loss_chg << "\n";
|
||||||
|
os << "left_sum: " << e.split.left_sum << "\n";
|
||||||
|
os << "right_sum: " << e.split.right_sum << "\n";
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
||||||
|
if (lhs.depth == rhs.depth) {
|
||||||
|
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||||
|
} else {
|
||||||
|
return lhs.depth > rhs.depth; // favor small depth
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
||||||
|
if (lhs.split.loss_chg == rhs.split.loss_chg) {
|
||||||
|
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
||||||
|
} else {
|
||||||
|
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find a gidx value for a given feature otherwise return -1 if not found
|
||||||
|
__device__ int BinarySearchRow(bst_uint begin, bst_uint end,
|
||||||
|
common::CompressedIterator<uint32_t> data,
|
||||||
|
int const fidx_begin, int const fidx_end) {
|
||||||
|
bst_uint previous_middle = UINT32_MAX;
|
||||||
|
while (end != begin) {
|
||||||
|
auto middle = begin + (end - begin) / 2;
|
||||||
|
if (middle == previous_middle) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
previous_middle = middle;
|
||||||
|
|
||||||
|
auto gidx = data[middle];
|
||||||
|
|
||||||
|
if (gidx >= fidx_begin && gidx < fidx_end) {
|
||||||
|
return gidx;
|
||||||
|
} else if (gidx < fidx_begin) {
|
||||||
|
begin = middle;
|
||||||
|
} else {
|
||||||
|
end = middle;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Value is missing
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||||
|
* device. Does not own underlying memory and may be trivially copied into
|
||||||
|
* kernels.*/
|
||||||
|
struct ELLPackMatrix {
|
||||||
|
common::Span<uint32_t> feature_segments;
|
||||||
|
/*! \brief minimum value for each feature. */
|
||||||
|
common::Span<bst_float> min_fvalue;
|
||||||
|
/*! \brief Cut. */
|
||||||
|
common::Span<bst_float> gidx_fvalue_map;
|
||||||
|
/*! \brief row length for ELLPack. */
|
||||||
|
size_t row_stride{0};
|
||||||
|
common::CompressedIterator<uint32_t> gidx_iter;
|
||||||
|
bool is_dense;
|
||||||
|
int null_gidx_value;
|
||||||
|
|
||||||
|
XGBOOST_DEVICE size_t BinCount() const { return gidx_fvalue_map.size(); }
|
||||||
|
|
||||||
|
// Get a matrix element, uses binary search for look up
|
||||||
|
// Return NaN if missing
|
||||||
|
__device__ bst_float GetElement(size_t ridx, size_t fidx) const {
|
||||||
|
auto row_begin = row_stride * ridx;
|
||||||
|
auto row_end = row_begin + row_stride;
|
||||||
|
auto gidx = -1;
|
||||||
|
if (is_dense) {
|
||||||
|
gidx = gidx_iter[row_begin + fidx];
|
||||||
|
} else {
|
||||||
|
gidx =
|
||||||
|
BinarySearchRow(row_begin, row_end, gidx_iter, feature_segments[fidx],
|
||||||
|
feature_segments[fidx + 1]);
|
||||||
|
}
|
||||||
|
if (gidx == -1) {
|
||||||
|
return nan("");
|
||||||
|
}
|
||||||
|
return gidx_fvalue_map[gidx];
|
||||||
|
}
|
||||||
|
void Init(common::Span<uint32_t> feature_segments,
|
||||||
|
common::Span<bst_float> min_fvalue,
|
||||||
|
common::Span<bst_float> gidx_fvalue_map, size_t row_stride,
|
||||||
|
common::CompressedIterator<uint32_t> gidx_iter, bool is_dense,
|
||||||
|
int null_gidx_value) {
|
||||||
|
this->feature_segments = feature_segments;
|
||||||
|
this->min_fvalue = min_fvalue;
|
||||||
|
this->gidx_fvalue_map = gidx_fvalue_map;
|
||||||
|
this->row_stride = row_stride;
|
||||||
|
this->gidx_iter = gidx_iter;
|
||||||
|
this->is_dense = is_dense;
|
||||||
|
this->null_gidx_value = null_gidx_value;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// With constraints
|
// With constraints
|
||||||
template <typename GradientPairT>
|
template <typename GradientPairT>
|
||||||
XGBOOST_DEVICE float inline LossChangeMissing(
|
XGBOOST_DEVICE float inline LossChangeMissing(
|
||||||
@ -111,19 +238,17 @@ __device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_h
|
|||||||
template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
||||||
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
||||||
__device__ void EvaluateFeature(
|
__device__ void EvaluateFeature(
|
||||||
int fidx,
|
int fidx, common::Span<const GradientSumT> node_histogram,
|
||||||
common::Span<const GradientSumT> node_histogram,
|
const ELLPackMatrix& matrix,
|
||||||
common::Span<const uint32_t> feature_segments, // cut.row_ptr
|
|
||||||
float min_fvalue, // cut.min_value
|
|
||||||
common::Span<const float> gidx_fvalue_map, // cut.cut
|
|
||||||
DeviceSplitCandidate* best_split, // shared memory storing best split
|
DeviceSplitCandidate* best_split, // shared memory storing best split
|
||||||
const DeviceNodeStats& node, const GPUTrainingParam& param,
|
const DeviceNodeStats& node, const GPUTrainingParam& param,
|
||||||
TempStorageT* temp_storage, // temp memory for cub operations
|
TempStorageT* temp_storage, // temp memory for cub operations
|
||||||
int constraint, // monotonic_constraints
|
int constraint, // monotonic_constraints
|
||||||
const ValueConstraint& value_constraint) {
|
const ValueConstraint& value_constraint) {
|
||||||
// Use pointer from cut to indicate begin and end of bins for each feature.
|
// Use pointer from cut to indicate begin and end of bins for each feature.
|
||||||
uint32_t gidx_begin = feature_segments[fidx]; // begining bin
|
uint32_t gidx_begin = matrix.feature_segments[fidx]; // begining bin
|
||||||
uint32_t gidx_end = feature_segments[fidx + 1]; // end bin for i^th feature
|
uint32_t gidx_end =
|
||||||
|
matrix.feature_segments[fidx + 1]; // end bin for i^th feature
|
||||||
|
|
||||||
// Sum histogram bins for current feature
|
// Sum histogram bins for current feature
|
||||||
GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
|
GradientSumT const feature_sum = ReduceFeature<BLOCK_THREADS, ReduceT>(
|
||||||
@ -168,16 +293,17 @@ __device__ void EvaluateFeature(
|
|||||||
|
|
||||||
// Best thread updates split
|
// Best thread updates split
|
||||||
if (threadIdx.x == block_max.key) {
|
if (threadIdx.x == block_max.key) {
|
||||||
int gidx = scan_begin + threadIdx.x;
|
int split_gidx = (scan_begin + threadIdx.x) - 1;
|
||||||
float fvalue =
|
float fvalue;
|
||||||
gidx == gidx_begin ? min_fvalue : gidx_fvalue_map[gidx - 1];
|
if (split_gidx < static_cast<int>(gidx_begin)) {
|
||||||
|
fvalue = matrix.min_fvalue[fidx];
|
||||||
|
} else {
|
||||||
|
fvalue = matrix.gidx_fvalue_map[split_gidx];
|
||||||
|
}
|
||||||
GradientSumT left = missing_left ? bin + missing : bin;
|
GradientSumT left = missing_left ? bin + missing : bin;
|
||||||
GradientSumT right = parent_sum - left;
|
GradientSumT right = parent_sum - left;
|
||||||
best_split->Update(gain, missing_left ? kLeftDir : kRightDir,
|
best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue,
|
||||||
fvalue, fidx,
|
fidx, GradientPair(left), GradientPair(right), param);
|
||||||
GradientPair(left),
|
|
||||||
GradientPair(right),
|
|
||||||
param);
|
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
@ -189,10 +315,7 @@ __global__ void EvaluateSplitKernel(
|
|||||||
node_histogram, // histogram for gradients
|
node_histogram, // histogram for gradients
|
||||||
common::Span<const int> feature_set, // Selected features
|
common::Span<const int> feature_set, // Selected features
|
||||||
DeviceNodeStats node,
|
DeviceNodeStats node,
|
||||||
common::Span<const uint32_t>
|
ELLPackMatrix matrix,
|
||||||
d_feature_segments, // row_ptr form HistCutMatrix
|
|
||||||
common::Span<const float> d_fidx_min_map, // min_value
|
|
||||||
common::Span<const float> d_gidx_fvalue_map, // cut
|
|
||||||
GPUTrainingParam gpu_param,
|
GPUTrainingParam gpu_param,
|
||||||
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
|
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
|
||||||
ValueConstraint value_constraint,
|
ValueConstraint value_constraint,
|
||||||
@ -226,10 +349,8 @@ __global__ void EvaluateSplitKernel(
|
|||||||
int fidx = feature_set[blockIdx.x];
|
int fidx = feature_set[blockIdx.x];
|
||||||
int constraint = d_monotonic_constraints[fidx];
|
int constraint = d_monotonic_constraints[fidx];
|
||||||
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
|
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
|
||||||
fidx, node_histogram,
|
fidx, node_histogram, matrix, &best_split, node, gpu_param, &temp_storage,
|
||||||
d_feature_segments, d_fidx_min_map[fidx], d_gidx_fvalue_map,
|
constraint, value_constraint);
|
||||||
&best_split, node, gpu_param, &temp_storage, constraint,
|
|
||||||
value_constraint);
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
@ -239,32 +360,6 @@ __global__ void EvaluateSplitKernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find a gidx value for a given feature otherwise return -1 if not found
|
|
||||||
template <typename GidxIterT>
|
|
||||||
__device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data,
|
|
||||||
int const fidx_begin, int const fidx_end) {
|
|
||||||
bst_uint previous_middle = UINT32_MAX;
|
|
||||||
while (end != begin) {
|
|
||||||
auto middle = begin + (end - begin) / 2;
|
|
||||||
if (middle == previous_middle) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
previous_middle = middle;
|
|
||||||
|
|
||||||
auto gidx = data[middle];
|
|
||||||
|
|
||||||
if (gidx >= fidx_begin && gidx < fidx_end) {
|
|
||||||
return gidx;
|
|
||||||
} else if (gidx < fidx_begin) {
|
|
||||||
begin = middle;
|
|
||||||
} else {
|
|
||||||
end = middle;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Value is missing
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \struct DeviceHistogram
|
* \struct DeviceHistogram
|
||||||
*
|
*
|
||||||
@ -290,7 +385,6 @@ class DeviceHistogram {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Reset() {
|
void Reset() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
|
||||||
dh::safe_cuda(cudaMemsetAsync(
|
dh::safe_cuda(cudaMemsetAsync(
|
||||||
data_.data().get(), 0,
|
data_.data().get(), 0,
|
||||||
data_.size() * sizeof(typename decltype(data_)::value_type)));
|
data_.size() * sizeof(typename decltype(data_)::value_type)));
|
||||||
@ -397,27 +491,27 @@ __global__ void CompressBinEllpackKernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
__global__ void SharedMemHistKernel(size_t row_stride, const bst_uint* d_ridx,
|
__global__ void SharedMemHistKernel(ELLPackMatrix matrix, const bst_uint* d_ridx,
|
||||||
common::CompressedIterator<uint32_t> d_gidx,
|
|
||||||
int null_gidx_value,
|
|
||||||
GradientSumT* d_node_hist,
|
GradientSumT* d_node_hist,
|
||||||
const GradientPair* d_gpair,
|
const GradientPair* d_gpair,
|
||||||
size_t segment_begin, size_t n_elements) {
|
size_t segment_begin, size_t n_elements) {
|
||||||
extern __shared__ char smem[];
|
extern __shared__ char smem[];
|
||||||
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
GradientSumT* smem_arr = reinterpret_cast<GradientSumT*>(smem); // NOLINT
|
||||||
for (auto i : dh::BlockStrideRange(0, null_gidx_value)) {
|
for (auto i :
|
||||||
|
dh::BlockStrideRange(static_cast<size_t>(0), matrix.BinCount())) {
|
||||||
smem_arr[i] = GradientSumT();
|
smem_arr[i] = GradientSumT();
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
for (auto idx : dh::GridStrideRange(static_cast<size_t>(0), n_elements)) {
|
||||||
int ridx = d_ridx[idx / row_stride + segment_begin];
|
int ridx = d_ridx[idx / matrix.row_stride + segment_begin];
|
||||||
int gidx = d_gidx[ridx * row_stride + idx % row_stride];
|
int gidx = matrix.gidx_iter[ridx * matrix.row_stride + idx % matrix.row_stride];
|
||||||
if (gidx != null_gidx_value) {
|
if (gidx != matrix.null_gidx_value) {
|
||||||
AtomicAddGpair(smem_arr + gidx, d_gpair[ridx]);
|
AtomicAddGpair(smem_arr + gidx, d_gpair[ridx]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
for (auto i : dh::BlockStrideRange(0, null_gidx_value)) {
|
for (auto i :
|
||||||
|
dh::BlockStrideRange(static_cast<size_t>(0), matrix.BinCount())) {
|
||||||
AtomicAddGpair(d_node_hist + i, smem_arr[i]);
|
AtomicAddGpair(d_node_hist + i, smem_arr[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -509,32 +603,26 @@ struct DeviceShard {
|
|||||||
|
|
||||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||||
|
|
||||||
/*! \brief HistCutMatrix stored in device. */
|
ELLPackMatrix ellpack_matrix;
|
||||||
struct DeviceHistCutMatrix {
|
|
||||||
|
/*! \brief Range of rows for each node. */
|
||||||
|
std::vector<Segment> ridx_segments;
|
||||||
|
DeviceHistogram<GradientSumT> hist;
|
||||||
|
|
||||||
/*! \brief row_ptr form HistCutMatrix. */
|
/*! \brief row_ptr form HistCutMatrix. */
|
||||||
dh::DVec<uint32_t> feature_segments;
|
dh::DVec<uint32_t> feature_segments;
|
||||||
/*! \brief minimum value for each feature. */
|
/*! \brief minimum value for each feature. */
|
||||||
dh::DVec<bst_float> min_fvalue;
|
dh::DVec<bst_float> min_fvalue;
|
||||||
/*! \brief Cut. */
|
/*! \brief Cut. */
|
||||||
dh::DVec<bst_float> gidx_fvalue_map;
|
dh::DVec<bst_float> gidx_fvalue_map;
|
||||||
} d_cut;
|
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||||
|
dh::DVec<common::CompressedByteT> gidx_buffer;
|
||||||
/*! \brief Range of rows for each node. */
|
|
||||||
std::vector<Segment> ridx_segments;
|
|
||||||
DeviceHistogram<GradientSumT> hist;
|
|
||||||
|
|
||||||
/*! \brief row length for ELLPack. */
|
|
||||||
size_t row_stride;
|
|
||||||
common::CompressedIterator<uint32_t> gidx;
|
|
||||||
|
|
||||||
/*! \brief Row indices relative to this shard, necessary for sorting rows. */
|
/*! \brief Row indices relative to this shard, necessary for sorting rows. */
|
||||||
dh::DVec2<bst_uint> ridx;
|
dh::DVec2<bst_uint> ridx;
|
||||||
/*! \brief Gradient pair for each row. */
|
/*! \brief Gradient pair for each row. */
|
||||||
dh::DVec<GradientPair> gpair;
|
dh::DVec<GradientPair> gpair;
|
||||||
|
|
||||||
/*! \brief The last histogram index. */
|
|
||||||
int null_gidx_value;
|
|
||||||
|
|
||||||
dh::DVec2<int> position;
|
dh::DVec2<int> position;
|
||||||
|
|
||||||
dh::DVec<int> monotone_constraints;
|
dh::DVec<int> monotone_constraints;
|
||||||
@ -543,8 +631,6 @@ struct DeviceShard {
|
|||||||
/*! \brief Sum gradient for each node. */
|
/*! \brief Sum gradient for each node. */
|
||||||
std::vector<GradientPair> node_sum_gradients;
|
std::vector<GradientPair> node_sum_gradients;
|
||||||
dh::DVec<GradientPair> node_sum_gradients_d;
|
dh::DVec<GradientPair> node_sum_gradients_d;
|
||||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
|
||||||
dh::DVec<common::CompressedByteT> gidx_buffer;
|
|
||||||
/*! \brief row offset in SparsePage (the input data). */
|
/*! \brief row offset in SparsePage (the input data). */
|
||||||
thrust::device_vector<size_t> row_ptrs;
|
thrust::device_vector<size_t> row_ptrs;
|
||||||
/*! \brief On-device feature set, only actually used on one of the devices */
|
/*! \brief On-device feature set, only actually used on one of the devices */
|
||||||
@ -572,16 +658,13 @@ struct DeviceShard {
|
|||||||
: device_id(_device_id),
|
: device_id(_device_id),
|
||||||
row_begin_idx(row_begin),
|
row_begin_idx(row_begin),
|
||||||
row_end_idx(row_end),
|
row_end_idx(row_end),
|
||||||
row_stride(0),
|
|
||||||
n_rows(row_end - row_begin),
|
n_rows(row_end - row_begin),
|
||||||
n_bins{0},
|
n_bins(0),
|
||||||
null_gidx_value(0),
|
|
||||||
param(std::move(_param)),
|
param(std::move(_param)),
|
||||||
prediction_cache_initialised(false) {}
|
prediction_cache_initialised(false) {}
|
||||||
|
|
||||||
/* Init row_ptrs and row_stride */
|
/* Init row_ptrs and row_stride */
|
||||||
void InitRowPtrs(const SparsePage& row_batch) {
|
size_t InitRowPtrs(const SparsePage& row_batch) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
|
||||||
const auto& offset_vec = row_batch.offset.HostVector();
|
const auto& offset_vec = row_batch.offset.HostVector();
|
||||||
row_ptrs.resize(n_rows + 1);
|
row_ptrs.resize(n_rows + 1);
|
||||||
thrust::copy(offset_vec.data() + row_begin_idx,
|
thrust::copy(offset_vec.data() + row_begin_idx,
|
||||||
@ -597,20 +680,15 @@ struct DeviceShard {
|
|||||||
using TransformT = thrust::transform_iterator<decltype(get_size),
|
using TransformT = thrust::transform_iterator<decltype(get_size),
|
||||||
decltype(counting), size_t>;
|
decltype(counting), size_t>;
|
||||||
TransformT row_size_iter = TransformT(counting, get_size);
|
TransformT row_size_iter = TransformT(counting, get_size);
|
||||||
row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
|
size_t row_stride = thrust::reduce(row_size_iter, row_size_iter + n_rows, 0,
|
||||||
thrust::maximum<size_t>());
|
thrust::maximum<size_t>());
|
||||||
|
return row_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
Init:
|
|
||||||
n_bins, null_gidx_value, gidx_buffer, row_ptrs, gidx, gidx_fvalue_map,
|
|
||||||
min_fvalue, feature_segments, node_sum_gradients, ridx_segments,
|
|
||||||
hist
|
|
||||||
*/
|
|
||||||
void InitCompressedData(
|
void InitCompressedData(
|
||||||
const common::HistCutMatrix& hmat, const SparsePage& row_batch);
|
const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense);
|
||||||
|
|
||||||
void CreateHistIndices(const SparsePage& row_batch);
|
void CreateHistIndices(const SparsePage& row_batch, size_t row_stride, int null_gidx_value);
|
||||||
|
|
||||||
~DeviceShard() {
|
~DeviceShard() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
@ -708,10 +786,9 @@ struct DeviceShard {
|
|||||||
int constexpr kBlockThreads = 256;
|
int constexpr kBlockThreads = 256;
|
||||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
||||||
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
||||||
hist.GetNodeHistogram(nidx), d_feature_set, node,
|
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
|
||||||
d_cut.feature_segments.GetSpan(), d_cut.min_fvalue.GetSpan(),
|
gpu_param, d_split_candidates, value_constraints[nidx],
|
||||||
d_cut.gidx_fvalue_map.GetSpan(), gpu_param, d_split_candidates,
|
monotone_constraints.GetSpan());
|
||||||
value_constraints[nidx], monotone_constraints.GetSpan());
|
|
||||||
|
|
||||||
// Reduce over features to find best feature
|
// Reduce over features to find best feature
|
||||||
auto d_result = d_result_all.subspan(i, 1);
|
auto d_result = d_result_all.subspan(i, 1);
|
||||||
@ -756,47 +833,35 @@ struct DeviceShard {
|
|||||||
hist.HistogramExists(nidx_parent);
|
hist.HistogramExists(nidx_parent);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdatePosition(int nidx, int left_nidx, int right_nidx, int fidx,
|
void UpdatePosition(int nidx, RegTree::Node split_node) {
|
||||||
int64_t split_gidx, bool default_dir_left, bool is_dense,
|
CHECK(!split_node.IsLeaf()) <<"Node must not be leaf";
|
||||||
int fidx_begin, // cut.row_ptr[fidx]
|
|
||||||
int fidx_end) { // cut.row_ptr[fidx + 1]
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
|
||||||
Segment segment = ridx_segments[nidx];
|
Segment segment = ridx_segments[nidx];
|
||||||
bst_uint* d_ridx = ridx.Current();
|
bst_uint* d_ridx = ridx.Current();
|
||||||
int* d_position = position.Current();
|
int* d_position = position.Current();
|
||||||
common::CompressedIterator<uint32_t> d_gidx = gidx;
|
|
||||||
size_t row_stride = this->row_stride;
|
|
||||||
if (left_counts.size() <= nidx) {
|
if (left_counts.size() <= nidx) {
|
||||||
left_counts.resize((nidx * 2) + 1);
|
left_counts.resize((nidx * 2) + 1);
|
||||||
}
|
}
|
||||||
int64_t* d_left_count = left_counts.data().get() + nidx;
|
int64_t* d_left_count = left_counts.data().get() + nidx;
|
||||||
|
auto d_matrix = this->ellpack_matrix;
|
||||||
// Launch 1 thread for each row
|
// Launch 1 thread for each row
|
||||||
dh::LaunchN<1, 128>(
|
dh::LaunchN<1, 128>(
|
||||||
device_id, segment.Size(), [=] __device__(bst_uint idx) {
|
device_id, segment.Size(), [=] __device__(bst_uint idx) {
|
||||||
idx += segment.begin;
|
idx += segment.begin;
|
||||||
bst_uint ridx = d_ridx[idx];
|
bst_uint ridx = d_ridx[idx];
|
||||||
auto row_begin = row_stride * ridx;
|
bst_float element = d_matrix.GetElement(ridx, split_node.SplitIndex());
|
||||||
auto row_end = row_begin + row_stride;
|
// Missing value
|
||||||
auto gidx = -1;
|
int new_position = 0;
|
||||||
if (is_dense) {
|
if (isnan(element)) {
|
||||||
// FIXME: Maybe just search the cuts again.
|
new_position = split_node.DefaultChild();
|
||||||
gidx = d_gidx[row_begin + fidx];
|
|
||||||
} else {
|
} else {
|
||||||
gidx = BinarySearchRow(row_begin, row_end, d_gidx, fidx_begin,
|
if (element <= split_node.SplitCond()) {
|
||||||
fidx_end);
|
new_position = split_node.LeftChild();
|
||||||
}
|
|
||||||
|
|
||||||
// belong to left node or right node.
|
|
||||||
int position;
|
|
||||||
if (gidx >= 0) {
|
|
||||||
// Feature is found
|
|
||||||
position = gidx <= split_gidx ? left_nidx : right_nidx;
|
|
||||||
} else {
|
} else {
|
||||||
// Feature is missing
|
new_position = split_node.RightChild();
|
||||||
position = default_dir_left ? left_nidx : right_nidx;
|
|
||||||
}
|
}
|
||||||
CountLeft(d_left_count, position, left_nidx);
|
}
|
||||||
d_position[idx] = position;
|
CountLeft(d_left_count, new_position, split_node.LeftChild());
|
||||||
|
d_position[idx] = new_position;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Overlap device to host memory copy (left_count) with sort
|
// Overlap device to host memory copy (left_count) with sort
|
||||||
@ -805,16 +870,16 @@ struct DeviceShard {
|
|||||||
dh::safe_cuda(cudaMemcpyAsync(tmp_pinned.data(), d_left_count, sizeof(int64_t),
|
dh::safe_cuda(cudaMemcpyAsync(tmp_pinned.data(), d_left_count, sizeof(int64_t),
|
||||||
cudaMemcpyDeviceToHost, streams[0]));
|
cudaMemcpyDeviceToHost, streams[0]));
|
||||||
|
|
||||||
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count,
|
SortPositionAndCopy(segment, split_node.LeftChild(), split_node.RightChild(), d_left_count,
|
||||||
streams[1]);
|
streams[1]);
|
||||||
|
|
||||||
dh::safe_cuda(cudaStreamSynchronize(streams[0]));
|
dh::safe_cuda(cudaStreamSynchronize(streams[0]));
|
||||||
int64_t left_count = tmp_pinned[0];
|
int64_t left_count = tmp_pinned[0];
|
||||||
CHECK_LE(left_count, segment.Size());
|
CHECK_LE(left_count, segment.Size());
|
||||||
CHECK_GE(left_count, 0);
|
CHECK_GE(left_count, 0);
|
||||||
ridx_segments[left_nidx] =
|
ridx_segments[split_node.LeftChild()] =
|
||||||
Segment(segment.begin, segment.begin + left_count);
|
Segment(segment.begin, segment.begin + left_count);
|
||||||
ridx_segments[right_nidx] =
|
ridx_segments[split_node.RightChild()] =
|
||||||
Segment(segment.begin + left_count, segment.end);
|
Segment(segment.begin + left_count, segment.end);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -840,6 +905,41 @@ struct DeviceShard {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// After tree update is finished, update the position of all training
|
||||||
|
// instances to their final leaf This information is used later to update the
|
||||||
|
// prediction cache
|
||||||
|
void FinalisePosition(RegTree* p_tree) {
|
||||||
|
const auto d_nodes =
|
||||||
|
temp_memory.GetSpan<RegTree::Node>(p_tree->GetNodes().size());
|
||||||
|
dh::safe_cuda(cudaMemcpy(d_nodes.data(), p_tree->GetNodes().data(),
|
||||||
|
d_nodes.size() * sizeof(RegTree::Node),
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
auto d_position = position.Current();
|
||||||
|
const auto d_ridx = ridx.Current();
|
||||||
|
auto d_matrix = this->ellpack_matrix;
|
||||||
|
dh::LaunchN(device_id, position.Size(), [=] __device__(size_t idx) {
|
||||||
|
auto position = d_position[idx];
|
||||||
|
auto node = d_nodes[position];
|
||||||
|
bst_uint ridx = d_ridx[idx];
|
||||||
|
|
||||||
|
while (!node.IsLeaf()) {
|
||||||
|
bst_float element = d_matrix.GetElement(ridx, node.SplitIndex());
|
||||||
|
// Missing value
|
||||||
|
if (isnan(element)) {
|
||||||
|
position = node.DefaultChild();
|
||||||
|
} else {
|
||||||
|
if (element <= node.SplitCond()) {
|
||||||
|
position = node.LeftChild();
|
||||||
|
} else {
|
||||||
|
position = node.RightChild();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
node = d_nodes[position];
|
||||||
|
}
|
||||||
|
d_position[idx] = position;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void UpdatePredictionCache(bst_float* out_preds_d) {
|
void UpdatePredictionCache(bst_float* out_preds_d) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
if (!prediction_cache_initialised) {
|
if (!prediction_cache_initialised) {
|
||||||
@ -880,14 +980,12 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
auto segment = shard->ridx_segments[nidx];
|
auto segment = shard->ridx_segments[nidx];
|
||||||
auto segment_begin = segment.begin;
|
auto segment_begin = segment.begin;
|
||||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx);
|
auto d_node_hist = shard->hist.GetNodeHistogram(nidx);
|
||||||
auto d_gidx = shard->gidx;
|
|
||||||
auto d_ridx = shard->ridx.Current();
|
auto d_ridx = shard->ridx.Current();
|
||||||
auto d_gpair = shard->gpair.Data();
|
auto d_gpair = shard->gpair.Data();
|
||||||
|
|
||||||
int null_gidx_value = shard->null_gidx_value;
|
auto n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
|
||||||
auto n_elements = segment.Size() * shard->row_stride;
|
|
||||||
|
|
||||||
const size_t smem_size = sizeof(GradientSumT) * shard->null_gidx_value;
|
const size_t smem_size = sizeof(GradientSumT) * shard->ellpack_matrix.BinCount();
|
||||||
const int items_per_thread = 8;
|
const int items_per_thread = 8;
|
||||||
const int block_threads = 256;
|
const int block_threads = 256;
|
||||||
const int grid_size =
|
const int grid_size =
|
||||||
@ -896,9 +994,8 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
if (grid_size <= 0) {
|
if (grid_size <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>(
|
||||||
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>
|
shard->ellpack_matrix, d_ridx, d_node_hist.data(), d_gpair,
|
||||||
(shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair,
|
|
||||||
segment_begin, n_elements);
|
segment_begin, n_elements);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -908,20 +1005,18 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
void Build(DeviceShard<GradientSumT>* shard, int nidx) override {
|
void Build(DeviceShard<GradientSumT>* shard, int nidx) override {
|
||||||
Segment segment = shard->ridx_segments[nidx];
|
Segment segment = shard->ridx_segments[nidx];
|
||||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||||
common::CompressedIterator<uint32_t> d_gidx = shard->gidx;
|
|
||||||
bst_uint* d_ridx = shard->ridx.Current();
|
bst_uint* d_ridx = shard->ridx.Current();
|
||||||
GradientPair* d_gpair = shard->gpair.Data();
|
GradientPair* d_gpair = shard->gpair.Data();
|
||||||
|
|
||||||
size_t const n_elements = segment.Size() * shard->row_stride;
|
size_t const n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
|
||||||
size_t const row_stride = shard->row_stride;
|
auto d_matrix = shard->ellpack_matrix;
|
||||||
int const null_gidx_value = shard->null_gidx_value;
|
|
||||||
|
|
||||||
dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) {
|
dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) {
|
||||||
int ridx = d_ridx[(idx / row_stride) + segment.begin];
|
int ridx = d_ridx[(idx / d_matrix.row_stride) + segment.begin];
|
||||||
// lookup the index (bin) of histogram.
|
// lookup the index (bin) of histogram.
|
||||||
int gidx = d_gidx[ridx * row_stride + idx % row_stride];
|
int gidx = d_matrix.gidx_iter[ridx * d_matrix.row_stride + idx % d_matrix.row_stride];
|
||||||
|
|
||||||
if (gidx != null_gidx_value) {
|
if (gidx != d_matrix.null_gidx_value) {
|
||||||
AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]);
|
AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -930,10 +1025,10 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||||
const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
|
const common::HistCutMatrix& hmat, const SparsePage& row_batch, bool is_dense) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
size_t row_stride = this->InitRowPtrs(row_batch);
|
||||||
n_bins = hmat.NumBins();
|
n_bins = hmat.row_ptr.back();
|
||||||
null_gidx_value = hmat.NumBins();
|
int null_gidx_value = hmat.row_ptr.back();
|
||||||
|
|
||||||
int max_nodes =
|
int max_nodes =
|
||||||
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
|
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
|
||||||
@ -944,13 +1039,13 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
&position, n_rows,
|
&position, n_rows,
|
||||||
&prediction_cache, n_rows,
|
&prediction_cache, n_rows,
|
||||||
&node_sum_gradients_d, max_nodes,
|
&node_sum_gradients_d, max_nodes,
|
||||||
&d_cut.feature_segments, hmat.row_ptr.size(),
|
&feature_segments, hmat.row_ptr.size(),
|
||||||
&d_cut.gidx_fvalue_map, hmat.cut.size(),
|
&gidx_fvalue_map, hmat.cut.size(),
|
||||||
&d_cut.min_fvalue, hmat.min_val.size(),
|
&min_fvalue, hmat.min_val.size(),
|
||||||
&monotone_constraints, param.monotone_constraints.size());
|
&monotone_constraints, param.monotone_constraints.size());
|
||||||
d_cut.gidx_fvalue_map = hmat.cut;
|
gidx_fvalue_map = hmat.cut;
|
||||||
d_cut.min_fvalue = hmat.min_val;
|
min_fvalue = hmat.min_val;
|
||||||
d_cut.feature_segments = hmat.row_ptr;
|
feature_segments = hmat.row_ptr;
|
||||||
monotone_constraints = param.monotone_constraints;
|
monotone_constraints = param.monotone_constraints;
|
||||||
|
|
||||||
node_sum_gradients.resize(max_nodes);
|
node_sum_gradients.resize(max_nodes);
|
||||||
@ -970,15 +1065,18 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
|
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
|
||||||
gidx_buffer.Fill(0);
|
gidx_buffer.Fill(0);
|
||||||
|
|
||||||
int nbits = common::detail::SymbolBits(num_symbols);
|
this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
|
||||||
|
|
||||||
CreateHistIndices(row_batch);
|
ellpack_matrix.Init(
|
||||||
|
feature_segments.GetSpan(), min_fvalue.GetSpan(),
|
||||||
gidx = common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols);
|
gidx_fvalue_map.GetSpan(), row_stride,
|
||||||
|
common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols),
|
||||||
|
is_dense, null_gidx_value);
|
||||||
|
|
||||||
// check if we can use shared memory for building histograms
|
// check if we can use shared memory for building histograms
|
||||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding)
|
// (assuming atleast we need 2 CTAs per SM to maintain decent latency
|
||||||
auto histogram_size = sizeof(GradientSumT) * null_gidx_value;
|
// hiding)
|
||||||
|
auto histogram_size = sizeof(GradientSumT) * hmat.row_ptr.back();
|
||||||
auto max_smem = dh::MaxSharedMemory(device_id);
|
auto max_smem = dh::MaxSharedMemory(device_id);
|
||||||
if (histogram_size <= max_smem) {
|
if (histogram_size <= max_smem) {
|
||||||
hist_builder.reset(new SharedMemHistBuilder<GradientSumT>);
|
hist_builder.reset(new SharedMemHistBuilder<GradientSumT>);
|
||||||
@ -990,9 +1088,9 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
hist.Init(device_id, hmat.NumBins());
|
hist.Init(device_id, hmat.NumBins());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_batch) {
|
inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||||
|
const SparsePage& row_batch, size_t row_stride, int null_gidx_value) {
|
||||||
int num_symbols = n_bins + 1;
|
int num_symbols = n_bins + 1;
|
||||||
// bin and compress entries in batches of rows
|
// bin and compress entries in batches of rows
|
||||||
size_t gpu_batch_nrows =
|
size_t gpu_batch_nrows =
|
||||||
@ -1026,7 +1124,7 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
|
|||||||
gidx_buffer.Data(),
|
gidx_buffer.Data(),
|
||||||
row_ptrs.data().get() + batch_row_begin,
|
row_ptrs.data().get() + batch_row_begin,
|
||||||
entries_d.data().get(),
|
entries_d.data().get(),
|
||||||
d_cut.gidx_fvalue_map.Data(), d_cut.feature_segments.Data(),
|
gidx_fvalue_map.Data(), feature_segments.Data(),
|
||||||
batch_row_begin, batch_nrows,
|
batch_row_begin, batch_nrows,
|
||||||
row_ptrs[batch_row_begin],
|
row_ptrs[batch_row_begin],
|
||||||
row_stride, null_gidx_value);
|
row_stride, null_gidx_value);
|
||||||
@ -1039,12 +1137,9 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
|
|||||||
entries_d.shrink_to_fit();
|
entries_d.shrink_to_fit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
class GPUHistMakerSpecialised{
|
class GPUHistMakerSpecialised{
|
||||||
public:
|
public:
|
||||||
struct ExpandEntry;
|
|
||||||
|
|
||||||
GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
|
GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
|
||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) {
|
const std::vector<std::pair<std::string, std::string>>& args) {
|
||||||
@ -1111,7 +1206,6 @@ class GPUHistMakerSpecialised{
|
|||||||
shard = std::unique_ptr<DeviceShard<GradientSumT>>(
|
shard = std::unique_ptr<DeviceShard<GradientSumT>>(
|
||||||
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start,
|
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(i), start,
|
||||||
start + size, param_));
|
start + size, param_));
|
||||||
shard->InitRowPtrs(batch);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// Find the cuts.
|
// Find the cuts.
|
||||||
@ -1119,13 +1213,14 @@ class GPUHistMakerSpecialised{
|
|||||||
common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
|
common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
|
||||||
n_bins_ = hmat_.row_ptr.back();
|
n_bins_ = hmat_.row_ptr.back();
|
||||||
monitor_.StopCuda("Quantiles");
|
monitor_.StopCuda("Quantiles");
|
||||||
|
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
||||||
|
|
||||||
monitor_.StartCuda("BinningCompression");
|
monitor_.StartCuda("BinningCompression");
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||||
shard->InitCompressedData(hmat_, batch);
|
shard->InitCompressedData(hmat_, batch, is_dense);
|
||||||
});
|
});
|
||||||
monitor_.StopCuda("BinningCompression");
|
monitor_.StopCuda("BinningCompression");
|
||||||
++batch_iter;
|
++batch_iter;
|
||||||
@ -1300,32 +1395,19 @@ class GPUHistMakerSpecialised{
|
|||||||
}
|
}
|
||||||
|
|
||||||
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||||
int nidx = candidate.nid;
|
|
||||||
int left_nidx = (*p_tree)[nidx].LeftChild();
|
|
||||||
int right_nidx = (*p_tree)[nidx].RightChild();
|
|
||||||
|
|
||||||
// convert floating-point split_pt into corresponding bin_id
|
|
||||||
// split_cond = -1 indicates that split_pt is less than all known cut points
|
|
||||||
int64_t split_gidx = -1;
|
|
||||||
int64_t fidx = candidate.split.findex;
|
|
||||||
bool default_dir_left = candidate.split.dir == kLeftDir;
|
|
||||||
uint32_t fidx_begin = hmat_.row_ptr[fidx];
|
|
||||||
uint32_t fidx_end = hmat_.row_ptr[fidx + 1];
|
|
||||||
// split_gidx = i where i is the i^th bin containing split value.
|
|
||||||
for (auto i = fidx_begin; i < fidx_end; ++i) {
|
|
||||||
if (candidate.split.fvalue == hmat_.cut[i]) {
|
|
||||||
split_gidx = static_cast<int64_t>(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
|
|
||||||
|
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||||
shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx, split_gidx,
|
shard->UpdatePosition(candidate.nid,
|
||||||
default_dir_left, is_dense, fidx_begin,
|
p_tree->GetNodes()[candidate.nid]);
|
||||||
fidx_end);
|
});
|
||||||
|
}
|
||||||
|
void FinalisePosition(RegTree* p_tree) {
|
||||||
|
dh::ExecuteIndexShards(
|
||||||
|
&shards_,
|
||||||
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
|
shard->FinalisePosition(p_tree);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1380,20 +1462,22 @@ class GPUHistMakerSpecialised{
|
|||||||
while (!qexpand_->empty()) {
|
while (!qexpand_->empty()) {
|
||||||
ExpandEntry candidate = qexpand_->top();
|
ExpandEntry candidate = qexpand_->top();
|
||||||
qexpand_->pop();
|
qexpand_->pop();
|
||||||
if (!candidate.IsValid(param_, num_leaves)) continue;
|
if (!candidate.IsValid(param_, num_leaves)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
this->ApplySplit(candidate, p_tree);
|
this->ApplySplit(candidate, p_tree);
|
||||||
monitor_.StartCuda("UpdatePosition");
|
|
||||||
this->UpdatePosition(candidate, p_tree);
|
|
||||||
monitor_.StopCuda("UpdatePosition");
|
|
||||||
num_leaves++;
|
num_leaves++;
|
||||||
|
|
||||||
int left_child_nidx = tree[candidate.nid].LeftChild();
|
int left_child_nidx = tree[candidate.nid].LeftChild();
|
||||||
int right_child_nidx = tree[candidate.nid].RightChild();
|
int right_child_nidx = tree[candidate.nid].RightChild();
|
||||||
|
|
||||||
// Only create child entries if needed
|
// Only create child entries if needed
|
||||||
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
|
if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx),
|
||||||
num_leaves)) {
|
num_leaves)) {
|
||||||
|
monitor_.StartCuda("UpdatePosition");
|
||||||
|
this->UpdatePosition(candidate, p_tree);
|
||||||
|
monitor_.StopCuda("UpdatePosition");
|
||||||
|
|
||||||
monitor_.StartCuda("BuildHist");
|
monitor_.StartCuda("BuildHist");
|
||||||
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
this->BuildHistLeftRight(candidate.nid, left_child_nidx,
|
||||||
right_child_nidx);
|
right_child_nidx);
|
||||||
@ -1411,6 +1495,10 @@ class GPUHistMakerSpecialised{
|
|||||||
monitor_.StopCuda("EvaluateSplits");
|
monitor_.StopCuda("EvaluateSplits");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
monitor_.StartCuda("FinalisePosition");
|
||||||
|
this->FinalisePosition(p_tree);
|
||||||
|
monitor_.StopCuda("FinalisePosition");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UpdatePredictionCache(
|
bool UpdatePredictionCache(
|
||||||
@ -1431,64 +1519,6 @@ class GPUHistMakerSpecialised{
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ExpandEntry {
|
|
||||||
int nid;
|
|
||||||
int depth;
|
|
||||||
DeviceSplitCandidate split;
|
|
||||||
uint64_t timestamp;
|
|
||||||
ExpandEntry(int _nid, int _depth, const DeviceSplitCandidate _split,
|
|
||||||
uint64_t _timestamp) :
|
|
||||||
nid{_nid}, depth{_depth}, split(std::move(_split)),
|
|
||||||
timestamp{_timestamp} {}
|
|
||||||
bool IsValid(const TrainParam& param, int num_leaves) const {
|
|
||||||
if (split.loss_chg <= kRtEps) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (param.max_depth > 0 && depth == param.max_depth) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ChildIsValid(const TrainParam& param, int depth,
|
|
||||||
int num_leaves) {
|
|
||||||
if (param.max_depth > 0 && depth >= param.max_depth) return false;
|
|
||||||
if (param.max_leaves > 0 && num_leaves >= param.max_leaves) return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
friend std::ostream& operator<<(std::ostream& os, const ExpandEntry& e) {
|
|
||||||
os << "ExpandEntry: \n";
|
|
||||||
os << "nidx: " << e.nid << "\n";
|
|
||||||
os << "depth: " << e.depth << "\n";
|
|
||||||
os << "loss: " << e.split.loss_chg << "\n";
|
|
||||||
os << "left_sum: " << e.split.left_sum << "\n";
|
|
||||||
os << "right_sum: " << e.split.right_sum << "\n";
|
|
||||||
return os;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline static bool DepthWise(ExpandEntry lhs, ExpandEntry rhs) {
|
|
||||||
if (lhs.depth == rhs.depth) {
|
|
||||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
|
||||||
} else {
|
|
||||||
return lhs.depth > rhs.depth; // favor small depth
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inline static bool LossGuide(ExpandEntry lhs, ExpandEntry rhs) {
|
|
||||||
if (lhs.split.loss_chg == rhs.split.loss_chg) {
|
|
||||||
return lhs.timestamp > rhs.timestamp; // favor small timestamp
|
|
||||||
} else {
|
|
||||||
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TrainParam param_; // NOLINT
|
TrainParam param_; // NOLINT
|
||||||
common::HistCutMatrix hmat_; // NOLINT
|
common::HistCutMatrix hmat_; // NOLINT
|
||||||
MetaInfo* info_; // NOLINT
|
MetaInfo* info_; // NOLINT
|
||||||
@ -1507,7 +1537,8 @@ class GPUHistMakerSpecialised{
|
|||||||
GPUHistMakerTrainParam hist_maker_param_;
|
GPUHistMakerTrainParam hist_maker_param_;
|
||||||
common::GHistIndexMatrix gmat_;
|
common::GHistIndexMatrix gmat_;
|
||||||
|
|
||||||
using ExpandQueue = std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
using ExpandQueue =
|
||||||
|
std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||||
std::unique_ptr<ExpandQueue> qexpand_;
|
std::unique_ptr<ExpandQueue> qexpand_;
|
||||||
dh::AllReducer reducer_;
|
dh::AllReducer reducer_;
|
||||||
|
|||||||
@ -26,7 +26,7 @@
|
|||||||
|
|
||||||
bool FileExists(const std::string& filename);
|
bool FileExists(const std::string& filename);
|
||||||
|
|
||||||
long GetFileSize(const std::string& filename);
|
int64_t GetFileSize(const std::string& filename);
|
||||||
|
|
||||||
void CreateSimpleTestData(const std::string& filename);
|
void CreateSimpleTestData(const std::string& filename);
|
||||||
|
|
||||||
|
|||||||
@ -39,8 +39,9 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
|
|||||||
0.26f, 0.74f, 1.98f,
|
0.26f, 0.74f, 1.98f,
|
||||||
0.26f, 0.71f, 1.83f};
|
0.26f, 0.71f, 1.83f};
|
||||||
|
|
||||||
shard->InitRowPtrs(batch);
|
auto is_dense = (*dmat)->Info().num_nonzero_ ==
|
||||||
shard->InitCompressedData(cmat, batch);
|
(*dmat)->Info().num_row_ * (*dmat)->Info().num_col_;
|
||||||
|
shard->InitCompressedData(cmat, batch, is_dense);
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
}
|
}
|
||||||
@ -59,7 +60,7 @@ TEST(GpuHist, BuildGidxDense) {
|
|||||||
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||||
|
|
||||||
ASSERT_EQ(shard.row_stride, kNCols);
|
ASSERT_EQ(shard.ellpack_matrix.row_stride, kNCols);
|
||||||
|
|
||||||
std::vector<uint32_t> solution = {
|
std::vector<uint32_t> solution = {
|
||||||
0, 3, 8, 9, 14, 17, 20, 21,
|
0, 3, 8, 9, 14, 17, 20, 21,
|
||||||
@ -98,7 +99,7 @@ TEST(GpuHist, BuildGidxSparse) {
|
|||||||
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||||
|
|
||||||
ASSERT_LE(shard.row_stride, 3);
|
ASSERT_LE(shard.ellpack_matrix.row_stride, 3);
|
||||||
|
|
||||||
// row_stride = 3, 16 rows, 48 entries for ELLPack
|
// row_stride = 3, 16 rows, 48 entries for ELLPack
|
||||||
std::vector<uint32_t> solution = {
|
std::vector<uint32_t> solution = {
|
||||||
@ -106,7 +107,7 @@ TEST(GpuHist, BuildGidxSparse) {
|
|||||||
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
||||||
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < kNRows * shard.row_stride; ++i) {
|
for (size_t i = 0; i < kNRows * shard.ellpack_matrix.row_stride; ++i) {
|
||||||
ASSERT_EQ(solution[i], gidx[i]);
|
ASSERT_EQ(solution[i], gidx[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -256,16 +257,19 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
common::HistCutMatrix cmat = GetHostCutMatrix();
|
common::HistCutMatrix cmat = GetHostCutMatrix();
|
||||||
|
|
||||||
// Copy cut matrix to device.
|
// Copy cut matrix to device.
|
||||||
DeviceShard<GradientPairPrecise>::DeviceHistCutMatrix cut;
|
|
||||||
shard->ba.Allocate(0,
|
shard->ba.Allocate(0,
|
||||||
&(shard->d_cut.feature_segments), cmat.row_ptr.size(),
|
&(shard->feature_segments), cmat.row_ptr.size(),
|
||||||
&(shard->d_cut.min_fvalue), cmat.min_val.size(),
|
&(shard->min_fvalue), cmat.min_val.size(),
|
||||||
&(shard->d_cut.gidx_fvalue_map), 24,
|
&(shard->gidx_fvalue_map), 24,
|
||||||
&(shard->monotone_constraints), kNCols);
|
&(shard->monotone_constraints), kNCols);
|
||||||
shard->d_cut.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
|
shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
|
||||||
shard->d_cut.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
|
shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
|
||||||
shard->monotone_constraints.copy(param.monotone_constraints.begin(),
|
shard->monotone_constraints.copy(param.monotone_constraints.begin(),
|
||||||
param.monotone_constraints.end());
|
param.monotone_constraints.end());
|
||||||
|
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan();
|
||||||
|
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan();
|
||||||
|
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end());
|
||||||
|
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan();
|
||||||
|
|
||||||
// Initialize DeviceShard::hist
|
// Initialize DeviceShard::hist
|
||||||
shard->hist.Init(0, (max_bins - 1) * kNCols);
|
shard->hist.Init(0, (max_bins - 1) * kNCols);
|
||||||
@ -339,7 +343,7 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
shard->ridx_segments[0] = Segment(0, kNRows);
|
shard->ridx_segments[0] = Segment(0, kNRows);
|
||||||
shard->ba.Allocate(0, &(shard->ridx), kNRows,
|
shard->ba.Allocate(0, &(shard->ridx), kNRows,
|
||||||
&(shard->position), kNRows);
|
&(shard->position), kNRows);
|
||||||
shard->row_stride = kNCols;
|
shard->ellpack_matrix.row_stride = kNCols;
|
||||||
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
||||||
shard->ridx.CurrentDVec().tend());
|
shard->ridx.CurrentDVec().tend());
|
||||||
// Initialize GPUHistMaker
|
// Initialize GPUHistMaker
|
||||||
@ -351,11 +355,9 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
0.59, 4, // fvalue has to be equal to one of the cut field
|
0.59, 4, // fvalue has to be equal to one of the cut field
|
||||||
GradientPair(8.2, 2.8), GradientPair(6.3, 3.6),
|
GradientPair(8.2, 2.8), GradientPair(6.3, 3.6),
|
||||||
GPUTrainingParam(param));
|
GPUTrainingParam(param));
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise>::ExpandEntry candidate_entry {0, 0, candidate, 0};
|
ExpandEntry candidate_entry {0, 0, candidate, 0};
|
||||||
candidate_entry.nid = kNId;
|
candidate_entry.nid = kNId;
|
||||||
|
|
||||||
auto const& nodes = tree.GetNodes();
|
|
||||||
|
|
||||||
// Used to get bin_id in update position.
|
// Used to get bin_id in update position.
|
||||||
common::HistCutMatrix cmat = GetHostCutMatrix();
|
common::HistCutMatrix cmat = GetHostCutMatrix();
|
||||||
hist_maker.hmat_ = cmat;
|
hist_maker.hmat_ = cmat;
|
||||||
@ -370,19 +372,31 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
int row_stride = kNCols;
|
int row_stride = kNCols;
|
||||||
int num_symbols = n_bins + 1;
|
int num_symbols = n_bins + 1;
|
||||||
size_t compressed_size_bytes =
|
size_t compressed_size_bytes =
|
||||||
common::CompressedBufferWriter::CalculateBufferSize(
|
common::CompressedBufferWriter::CalculateBufferSize(row_stride * kNRows,
|
||||||
row_stride * kNRows, num_symbols);
|
num_symbols);
|
||||||
shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes);
|
shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes,
|
||||||
|
&(shard->feature_segments), cmat.row_ptr.size(),
|
||||||
|
&(shard->min_fvalue), cmat.min_val.size(),
|
||||||
|
&(shard->gidx_fvalue_map), 24);
|
||||||
|
shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
|
||||||
|
shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
|
||||||
|
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan();
|
||||||
|
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan();
|
||||||
|
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end());
|
||||||
|
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan();
|
||||||
|
shard->ellpack_matrix.is_dense = true;
|
||||||
|
|
||||||
common::CompressedBufferWriter wr(num_symbols);
|
common::CompressedBufferWriter wr(num_symbols);
|
||||||
std::vector<int> h_gidx (kNRows * row_stride);
|
// gidx 14 should go right, 12 goes left
|
||||||
std::iota(h_gidx.begin(), h_gidx.end(), 0);
|
std::vector<int> h_gidx (kNRows * row_stride, 14);
|
||||||
|
h_gidx[4] = 12;
|
||||||
|
h_gidx[12] = 12;
|
||||||
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
|
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
|
||||||
|
|
||||||
wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end());
|
wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end());
|
||||||
shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end());
|
shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end());
|
||||||
|
|
||||||
shard->gidx = common::CompressedIterator<uint32_t>(
|
shard->ellpack_matrix.gidx_iter = common::CompressedIterator<uint32_t>(
|
||||||
shard->gidx_buffer.Data(), num_symbols);
|
shard->gidx_buffer.Data(), num_symbols);
|
||||||
|
|
||||||
hist_maker.info_ = &info;
|
hist_maker.info_ = &info;
|
||||||
@ -395,8 +409,8 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
int right_nidx = tree[kNId].RightChild();
|
int right_nidx = tree[kNId].RightChild();
|
||||||
|
|
||||||
ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0);
|
ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0);
|
||||||
ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6);
|
ASSERT_EQ(shard->ridx_segments[left_nidx].end, 2);
|
||||||
ASSERT_EQ(shard->ridx_segments[right_nidx].begin, 6);
|
ASSERT_EQ(shard->ridx_segments[right_nidx].begin, 2);
|
||||||
ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
|
ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -417,7 +431,7 @@ void TestSortPosition(const std::vector<int>& position_in, int left_idx,
|
|||||||
common::Span<int>(position_out.data().get(), position_out.size()),
|
common::Span<int>(position_out.data().get(), position_out.size()),
|
||||||
common::Span<bst_uint>(ridx.data().get(), ridx.size()),
|
common::Span<bst_uint>(ridx.data().get(), ridx.size()),
|
||||||
common::Span<bst_uint>(ridx_out.data().get(), ridx_out.size()), left_idx,
|
common::Span<bst_uint>(ridx_out.data().get(), ridx_out.size()), left_idx,
|
||||||
right_idx, d_left_count.data().get());
|
right_idx, d_left_count.data().get(), nullptr);
|
||||||
thrust::host_vector<int> position_result = position_out;
|
thrust::host_vector<int> position_result = position_out;
|
||||||
thrust::host_vector<int> ridx_result = ridx_out;
|
thrust::host_vector<int> ridx_result = ridx_out;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user