From f90e7f9aa8323102f544854f19cc5f92d66db8c6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 6 Sep 2019 03:01:42 -0400 Subject: [PATCH] Some comments for row partitioner. (#4832) --- src/tree/gpu_hist/row_partitioner.cu | 22 +++++++++++++++++++--- src/tree/gpu_hist/row_partitioner.cuh | 25 ++++++++++++++++++++----- src/tree/updater_gpu_hist.cu | 11 ++++++----- 3 files changed, 45 insertions(+), 13 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index b509822b8..198c714b4 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -19,7 +19,9 @@ struct IndicateLeftTransform { return x == left_nidx ? 1 : 0; } }; - +/* + * position: Position of rows belonged to current split node. + */ void RowPartitioner::SortPosition(common::Span position, common::Span position_out, common::Span ridx, @@ -27,27 +29,37 @@ void RowPartitioner::SortPosition(common::Span position, TreePositionT left_nidx, TreePositionT right_nidx, int64_t* d_left_count, cudaStream_t stream) { + // radix sort over 1 bit, see: + // https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html auto d_position_out = position_out.data(); auto d_position_in = position.data(); auto d_ridx_out = ridx_out.data(); auto d_ridx_in = ridx.data(); auto write_results = [=] __device__(size_t idx, int ex_scan_result) { + // the ex_scan_result represents how many rows have been assigned to left node so far + // during scan. int scatter_address; if (d_position_in[idx] == left_nidx) { scatter_address = ex_scan_result; } else { + // current number of rows belong to right node + total number of rows belong to left + // node scatter_address = (idx - ex_scan_result) + *d_left_count; } + // copy the node id to output d_position_out[scatter_address] = d_position_in[idx]; d_ridx_out[scatter_address] = d_ridx_in[idx]; }; // NOLINT - IndicateLeftTransform conversion_op(left_nidx); + IndicateLeftTransform is_left(left_nidx); + // an iterator that given a old position returns whether it belongs to left or right + // node. cub::TransformInputIterator - in_itr(d_position_in, conversion_op); + in_itr(d_position_in, is_left); dh::DiscardLambdaItr out_itr(write_results); size_t temp_storage_bytes = 0; + // position is of the same size with current split node's row segment cub::DeviceScan::ExclusiveSum(nullptr, temp_storage_bytes, in_itr, out_itr, position.size(), stream); dh::caching_device_vector temp_storage(temp_storage_bytes); @@ -125,11 +137,15 @@ void RowPartitioner::SortPositionAndCopy(const Segment& segment, int64_t* d_left_count, cudaStream_t stream) { SortPosition( + // position_in common::Span(position.Current() + segment.begin, segment.Size()), + // position_out common::Span(position.other() + segment.begin, segment.Size()), + // row index in common::Span(ridx.Current() + segment.begin, segment.Size()), + // row index out common::Span(ridx.other() + segment.begin, segment.Size()), left_nidx, right_nidx, d_left_count, stream); // Copy back key/value diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 5ff45bd2b..7cfe04d51 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -30,19 +30,32 @@ __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment * partition training rows into different leaf nodes. */ class RowPartitioner { public: - using TreePositionT = int; + using TreePositionT = int32_t; using RowIndexT = bst_uint; struct Segment; private: int device_idx; - /*! \brief Range of rows for each node. */ + /*! \brief In here if you want to find the rows belong to a node nid, first you need to + * get the indices segment from ridx_segments[nid], then get the row index that + * represents position of row in input data X. `RowPartitioner::GetRows` would be a + * good starting place to get a sense what are these vector storing. + * + * node id -> segment -> indices of rows belonging to node + */ + /*! \brief Range of row index for each node, pointers into ridx below. */ std::vector ridx_segments; dh::caching_device_vector ridx_a; dh::caching_device_vector ridx_b; dh::caching_device_vector position_a; dh::caching_device_vector position_b; + /*! \brief mapping for node id -> rows. + * This looks like: + * node id | 1 | 2 | + * rows idx | 3, 5, 1 | 13, 31 | + */ dh::DoubleBuffer ridx; + /*! \brief mapping for row -> node id. */ dh::DoubleBuffer position; dh::caching_device_vector left_counts; // Useful to keep a bunch of zeroed memory for sort position @@ -95,20 +108,22 @@ class RowPartitioner { void UpdatePosition(TreePositionT nidx, TreePositionT left_nidx, TreePositionT right_nidx, UpdatePositionOpT op) { dh::safe_cuda(cudaSetDevice(device_idx)); - Segment segment = ridx_segments.at(nidx); + Segment segment = ridx_segments.at(nidx); // rows belongs to node nidx auto d_ridx = ridx.CurrentSpan(); auto d_position = position.CurrentSpan(); if (left_counts.size() <= nidx) { left_counts.resize((nidx * 2) + 1); thrust::fill(left_counts.begin(), left_counts.end(), 0); } + // Now we divide the row segment into left and right node. + int64_t* d_left_count = left_counts.data().get() + nidx; // Launch 1 thread for each row dh::LaunchN<1, 128>(device_idx, segment.Size(), [=] __device__(size_t idx) { + // LaunchN starts from zero, so we restore the row index by adding segment.begin idx += segment.begin; RowIndexT ridx = d_ridx[idx]; - // Missing value - TreePositionT new_position = op(ridx); + TreePositionT new_position = op(ridx); // new node id KERNEL_CHECK(new_position == left_nidx || new_position == right_nidx); AtomicIncrement(d_left_count, new_position == left_nidx); d_position[idx] = new_position; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 94ad15e51..0a06daa2a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -152,8 +152,8 @@ struct ELLPackMatrix { XGBOOST_DEVICE size_t BinCount() const { return gidx_fvalue_map.size(); } - // Get a matrix element, uses binary search for look up - // Return NaN if missing + // Get a matrix element, uses binary search for look up Return NaN if missing + // Given a row index and a feature index, returns the corresponding cut value __device__ bst_float GetElement(size_t ridx, size_t fidx) const { auto row_begin = row_stride * ridx; auto row_end = row_begin + row_stride; @@ -832,14 +832,15 @@ struct DeviceShard { row_partitioner->UpdatePosition( nidx, split_node.LeftChild(), split_node.RightChild(), [=] __device__(bst_uint ridx) { - bst_float element = + // given a row index, returns the node id it belongs to + bst_float cut_value = d_matrix.GetElement(ridx, split_node.SplitIndex()); // Missing value int new_position = 0; - if (isnan(element)) { + if (isnan(cut_value)) { new_position = split_node.DefaultChild(); } else { - if (element <= split_node.SplitCond()) { + if (cut_value <= split_node.SplitCond()) { new_position = split_node.LeftChild(); } else { new_position = split_node.RightChild();