diff --git a/include/xgboost/data.h b/include/xgboost/data.h index fbc93def0..32ca6f314 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -151,10 +151,11 @@ struct Entry { }; /*! - * \brief in-memory storage unit of sparse batch + * \brief In-memory storage unit of sparse batch, stored in CSR format. */ class SparsePage { public: + // Offset for each row. HostDeviceVector offset; /*! \brief the data of the segments */ HostDeviceVector data; diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 507325e22..874fd311a 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -204,7 +204,7 @@ inline void LaunchN(int device_idx, size_t n, L lambda) { const int GRID_SIZE = static_cast(DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS)); LaunchNKernel<<>>(static_cast(0), n, - lambda); + lambda); } /* @@ -365,6 +365,7 @@ class DVec2 { T *other() { return buff_.Alternate(); } }; +/*! \brief Helper for allocating large block of memory. */ template class BulkAllocator { std::vector d_ptr_; diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 4c8f4a382..1d5f3c8f1 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -53,34 +53,15 @@ struct GHistEntry { } }; - -/*! \brief Cut configuration for one feature */ -struct HistCutUnit { - /*! \brief the index pointer of each histunit */ - const bst_float* cut; - /*! \brief number of cutting point, containing the maximum point */ - uint32_t size; - // default constructor - HistCutUnit() = default; - // constructor - HistCutUnit(const bst_float* cut, uint32_t size) - : cut(cut), size(size) {} -}; - -/*! \brief cut configuration for all the features. */ +/*! \brief Cut configuration for all the features. */ struct HistCutMatrix { - /*! \brief unit pointer to rows by element position */ + /*! \brief Unit pointer to rows by element position */ std::vector row_ptr; /*! \brief minimum value of each feature */ std::vector min_val; /*! \brief the cut field */ std::vector cut; uint32_t GetBinIdx(const Entry &e); - /*! \brief Get histogram bound for fid */ - inline HistCutUnit operator[](bst_uint fid) const { - return {dmlc::BeginPtr(cut) + row_ptr[fid], - row_ptr[fid + 1] - row_ptr[fid]}; - } using WXQSketch = common::WXQuantileSketch; @@ -189,7 +170,7 @@ class GHistIndexBlockMatrix { /*! * \brief histogram of graident statistics for a single node. - * Consists of multiple GHistEntry's, each entry showing total graident statistics + * Consists of multiple GHistEntry's, each entry showing total graident statistics * for that particular bin * Uses global bin id so as to represent all features simultaneously */ diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index e3e9fade8..297b40e39 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -75,7 +75,8 @@ inline void CheckGradientMax(const std::vector& gpair) { auto* ptr = reinterpret_cast(gpair.data()); float abs_max = std::accumulate(ptr, ptr + (gpair.size() * 2), 0.f, - [=](float a, float b) { return max(abs(a), abs(b)); }); + [=](float a, float b) { + return std::max(abs(a), abs(b)); }); CHECK_LT(abs_max, std::pow(2.0f, 16.0f)) << "Labels are too large for this algorithm. Rescale to less than 2^16."; @@ -254,6 +255,7 @@ XGBOOST_DEVICE float inline LossChangeMissing(const GradientPairT& scan, const float& parent_gain, const GPUTrainingParam& param, bool& missing_left_out) { // NOLINT + // Put gradients of missing values to left float missing_left_loss = DeviceCalcLossChange(param, scan + missing, parent_sum, parent_gain); float missing_right_loss = diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 20b743466..45d307078 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -1,5 +1,5 @@ /*! - * Copyright 2017 XGBoost contributors + * Copyright 2017-2018 XGBoost contributors */ #include #include @@ -11,15 +11,17 @@ #include #include #include +#include #include #include #include +#include "../common/common.h" #include "../common/compressed_iterator.h" #include "../common/device_helpers.cuh" #include "../common/hist_util.h" #include "../common/host_device_vector.h" #include "../common/timer.h" -#include "../common/common.h" +#include "../common/span.h" #include "param.h" #include "updater_gpu_common.cuh" @@ -30,14 +32,25 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); using GradientPairSumT = GradientPairPrecise; +/*! + * \brief + * + * \tparam ReduceT BlockReduce Type. + * \tparam TempStorage Cub Shared memory + * + * \param begin + * \param end + * \param temp_storage Shared memory for intermediate result. + */ template __device__ GradientPairSumT ReduceFeature(const GradientPairSumT* begin, - const GradientPairSumT* end, - TempStorageT* temp_storage) { + const GradientPairSumT* end, + TempStorageT* temp_storage) { __shared__ cub::Uninitialized uninitialized_sum; GradientPairSumT& shared_sum = uninitialized_sum.Alias(); GradientPairSumT local_sum = GradientPairSumT(); + // For loop sums features into one block size for (auto itr = begin; itr < end; itr += BLOCK_THREADS) { bool thread_active = itr + threadIdx.x < end; // Scan histogram @@ -45,51 +58,60 @@ __device__ GradientPairSumT ReduceFeature(const GradientPairSumT* begin, local_sum += bin; } local_sum = ReduceT(temp_storage->sum_reduce).Reduce(local_sum, cub::Sum()); - + // Reduction result is stored in thread 0. if (threadIdx.x == 0) { shared_sum = local_sum; } __syncthreads(); - return shared_sum; } +/*! \brief Find the thread with best gain. */ template -__device__ void EvaluateFeature(int fidx, const GradientPairSumT* hist, - const int* feature_segments, float min_fvalue, - const float* gidx_fvalue_map, - DeviceSplitCandidate* best_split, - const DeviceNodeStats& node, - const GPUTrainingParam& param, - TempStorageT* temp_storage, int constraint, - const ValueConstraint& value_constraint) { - int gidx_begin = feature_segments[fidx]; - int gidx_end = feature_segments[fidx + 1]; +__device__ void EvaluateFeature( + int fidx, + const GradientPairSumT* hist, - GradientPairSumT feature_sum = ReduceFeature( + const uint32_t* feature_segments, // cut.row_ptr + float min_fvalue, // cut.min_value + const float* gidx_fvalue_map, // cut.cut + + DeviceSplitCandidate* best_split, // shared memory storing best split + const DeviceNodeStats& node, + const GPUTrainingParam& param, + TempStorageT* temp_storage, // temp memory for cub operations + int constraint, // monotonic_constraints + const ValueConstraint& value_constraint) { + // Use pointer from cut to indicate begin and end of bins for each feature. + uint32_t gidx_begin = feature_segments[fidx]; // begining bin + uint32_t gidx_end = feature_segments[fidx + 1]; // end bin for i^th feature + + // Sum histogram bins for current feature + GradientPairSumT const feature_sum = ReduceFeature( hist + gidx_begin, hist + gidx_end, temp_storage); - auto prefix_op = SumCallbackOp(); + GradientPairSumT const parent_sum = GradientPairSumT(node.sum_gradients); + GradientPairSumT const missing = parent_sum - feature_sum; + float const null_gain = -std::numeric_limits::infinity(); + + SumCallbackOp prefix_op = + SumCallbackOp(); for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { - bool thread_active = scan_begin + threadIdx.x < gidx_end; + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + // Gradient value for current bin. GradientPairSumT bin = thread_active ? hist[scan_begin + threadIdx.x] : GradientPairSumT(); scan_t(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); - // Calculate gain - GradientPairSumT parent_sum = GradientPairSumT(node.sum_gradients); - - GradientPairSumT missing = parent_sum - feature_sum; - + // Whether the gradient of missing values is put to the left side. bool missing_left = true; - const float null_gain = -FLT_MAX; float gain = null_gain; if (thread_active) { gain = LossChangeMissing(bin, missing, parent_sum, node.root_gain, param, - constraint, value_constraint, missing_left); + constraint, value_constraint, missing_left); } __syncthreads(); @@ -111,27 +133,37 @@ __device__ void EvaluateFeature(int fidx, const GradientPairSumT* hist, int gidx = scan_begin + threadIdx.x; float fvalue = gidx == gidx_begin ? min_fvalue : gidx_fvalue_map[gidx - 1]; - GradientPairSumT left = missing_left ? bin + missing : bin; GradientPairSumT right = parent_sum - left; - - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, - GradientPair(left), GradientPair(right), param); + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, + fvalue, fidx, + GradientPair(left), + GradientPair(right), + param); } __syncthreads(); } } template -__global__ void evaluate_split_kernel( - const GradientPairSumT* d_hist, int nidx, uint64_t n_features, - int* feature_set, DeviceNodeStats nodes, const int* d_feature_segments, - const float* d_fidx_min_map, const float* d_gidx_fvalue_map, - GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split, - ValueConstraint value_constraint, int* d_monotonic_constraints) { +__global__ void EvaluateSplitKernel( + const GradientPairSumT* d_hist, // histogram for gradients + uint64_t n_features, + int* feature_set, // Selected features + DeviceNodeStats node, + + const uint32_t* d_feature_segments, // row_ptr form HistCutMatrix + const float* d_fidx_min_map, // min_value + const float* d_gidx_fvalue_map, // cut + + GPUTrainingParam gpu_param, + DeviceSplitCandidate* d_split, // resulting split + ValueConstraint value_constraint, + int* d_monotonic_constraints) { + // KeyValuePair here used as threadIdx.x -> gain_value typedef cub::KeyValuePair ArgMaxT; - typedef cub::BlockScan - BlockScanT; + typedef cub::BlockScan< + GradientPairSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS> BlockScanT; typedef cub::BlockReduce MaxReduceT; typedef cub::BlockReduce SumReduceT; @@ -142,6 +174,7 @@ __global__ void evaluate_split_kernel( typename SumReduceT::TempStorage sum_reduce; }; + // Aligned && shared storage for best_split __shared__ cub::Uninitialized uninitialized_split; DeviceSplitCandidate& best_split = uninitialized_split.Alias(); __shared__ TempStorage temp_storage; @@ -152,17 +185,28 @@ __global__ void evaluate_split_kernel( __syncthreads(); - auto fidx = feature_set[blockIdx.x]; - auto constraint = d_monotonic_constraints[fidx]; + // One block for each feature. Features are sampled, so fidx != blockIdx.x + int fidx = feature_set[blockIdx.x]; + int constraint = d_monotonic_constraints[fidx]; EvaluateFeature( - fidx, d_hist, d_feature_segments, d_fidx_min_map[fidx], d_gidx_fvalue_map, - &best_split, nodes, gpu_param, &temp_storage, constraint, + fidx, + d_hist, + + d_feature_segments, + d_fidx_min_map[fidx], + d_gidx_fvalue_map, + + &best_split, + node, + gpu_param, + &temp_storage, + constraint, value_constraint); __syncthreads(); if (threadIdx.x == 0) { - // Record best loss + // Record best loss for each feature d_split[fidx] = best_split; } } @@ -170,7 +214,7 @@ __global__ void evaluate_split_kernel( // Find a gidx value for a given feature otherwise return -1 if not found template __device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data, - int fidx_begin, int fidx_end) { + int const fidx_begin, int const fidx_end) { bst_uint previous_middle = UINT32_MAX; while (end != begin) { auto middle = begin + (end - begin) / 2; @@ -201,14 +245,14 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data, * \author Rory * \date 28/07/2018 */ - struct DeviceHistogram { - std::map - nidx_map; // Map nidx to starting index of its histogram + /*! \brief Map nidx to starting index of its histogram. */ + std::map nidx_map; thrust::device_vector data; const size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size int n_bins; int device_idx; + void Init(int device_idx, int n_bins) { this->n_bins = n_bins; this->device_idx = device_idx; @@ -229,7 +273,7 @@ struct DeviceHistogram { if (data.size() > kStopGrowingSize) { // Recycle histogram memory - auto old_entry = *nidx_map.begin(); + std::pair old_entry = *nidx_map.begin(); nidx_map.erase(old_entry.first); dh::safe_cuda(cudaMemset(data.data().get() + old_entry.second, 0, n_bins * sizeof(GradientPairSumT))); @@ -238,6 +282,7 @@ struct DeviceHistogram { // Append new node histogram nidx_map[nidx] = data.size(); dh::safe_cuda(cudaSetDevice(device_idx)); + // x 2: Hess and Grad. data.resize(data.size() + (n_bins * 2)); } } @@ -247,7 +292,6 @@ struct DeviceHistogram { * \param nidx Tree node index. * \return hist pointer. */ - GradientPairSumT* GetHistPtr(int nidx) { CHECK(this->HistogramExists(nidx)); auto ptr = data.data().get() + nidx_map[nidx]; @@ -269,30 +313,42 @@ struct CalcWeightTrainParam { learning_rate(p.learning_rate) {} }; -__global__ void compress_bin_ellpack_k -(common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, - const size_t* __restrict__ row_ptrs, - const Entry* __restrict__ entries, - const float* __restrict__ cuts, const size_t* __restrict__ cut_rows, - size_t base_row, size_t n_rows, size_t row_ptr_begin, size_t row_stride, - unsigned int null_gidx_value) { - size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x; +// Bin each input data entry, store the bin indices in compressed form. +__global__ void compress_bin_ellpack_k( + common::CompressedBufferWriter wr, + common::CompressedByteT* __restrict__ buffer, // gidx_buffer + const size_t* __restrict__ row_ptrs, // row offset of input data + const Entry* __restrict__ entries, // One batch of input data + const float* __restrict__ cuts, // HistCutMatrix::cut + const uint32_t* __restrict__ cut_rows, // HistCutMatrix::row_ptrs + size_t base_row, // batch_row_begin + size_t n_rows, + // row_ptr_begin: row_offset[base_row], the start position of base_row + size_t row_ptr_begin, + size_t row_stride, + unsigned int null_gidx_value) { + size_t irow = threadIdx.x + blockIdx.x * blockDim.x; int ifeature = threadIdx.y + blockIdx.y * blockDim.y; if (irow >= n_rows || ifeature >= row_stride) return; - int row_size = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); + int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); unsigned int bin = null_gidx_value; - if (ifeature < row_size) { + if (ifeature < row_length) { Entry entry = entries[row_ptrs[irow] - row_ptr_begin + ifeature]; int feature = entry.index; float fvalue = entry.fvalue; + // {feature_cuts, ncuts} forms the array of cuts of `feature'. const float *feature_cuts = &cuts[cut_rows[feature]]; int ncuts = cut_rows[feature + 1] - cut_rows[feature]; + // Assigning the bin in current entry. + // S.t.: fvalue < feature_cuts[bin] bin = dh::UpperBound(feature_cuts, ncuts, fvalue); if (bin >= ncuts) bin = ncuts - 1; + // Add the number of bins in previous features. bin += cut_rows[feature]; } + // Write to gidx buffer. wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature); } @@ -323,57 +379,92 @@ __global__ void sharedMemHistKernel(size_t row_stride, } } +struct Segment { + size_t begin; + size_t end; + + Segment() : begin(0), end(0) {} + + Segment(size_t begin, size_t end) : begin(begin), end(end) { + CHECK_GE(end, begin); + } + size_t Size() const { return end - begin; } +}; + +struct DeviceShard; + +struct GPUHistBuilderBase { + public: + virtual void Build(DeviceShard* shard, int idx) = 0; +}; + // Manage memory for a single GPU struct DeviceShard { - struct Segment { - size_t begin; - size_t end; - - Segment() : begin(0), end(0) {} - - Segment(size_t begin, size_t end) : begin(begin), end(end) { - CHECK_GE(end, begin); - } - size_t Size() const { return end - begin; } - }; - int device_idx; - int normalised_device_idx; // Device index counting from param.gpu_id + /*! \brief Device index counting from param.gpu_id */ + int normalised_device_idx; dh::BulkAllocator ba; - dh::DVec gidx_buffer; - dh::DVec gpair; - dh::DVec2 ridx; // Row index relative to this shard - dh::DVec2 position; + + /*! \brief HistCutMatrix stored in device. */ + struct DeviceHistCutMatrix { + /*! \brief row_ptr form HistCutMatrix. */ + dh::DVec feature_segments; + /*! \brief minimum value for each feature. */ + dh::DVec min_fvalue; + /*! \brief Cut. */ + dh::DVec gidx_fvalue_map; + } cut_; + + /*! \brief Range of rows for each node. */ std::vector ridx_segments; - dh::DVec feature_segments; - dh::DVec gidx_fvalue_map; - dh::DVec min_fvalue; + DeviceHistogram hist; + + /*! \brief global index of histogram, which is stored in ELLPack format. */ + dh::DVec gidx_buffer; + /*! \brief row length for ELLPack. */ + size_t row_stride; + common::CompressedIterator gidx; + + /*! \brief Row indices relative to this shard, necessary for sorting rows. */ + dh::DVec2 ridx; + /*! \brief Gradient pair for each row. */ + dh::DVec gpair; + + /*! \brief The last histogram index. */ + int null_gidx_value; + + dh::DVec2 position; + dh::DVec monotone_constraints; dh::DVec prediction_cache; + + /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; dh::DVec node_sum_gradients_d; + /*! \brief row offset in SparsePage (the input data). */ thrust::device_vector row_ptrs; - common::CompressedIterator gidx; - size_t row_stride; - bst_uint row_begin_idx; // The row offset for this shard + /*! The row offset for this shard. */ + bst_uint row_begin_idx; bst_uint row_end_idx; bst_uint n_rows; int n_bins; - int null_gidx_value; - DeviceHistogram hist; + TrainParam param; bool prediction_cache_initialised; - bool can_use_smem_atomics; + // FIXME: Remove this int64_t* tmp_pinned; // Small amount of staging memory + // Used to process nodes concurrently std::vector streams; dh::CubMemory temp_memory; + std::unique_ptr hist_builder; + // TODO(canonizer): do add support multi-batch DMatrix here DeviceShard(int device_idx, int normalised_device_idx, - bst_uint row_begin, bst_uint row_end, TrainParam param) : + bst_uint row_begin, bst_uint row_end, TrainParam _param) : device_idx(device_idx), normalised_device_idx(normalised_device_idx), row_begin_idx(row_begin), @@ -382,11 +473,11 @@ struct DeviceShard { n_rows(row_end - row_begin), n_bins(0), null_gidx_value(0), - param(param), + param(_param), prediction_cache_initialised(false), - can_use_smem_atomics(false), tmp_pinned(nullptr) {} + /* Init row_ptrs and row_stride */ void InitRowPtrs(const SparsePage& row_batch) { dh::safe_cuda(cudaSetDevice(device_idx)); const auto& offset_vec = row_batch.offset.HostVector(); @@ -395,7 +486,7 @@ struct DeviceShard { offset_vec.data() + row_end_idx + 1, row_ptrs.begin()); auto row_iter = row_ptrs.begin(); - // find the maximum row size + // find the maximum row size for converting to ELLPack auto get_size = [=] __device__(size_t row) { return row_iter[row + 1] - row_iter[row]; }; // NOLINT @@ -408,106 +499,16 @@ struct DeviceShard { thrust::maximum()); } - void InitCompressedData(const common::HistCutMatrix& hmat, const SparsePage& row_batch) { - n_bins = hmat.row_ptr.back(); - null_gidx_value = hmat.row_ptr.back(); + /* + Init: + n_bins, null_gidx_value, gidx_buffer, row_ptrs, gidx, gidx_fvalue_map, + min_fvalue, feature_segments, node_sum_gradients, ridx_segments, + hist + */ + void InitCompressedData( + const common::HistCutMatrix& hmat, const SparsePage& row_batch); - // copy cuts to the GPU - dh::safe_cuda(cudaSetDevice(device_idx)); - thrust::device_vector cuts_d(hmat.cut); - thrust::device_vector cut_row_ptrs_d(hmat.row_ptr); - - // allocate compressed bin data - int num_symbols = n_bins + 1; - size_t compressed_size_bytes = - common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, - num_symbols); - - CHECK(!(param.max_leaves == 0 && param.max_depth == 0)) - << "Max leaves and max depth cannot both be unconstrained for " - "gpu_hist."; - ba.Allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes); - gidx_buffer.Fill(0); - - int nbits = common::detail::SymbolBits(num_symbols); - - // bin and compress entries in batches of rows - size_t gpu_batch_nrows = std::min - (dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)), - static_cast(n_rows)); - - const auto& offset_vec = row_batch.offset.HostVector(); - const auto& data_vec = row_batch.data.HostVector(); - - thrust::device_vector entries_d(gpu_batch_nrows * row_stride); - size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows); - - for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { - size_t batch_row_begin = gpu_batch * gpu_batch_nrows; - size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows; - if (batch_row_end > n_rows) { - batch_row_end = n_rows; - } - size_t batch_nrows = batch_row_end - batch_row_begin; - size_t n_entries = - offset_vec[row_begin_idx + batch_row_end] - - offset_vec[row_begin_idx + batch_row_begin]; - dh::safe_cuda - (cudaMemcpy - (entries_d.data().get(), - data_vec.data() + offset_vec[row_begin_idx + batch_row_begin], - n_entries * sizeof(Entry), cudaMemcpyDefault)); - dim3 block3(32, 8, 1); - dim3 grid3(dh::DivRoundUp(n_rows, block3.x), - dh::DivRoundUp(row_stride, block3.y), 1); - compress_bin_ellpack_k<<>> - (common::CompressedBufferWriter(num_symbols), gidx_buffer.Data(), - row_ptrs.data().get() + batch_row_begin, - entries_d.data().get(), cuts_d.data().get(), cut_row_ptrs_d.data().get(), - batch_row_begin, batch_nrows, - offset_vec[row_begin_idx + batch_row_begin], - row_stride, null_gidx_value); - - dh::safe_cuda(cudaGetLastError()); - dh::safe_cuda(cudaDeviceSynchronize()); - } - - // free the memory that is no longer needed - row_ptrs.resize(0); - row_ptrs.shrink_to_fit(); - entries_d.resize(0); - entries_d.shrink_to_fit(); - - gidx = common::CompressedIterator(gidx_buffer.Data(), num_symbols); - - // allocate the rest - int max_nodes = - param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth); - ba.Allocate(device_idx, param.silent, - &gpair, n_rows, &ridx, n_rows, &position, n_rows, - &prediction_cache, n_rows, &node_sum_gradients_d, max_nodes, - &feature_segments, hmat.row_ptr.size(), &gidx_fvalue_map, - hmat.cut.size(), &min_fvalue, hmat.min_val.size(), - &monotone_constraints, param.monotone_constraints.size()); - gidx_fvalue_map = hmat.cut; - min_fvalue = hmat.min_val; - feature_segments = hmat.row_ptr; - monotone_constraints = param.monotone_constraints; - - node_sum_gradients.resize(max_nodes); - ridx_segments.resize(max_nodes); - - // check if we can use shared memory for building histograms - // (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding) - auto histogram_size = sizeof(GradientPairSumT) * null_gidx_value; - auto max_smem = dh::MaxSharedMemory(device_idx); - can_use_smem_atomics = histogram_size <= max_smem; - - // Init histogram - hist.Init(device_idx, hmat.row_ptr.back()); - - dh::safe_cuda(cudaMallocHost(&tmp_pinned, sizeof(int64_t))); - } + void CreateHistIndices(const SparsePage& row_batch); ~DeviceShard() { for (auto& stream : streams) { @@ -550,59 +551,9 @@ struct DeviceShard { hist.Reset(); } - void BuildHistUsingGlobalMem(int nidx) { - auto segment = ridx_segments[nidx]; - auto d_node_hist = hist.GetHistPtr(nidx); - auto d_gidx = gidx; - auto d_ridx = ridx.Current(); - auto d_gpair = gpair.Data(); - auto row_stride = this->row_stride; - auto null_gidx_value = this->null_gidx_value; - auto n_elements = segment.Size() * row_stride; - - dh::LaunchN(device_idx, n_elements, [=] __device__(size_t idx) { - int ridx = d_ridx[(idx / row_stride) + segment.begin]; - int gidx = d_gidx[ridx * row_stride + idx % row_stride]; - - if (gidx != null_gidx_value) { - AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]); - } - }); - } - - void BuildHistUsingSharedMem(int nidx) { - auto segment = ridx_segments[nidx]; - auto segment_begin = segment.begin; - auto d_node_hist = hist.GetHistPtr(nidx); - auto d_gidx = gidx; - auto d_ridx = ridx.Current(); - auto d_gpair = gpair.Data(); - auto row_stride = this->row_stride; - auto null_gidx_value = this->null_gidx_value; - auto n_elements = segment.Size() * row_stride; - - const size_t smem_size = sizeof(GradientPairSumT) * null_gidx_value; - const int items_per_thread = 8; - const int block_threads = 256; - const int grid_size = - static_cast(dh::DivRoundUp(n_elements, - items_per_thread * block_threads)); - if (grid_size <= 0) { - return; - } - dh::safe_cuda(cudaSetDevice(device_idx)); - sharedMemHistKernel<<>> - (row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist, d_gpair, - segment_begin, n_elements); - } - void BuildHist(int nidx) { hist.AllocateHistogram(nidx); - if (can_use_smem_atomics) { - BuildHistUsingSharedMem(nidx); - } else { - BuildHistUsingGlobalMem(nidx); - } + hist_builder->Build(this, nidx); } void SubtractionTrick(int nidx_parent, int nidx_histogram, @@ -625,6 +576,7 @@ struct DeviceShard { hist.HistogramExists(nidx_parent); } + /*! \brief Count how many rows are assigned to left node. */ __device__ void CountLeft(int64_t* d_count, int val, int left_nidx) { unsigned ballot = __ballot(val == left_nidx); if (threadIdx.x % 32 == 0) { @@ -634,31 +586,35 @@ struct DeviceShard { } void UpdatePosition(int nidx, int left_nidx, int right_nidx, int fidx, - int split_gidx, bool default_dir_left, bool is_dense, - int fidx_begin, int fidx_end) { + int64_t split_gidx, bool default_dir_left, bool is_dense, + int fidx_begin, // cut.row_ptr[fidx] + int fidx_end) { // cut.row_ptr[fidx + 1] dh::safe_cuda(cudaSetDevice(device_idx)); temp_memory.LazyAllocate(sizeof(int64_t)); - auto d_left_count = temp_memory.Pointer(); + int64_t* d_left_count = temp_memory.Pointer(); dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t))); - auto segment = ridx_segments[nidx]; - auto d_ridx = ridx.Current(); - auto d_position = position.Current(); - auto d_gidx = gidx; - auto row_stride = this->row_stride; + Segment segment = ridx_segments[nidx]; + bst_uint* d_ridx = ridx.Current(); + int* d_position = position.Current(); + common::CompressedIterator d_gidx = gidx; + size_t row_stride = this->row_stride; + // Launch 1 thread for each row dh::LaunchN<1, 512>( device_idx, segment.Size(), [=] __device__(bst_uint idx) { idx += segment.begin; - auto ridx = d_ridx[idx]; + bst_uint ridx = d_ridx[idx]; auto row_begin = row_stride * ridx; auto row_end = row_begin + row_stride; auto gidx = -1; if (is_dense) { + // FIXME: Maybe just search the cuts again. gidx = d_gidx[row_begin + fidx]; } else { gidx = BinarySearchRow(row_begin, row_end, d_gidx, fidx_begin, fidx_end); } + // belong to left node or right node. int position; if (gidx >= 0) { // Feature is found @@ -671,19 +627,18 @@ struct DeviceShard { CountLeft(d_left_count, position, left_nidx); d_position[idx] = position; }); - dh::safe_cuda(cudaMemcpy(tmp_pinned, d_left_count, sizeof(int64_t), cudaMemcpyDeviceToHost)); auto left_count = *tmp_pinned; - SortPosition(segment, left_nidx, right_nidx); - // dh::safe_cuda(cudaStreamSynchronize(stream)); + ridx_segments[left_nidx] = Segment(segment.begin, segment.begin + left_count); ridx_segments[right_nidx] = Segment(segment.begin + left_count, segment.end); } + /*! \brief Sort row indices according to position. */ void SortPosition(const Segment& segment, int left_nidx, int right_nidx) { int min_bits = 0; int max_bits = static_cast( @@ -691,9 +646,10 @@ struct DeviceShard { size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( - nullptr, temp_storage_bytes, position.Current() + segment.begin, - position.other() + segment.begin, ridx.Current() + segment.begin, - ridx.other() + segment.begin, segment.Size(), min_bits, max_bits); + nullptr, temp_storage_bytes, + position.Current() + segment.begin, position.other() + segment.begin, + ridx.Current() + segment.begin, ridx.other() + segment.begin, + segment.Size(), min_bits, max_bits); temp_memory.LazyAllocate(temp_storage_bytes); @@ -702,9 +658,11 @@ struct DeviceShard { position.Current() + segment.begin, position.other() + segment.begin, ridx.Current() + segment.begin, ridx.other() + segment.begin, segment.Size(), min_bits, max_bits); + // Copy back key dh::safe_cuda(cudaMemcpy( position.Current() + segment.begin, position.other() + segment.begin, segment.Size() * sizeof(int), cudaMemcpyDeviceToDevice)); + // Copy back value dh::safe_cuda(cudaMemcpy( ridx.Current() + segment.begin, ridx.other() + segment.begin, segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice)); @@ -744,20 +702,182 @@ struct DeviceShard { } }; +struct SharedMemHistBuilder : public GPUHistBuilderBase { + void Build(DeviceShard* shard, int nidx) override { + auto segment = shard->ridx_segments[nidx]; + auto segment_begin = segment.begin; + auto d_node_hist = shard->hist.GetHistPtr(nidx); + auto d_gidx = shard->gidx; + auto d_ridx = shard->ridx.Current(); + auto d_gpair = shard->gpair.Data(); + + int null_gidx_value = shard->null_gidx_value; + auto n_elements = segment.Size() * shard->row_stride; + + const size_t smem_size = sizeof(GradientPairSumT) * shard->null_gidx_value; + const int items_per_thread = 8; + const int block_threads = 256; + const int grid_size = + static_cast(dh::DivRoundUp(n_elements, + items_per_thread * block_threads)); + if (grid_size <= 0) { + return; + } + dh::safe_cuda(cudaSetDevice(shard->device_idx)); + sharedMemHistKernel<<>> + (shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist, d_gpair, + segment_begin, n_elements); + } +}; + +struct GlobalMemHistBuilder : public GPUHistBuilderBase { + void Build(DeviceShard* shard, int nidx) override { + Segment segment = shard->ridx_segments[nidx]; + GradientPairSumT* d_node_hist = shard->hist.GetHistPtr(nidx); + common::CompressedIterator d_gidx = shard->gidx; + bst_uint* d_ridx = shard->ridx.Current(); + GradientPair* d_gpair = shard->gpair.Data(); + + size_t const n_elements = segment.Size() * shard->row_stride; + size_t const row_stride = shard->row_stride; + int const null_gidx_value = shard->null_gidx_value; + + dh::LaunchN(shard->device_idx, n_elements, [=] __device__(size_t idx) { + int ridx = d_ridx[(idx / row_stride) + segment.begin]; + // lookup the index (bin) of histogram. + int gidx = d_gidx[ridx * row_stride + idx % row_stride]; + + if (gidx != null_gidx_value) { + AtomicAddGpair(d_node_hist + gidx, d_gpair[ridx]); + } + }); + } +}; + +inline void DeviceShard::InitCompressedData( + const common::HistCutMatrix& hmat, const SparsePage& row_batch) { + n_bins = hmat.row_ptr.back(); + null_gidx_value = hmat.row_ptr.back(); + + int max_nodes = + param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth); + + ba.Allocate(device_idx, param.silent, + &gpair, n_rows, + &ridx, n_rows, + &position, n_rows, + &prediction_cache, n_rows, + &node_sum_gradients_d, max_nodes, + &cut_.feature_segments, hmat.row_ptr.size(), + &cut_.gidx_fvalue_map, hmat.cut.size(), + &cut_.min_fvalue, hmat.min_val.size(), + &monotone_constraints, param.monotone_constraints.size()); + cut_.gidx_fvalue_map = hmat.cut; + cut_.min_fvalue = hmat.min_val; + cut_.feature_segments = hmat.row_ptr; + monotone_constraints = param.monotone_constraints; + + node_sum_gradients.resize(max_nodes); + ridx_segments.resize(max_nodes); + + dh::safe_cuda(cudaSetDevice(device_idx)); + + // allocate compressed bin data + int num_symbols = n_bins + 1; + // Required buffer size for storing data matrix in ELLPack format. + size_t compressed_size_bytes = + common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, + num_symbols); + + CHECK(!(param.max_leaves == 0 && param.max_depth == 0)) + << "Max leaves and max depth cannot both be unconstrained for " + "gpu_hist."; + ba.Allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes); + gidx_buffer.Fill(0); + + int nbits = common::detail::SymbolBits(num_symbols); + + CreateHistIndices(row_batch); + + gidx = common::CompressedIterator(gidx_buffer.Data(), num_symbols); + + // check if we can use shared memory for building histograms + // (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding) + auto histogram_size = sizeof(GradientPairSumT) * null_gidx_value; + auto max_smem = dh::MaxSharedMemory(device_idx); + if (histogram_size <= max_smem) { + hist_builder.reset(new SharedMemHistBuilder); + } else { + hist_builder.reset(new GlobalMemHistBuilder); + } + + // Init histogram + hist.Init(device_idx, hmat.row_ptr.back()); + + dh::safe_cuda(cudaMallocHost(&tmp_pinned, sizeof(int64_t))); +} + +inline void DeviceShard::CreateHistIndices(const SparsePage& row_batch) { + int num_symbols = n_bins + 1; + // bin and compress entries in batches of rows + size_t gpu_batch_nrows = std::min + (dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)), + static_cast(n_rows)); + const std::vector& data_vec = row_batch.data.HostVector(); + + thrust::device_vector entries_d(gpu_batch_nrows * row_stride); + size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows); + + for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { + size_t batch_row_begin = gpu_batch * gpu_batch_nrows; + size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows; + if (batch_row_end > n_rows) { + batch_row_end = n_rows; + } + size_t batch_nrows = batch_row_end - batch_row_begin; + // number of entries in this batch. + size_t n_entries = row_ptrs[batch_row_end] - row_ptrs[batch_row_begin]; + // copy data entries to device. + dh::safe_cuda + (cudaMemcpy + (entries_d.data().get(), data_vec.data() + row_ptrs[batch_row_begin], + n_entries * sizeof(Entry), cudaMemcpyDefault)); + const dim3 block3(32, 8, 1); // 256 threads + const dim3 grid3(dh::DivRoundUp(n_rows, block3.x), + dh::DivRoundUp(row_stride, block3.y), 1); + compress_bin_ellpack_k<<>> + (common::CompressedBufferWriter(num_symbols), + gidx_buffer.Data(), + row_ptrs.data().get() + batch_row_begin, + entries_d.data().get(), + cut_.gidx_fvalue_map.Data(), cut_.feature_segments.Data(), + batch_row_begin, batch_nrows, + row_ptrs[batch_row_begin], + row_stride, null_gidx_value); + + dh::safe_cuda(cudaGetLastError()); + dh::safe_cuda(cudaDeviceSynchronize()); + } + + // free the memory that is no longer needed + row_ptrs.resize(0); + row_ptrs.shrink_to_fit(); + entries_d.resize(0); + entries_d.shrink_to_fit(); +} + class GPUHistMaker : public TreeUpdater { public: struct ExpandEntry; GPUHistMaker() : initialised_(false), p_last_fmat_(nullptr) {} - void Init( const std::vector>& args) override { param_.InitAllowUnknown(args); CHECK(param_.n_gpus != 0) << "Must have at least one device"; n_devices_ = param_.n_gpus; - dist_ = - GPUDistribution::Block(GPUSet::All(param_.n_gpus) - .Normalised(param_.gpu_id)); + dist_ = GPUDistribution::Block(GPUSet::All(param_.n_gpus) + .Normalised(param_.gpu_id)); dh::CheckComputeCapability(); @@ -817,6 +937,7 @@ class GPUHistMaker : public TreeUpdater { shard->InitRowPtrs(batch); }); + // Find the cuts. monitor_.Start("Quantiles", dist_.Devices()); common::DeviceSketch(batch, *info_, param_, &hmat_); n_bins_ = hmat_.row_ptr.back(); @@ -834,8 +955,7 @@ class GPUHistMaker : public TreeUpdater { initialised_ = true; } - void InitData(HostDeviceVector* gpair, DMatrix* dmat, - const RegTree& tree) { + void InitData(HostDeviceVector* gpair, DMatrix* dmat) { monitor_.Start("InitDataOnce", dist_.Devices()); if (!initialised_) { this->InitDataOnce(dmat); @@ -920,14 +1040,18 @@ class GPUHistMaker : public TreeUpdater { // Returns best loss std::vector EvaluateSplits( const std::vector& nidx_set, RegTree* p_tree) { - auto columns = info_->num_col_; + size_t const columns = info_->num_col_; std::vector best_splits(nidx_set.size()); + // Every feature is a candidate + size_t const candidates_size_bytes = + nidx_set.size() * columns * sizeof(DeviceSplitCandidate); + // Storage for all candidates from all nodes. std::vector candidate_splits(nidx_set.size() * columns); + // FIXME: Multi-gpu support? // Use first device auto& shard = shards_.front(); dh::safe_cuda(cudaSetDevice(shard->device_idx)); - shard->temp_memory.LazyAllocate(sizeof(DeviceSplitCandidate) * columns * - nidx_set.size()); + shard->temp_memory.LazyAllocate(candidates_size_bytes); auto d_split = shard->temp_memory.Pointer(); auto& streams = shard->GetStreams(static_cast(nidx_set.size())); @@ -936,32 +1060,38 @@ class GPUHistMaker : public TreeUpdater { for (auto i = 0; i < nidx_set.size(); i++) { auto nidx = nidx_set[i]; DeviceNodeStats node(shard->node_sum_gradients[nidx], nidx, param_); - auto depth = p_tree->GetDepth(nidx); + int depth = p_tree->GetDepth(nidx); - auto& feature_set = column_sampler_.GetFeatureSet(depth); - feature_set.Reshard(GPUSet(shard->device_idx, 1)); - - const int BLOCK_THREADS = 256; - evaluate_split_kernel + HostDeviceVector& feature_set = column_sampler_.GetFeatureSet(depth); + feature_set.Reshard(GPUSet::Range(shard->device_idx, 1)); + auto& h_feature_set = feature_set.HostVector(); + // One block for each feature + int constexpr BLOCK_THREADS = 256; + EvaluateSplitKernel <<>>( - shard->hist.GetHistPtr(nidx), nidx, info_->num_col_, - feature_set.DevicePointer(shard->device_idx), node, - shard->feature_segments.Data(), shard->min_fvalue.Data(), - shard->gidx_fvalue_map.Data(), GPUTrainingParam(param_), - d_split + i * columns, node_value_constraints_[nidx], + shard->hist.GetHistPtr(nidx), + info_->num_col_, + feature_set.DevicePointer(shard->device_idx), + node, + shard->cut_.feature_segments.Data(), + shard->cut_.min_fvalue.Data(), + shard->cut_.gidx_fvalue_map.Data(), + GPUTrainingParam(param_), + d_split + i * columns, // split candidate for i^th node. + node_value_constraints_[nidx], shard->monotone_constraints.Data()); } dh::safe_cuda(cudaDeviceSynchronize()); - dh::safe_cuda - (cudaMemcpy(candidate_splits.data(), shard->temp_memory.d_temp_storage, - sizeof(DeviceSplitCandidate) * columns * nidx_set.size(), - cudaMemcpyDeviceToHost)); + dh::safe_cuda( + cudaMemcpy(candidate_splits.data(), shard->temp_memory.d_temp_storage, + candidates_size_bytes, cudaMemcpyDeviceToHost)); for (auto i = 0; i < nidx_set.size(); i++) { auto depth = p_tree->GetDepth(nidx_set[i]); DeviceSplitCandidate nidx_best; for (auto fidx : column_sampler_.GetFeatureSet(depth).HostVector()) { - auto& candidate = candidate_splits[i * columns + fidx]; + DeviceSplitCandidate& candidate = + candidate_splits[i * columns + fidx]; nidx_best.Update(candidate, param_); } best_splits[i] = nidx_best; @@ -970,7 +1100,7 @@ class GPUHistMaker : public TreeUpdater { } void InitRoot(RegTree* p_tree) { - auto root_nidx = 0; + constexpr int root_nidx = 0; // Sum gradients std::vector tmp_sums(shards_.size()); @@ -980,7 +1110,7 @@ class GPUHistMaker : public TreeUpdater { dh::SumReduction(shard->temp_memory, shard->gpair.Data(), shard->gpair.Size()); }); - auto sum_gradient = + GradientPair sum_gradient = std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair()); // Generate root histogram @@ -1011,29 +1141,29 @@ class GPUHistMaker : public TreeUpdater { } void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) { - auto nidx = candidate.nid; - auto left_nidx = (*p_tree)[nidx].LeftChild(); - auto right_nidx = (*p_tree)[nidx].RightChild(); + int nidx = candidate.nid; + int left_nidx = (*p_tree)[nidx].LeftChild(); + int right_nidx = (*p_tree)[nidx].RightChild(); // convert floating-point split_pt into corresponding bin_id // split_cond = -1 indicates that split_pt is less than all known cut points - auto split_gidx = -1; - auto fidx = candidate.split.findex; - auto default_dir_left = candidate.split.dir == kLeftDir; - auto fidx_begin = hmat_.row_ptr[fidx]; - auto fidx_end = hmat_.row_ptr[fidx + 1]; + int64_t split_gidx = -1; + int64_t fidx = candidate.split.findex; + bool default_dir_left = candidate.split.dir == kLeftDir; + uint32_t fidx_begin = hmat_.row_ptr[fidx]; + uint32_t fidx_end = hmat_.row_ptr[fidx + 1]; + // split_gidx = i where i is the i^th bin containing split value. for (auto i = fidx_begin; i < fidx_end; ++i) { if (candidate.split.fvalue == hmat_.cut[i]) { - split_gidx = static_cast(i); + split_gidx = static_cast(i); } } - auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_; dh::ExecuteShards(&shards_, [&](std::unique_ptr& shard) { - shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx, - split_gidx, default_dir_left, - is_dense, fidx_begin, fidx_end); + shard->UpdatePosition(nidx, left_nidx, right_nidx, fidx, + split_gidx, default_dir_left, + is_dense, fidx_begin, fidx_end); }); } @@ -1043,7 +1173,7 @@ class GPUHistMaker : public TreeUpdater { tree.AddChilds(candidate.nid); auto& parent = tree[candidate.nid]; parent.SetSplit(candidate.split.findex, candidate.split.fvalue, - candidate.split.dir == kLeftDir); + candidate.split.dir == kLeftDir); tree.Stat(candidate.nid).loss_chg = candidate.split.loss_chg; // Set up child constraints @@ -1070,6 +1200,7 @@ class GPUHistMaker : public TreeUpdater { tree[parent.RightChild()].SetLeaf(right_weight * param_.learning_rate, 0); tree.Stat(parent.RightChild()).base_weight = right_weight; tree.Stat(parent.RightChild()).sum_hess = candidate.split.right_sum.GetHess(); + // Store sum gradients for (auto& shard : shards_) { shard->node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum; @@ -1083,7 +1214,7 @@ class GPUHistMaker : public TreeUpdater { auto& tree = *p_tree; monitor_.Start("InitData", dist_.Devices()); - this->InitData(gpair, p_fmat, *p_tree); + this->InitData(gpair, p_fmat); monitor_.Stop("InitData", dist_.Devices()); monitor_.Start("InitRoot", dist_.Devices()); this->InitRoot(p_tree); @@ -1093,17 +1224,17 @@ class GPUHistMaker : public TreeUpdater { auto num_leaves = 1; while (!qexpand_->empty()) { - auto candidate = qexpand_->top(); + ExpandEntry candidate = qexpand_->top(); qexpand_->pop(); if (!candidate.IsValid(param_, num_leaves)) continue; - // std::cout << candidate; + monitor_.Start("ApplySplit", dist_.Devices()); this->ApplySplit(candidate, p_tree); monitor_.Stop("ApplySplit", dist_.Devices()); num_leaves++; - auto left_child_nidx = tree[candidate.nid].LeftChild(); - auto right_child_nidx = tree[candidate.nid].RightChild(); + int left_child_nidx = tree[candidate.nid].LeftChild(); + int right_child_nidx = tree[candidate.nid].RightChild(); // Only create child entries if needed if (ExpandEntry::ChildIsValid(param_, tree.GetDepth(left_child_nidx), @@ -1199,9 +1330,8 @@ class GPUHistMaker : public TreeUpdater { std::vector> shards_; common::ColumnSampler column_sampler_; - typedef std::priority_queue, - std::function> - ExpandQueue; + using ExpandQueue = std::priority_queue, + std::function>; std::unique_ptr qexpand_; common::Monitor monitor_; dh::AllReducer reducer_; diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index b8c897eac..ba146af42 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -1,3 +1,6 @@ +/*! + * Copyright 2016-2018 XGBoost contributors + */ #include "./helpers.h" #include "xgboost/c_api.h" #include @@ -14,7 +17,7 @@ std::string TempFileName() { bool FileExists(const std::string name) { struct stat st; - return stat(name.c_str(), &st) == 0; + return stat(name.c_str(), &st) == 0; } long GetFileSize(const std::string filename) { @@ -106,17 +109,42 @@ xgboost::bst_float GetMetricEval(xgboost::Metric * metric, return metric->Eval(preds, info, false); } +namespace xgboost { +bool IsNear(std::vector::const_iterator _beg1, + std::vector::const_iterator _end1, + std::vector::const_iterator _beg2) { + for (auto iter1 = _beg1, iter2 = _beg2; iter1 != _end1; ++iter1, ++iter2) { + if (std::abs(*iter1 - *iter2) > xgboost::kRtEps){ + return false; + } + } + return true; +} + +SimpleLCG::StateType SimpleLCG::operator()() { + state_ = (alpha_ * state_) % mod_; + return state_; +} +SimpleLCG::StateType SimpleLCG::Min() const { + return seed_ * alpha_; +} +SimpleLCG::StateType SimpleLCG::Max() const { + return max_value_; +} + std::shared_ptr* CreateDMatrix(int rows, int columns, float sparsity, int seed) { const float missing_value = -1; std::vector test_data(rows * columns); - std::mt19937 gen(seed); - std::uniform_real_distribution dis(0.0f, 1.0f); + + xgboost::SimpleLCG gen(seed); + SimpleRealUniformDistribution dis(0.0f, 1.0f); + for (auto &e : test_data) { - if (dis(gen) < sparsity) { + if (dis(&gen) < sparsity) { e = missing_value; } else { - e = dis(gen); + e = dis(&gen); } } @@ -126,16 +154,4 @@ std::shared_ptr* CreateDMatrix(int rows, int columns, return static_cast *>(handle); } -namespace xgboost { -bool IsNear(std::vector::const_iterator _beg1, - std::vector::const_iterator _end1, - std::vector::const_iterator _beg2) { - for (auto iter1 = _beg1, iter2 = _beg2; iter1 != _end1; ++iter1, ++iter2) { - if (std::abs(*iter1 - *iter2) > xgboost::kRtEps){ - return false; - } - } - return true; -} -} - +} // namespace xgboost diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index bdc0e81f9..507bd3aa3 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -1,3 +1,6 @@ +/*! + * Copyright 2016-2018 XGBoost contributors + */ #ifndef XGBOOST_TESTS_CPP_HELPERS_H_ #define XGBOOST_TESTS_CPP_HELPERS_H_ @@ -56,7 +59,86 @@ namespace xgboost { bool IsNear(std::vector::const_iterator _beg1, std::vector::const_iterator _end1, std::vector::const_iterator _beg2); -} + +/*! + * \brief Linear congruential generator. + * + * The distribution defined in std is not portable. Given the same seed, it + * migth produce different outputs on different platforms or with different + * compilers. The SimpleLCG implemented here is to make sure all tests are + * reproducible. + */ +class SimpleLCG { + private: + using StateType = int64_t; + static StateType constexpr default_init_ = 3; + static StateType constexpr default_alpha_ = 61; + static StateType constexpr max_value_ = ((StateType)1 << 32) - 1; + + StateType state_; + StateType const alpha_; + StateType const mod_; + + StateType const seed_; + + public: + SimpleLCG() : state_{default_init_}, + alpha_{default_alpha_}, mod_{max_value_}, seed_{state_}{} + /*! + * \brief Initialize SimpleLCG. + * + * \param state Initial state, can also be considered as seed. If set to + * zero, SimpleLCG will use internal default value. + * \param alpha multiplier + * \param mod modulo + */ + SimpleLCG(StateType state, + StateType alpha=default_alpha_, StateType mod=max_value_) + : state_{state == 0 ? default_init_ : state}, + alpha_{alpha}, mod_{mod} , seed_{state} {} + + StateType operator()(); + StateType Min() const; + StateType Max() const; +}; + +template +class SimpleRealUniformDistribution { + private: + ResultT const lower; + ResultT const upper; + + /*! \brief Over-simplified version of std::generate_canonical. */ + template + ResultT GenerateCanonical(GeneratorT* rng) const { + static_assert(std::is_floating_point::value, + "Result type must be floating point."); + long double const r = (static_cast(rng->Max()) + - static_cast(rng->Min())) + 1.0L; + size_t const log2r = std::log(r) / std::log(2.0L); + size_t m = std::max(1UL, (Bits + log2r - 1UL) / log2r); + ResultT sum_value = 0, r_k = 1; + + for (size_t k = m; k != 0; --k) { + sum_value += ResultT((*rng)() - rng->Min()) * r_k; + r_k *= r; + } + + ResultT res = sum_value / r_k; + return res; + } + + public: + SimpleRealUniformDistribution(ResultT l, ResultT u) : + lower{l}, upper{u} {} + + template + ResultT operator()(GeneratorT* rng) const { + ResultT tmp = GenerateCanonical::digits, + GeneratorT>(rng); + return (tmp * (upper - lower)) + lower; + } +}; /** * \fn std::shared_ptr CreateDMatrix(int rows, int columns, float sparsity, int seed); @@ -70,7 +152,8 @@ bool IsNear(std::vector::const_iterator _beg1, * * \return The new d matrix. */ - std::shared_ptr *CreateDMatrix(int rows, int columns, float sparsity, int seed = 0); + +} // namespace xgboost #endif diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc index 35db3180e..5bcc0e771 100644 --- a/tests/cpp/linear/test_linear.cc +++ b/tests/cpp/linear/test_linear.cc @@ -7,7 +7,7 @@ typedef std::pair arg; TEST(Linear, shotgun) { typedef std::pair arg; - auto mat = CreateDMatrix(10, 10, 0); + auto mat = xgboost::CreateDMatrix(10, 10, 0); auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("shotgun")); updater->Init({{"eta", "1."}}); @@ -26,7 +26,7 @@ TEST(Linear, shotgun) { TEST(Linear, coordinate) { typedef std::pair arg; - auto mat = CreateDMatrix(10, 10, 0); + auto mat = xgboost::CreateDMatrix(10, 10, 0); auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("coord_descent")); updater->Init({{"eta", "1."}}); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 429f45e50..40228b8c1 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -1,86 +1,396 @@ - /*! - * Copyright 2017 XGBoost contributors + * Copyright 2017-2018 XGBoost contributors */ + #include #include +#include #include "../helpers.h" #include "gtest/gtest.h" #include "../../../src/data/sparse_page_source.h" #include "../../../src/gbm/gbtree_model.h" #include "../../../src/tree/updater_gpu_hist.cu" +#include "../../../src/tree/updater_gpu_common.cuh" #include "../../../src/common/common.h" namespace xgboost { namespace tree { -TEST(gpu_hist_experimental, TestSparseShard) { - int rows = 100; - int columns = 80; - int max_bins = 4; - auto dmat = CreateDMatrix(rows, columns, 0.9f); - common::GHistIndexMatrix gmat; - gmat.Init((*dmat).get(),max_bins); - TrainParam p; - p.max_depth = 6; +void BuildGidx(DeviceShard* shard, int n_rows, int n_cols, + bst_float sparsity=0) { + auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3); const SparsePage& batch = *(*dmat)->GetRowBatches().begin(); - DeviceShard shard(0, 0, 0, rows, p); - shard.InitRowPtrs(batch); - shard.InitCompressedData(gmat.cut, batch); - ASSERT_LT(shard.row_stride, columns); + common::HistCutMatrix cmat; + cmat.row_ptr = {0, 3, 6, 9, 12, 15, 18, 21, 24}; + cmat.min_val = {0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.2, 0.2}; + // 24 cut fields, 3 cut fields for each feature (column). + cmat.cut = {0.30, 0.67, 1.64, + 0.32, 0.77, 1.95, + 0.29, 0.70, 1.80, + 0.32, 0.75, 1.85, + 0.18, 0.59, 1.69, + 0.25, 0.74, 2.00, + 0.26, 0.74, 1.98, + 0.26, 0.71, 1.83}; - auto host_gidx_buffer = shard.gidx_buffer.AsVector(); - - common::CompressedIterator gidx(host_gidx_buffer.data(), - gmat.cut.row_ptr.back() + 1); - - for (int i = 0; i < rows; i++) { - int row_offset = 0; - for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) { - ASSERT_EQ(gidx[i * shard.row_stride + row_offset], gmat.index[j]); - row_offset++; - } - - for (; row_offset < shard.row_stride; row_offset++) { - ASSERT_EQ(gidx[i * shard.row_stride + row_offset], shard.null_gidx_value); - } - } + shard->InitRowPtrs(batch); + shard->InitCompressedData(cmat, batch); delete dmat; } -TEST(gpu_hist_experimental, TestDenseShard) { - int rows = 100; - int columns = 80; - int max_bins = 4; - auto dmat = CreateDMatrix(rows, columns, 0); - common::GHistIndexMatrix gmat; - gmat.Init((*dmat).get(),max_bins); - TrainParam p; - p.max_depth = 6; +TEST(GpuHist, BuildGidxDense) { + int const n_rows = 16, n_cols = 8; + TrainParam param; + param.max_depth = 1; + param.n_gpus = 1; + param.max_leaves = 0; - const SparsePage& batch = *(*dmat)->GetRowBatches().begin(); - DeviceShard shard(0, 0, 0, rows, p); - shard.InitRowPtrs(batch); - shard.InitCompressedData(gmat.cut, batch); + DeviceShard shard(0, 0, 0, n_rows, param); + BuildGidx(&shard, n_rows, n_cols); - ASSERT_EQ(shard.row_stride, columns); + std::vector h_gidx_buffer; + h_gidx_buffer = shard.gidx_buffer.AsVector(); + common::CompressedIterator gidx(h_gidx_buffer.data(), 25); - auto host_gidx_buffer = shard.gidx_buffer.AsVector(); + ASSERT_EQ(shard.row_stride, n_cols); - common::CompressedIterator gidx(host_gidx_buffer.data(), - gmat.cut.row_ptr.back() + 1); - - for (int i = 0; i < gmat.index.size(); i++) { - ASSERT_EQ(gidx[i], gmat.index[i]); + std::vector solution = { + 0, 3, 8, 9, 14, 17, 20, 21, + 0, 4, 7, 10, 14, 16, 19, 22, + 1, 3, 7, 11, 14, 15, 19, 21, + 2, 3, 7, 9, 13, 16, 20, 22, + 2, 3, 6, 9, 12, 16, 20, 21, + 1, 5, 6, 10, 13, 16, 20, 21, + 2, 5, 8, 9, 13, 17, 19, 22, + 2, 4, 6, 10, 14, 17, 19, 21, + 2, 5, 7, 9, 13, 16, 19, 22, + 0, 3, 8, 10, 12, 16, 19, 22, + 1, 3, 7, 10, 13, 16, 19, 21, + 1, 3, 8, 10, 13, 17, 20, 22, + 2, 4, 6, 9, 14, 15, 19, 22, + 1, 4, 6, 9, 13, 16, 19, 21, + 2, 4, 8, 10, 14, 15, 19, 22, + 1, 4, 7, 10, 14, 16, 19, 21, + }; + for (size_t i = 0; i < n_rows * n_cols; ++i) { + ASSERT_EQ(solution[i], gidx[i]); } - - delete dmat; } -TEST(gpu_hist_experimental, MGPU_mock) { +TEST(GpuHist, BuildGidxSparse) { + int const n_rows = 16, n_cols = 8; + TrainParam param; + param.max_depth = 1; + param.n_gpus = 1; + param.max_leaves = 0; + + DeviceShard shard(0, 0, 0, n_rows, param); + BuildGidx(&shard, n_rows, n_cols, 0.9f); + + std::vector h_gidx_buffer; + h_gidx_buffer = shard.gidx_buffer.AsVector(); + common::CompressedIterator gidx(h_gidx_buffer.data(), 25); + + ASSERT_LE(shard.row_stride, 3); + + // row_stride = 3, 16 rows, 48 entries for ELLPack + std::vector solution = { + 15, 24, 24, 0, 24, 24, 24, 24, 24, 24, 24, 24, 20, 24, 24, 24, + 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, + 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 + }; + for (size_t i = 0; i < n_rows * shard.row_stride; ++i) { + ASSERT_EQ(solution[i], gidx[i]); + } +} + +std::vector GetHostHistGpair() { + // 24 bins, 3 bins for each feature (column). + std::vector hist_gpair = { + {0.8314, 0.7147}, {1.7989, 3.7312}, {3.3846, 3.4598}, + {2.9277, 3.5886}, {1.8429, 2.4152}, {1.2443, 1.9019}, + {1.6380, 2.9174}, {1.5657, 2.5107}, {2.8111, 2.4776}, + {2.1322, 3.0651}, {3.2927, 3.8540}, {0.5899, 0.9866}, + {1.5185, 1.6263}, {2.0686, 3.1844}, {2.4278, 3.0950}, + {1.5105, 2.1403}, {2.6922, 4.2217}, {1.8122, 1.5437}, + {0.0000, 0.0000}, {4.3245, 5.7955}, {1.6903, 2.1103}, + {2.4012, 4.4754}, {3.6136, 3.4303}, {0.0000, 0.0000} + }; + return hist_gpair; +} + +void TestBuildHist(GPUHistBuilderBase& builder) { + int const n_rows = 16, n_cols = 8; + + TrainParam param; + param.max_depth = 6; + param.n_gpus = 1; + param.max_leaves = 0; + + DeviceShard shard(0, 0, 0, n_rows, param); + + BuildGidx(&shard, n_rows, n_cols); + + xgboost::SimpleLCG gen; + xgboost::SimpleRealUniformDistribution dist(0.0f, 1.0f); + std::vector h_gpair(n_rows); + for (size_t i = 0; i < h_gpair.size(); ++i) { + bst_float grad = dist(&gen); + bst_float hess = dist(&gen); + h_gpair[i] = GradientPair(grad, hess); + } + + thrust::device_vector gpair (n_rows); + gpair = h_gpair; + + int num_symbols = shard.n_bins + 1; + + thrust::host_vector h_gidx_buffer ( + shard.gidx_buffer.Size()); + + common::CompressedByteT* d_gidx_buffer_ptr = shard.gidx_buffer.Data(); + dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr, + sizeof(common::CompressedByteT) * shard.gidx_buffer.Size(), + cudaMemcpyDeviceToHost)); + auto gidx = common::CompressedIterator(h_gidx_buffer.data(), + num_symbols); + + shard.ridx_segments.resize(1); + shard.ridx_segments[0] = Segment(0, n_rows); + shard.hist.AllocateHistogram(0); + shard.gpair.copy(gpair.begin(), gpair.end()); + thrust::sequence(shard.ridx.CurrentDVec().tbegin(), + shard.ridx.CurrentDVec().tend()); + + builder.Build(&shard, 0); + DeviceHistogram d_hist = shard.hist; + + GradientPairSumT* d_histptr {d_hist.GetHistPtr(0)}; + // d_hist.data stored in float, not gradient pair + thrust::host_vector h_result (d_hist.data.size()/2); + size_t data_size = sizeof(GradientPairSumT) / ( + sizeof(GradientPairSumT) / sizeof(GradientPairSumT::ValueT)); + data_size *= d_hist.data.size(); + dh::safe_cuda(cudaMemcpy(h_result.data(), d_histptr, data_size, + cudaMemcpyDeviceToHost)); + + std::vector solution = GetHostHistGpair(); + std::cout << std::fixed; + for (size_t i = 0; i < h_result.size(); ++i) { + EXPECT_NEAR(h_result[i].GetGrad(), solution[i].GetGrad(), 0.01f); + EXPECT_NEAR(h_result[i].GetHess(), solution[i].GetHess(), 0.01f); + } +} + +TEST(GpuHist, BuildHistGlobalMem) { + GlobalMemHistBuilder builder; + TestBuildHist(builder); +} + +TEST(GpuHist, BuildHistSharedMem) { + SharedMemHistBuilder builder; + TestBuildHist(builder); +} + +common::HistCutMatrix GetHostCutMatrix () { + common::HistCutMatrix cmat; + cmat.row_ptr = {0, 3, 6, 9, 12, 15, 18, 21, 24}; + cmat.min_val = {0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.2, 0.2}; + // 24 cut fields, 3 cut fields for each feature (column). + // Each row of the cut represents the cuts for a data column. + cmat.cut = {0.30, 0.67, 1.64, + 0.32, 0.77, 1.95, + 0.29, 0.70, 1.80, + 0.32, 0.75, 1.85, + 0.18, 0.59, 1.69, + 0.25, 0.74, 2.00, + 0.26, 0.74, 1.98, + 0.26, 0.71, 1.83}; + return cmat; +} + +// TODO(trivialfis): This test is over simplified. +TEST(GpuHist, EvaluateSplits) { + constexpr int n_rows = 16; + constexpr int n_cols = 8; + + TrainParam param; + param.max_depth = 1; + param.n_gpus = 1; + param.colsample_bylevel = 1; + param.colsample_bytree = 1; + param.min_child_weight = 0.01; + + // Disable all parameters. + param.reg_alpha = 0.0; + param.reg_lambda = 0; + param.max_delta_step = 0.0; + + for (size_t i = 0; i < n_cols; ++i) { + param.monotone_constraints.emplace_back(0); + } + + int max_bins = 4; + + // Initialize DeviceShard + std::unique_ptr shard {new DeviceShard(0, 0, 0, n_rows, param)}; + // Initialize DeviceShard::node_sum_gradients + shard->node_sum_gradients = {{6.4, 12.8}}; + + // Initialize DeviceShard::cut + common::HistCutMatrix cmat = GetHostCutMatrix(); + + // Copy cut matrix to device. + DeviceShard::DeviceHistCutMatrix cut; + shard->ba.Allocate(0, true, + &(shard->cut_.feature_segments), cmat.row_ptr.size(), + &(shard->cut_.min_fvalue), cmat.min_val.size(), + &(shard->cut_.gidx_fvalue_map), 24, + &(shard->monotone_constraints), n_cols); + shard->cut_.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); + shard->cut_.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); + shard->monotone_constraints.copy(param.monotone_constraints.begin(), + param.monotone_constraints.end()); + + // Initialize DeviceShard::hist + shard->hist.Init(0, (max_bins - 1) * n_cols); + shard->hist.AllocateHistogram(0); + // Each row of hist_gpair represents gpairs for one feature. + // Each entry represents a bin. + std::vector hist_gpair = GetHostHistGpair(); + std::vector hist; + for (auto pair : hist_gpair) { + hist.push_back(pair.GetGrad()); + hist.push_back(pair.GetHess()); + } + + ASSERT_EQ(shard->hist.data.size(), hist.size()); + thrust::copy(hist.begin(), hist.end(), + shard->hist.data.begin()); + + + // Initialize GPUHistMaker + GPUHistMaker hist_maker = GPUHistMaker(); + hist_maker.param_ = param; + hist_maker.shards_.push_back(std::move(shard)); + hist_maker.column_sampler_.Init(n_cols, + param.colsample_bylevel, + param.colsample_bytree, + false); + + RegTree tree; + tree.InitModel(); + + MetaInfo info; + info.num_row_ = n_rows; + info.num_col_ = n_cols; + + hist_maker.info_ = &info; + hist_maker.node_value_constraints_.resize(1); + hist_maker.node_value_constraints_[0].lower_bound = -1.0; + hist_maker.node_value_constraints_[0].upper_bound = 1.0; + + std::vector res = + hist_maker.EvaluateSplits({0}, &tree); + + ASSERT_EQ(res.size(), 1); + ASSERT_EQ(res[0].findex, 7); + ASSERT_NEAR(res[0].fvalue, 0.26, xgboost::kRtEps); +} + +TEST(GpuHist, ApplySplit) { + GPUHistMaker hist_maker = GPUHistMaker(); + int constexpr nid = 0; + int constexpr n_rows = 16; + int constexpr n_cols = 8; + + TrainParam param; + param.silent = true; + + // Initialize shard + for (size_t i = 0; i < n_cols; ++i) { + param.monotone_constraints.emplace_back(0); + } + + hist_maker.shards_.resize(1); + hist_maker.shards_[0].reset(new DeviceShard(0, 0, 0, n_rows, param)); + + auto& shard = hist_maker.shards_.at(0); + shard->ridx_segments.resize(3); // 3 nodes. + shard->node_sum_gradients.resize(3); + + shard->ridx_segments[0] = Segment(0, n_rows); + shard->ba.Allocate(0, true, &(shard->ridx), n_rows, + &(shard->position), n_rows); + shard->row_stride = n_cols; + thrust::sequence(shard->ridx.CurrentDVec().tbegin(), + shard->ridx.CurrentDVec().tend()); + dh::safe_cuda(cudaMallocHost(&(shard->tmp_pinned), sizeof(int64_t))); + + // Initialize GPUHistMaker + hist_maker.param_ = param; + RegTree tree; + tree.InitModel(); + + DeviceSplitCandidate candidate; + candidate.Update(2, kLeftDir, + 0.59, 4, // fvalue has to be equal to one of the cut field + GradientPair(8.2, 2.8), GradientPair(6.3, 3.6), + GPUTrainingParam(param)); + GPUHistMaker::ExpandEntry candidate_entry {0, 0, candidate, 0}; + candidate_entry.nid = nid; + + auto const& nodes = tree.GetNodes(); + size_t n_nodes = nodes.size(); + + // Used to get bin_id in update position. + common::HistCutMatrix cmat = GetHostCutMatrix(); + hist_maker.hmat_ = cmat; + + MetaInfo info; + info.num_row_ = n_rows; + info.num_col_ = n_cols; + info.num_nonzero_ = n_rows * n_cols; // Dense + + // Initialize gidx + int n_bins = 24; + int row_stride = n_cols; + int num_symbols = n_bins + 1; + size_t compressed_size_bytes = + common::CompressedBufferWriter::CalculateBufferSize( + row_stride * n_rows, num_symbols); + shard->ba.Allocate(0, param.silent, + &(shard->gidx_buffer), compressed_size_bytes); + + common::CompressedBufferWriter wr(num_symbols); + std::vector h_gidx (n_rows * row_stride); + std::iota(h_gidx.begin(), h_gidx.end(), 0); + std::vector h_gidx_compressed (compressed_size_bytes); + + wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end()); + shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end()); + + shard->gidx = common::CompressedIterator( + shard->gidx_buffer.Data(), num_symbols); + + hist_maker.info_ = &info; + hist_maker.ApplySplit(candidate_entry, &tree); + + ASSERT_FALSE(tree[nid].IsLeaf()); + + int left_nidx = tree[nid].LeftChild(); + int right_nidx = tree[nid].RightChild(); + + ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0); + ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6); + ASSERT_EQ(shard->ridx_segments[right_nidx].begin, 6); + ASSERT_EQ(shard->ridx_segments[right_nidx].end, 16); +} + +TEST(GpuHist, MGPU_mock) { // Attempt to choose multiple GPU devices int ngpu; dh::safe_cuda(cudaGetDeviceCount(&ngpu));