diff --git a/CMakeLists.txt b/CMakeLists.txt index a3c7c783b..355b060b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 2.6) +cmake_minimum_required (VERSION 3.5) project (xgboost) find_package(OpenMP) diff --git a/plugin/updater_gpu/README.md b/plugin/updater_gpu/README.md index 526f64d1b..96d425531 100644 --- a/plugin/updater_gpu/README.md +++ b/plugin/updater_gpu/README.md @@ -1,5 +1,16 @@ # CUDA Accelerated Tree Construction Algorithm +## Benchmarks + +Time for 500 boosting iterations in seconds. + +Dataset | Instances | Features | i7-6700K | Titan X (pascal) | Speedup +--- | --- | --- | --- | --- | --- +Yahoo LTR | 473,134 | 700 | 3738 | 507 | 7.37 +Higgs | 10,500,000 | 28 | 31352 | 4173 | 7.51 +Bosch | 1,183,747 | 968 | 9460 | 1009 | 9.38 + + ## Usage Specify the updater parameter as 'grow_gpu'. @@ -13,9 +24,11 @@ param['updater'] = 'grow_gpu' ## Memory usage Device memory usage can be calculated as approximately: ``` -bytes = (10 x n_rows) + (44 x n_rows x n_columns x column_density) +bytes = (10 x n_rows) + (40 x n_rows x n_columns x column_density) + (64 x max_nodes) + (76 x max_nodes_level x n_columns) ``` -Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the column_density of these columns is 1/5 = 0.2. +The maximum number of nodes needed for a given tree depth d is 2d+1 - 1. The maximum number of nodes on any given level is 2d. + +Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the density of these columns is 1/5 = 0.2. A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset. @@ -26,7 +39,7 @@ A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depend Building the plug-in requires CUDA Toolkit 7.5 or later. -The plugin also depends on CUB 1.5.4 - http://nvlabs.github.io/cub/index.html. +The plugin also depends on CUB 1.5.2 - https://github.com/NVlabs/cub/tree/1.5.2 CUB is a header only cuda library which provides sort/reduce/scan primitives. @@ -60,5 +73,3 @@ Rory Mitchell Report any bugs to r.a.mitchell.nz at google mail. - - diff --git a/plugin/updater_gpu/src/device_helpers.cuh b/plugin/updater_gpu/src/device_helpers.cuh index c895c16e0..3c849efdf 100644 --- a/plugin/updater_gpu/src/device_helpers.cuh +++ b/plugin/updater_gpu/src/device_helpers.cuh @@ -170,55 +170,6 @@ struct Timer { } }; -/* - * Utility functions - */ - -template -void print(const thrust::device_vector &v, size_t max_items = 10) { - thrust::host_vector h = v; - for (int i = 0; i < std::min(max_items, h.size()); i++) { - std::cout << " " << h[i]; - } - std::cout << "\n"; -} - -template -void print(char *label, const thrust::device_vector &v, - const char *format = "%d ", int max = 10) { - thrust::host_vector h_v = v; - - std::cout << label << ":\n"; - for (int i = 0; i < std::min(static_cast(h_v.size()), max); i++) { - printf(format, h_v[i]); - } - std::cout << "\n"; -} - -template T1 div_round_up(const T1 a, const T2 b) { - return static_cast(ceil(static_cast(a) / b)); -} - -template thrust::device_ptr dptr(T *d_ptr) { - return thrust::device_pointer_cast(d_ptr); -} - -template T *raw(thrust::device_vector &v) { // NOLINT - return raw_pointer_cast(v.data()); -} - -template size_t size_bytes(const thrust::device_vector &v) { - return sizeof(T) * v.size(); -} - -// Threadblock iterates over range, filling with value -template -__device__ void block_fill(IterT begin, size_t n, ValueT value) { - for (auto i : block_stride_range(static_cast(0), n)) { - begin[i] = value; - } -} - /* * Range iterator */ @@ -282,6 +233,55 @@ template __device__ range block_stride_range(T begin, T end) { return r; } +/* + * Utility functions + */ + +template +void print(const thrust::device_vector &v, size_t max_items = 10) { + thrust::host_vector h = v; + for (int i = 0; i < std::min(max_items, h.size()); i++) { + std::cout << " " << h[i]; + } + std::cout << "\n"; +} + +template +void print(char *label, const thrust::device_vector &v, + const char *format = "%d ", int max = 10) { + thrust::host_vector h_v = v; + + std::cout << label << ":\n"; + for (int i = 0; i < std::min(static_cast(h_v.size()), max); i++) { + printf(format, h_v[i]); + } + std::cout << "\n"; +} + +template T1 div_round_up(const T1 a, const T2 b) { + return static_cast(ceil(static_cast(a) / b)); +} + +template thrust::device_ptr dptr(T *d_ptr) { + return thrust::device_pointer_cast(d_ptr); +} + +template T *raw(thrust::device_vector &v) { // NOLINT + return raw_pointer_cast(v.data()); +} + +template size_t size_bytes(const thrust::device_vector &v) { + return sizeof(T) * v.size(); +} + +// Threadblock iterates over range, filling with value +template +__device__ void block_fill(IterT begin, size_t n, ValueT value) { + for (auto i : block_stride_range(static_cast(0), n)) { + begin[i] = value; + } +} + /* * Memory */ @@ -414,6 +414,7 @@ class bulk_allocator { } _size = get_size_bytes(args...); + std::cout << "trying to allocate: " << _size << "\n"; safe_cuda(cudaMalloc(&d_ptr, _size)); diff --git a/plugin/updater_gpu/src/gpu_builder.cu b/plugin/updater_gpu/src/gpu_builder.cu index 4279b35e8..e198c8e40 100644 --- a/plugin/updater_gpu/src/gpu_builder.cu +++ b/plugin/updater_gpu/src/gpu_builder.cu @@ -68,9 +68,12 @@ struct GPUData { // Calculate memory for sort size_t cub_mem_size = 0; + cub::DoubleBuffer db_key; + cub::DoubleBuffer db_value; + cub::DeviceSegmentedRadixSort::SortPairs( - cub_mem.data(), cub_mem_size, cub::DoubleBuffer(), - cub::DoubleBuffer(), in_fvalues.size(), n_features, + cub_mem.data(), cub_mem_size, db_key, + db_value, in_fvalues.size(), n_features, foffsets.data(), foffsets.data() + 1); // Allocate memory @@ -304,7 +307,11 @@ void GPUBuilder::Update(const std::vector &gpair, DMatrix *p_fmat, float GPUBuilder::GetSubsamplingRate(MetaInfo info) { float subsample = 1.0; - size_t required = 10 * info.num_row + 44 * info.num_nonzero; + uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1; + uint32_t max_nodes_level = 1 << param.max_depth; + size_t required = 10 * info.num_row + 40 * info.num_nonzero + + 64 * max_nodes + 76 * max_nodes_level * info.num_col; + std::cout << "required: " << required << "\n"; size_t available = dh::available_memory(); while (available < required) { subsample -= 0.05; diff --git a/plugin/updater_gpu/src/gpu_builder.cuh b/plugin/updater_gpu/src/gpu_builder.cuh index fdb308285..61ccdbbcf 100644 --- a/plugin/updater_gpu/src/gpu_builder.cuh +++ b/plugin/updater_gpu/src/gpu_builder.cuh @@ -36,7 +36,7 @@ class GPUBuilder { GPUData *gpu_data; int multiscan_levels = - 0; // Number of levels before switching to sorting algorithm + 5; // Number of levels before switching to sorting algorithm }; } // namespace tree } // namespace xgboost