diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3c7c783b..355b060b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 2.6)
+cmake_minimum_required (VERSION 3.5)
project (xgboost)
find_package(OpenMP)
diff --git a/plugin/updater_gpu/README.md b/plugin/updater_gpu/README.md
index 526f64d1b..96d425531 100644
--- a/plugin/updater_gpu/README.md
+++ b/plugin/updater_gpu/README.md
@@ -1,5 +1,16 @@
# CUDA Accelerated Tree Construction Algorithm
+## Benchmarks
+
+Time for 500 boosting iterations in seconds.
+
+Dataset | Instances | Features | i7-6700K | Titan X (pascal) | Speedup
+--- | --- | --- | --- | --- | ---
+Yahoo LTR | 473,134 | 700 | 3738 | 507 | 7.37
+Higgs | 10,500,000 | 28 | 31352 | 4173 | 7.51
+Bosch | 1,183,747 | 968 | 9460 | 1009 | 9.38
+
+
## Usage
Specify the updater parameter as 'grow_gpu'.
@@ -13,9 +24,11 @@ param['updater'] = 'grow_gpu'
## Memory usage
Device memory usage can be calculated as approximately:
```
-bytes = (10 x n_rows) + (44 x n_rows x n_columns x column_density)
+bytes = (10 x n_rows) + (40 x n_rows x n_columns x column_density) + (64 x max_nodes) + (76 x max_nodes_level x n_columns)
```
-Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the column_density of these columns is 1/5 = 0.2.
+The maximum number of nodes needed for a given tree depth d is 2d+1 - 1. The maximum number of nodes on any given level is 2d.
+
+Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the density of these columns is 1/5 = 0.2.
A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset.
@@ -26,7 +39,7 @@ A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depend
Building the plug-in requires CUDA Toolkit 7.5 or later.
-The plugin also depends on CUB 1.5.4 - http://nvlabs.github.io/cub/index.html.
+The plugin also depends on CUB 1.5.2 - https://github.com/NVlabs/cub/tree/1.5.2
CUB is a header only cuda library which provides sort/reduce/scan primitives.
@@ -60,5 +73,3 @@ Rory Mitchell
Report any bugs to r.a.mitchell.nz at google mail.
-
-
diff --git a/plugin/updater_gpu/src/device_helpers.cuh b/plugin/updater_gpu/src/device_helpers.cuh
index c895c16e0..3c849efdf 100644
--- a/plugin/updater_gpu/src/device_helpers.cuh
+++ b/plugin/updater_gpu/src/device_helpers.cuh
@@ -170,55 +170,6 @@ struct Timer {
}
};
-/*
- * Utility functions
- */
-
-template
-void print(const thrust::device_vector &v, size_t max_items = 10) {
- thrust::host_vector h = v;
- for (int i = 0; i < std::min(max_items, h.size()); i++) {
- std::cout << " " << h[i];
- }
- std::cout << "\n";
-}
-
-template
-void print(char *label, const thrust::device_vector &v,
- const char *format = "%d ", int max = 10) {
- thrust::host_vector h_v = v;
-
- std::cout << label << ":\n";
- for (int i = 0; i < std::min(static_cast(h_v.size()), max); i++) {
- printf(format, h_v[i]);
- }
- std::cout << "\n";
-}
-
-template T1 div_round_up(const T1 a, const T2 b) {
- return static_cast(ceil(static_cast(a) / b));
-}
-
-template thrust::device_ptr dptr(T *d_ptr) {
- return thrust::device_pointer_cast(d_ptr);
-}
-
-template T *raw(thrust::device_vector &v) { // NOLINT
- return raw_pointer_cast(v.data());
-}
-
-template size_t size_bytes(const thrust::device_vector &v) {
- return sizeof(T) * v.size();
-}
-
-// Threadblock iterates over range, filling with value
-template
-__device__ void block_fill(IterT begin, size_t n, ValueT value) {
- for (auto i : block_stride_range(static_cast(0), n)) {
- begin[i] = value;
- }
-}
-
/*
* Range iterator
*/
@@ -282,6 +233,55 @@ template __device__ range block_stride_range(T begin, T end) {
return r;
}
+/*
+ * Utility functions
+ */
+
+template
+void print(const thrust::device_vector &v, size_t max_items = 10) {
+ thrust::host_vector h = v;
+ for (int i = 0; i < std::min(max_items, h.size()); i++) {
+ std::cout << " " << h[i];
+ }
+ std::cout << "\n";
+}
+
+template
+void print(char *label, const thrust::device_vector &v,
+ const char *format = "%d ", int max = 10) {
+ thrust::host_vector h_v = v;
+
+ std::cout << label << ":\n";
+ for (int i = 0; i < std::min(static_cast(h_v.size()), max); i++) {
+ printf(format, h_v[i]);
+ }
+ std::cout << "\n";
+}
+
+template T1 div_round_up(const T1 a, const T2 b) {
+ return static_cast(ceil(static_cast(a) / b));
+}
+
+template thrust::device_ptr dptr(T *d_ptr) {
+ return thrust::device_pointer_cast(d_ptr);
+}
+
+template T *raw(thrust::device_vector &v) { // NOLINT
+ return raw_pointer_cast(v.data());
+}
+
+template size_t size_bytes(const thrust::device_vector &v) {
+ return sizeof(T) * v.size();
+}
+
+// Threadblock iterates over range, filling with value
+template
+__device__ void block_fill(IterT begin, size_t n, ValueT value) {
+ for (auto i : block_stride_range(static_cast(0), n)) {
+ begin[i] = value;
+ }
+}
+
/*
* Memory
*/
@@ -414,6 +414,7 @@ class bulk_allocator {
}
_size = get_size_bytes(args...);
+ std::cout << "trying to allocate: " << _size << "\n";
safe_cuda(cudaMalloc(&d_ptr, _size));
diff --git a/plugin/updater_gpu/src/gpu_builder.cu b/plugin/updater_gpu/src/gpu_builder.cu
index 4279b35e8..e198c8e40 100644
--- a/plugin/updater_gpu/src/gpu_builder.cu
+++ b/plugin/updater_gpu/src/gpu_builder.cu
@@ -68,9 +68,12 @@ struct GPUData {
// Calculate memory for sort
size_t cub_mem_size = 0;
+ cub::DoubleBuffer db_key;
+ cub::DoubleBuffer db_value;
+
cub::DeviceSegmentedRadixSort::SortPairs(
- cub_mem.data(), cub_mem_size, cub::DoubleBuffer(),
- cub::DoubleBuffer(), in_fvalues.size(), n_features,
+ cub_mem.data(), cub_mem_size, db_key,
+ db_value, in_fvalues.size(), n_features,
foffsets.data(), foffsets.data() + 1);
// Allocate memory
@@ -304,7 +307,11 @@ void GPUBuilder::Update(const std::vector &gpair, DMatrix *p_fmat,
float GPUBuilder::GetSubsamplingRate(MetaInfo info) {
float subsample = 1.0;
- size_t required = 10 * info.num_row + 44 * info.num_nonzero;
+ uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1;
+ uint32_t max_nodes_level = 1 << param.max_depth;
+ size_t required = 10 * info.num_row + 40 * info.num_nonzero
+ + 64 * max_nodes + 76 * max_nodes_level * info.num_col;
+ std::cout << "required: " << required << "\n";
size_t available = dh::available_memory();
while (available < required) {
subsample -= 0.05;
diff --git a/plugin/updater_gpu/src/gpu_builder.cuh b/plugin/updater_gpu/src/gpu_builder.cuh
index fdb308285..61ccdbbcf 100644
--- a/plugin/updater_gpu/src/gpu_builder.cuh
+++ b/plugin/updater_gpu/src/gpu_builder.cuh
@@ -36,7 +36,7 @@ class GPUBuilder {
GPUData *gpu_data;
int multiscan_levels =
- 0; // Number of levels before switching to sorting algorithm
+ 5; // Number of levels before switching to sorting algorithm
};
} // namespace tree
} // namespace xgboost