Add benchmarks, fix GCC build (#1848)
This commit is contained in:
parent
e7fbc8591f
commit
2b6aa7736f
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required (VERSION 2.6)
|
||||
cmake_minimum_required (VERSION 3.5)
|
||||
project (xgboost)
|
||||
find_package(OpenMP)
|
||||
|
||||
|
||||
@ -1,5 +1,16 @@
|
||||
# CUDA Accelerated Tree Construction Algorithm
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Time for 500 boosting iterations in seconds.
|
||||
|
||||
Dataset | Instances | Features | i7-6700K | Titan X (pascal) | Speedup
|
||||
--- | --- | --- | --- | --- | ---
|
||||
Yahoo LTR | 473,134 | 700 | 3738 | 507 | 7.37
|
||||
Higgs | 10,500,000 | 28 | 31352 | 4173 | 7.51
|
||||
Bosch | 1,183,747 | 968 | 9460 | 1009 | 9.38
|
||||
|
||||
|
||||
## Usage
|
||||
Specify the updater parameter as 'grow_gpu'.
|
||||
|
||||
@ -13,9 +24,11 @@ param['updater'] = 'grow_gpu'
|
||||
## Memory usage
|
||||
Device memory usage can be calculated as approximately:
|
||||
```
|
||||
bytes = (10 x n_rows) + (44 x n_rows x n_columns x column_density)
|
||||
bytes = (10 x n_rows) + (40 x n_rows x n_columns x column_density) + (64 x max_nodes) + (76 x max_nodes_level x n_columns)
|
||||
```
|
||||
Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the column_density of these columns is 1/5 = 0.2.
|
||||
The maximum number of nodes needed for a given tree depth d is 2<sup>d+1</sup> - 1. The maximum number of nodes on any given level is 2<sup>d</sup>.
|
||||
|
||||
Data is stored in a sparse format. For example, missing values produced by one hot encoding are not stored. If a one hot encoding separates a categorical variable into 5 columns the density of these columns is 1/5 = 0.2.
|
||||
|
||||
A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset.
|
||||
|
||||
@ -26,7 +39,7 @@ A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depend
|
||||
|
||||
Building the plug-in requires CUDA Toolkit 7.5 or later.
|
||||
|
||||
The plugin also depends on CUB 1.5.4 - http://nvlabs.github.io/cub/index.html.
|
||||
The plugin also depends on CUB 1.5.2 - https://github.com/NVlabs/cub/tree/1.5.2
|
||||
|
||||
CUB is a header only cuda library which provides sort/reduce/scan primitives.
|
||||
|
||||
@ -60,5 +73,3 @@ Rory Mitchell
|
||||
Report any bugs to r.a.mitchell.nz at google mail.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -170,55 +170,6 @@ struct Timer {
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Utility functions
|
||||
*/
|
||||
|
||||
template <typename T>
|
||||
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
|
||||
thrust::host_vector<T> h = v;
|
||||
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
||||
std::cout << " " << h[i];
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print(char *label, const thrust::device_vector<T> &v,
|
||||
const char *format = "%d ", int max = 10) {
|
||||
thrust::host_vector<T> h_v = v;
|
||||
|
||||
std::cout << label << ":\n";
|
||||
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
|
||||
printf(format, h_v[i]);
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
|
||||
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
||||
}
|
||||
|
||||
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
|
||||
return thrust::device_pointer_cast(d_ptr);
|
||||
}
|
||||
|
||||
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||
return sizeof(T) * v.size();
|
||||
}
|
||||
|
||||
// Threadblock iterates over range, filling with value
|
||||
template <typename IterT, typename ValueT>
|
||||
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
|
||||
begin[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Range iterator
|
||||
*/
|
||||
@ -282,6 +233,55 @@ template <typename T> __device__ range block_stride_range(T begin, T end) {
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Utility functions
|
||||
*/
|
||||
|
||||
template <typename T>
|
||||
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
|
||||
thrust::host_vector<T> h = v;
|
||||
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
||||
std::cout << " " << h[i];
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void print(char *label, const thrust::device_vector<T> &v,
|
||||
const char *format = "%d ", int max = 10) {
|
||||
thrust::host_vector<T> h_v = v;
|
||||
|
||||
std::cout << label << ":\n";
|
||||
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
|
||||
printf(format, h_v[i]);
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
|
||||
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
||||
}
|
||||
|
||||
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
|
||||
return thrust::device_pointer_cast(d_ptr);
|
||||
}
|
||||
|
||||
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||
return sizeof(T) * v.size();
|
||||
}
|
||||
|
||||
// Threadblock iterates over range, filling with value
|
||||
template <typename IterT, typename ValueT>
|
||||
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
|
||||
begin[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Memory
|
||||
*/
|
||||
@ -414,6 +414,7 @@ class bulk_allocator {
|
||||
}
|
||||
|
||||
_size = get_size_bytes(args...);
|
||||
std::cout << "trying to allocate: " << _size << "\n";
|
||||
|
||||
safe_cuda(cudaMalloc(&d_ptr, _size));
|
||||
|
||||
|
||||
@ -68,9 +68,12 @@ struct GPUData {
|
||||
|
||||
// Calculate memory for sort
|
||||
size_t cub_mem_size = 0;
|
||||
cub::DoubleBuffer<NodeIdT> db_key;
|
||||
cub::DoubleBuffer<int> db_value;
|
||||
|
||||
cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
cub_mem.data(), cub_mem_size, cub::DoubleBuffer<NodeIdT>(),
|
||||
cub::DoubleBuffer<int>(), in_fvalues.size(), n_features,
|
||||
cub_mem.data(), cub_mem_size, db_key,
|
||||
db_value, in_fvalues.size(), n_features,
|
||||
foffsets.data(), foffsets.data() + 1);
|
||||
|
||||
// Allocate memory
|
||||
@ -304,7 +307,11 @@ void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
||||
|
||||
float GPUBuilder::GetSubsamplingRate(MetaInfo info) {
|
||||
float subsample = 1.0;
|
||||
size_t required = 10 * info.num_row + 44 * info.num_nonzero;
|
||||
uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1;
|
||||
uint32_t max_nodes_level = 1 << param.max_depth;
|
||||
size_t required = 10 * info.num_row + 40 * info.num_nonzero
|
||||
+ 64 * max_nodes + 76 * max_nodes_level * info.num_col;
|
||||
std::cout << "required: " << required << "\n";
|
||||
size_t available = dh::available_memory();
|
||||
while (available < required) {
|
||||
subsample -= 0.05;
|
||||
|
||||
@ -36,7 +36,7 @@ class GPUBuilder {
|
||||
GPUData *gpu_data;
|
||||
|
||||
int multiscan_levels =
|
||||
0; // Number of levels before switching to sorting algorithm
|
||||
5; // Number of levels before switching to sorting algorithm
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user