GPU Plugin: Add subsample, colsample_bytree, colsample_bylevel (#1895)

This commit is contained in:
Rory Mitchell 2016-12-23 04:30:36 +13:00 committed by Tianqi Chen
parent cee4aafb93
commit b49b339183
10 changed files with 331 additions and 324 deletions

View File

@ -9,10 +9,10 @@ https://www.kaggle.com/c/bosch-production-line-performance/data
Copy train_numeric.csv into xgboost/demo/data. Copy train_numeric.csv into xgboost/demo/data.
The subsample parameter can be changed so you can run the script first on a small portion of the data. Processing the entire dataset can take a long time and requires about 8GB of device memory. It is initially set to 0.4, using about 2650/3380MB on a GTX 970. The subset parameter changes the proportion of rows loaded from the CSV file. Processing the entire dataset can take a long time and requires about 8GB of device memory. It is initially set to 0.4, using about 2650/3380MB on a GTX 970. Lower the parameter if your device runs out of memory.
```python ```python
subsample = 0.4 subset = 0.4
``` ```
Parameters are set as usual except that we set silent to 0 to see how much memory is being allocated on the GPU and we change 'updater' to 'grow_gpu' to activate the GPU plugin. Parameters are set as usual except that we set silent to 0 to see how much memory is being allocated on the GPU and we change 'updater' to 'grow_gpu' to activate the GPU plugin.

View File

@ -5,12 +5,12 @@ import time
import random import random
from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import StratifiedKFold
#For sub sampling rows from input file #For sampling rows from input file
random_seed = 9 random_seed = 9
subsample = 0.4 subset = 0.4
n_rows = 1183747; n_rows = 1183747;
train_rows = int(n_rows * subsample) train_rows = int(n_rows * subset)
random.seed(random_seed) random.seed(random_seed)
skip = sorted(random.sample(xrange(1,n_rows + 1),n_rows-train_rows)) skip = sorted(random.sample(xrange(1,n_rows + 1),n_rows-train_rows))
data = pd.read_csv("../data/train_numeric.csv", index_col=0, dtype=np.float32, skiprows=skip) data = pd.read_csv("../data/train_numeric.csv", index_col=0, dtype=np.float32, skiprows=skip)

View File

@ -32,8 +32,6 @@ Data is stored in a sparse format. For example, missing values produced by one h
A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset. A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset.
The algorithm will automatically perform row subsampling if it detects there is not enough memory on the device.
## Dependencies ## Dependencies
A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler). A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).

View File

@ -7,6 +7,7 @@
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/system/cuda/error.h> #include <thrust/system/cuda/error.h>
#include <thrust/system_error.h> #include <thrust/system_error.h>
#include <thrust/random.h>
#include <algorithm> #include <algorithm>
#include <ctime> #include <ctime>
#include <sstream> #include <sstream>
@ -147,6 +148,8 @@ struct Timer {
LARGE_INTEGER now; LARGE_INTEGER now;
QueryPerformanceCounter(&now); QueryPerformanceCounter(&now);
return static_cast<double>(now.QuadPart) / s_frequency.QuadPart; return static_cast<double>(now.QuadPart) / s_frequency.QuadPart;
#else
return 0;
#endif #endif
} }
@ -160,12 +163,14 @@ struct Timer {
#ifdef _WIN32 #ifdef _WIN32
_ReadWriteBarrier(); _ReadWriteBarrier();
return seconds_now() - start; return seconds_now() - start;
#else
return 0;
#endif #endif
} }
void printElapsed(char *label) { void printElapsed(std::string label) {
#ifdef TIMERS #ifdef TIMERS
safe_cuda(cudaDeviceSynchronize()); safe_cuda(cudaDeviceSynchronize());
printf("%s:\t %1.4fs\n", label, elapsed()); printf("%s:\t %1.4fs\n", label.c_str(), elapsed());
#endif #endif
} }
}; };
@ -233,46 +238,6 @@ template <typename T> __device__ range block_stride_range(T begin, T end) {
return r; return r;
} }
/*
* Utility functions
*/
template <typename T>
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
thrust::host_vector<T> h = v;
for (int i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
}
std::cout << "\n";
}
template <typename T>
void print(char *label, const thrust::device_vector<T> &v,
const char *format = "%d ", int max = 10) {
thrust::host_vector<T> h_v = v;
std::cout << label << ":\n";
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
printf(format, h_v[i]);
}
std::cout << "\n";
}
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
return static_cast<T1>(ceil(static_cast<double>(a) / b));
}
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
return thrust::device_pointer_cast(d_ptr);
}
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
return sizeof(T) * v.size();
}
// Threadblock iterates over range, filling with value // Threadblock iterates over range, filling with value
template <typename IterT, typename ValueT> template <typename IterT, typename ValueT>
@ -306,11 +271,11 @@ template <typename T> class dvec {
public: public:
dvec() : _ptr(NULL), _size(0) {} dvec() : _ptr(NULL), _size(0) {}
size_t size() { return _size; } size_t size() const { return _size; }
bool empty() { return _ptr == NULL || _size == 0; } bool empty() const { return _ptr == NULL || _size == 0; }
T *data() { return _ptr; } T *data() { return _ptr; }
std::vector<T> as_vector() { std::vector<T> as_vector() const {
std::vector<T> h_vector(size()); std::vector<T> h_vector(size());
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T), safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
@ -454,6 +419,55 @@ inline std::string device_name() {
return std::string(prop.name); return std::string(prop.name);
} }
/*
* Utility functions
*/
template <typename T>
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
thrust::host_vector<T> h = v;
for (int i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
}
std::cout << "\n";
}
template <typename T>
void print(const dvec<T> &v, size_t max_items = 10) {
std::vector<T> h = v.as_vector();
for (int i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
}
std::cout << "\n";
}
template <typename T>
void print(char *label, const thrust::device_vector<T> &v,
const char *format = "%d ", int max = 10) {
thrust::host_vector<T> h_v = v;
std::cout << label << ":\n";
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
printf(format, h_v[i]);
}
std::cout << "\n";
}
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
return static_cast<T1>(ceil(static_cast<double>(a) / b));
}
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
return thrust::device_pointer_cast(d_ptr);
}
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
return sizeof(T) * v.size();
}
/* /*
* Kernel launcher * Kernel launcher
*/ */
@ -470,4 +484,25 @@ inline void launch_n(size_t n, L lambda) {
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda); launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
} }
/*
* Random
*/
struct BernoulliRng {
float p;
int seed;
__host__ __device__ BernoulliRng(float p, int seed):p(p), seed(seed) {}
__host__ __device__ bool operator()(const int i) const {
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> dist;
rng.discard(i);
return dist(rng) <= p;
}
};
} // namespace dh } // namespace dh

View File

@ -4,9 +4,11 @@
#pragma once #pragma once
#include <cub/cub.cuh> #include <cub/cub.cuh>
#include <xgboost/base.h> #include <xgboost/base.h>
#include <vector>
#include "device_helpers.cuh" #include "device_helpers.cuh"
#include "find_split_multiscan.cuh" #include "find_split_multiscan.cuh"
#include "find_split_sorting.cuh" #include "find_split_sorting.cuh"
#include "gpu_data.cuh"
#include "types_functions.cuh" #include "types_functions.cuh"
namespace xgboost { namespace xgboost {
@ -62,24 +64,47 @@ void reduce_split_candidates(Split *d_split_candidates, Node *d_nodes,
dh::safe_cuda(cudaDeviceSynchronize()); dh::safe_cuda(cudaDeviceSynchronize());
} }
void find_split(const ItemIter items_iter, Split *d_split_candidates, void colsample_level(GPUData *data, const TrainParam xgboost_param,
Node *d_nodes, bst_uint num_items, int num_features, const std::vector<int> &feature_set_tree,
const int *d_feature_offsets, gpu_gpair *d_node_sums, std::vector<int> *feature_set_level) {
int *d_node_offsets, const GPUTrainingParam param, unsigned n_bytree =
const int level, bool multiscan_algorithm) { static_cast<unsigned>(xgboost_param.colsample_bytree * data->n_features);
unsigned n =
static_cast<unsigned>(n_bytree * xgboost_param.colsample_bylevel);
CHECK_GT(n, 0);
*feature_set_level = feature_set_tree;
std::shuffle((*feature_set_level).begin(),
(*feature_set_level).begin() + n_bytree, common::GlobalRandom());
data->feature_set = *feature_set_level;
data->feature_flags.fill(0);
auto d_feature_set = data->feature_set.data();
auto d_feature_flags = data->feature_flags.data();
dh::launch_n(
n, [=] __device__(int i) { d_feature_flags[d_feature_set[i]] = 1; });
}
void find_split(GPUData *data, const TrainParam xgboost_param, const int level,
bool multiscan_algorithm,
const std::vector<int> &feature_set_tree,
std::vector<int> *feature_set_level) {
colsample_level(data, xgboost_param, feature_set_tree, feature_set_level);
// Reset split candidates
data->split_candidates.fill(Split());
if (multiscan_algorithm) { if (multiscan_algorithm) {
find_split_candidates_multiscan(items_iter, d_split_candidates, d_nodes, find_split_candidates_multiscan(data, level);
num_items, num_features, d_feature_offsets,
param, level);
} else { } else {
find_split_candidates_sorted(items_iter, d_split_candidates, d_nodes, find_split_candidates_sorted(data, level);
num_items, num_features, d_feature_offsets,
d_node_sums, d_node_offsets, param, level);
} }
// Find the best split for each node // Find the best split for each node
reduce_split_candidates(d_split_candidates, d_nodes, level, num_features, reduce_split_candidates(data->split_candidates.data(), data->nodes.data(),
param); level, data->n_features, data->param);
} }
} // namespace tree } // namespace tree
} // namespace xgboost } // namespace xgboost

View File

@ -5,6 +5,7 @@
#include <cub/cub.cuh> #include <cub/cub.cuh>
#include <xgboost/base.h> #include <xgboost/base.h>
#include "device_helpers.cuh" #include "device_helpers.cuh"
#include "gpu_data.cuh"
#include "types_functions.cuh" #include "types_functions.cuh"
namespace xgboost { namespace xgboost {
@ -609,22 +610,11 @@ struct FindSplitEnactorMultiscan {
} }
} }
__device__ __forceinline__ void ResetSplitCandidates() {
const int max_nodes = 1 << level;
const int begin = blockIdx.x * max_nodes;
const int end = begin + max_nodes;
for (auto i : dh::block_stride_range(begin, end)) {
d_split_candidates_out[i] = Split();
}
}
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin, __device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
const bst_uint &segment_end) { const bst_uint &segment_end) {
// Current position // Current position
bst_uint offset = segment_begin; bst_uint offset = segment_begin;
ResetSplitCandidates();
ResetTileCarry(); ResetTileCarry();
ResetSplits(); ResetSplits();
CacheNodes(); CacheNodes();
@ -654,8 +644,9 @@ __launch_bounds__(1024, 2)
const ItemIter items_iter, Split *d_split_candidates_out, const ItemIter items_iter, Split *d_split_candidates_out,
const Node *d_nodes, const int node_begin, bst_uint num_items, const Node *d_nodes, const int node_begin, bst_uint num_items,
int num_features, const int *d_feature_offsets, int num_features, const int *d_feature_offsets,
const GPUTrainingParam param, const int level) { const GPUTrainingParam param, const int *d_feature_flags,
if (num_items <= 0) { const int level) {
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
return; return;
} }
@ -685,69 +676,45 @@ __launch_bounds__(1024, 2)
} }
template <int N_NODES> template <int N_NODES>
void find_split_candidates_multiscan_variation( void find_split_candidates_multiscan_variation(GPUData *data, const int level) {
const ItemIter items_iter, Split *d_split_candidates, const Node *d_nodes, const int node_begin = (1 << level) - 1;
int node_begin, int node_end, bst_uint num_items, int num_features,
const int *d_feature_offsets, const GPUTrainingParam param,
const int level) {
const int BLOCK_THREADS = 512; const int BLOCK_THREADS = 512;
CHECK((node_end - node_begin) <= N_NODES) << "Multiscan: N_NODES template "
"parameter too small for given "
"node range.";
CHECK(BLOCK_THREADS / 32 < 32) CHECK(BLOCK_THREADS / 32 < 32)
<< "Too many active warps. See FindSplitEnactor - ReduceSplits."; << "Too many active warps. See FindSplitEnactor - ReduceSplits.";
typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false> typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
find_split_params; find_split_params;
typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params; typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
int grid_size = num_features; int grid_size = data->n_features;
find_split_candidates_multiscan_kernel< find_split_candidates_multiscan_kernel<
find_split_params, find_split_params,
reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>( reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
items_iter, d_split_candidates, d_nodes, node_begin, num_items, data->items_iter, data->split_candidates.data(), data->nodes.data(),
num_features, d_feature_offsets, param, level); node_begin, data->fvalues.size(), data->n_features, data->foffsets.data(),
data->param, data->feature_flags.data(), level);
dh::safe_cuda(cudaDeviceSynchronize()); dh::safe_cuda(cudaDeviceSynchronize());
} }
void find_split_candidates_multiscan( void find_split_candidates_multiscan(GPUData *data, const int level) {
const ItemIter items_iter, Split *d_split_candidates, const Node *d_nodes,
bst_uint num_items, int num_features, const int *d_feature_offsets,
const GPUTrainingParam param, const int level) {
// Select templated variation of split finding algorithm // Select templated variation of split finding algorithm
switch (level) { switch (level) {
case 0: case 0:
find_split_candidates_multiscan_variation<1>( find_split_candidates_multiscan_variation<1>(data, level);
items_iter, d_split_candidates, d_nodes, 0, 1, num_items, num_features,
d_feature_offsets, param, level);
break; break;
case 1: case 1:
find_split_candidates_multiscan_variation<2>( find_split_candidates_multiscan_variation<2>(data, level);
items_iter, d_split_candidates, d_nodes, 1, 3, num_items, num_features,
d_feature_offsets, param, level);
break; break;
case 2: case 2:
find_split_candidates_multiscan_variation<4>( find_split_candidates_multiscan_variation<4>(data, level);
items_iter, d_split_candidates, d_nodes, 3, 7, num_items, num_features,
d_feature_offsets, param, level);
break; break;
case 3: case 3:
find_split_candidates_multiscan_variation<8>( find_split_candidates_multiscan_variation<8>(data, level);
items_iter, d_split_candidates, d_nodes, 7, 15, num_items, num_features,
d_feature_offsets, param, level);
break; break;
case 4: case 4:
find_split_candidates_multiscan_variation<16>( find_split_candidates_multiscan_variation<16>(data, level);
items_iter, d_split_candidates, d_nodes, 15, 31, num_items,
num_features, d_feature_offsets, param, level);
break;
case 5:
find_split_candidates_multiscan_variation<32>(
items_iter, d_split_candidates, d_nodes, 31, 63, num_items,
num_features, d_feature_offsets, param, level);
break; break;
} }
} }

View File

@ -337,17 +337,8 @@ struct FindSplitEnactorSorting {
WriteBestSplit(node_id_adjusted); WriteBestSplit(node_id_adjusted);
} }
__device__ __forceinline__ void ResetSplitCandidates() {
const int max_nodes = 1 << level;
const int begin = blockIdx.x * max_nodes;
dh::block_fill(d_split_candidates_out + begin, max_nodes, Split());
}
__device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin, __device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin,
const bst_uint &segment_end) { const bst_uint &segment_end) {
ResetSplitCandidates();
int node_begin = segment_begin; int node_begin = segment_begin;
const int max_nodes = 1 << level; const int max_nodes = 1 << level;
@ -377,9 +368,9 @@ __global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
const ItemIter items_iter, Split *d_split_candidates_out, const ItemIter items_iter, Split *d_split_candidates_out,
const Node *d_nodes, bst_uint num_items, const int num_features, const Node *d_nodes, bst_uint num_items, const int num_features,
const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets, const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets,
const GPUTrainingParam param, const int level) { const GPUTrainingParam param, const int *d_feature_flags, const int level) {
if (num_items <= 0) { if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
return; return;
} }
@ -408,23 +399,19 @@ __global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
.ProcessFeature(segment_begin, segment_end); .ProcessFeature(segment_begin, segment_end);
} }
void find_split_candidates_sorted(const ItemIter items_iter, void find_split_candidates_sorted(GPUData * data, const int level) {
Split *d_split_candidates, Node *d_nodes,
bst_uint num_items, int num_features,
const int *d_feature_offsets,
gpu_gpair *d_node_sums, int *d_node_offsets,
const GPUTrainingParam param,
const int level) {
const int BLOCK_THREADS = 512; const int BLOCK_THREADS = 512;
CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps."; CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps.";
int grid_size = num_features; int grid_size = data->n_features;
find_split_candidates_sorted_kernel< find_split_candidates_sorted_kernel<
BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>( BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>(
items_iter, d_split_candidates, d_nodes, num_items, num_features, data->items_iter, data->split_candidates.data(), data->nodes.data(),
d_feature_offsets, d_node_sums, d_node_offsets, param, level); data->fvalues.size(), data->n_features,
data->foffsets.data(), data->node_sums.data(), data->node_offsets.data(),
data->param, data->feature_flags.data(), level);
dh::safe_cuda(cudaGetLastError()); dh::safe_cuda(cudaGetLastError());
dh::safe_cuda(cudaDeviceSynchronize()); dh::safe_cuda(cudaDeviceSynchronize());

View File

@ -12,143 +12,17 @@
#include <thrust/sequence.h> #include <thrust/sequence.h>
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <numeric>
#include <vector> #include <vector>
#include "../../../src/common/random.h" #include "../../../src/common/random.h"
#include "device_helpers.cuh" #include "device_helpers.cuh"
#include "find_split.cuh" #include "find_split.cuh"
#include "gpu_builder.cuh" #include "gpu_builder.cuh"
#include "types_functions.cuh" #include "types_functions.cuh"
#include "gpu_data.cuh"
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
struct GPUData {
GPUData() : allocated(false), n_features(0), n_instances(0) {}
bool allocated;
int n_features;
int n_instances;
dh::bulk_allocator ba;
GPUTrainingParam param;
dh::dvec<float> fvalues;
dh::dvec<float> fvalues_temp;
dh::dvec<float> fvalues_cached;
dh::dvec<int> foffsets;
dh::dvec<bst_uint> instance_id;
dh::dvec<bst_uint> instance_id_temp;
dh::dvec<bst_uint> instance_id_cached;
dh::dvec<int> feature_id;
dh::dvec<NodeIdT> node_id;
dh::dvec<NodeIdT> node_id_temp;
dh::dvec<NodeIdT> node_id_instance;
dh::dvec<gpu_gpair> gpair;
dh::dvec<Node> nodes;
dh::dvec<Split> split_candidates;
dh::dvec<gpu_gpair> node_sums;
dh::dvec<int> node_offsets;
dh::dvec<int> sort_index_in;
dh::dvec<int> sort_index_out;
dh::dvec<char> cub_mem;
ItemIter items_iter;
void Init(const std::vector<float> &in_fvalues,
const std::vector<int> &in_foffsets,
const std::vector<bst_uint> &in_instance_id,
const std::vector<int> &in_feature_id,
const std::vector<bst_gpair> &in_gpair, bst_uint n_instances_in,
bst_uint n_features_in, int max_depth, const TrainParam &param_in) {
n_features = n_features_in;
n_instances = n_instances_in;
uint32_t max_nodes = (1 << (max_depth + 1)) - 1;
uint32_t max_nodes_level = 1 << max_depth;
// Calculate memory for sort
size_t cub_mem_size = 0;
cub::DoubleBuffer<NodeIdT> db_key;
cub::DoubleBuffer<int> db_value;
cub::DeviceSegmentedRadixSort::SortPairs(
cub_mem.data(), cub_mem_size, db_key,
db_value, in_fvalues.size(), n_features,
foffsets.data(), foffsets.data() + 1);
// Allocate memory
size_t free_memory = dh::available_memory();
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
&fvalues_cached, in_fvalues.size(), &foffsets,
in_foffsets.size(), &instance_id, in_instance_id.size(),
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
in_instance_id.size(), &feature_id, in_feature_id.size(),
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
max_nodes, &split_candidates, max_nodes_level * n_features,
&node_sums, max_nodes_level * n_features, &node_offsets,
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size);
if (!param_in.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on " << dh::device_name();
}
node_id.fill(0);
node_id_instance.fill(0);
fvalues = in_fvalues;
fvalues_cached = fvalues;
foffsets = in_foffsets;
instance_id = in_instance_id;
instance_id_cached = instance_id;
feature_id = in_feature_id;
param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda,
param_in.reg_alpha, param_in.max_delta_step);
gpair = in_gpair;
nodes.fill(Node());
items_iter = thrust::make_zip_iterator(thrust::make_tuple(
thrust::make_permutation_iterator(gpair.tbegin(), instance_id.tbegin()),
fvalues.tbegin(), node_id.tbegin()));
allocated = true;
dh::safe_cuda(cudaGetLastError());
}
~GPUData() {}
// Reset memory for new boosting iteration
void Reset(const std::vector<bst_gpair> &in_gpair) {
CHECK(allocated);
gpair = in_gpair;
instance_id = instance_id_cached;
fvalues = fvalues_cached;
nodes.fill(Node());
node_id_instance.fill(0);
node_id.fill(0);
}
bool IsAllocated() { return allocated; }
// Gather from node_id_instance into node_id according to instance_id
void GatherNodeId() {
// Update node_id for each item
auto d_node_id = node_id.data();
auto d_node_id_instance = node_id_instance.data();
auto d_instance_id = instance_id.data();
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
// Item item = d_items[i];
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
});
}
};
GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); } GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); }
@ -253,15 +127,26 @@ void GPUBuilder::Sort(int level) {
} }
} }
void GPUBuilder::ColsampleTree() {
unsigned n = static_cast<unsigned>(
param.colsample_bytree * gpu_data->n_features);
CHECK_GT(n, 0);
feature_set_tree.resize(gpu_data->n_features);
std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
std::shuffle(feature_set_tree.begin(), feature_set_tree.end(),
common::GlobalRandom());
}
void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat, void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
RegTree *p_tree) { RegTree *p_tree) {
cudaProfilerStart();
try { try {
dh::Timer update; dh::Timer update;
dh::Timer t; dh::Timer t;
this->InitData(gpair, *p_fmat, *p_tree); this->InitData(gpair, *p_fmat, *p_tree);
t.printElapsed("init data"); t.printElapsed("init data");
this->InitFirstNode(); this->InitFirstNode();
this->ColsampleTree();
for (int level = 0; level < param.max_depth; level++) { for (int level = 0; level < param.max_depth; level++) {
bool use_multiscan_algorithm = level < multiscan_levels; bool use_multiscan_algorithm = level < multiscan_levels;
@ -280,11 +165,8 @@ void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
} }
dh::Timer split; dh::Timer split;
find_split(gpu_data->items_iter, gpu_data->split_candidates.data(), find_split(gpu_data, param, level, use_multiscan_algorithm,
gpu_data->nodes.data(), (bst_uint)gpu_data->fvalues.size(), feature_set_tree, &feature_set_level);
gpu_data->n_features, gpu_data->foffsets.data(),
gpu_data->node_sums.data(), gpu_data->node_offsets.data(),
gpu_data->param, level, use_multiscan_algorithm);
split.printElapsed("split"); split.printElapsed("split");
@ -302,22 +184,6 @@ void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
std::cerr << "Unknown exception." << std::endl; std::cerr << "Unknown exception." << std::endl;
exit(-1); exit(-1);
} }
cudaProfilerStop();
}
float GPUBuilder::GetSubsamplingRate(MetaInfo info) {
float subsample = 1.0;
uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1;
uint32_t max_nodes_level = 1 << param.max_depth;
size_t required = 10 * info.num_row + 40 * info.num_nonzero
+ 64 * max_nodes + 76 * max_nodes_level * info.num_col;
size_t available = dh::available_memory();
while (available < required) {
subsample -= 0.05;
required = 10 * info.num_row + subsample * (44 * info.num_nonzero);
}
return subsample;
} }
void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
@ -325,7 +191,7 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
CHECK(fmat.SingleColBlock()) << "GPUMaker: must have single column block"; CHECK(fmat.SingleColBlock()) << "GPUMaker: must have single column block";
if (gpu_data->IsAllocated()) { if (gpu_data->IsAllocated()) {
gpu_data->Reset(gpair); gpu_data->Reset(gpair, param.subsample);
return; return;
} }
@ -333,35 +199,6 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
MetaInfo info = fmat.info(); MetaInfo info = fmat.info();
// Work out if dataset will fit on GPU
float subsample = this->GetSubsamplingRate(info);
CHECK(subsample > 0.0);
if (!param.silent && subsample < param.subsample) {
LOG(CONSOLE) << "Not enough device memory for entire dataset.";
}
// Override subsample parameter if user-specified parameter is lower
subsample = std::min(param.subsample, subsample);
std::vector<bool> row_flags;
if (subsample < 1.0) {
if (!param.silent && subsample < 1.0) {
LOG(CONSOLE) << "Subsampling " << subsample * 100 << "% of rows.";
}
const RowSet &rowset = fmat.buffered_rowset();
row_flags.resize(info.num_row);
std::bernoulli_distribution coin_flip(subsample);
auto &rnd = common::GlobalRandom();
for (size_t i = 0; i < rowset.size(); ++i) {
const bst_uint ridx = rowset[i];
if (gpair[ridx].hess < 0.0f)
continue;
row_flags[ridx] = coin_flip(rnd);
}
}
std::vector<int> foffsets; std::vector<int> foffsets;
foffsets.push_back(0); foffsets.push_back(0);
std::vector<int> feature_id; std::vector<int> feature_id;
@ -382,18 +219,10 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
for (const ColBatch::Entry *it = col.data; it != col.data + col.length; for (const ColBatch::Entry *it = col.data; it != col.data + col.length;
it++) { it++) {
bst_uint inst_id = it->index; bst_uint inst_id = it->index;
if (subsample < 1.0) {
if (row_flags[inst_id]) {
fvalues.push_back(it->fvalue); fvalues.push_back(it->fvalue);
instance_id.push_back(inst_id); instance_id.push_back(inst_id);
feature_id.push_back(i); feature_id.push_back(i);
} }
} else {
fvalues.push_back(it->fvalue);
instance_id.push_back(inst_id);
feature_id.push_back(i);
}
}
foffsets.push_back(fvalues.size()); foffsets.push_back(fvalues.size());
} }
} }

View File

@ -23,6 +23,7 @@ class GPUBuilder {
RegTree *p_tree); RegTree *p_tree);
void UpdateNodeId(int level); void UpdateNodeId(int level);
private: private:
void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, // NOLINT void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, // NOLINT
const RegTree &tree); const RegTree &tree);
@ -31,9 +32,12 @@ class GPUBuilder {
void Sort(int level); void Sort(int level);
void InitFirstNode(); void InitFirstNode();
void CopyTree(RegTree &tree); // NOLINT void CopyTree(RegTree &tree); // NOLINT
void ColsampleTree();
TrainParam param; TrainParam param;
GPUData *gpu_data; GPUData *gpu_data;
std::vector<int> feature_set_tree;
std::vector<int> feature_set_level;
int multiscan_levels = int multiscan_levels =
5; // Number of levels before switching to sorting algorithm 5; // Number of levels before switching to sorting algorithm

View File

@ -0,0 +1,162 @@
/*!
* Copyright 2016 Rory mitchell
*/
#pragma once
#include <cub/cub.cuh>
#include <xgboost/logging.h>
#include <thrust/sequence.h>
#include <vector>
#include "device_helpers.cuh"
#include "../../src/tree/param.h"
#include "types_functions.cuh"
namespace xgboost {
namespace tree {
struct GPUData {
GPUData() : allocated(false), n_features(0), n_instances(0) {}
bool allocated;
int n_features;
int n_instances;
dh::bulk_allocator ba;
GPUTrainingParam param;
dh::dvec<float> fvalues;
dh::dvec<float> fvalues_temp;
dh::dvec<float> fvalues_cached;
dh::dvec<int> foffsets;
dh::dvec<bst_uint> instance_id;
dh::dvec<bst_uint> instance_id_temp;
dh::dvec<bst_uint> instance_id_cached;
dh::dvec<int> feature_id;
dh::dvec<NodeIdT> node_id;
dh::dvec<NodeIdT> node_id_temp;
dh::dvec<NodeIdT> node_id_instance;
dh::dvec<gpu_gpair> gpair;
dh::dvec<Node> nodes;
dh::dvec<Split> split_candidates;
dh::dvec<gpu_gpair> node_sums;
dh::dvec<int> node_offsets;
dh::dvec<int> sort_index_in;
dh::dvec<int> sort_index_out;
dh::dvec<char> cub_mem;
dh::dvec<int> feature_flags;
dh::dvec<int> feature_set;
ItemIter items_iter;
void Init(const std::vector<float> &in_fvalues,
const std::vector<int> &in_foffsets,
const std::vector<bst_uint> &in_instance_id,
const std::vector<int> &in_feature_id,
const std::vector<bst_gpair> &in_gpair, bst_uint n_instances_in,
bst_uint n_features_in, int max_depth, const TrainParam &param_in) {
n_features = n_features_in;
n_instances = n_instances_in;
uint32_t max_nodes = (1 << (max_depth + 1)) - 1;
uint32_t max_nodes_level = 1 << max_depth;
// Calculate memory for sort
size_t cub_mem_size = 0;
cub::DoubleBuffer<NodeIdT> db_key;
cub::DoubleBuffer<int> db_value;
cub::DeviceSegmentedRadixSort::SortPairs(
cub_mem.data(), cub_mem_size, db_key,
db_value, in_fvalues.size(), n_features,
foffsets.data(), foffsets.data() + 1);
// Allocate memory
size_t free_memory = dh::available_memory();
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
&fvalues_cached, in_fvalues.size(), &foffsets,
in_foffsets.size(), &instance_id, in_instance_id.size(),
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
in_instance_id.size(), &feature_id, in_feature_id.size(),
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
max_nodes, &split_candidates, max_nodes_level * n_features,
&node_sums, max_nodes_level * n_features, &node_offsets,
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
&feature_flags, n_features, &feature_set, n_features);
if (!param_in.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on " << dh::device_name();
}
fvalues_cached = in_fvalues;
foffsets = in_foffsets;
instance_id_cached = in_instance_id;
feature_id = in_feature_id;
param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda,
param_in.reg_alpha, param_in.max_delta_step);
allocated = true;
this->Reset(in_gpair, param_in.subsample);
items_iter = thrust::make_zip_iterator(thrust::make_tuple(
thrust::make_permutation_iterator(gpair.tbegin(), instance_id.tbegin()),
fvalues.tbegin(), node_id.tbegin()));
dh::safe_cuda(cudaGetLastError());
}
~GPUData() {}
// Set gradient pair to 0 with p = 1 - subsample
void MarkSubsample(float subsample) {
if (subsample == 1.0) {
return;
}
auto d_gpair = gpair.data();
dh::BernoulliRng rng(subsample, common::GlobalRandom()());
dh::launch_n(n_instances, [=] __device__(int i) {
if (!rng(i)) {
d_gpair[i] = gpu_gpair();
}
});
}
// Reset memory for new boosting iteration
void Reset(const std::vector<bst_gpair> &in_gpair, float subsample) {
CHECK(allocated);
gpair = in_gpair;
this->MarkSubsample(subsample);
instance_id = instance_id_cached;
fvalues = fvalues_cached;
nodes.fill(Node());
node_id_instance.fill(0);
node_id.fill(0);
}
bool IsAllocated() { return allocated; }
// Gather from node_id_instance into node_id according to instance_id
void GatherNodeId() {
// Update node_id for each item
auto d_node_id = node_id.data();
auto d_node_id_instance = node_id_instance.data();
auto d_instance_id = instance_id.data();
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
// Item item = d_items[i];
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
});
}
};
} // namespace tree
} // namespace xgboost