GPU Plugin: Add subsample, colsample_bytree, colsample_bylevel (#1895)
This commit is contained in:
parent
cee4aafb93
commit
b49b339183
@ -9,10 +9,10 @@ https://www.kaggle.com/c/bosch-production-line-performance/data
|
|||||||
|
|
||||||
Copy train_numeric.csv into xgboost/demo/data.
|
Copy train_numeric.csv into xgboost/demo/data.
|
||||||
|
|
||||||
The subsample parameter can be changed so you can run the script first on a small portion of the data. Processing the entire dataset can take a long time and requires about 8GB of device memory. It is initially set to 0.4, using about 2650/3380MB on a GTX 970.
|
The subset parameter changes the proportion of rows loaded from the CSV file. Processing the entire dataset can take a long time and requires about 8GB of device memory. It is initially set to 0.4, using about 2650/3380MB on a GTX 970. Lower the parameter if your device runs out of memory.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
subsample = 0.4
|
subset = 0.4
|
||||||
```
|
```
|
||||||
|
|
||||||
Parameters are set as usual except that we set silent to 0 to see how much memory is being allocated on the GPU and we change 'updater' to 'grow_gpu' to activate the GPU plugin.
|
Parameters are set as usual except that we set silent to 0 to see how much memory is being allocated on the GPU and we change 'updater' to 'grow_gpu' to activate the GPU plugin.
|
||||||
|
|||||||
@ -5,12 +5,12 @@ import time
|
|||||||
import random
|
import random
|
||||||
from sklearn.cross_validation import StratifiedKFold
|
from sklearn.cross_validation import StratifiedKFold
|
||||||
|
|
||||||
#For sub sampling rows from input file
|
#For sampling rows from input file
|
||||||
random_seed = 9
|
random_seed = 9
|
||||||
subsample = 0.4
|
subset = 0.4
|
||||||
|
|
||||||
n_rows = 1183747;
|
n_rows = 1183747;
|
||||||
train_rows = int(n_rows * subsample)
|
train_rows = int(n_rows * subset)
|
||||||
random.seed(random_seed)
|
random.seed(random_seed)
|
||||||
skip = sorted(random.sample(xrange(1,n_rows + 1),n_rows-train_rows))
|
skip = sorted(random.sample(xrange(1,n_rows + 1),n_rows-train_rows))
|
||||||
data = pd.read_csv("../data/train_numeric.csv", index_col=0, dtype=np.float32, skiprows=skip)
|
data = pd.read_csv("../data/train_numeric.csv", index_col=0, dtype=np.float32, skiprows=skip)
|
||||||
|
|||||||
@ -32,8 +32,6 @@ Data is stored in a sparse format. For example, missing values produced by one h
|
|||||||
|
|
||||||
A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset.
|
A 4GB graphics card will process approximately 3.5 million rows of the well known Kaggle higgs dataset.
|
||||||
|
|
||||||
The algorithm will automatically perform row subsampling if it detects there is not enough memory on the device.
|
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).
|
A CUDA capable GPU with at least compute capability >= 3.5 (the algorithm depends on shuffle and vote instructions introduced in Kepler).
|
||||||
|
|
||||||
|
|||||||
@ -7,6 +7,7 @@
|
|||||||
#include <thrust/device_vector.h>
|
#include <thrust/device_vector.h>
|
||||||
#include <thrust/system/cuda/error.h>
|
#include <thrust/system/cuda/error.h>
|
||||||
#include <thrust/system_error.h>
|
#include <thrust/system_error.h>
|
||||||
|
#include <thrust/random.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -147,6 +148,8 @@ struct Timer {
|
|||||||
LARGE_INTEGER now;
|
LARGE_INTEGER now;
|
||||||
QueryPerformanceCounter(&now);
|
QueryPerformanceCounter(&now);
|
||||||
return static_cast<double>(now.QuadPart) / s_frequency.QuadPart;
|
return static_cast<double>(now.QuadPart) / s_frequency.QuadPart;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,12 +163,14 @@ struct Timer {
|
|||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
_ReadWriteBarrier();
|
_ReadWriteBarrier();
|
||||||
return seconds_now() - start;
|
return seconds_now() - start;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
void printElapsed(char *label) {
|
void printElapsed(std::string label) {
|
||||||
#ifdef TIMERS
|
#ifdef TIMERS
|
||||||
safe_cuda(cudaDeviceSynchronize());
|
safe_cuda(cudaDeviceSynchronize());
|
||||||
printf("%s:\t %1.4fs\n", label, elapsed());
|
printf("%s:\t %1.4fs\n", label.c_str(), elapsed());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -233,46 +238,6 @@ template <typename T> __device__ range block_stride_range(T begin, T end) {
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Utility functions
|
|
||||||
*/
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
|
|
||||||
thrust::host_vector<T> h = v;
|
|
||||||
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
|
||||||
std::cout << " " << h[i];
|
|
||||||
}
|
|
||||||
std::cout << "\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void print(char *label, const thrust::device_vector<T> &v,
|
|
||||||
const char *format = "%d ", int max = 10) {
|
|
||||||
thrust::host_vector<T> h_v = v;
|
|
||||||
|
|
||||||
std::cout << label << ":\n";
|
|
||||||
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
|
|
||||||
printf(format, h_v[i]);
|
|
||||||
}
|
|
||||||
std::cout << "\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
|
|
||||||
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
|
|
||||||
return thrust::device_pointer_cast(d_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
|
|
||||||
return raw_pointer_cast(v.data());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
|
|
||||||
return sizeof(T) * v.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Threadblock iterates over range, filling with value
|
// Threadblock iterates over range, filling with value
|
||||||
template <typename IterT, typename ValueT>
|
template <typename IterT, typename ValueT>
|
||||||
@ -306,11 +271,11 @@ template <typename T> class dvec {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
dvec() : _ptr(NULL), _size(0) {}
|
dvec() : _ptr(NULL), _size(0) {}
|
||||||
size_t size() { return _size; }
|
size_t size() const { return _size; }
|
||||||
bool empty() { return _ptr == NULL || _size == 0; }
|
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||||
T *data() { return _ptr; }
|
T *data() { return _ptr; }
|
||||||
|
|
||||||
std::vector<T> as_vector() {
|
std::vector<T> as_vector() const {
|
||||||
std::vector<T> h_vector(size());
|
std::vector<T> h_vector(size());
|
||||||
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
|
safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T),
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
@ -454,6 +419,55 @@ inline std::string device_name() {
|
|||||||
return std::string(prop.name);
|
return std::string(prop.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Utility functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void print(const thrust::device_vector<T> &v, size_t max_items = 10) {
|
||||||
|
thrust::host_vector<T> h = v;
|
||||||
|
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
||||||
|
std::cout << " " << h[i];
|
||||||
|
}
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void print(const dvec<T> &v, size_t max_items = 10) {
|
||||||
|
std::vector<T> h = v.as_vector();
|
||||||
|
for (int i = 0; i < std::min(max_items, h.size()); i++) {
|
||||||
|
std::cout << " " << h[i];
|
||||||
|
}
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void print(char *label, const thrust::device_vector<T> &v,
|
||||||
|
const char *format = "%d ", int max = 10) {
|
||||||
|
thrust::host_vector<T> h_v = v;
|
||||||
|
|
||||||
|
std::cout << label << ":\n";
|
||||||
|
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
|
||||||
|
printf(format, h_v[i]);
|
||||||
|
}
|
||||||
|
std::cout << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
|
||||||
|
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
|
||||||
|
return thrust::device_pointer_cast(d_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||||
|
return raw_pointer_cast(v.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||||
|
return sizeof(T) * v.size();
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* Kernel launcher
|
* Kernel launcher
|
||||||
*/
|
*/
|
||||||
@ -470,4 +484,25 @@ inline void launch_n(size_t n, L lambda) {
|
|||||||
|
|
||||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
|
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Random
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct BernoulliRng {
|
||||||
|
float p;
|
||||||
|
int seed;
|
||||||
|
|
||||||
|
__host__ __device__ BernoulliRng(float p, int seed):p(p), seed(seed) {}
|
||||||
|
|
||||||
|
__host__ __device__ bool operator()(const int i) const {
|
||||||
|
thrust::default_random_engine rng(seed);
|
||||||
|
thrust::uniform_real_distribution<float> dist;
|
||||||
|
rng.discard(i);
|
||||||
|
|
||||||
|
return dist(rng) <= p;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
} // namespace dh
|
} // namespace dh
|
||||||
|
|||||||
@ -4,9 +4,11 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
#include <xgboost/base.h>
|
#include <xgboost/base.h>
|
||||||
|
#include <vector>
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
#include "find_split_multiscan.cuh"
|
#include "find_split_multiscan.cuh"
|
||||||
#include "find_split_sorting.cuh"
|
#include "find_split_sorting.cuh"
|
||||||
|
#include "gpu_data.cuh"
|
||||||
#include "types_functions.cuh"
|
#include "types_functions.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -62,24 +64,47 @@ void reduce_split_candidates(Split *d_split_candidates, Node *d_nodes,
|
|||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
void find_split(const ItemIter items_iter, Split *d_split_candidates,
|
void colsample_level(GPUData *data, const TrainParam xgboost_param,
|
||||||
Node *d_nodes, bst_uint num_items, int num_features,
|
const std::vector<int> &feature_set_tree,
|
||||||
const int *d_feature_offsets, gpu_gpair *d_node_sums,
|
std::vector<int> *feature_set_level) {
|
||||||
int *d_node_offsets, const GPUTrainingParam param,
|
unsigned n_bytree =
|
||||||
const int level, bool multiscan_algorithm) {
|
static_cast<unsigned>(xgboost_param.colsample_bytree * data->n_features);
|
||||||
|
unsigned n =
|
||||||
|
static_cast<unsigned>(n_bytree * xgboost_param.colsample_bylevel);
|
||||||
|
CHECK_GT(n, 0);
|
||||||
|
|
||||||
|
*feature_set_level = feature_set_tree;
|
||||||
|
|
||||||
|
std::shuffle((*feature_set_level).begin(),
|
||||||
|
(*feature_set_level).begin() + n_bytree, common::GlobalRandom());
|
||||||
|
|
||||||
|
data->feature_set = *feature_set_level;
|
||||||
|
|
||||||
|
data->feature_flags.fill(0);
|
||||||
|
auto d_feature_set = data->feature_set.data();
|
||||||
|
auto d_feature_flags = data->feature_flags.data();
|
||||||
|
|
||||||
|
dh::launch_n(
|
||||||
|
n, [=] __device__(int i) { d_feature_flags[d_feature_set[i]] = 1; });
|
||||||
|
}
|
||||||
|
|
||||||
|
void find_split(GPUData *data, const TrainParam xgboost_param, const int level,
|
||||||
|
bool multiscan_algorithm,
|
||||||
|
const std::vector<int> &feature_set_tree,
|
||||||
|
std::vector<int> *feature_set_level) {
|
||||||
|
colsample_level(data, xgboost_param, feature_set_tree, feature_set_level);
|
||||||
|
// Reset split candidates
|
||||||
|
data->split_candidates.fill(Split());
|
||||||
|
|
||||||
if (multiscan_algorithm) {
|
if (multiscan_algorithm) {
|
||||||
find_split_candidates_multiscan(items_iter, d_split_candidates, d_nodes,
|
find_split_candidates_multiscan(data, level);
|
||||||
num_items, num_features, d_feature_offsets,
|
|
||||||
param, level);
|
|
||||||
} else {
|
} else {
|
||||||
find_split_candidates_sorted(items_iter, d_split_candidates, d_nodes,
|
find_split_candidates_sorted(data, level);
|
||||||
num_items, num_features, d_feature_offsets,
|
|
||||||
d_node_sums, d_node_offsets, param, level);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the best split for each node
|
// Find the best split for each node
|
||||||
reduce_split_candidates(d_split_candidates, d_nodes, level, num_features,
|
reduce_split_candidates(data->split_candidates.data(), data->nodes.data(),
|
||||||
param);
|
level, data->n_features, data->param);
|
||||||
}
|
}
|
||||||
} // namespace tree
|
} // namespace tree
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
#include <cub/cub.cuh>
|
#include <cub/cub.cuh>
|
||||||
#include <xgboost/base.h>
|
#include <xgboost/base.h>
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
|
#include "gpu_data.cuh"
|
||||||
#include "types_functions.cuh"
|
#include "types_functions.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -609,22 +610,11 @@ struct FindSplitEnactorMultiscan {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void ResetSplitCandidates() {
|
|
||||||
const int max_nodes = 1 << level;
|
|
||||||
const int begin = blockIdx.x * max_nodes;
|
|
||||||
const int end = begin + max_nodes;
|
|
||||||
|
|
||||||
for (auto i : dh::block_stride_range(begin, end)) {
|
|
||||||
d_split_candidates_out[i] = Split();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
||||||
const bst_uint &segment_end) {
|
const bst_uint &segment_end) {
|
||||||
// Current position
|
// Current position
|
||||||
bst_uint offset = segment_begin;
|
bst_uint offset = segment_begin;
|
||||||
|
|
||||||
ResetSplitCandidates();
|
|
||||||
ResetTileCarry();
|
ResetTileCarry();
|
||||||
ResetSplits();
|
ResetSplits();
|
||||||
CacheNodes();
|
CacheNodes();
|
||||||
@ -654,8 +644,9 @@ __launch_bounds__(1024, 2)
|
|||||||
const ItemIter items_iter, Split *d_split_candidates_out,
|
const ItemIter items_iter, Split *d_split_candidates_out,
|
||||||
const Node *d_nodes, const int node_begin, bst_uint num_items,
|
const Node *d_nodes, const int node_begin, bst_uint num_items,
|
||||||
int num_features, const int *d_feature_offsets,
|
int num_features, const int *d_feature_offsets,
|
||||||
const GPUTrainingParam param, const int level) {
|
const GPUTrainingParam param, const int *d_feature_flags,
|
||||||
if (num_items <= 0) {
|
const int level) {
|
||||||
|
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -685,69 +676,45 @@ __launch_bounds__(1024, 2)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <int N_NODES>
|
template <int N_NODES>
|
||||||
void find_split_candidates_multiscan_variation(
|
void find_split_candidates_multiscan_variation(GPUData *data, const int level) {
|
||||||
const ItemIter items_iter, Split *d_split_candidates, const Node *d_nodes,
|
const int node_begin = (1 << level) - 1;
|
||||||
int node_begin, int node_end, bst_uint num_items, int num_features,
|
|
||||||
const int *d_feature_offsets, const GPUTrainingParam param,
|
|
||||||
const int level) {
|
|
||||||
|
|
||||||
const int BLOCK_THREADS = 512;
|
const int BLOCK_THREADS = 512;
|
||||||
|
|
||||||
CHECK((node_end - node_begin) <= N_NODES) << "Multiscan: N_NODES template "
|
|
||||||
"parameter too small for given "
|
|
||||||
"node range.";
|
|
||||||
CHECK(BLOCK_THREADS / 32 < 32)
|
CHECK(BLOCK_THREADS / 32 < 32)
|
||||||
<< "Too many active warps. See FindSplitEnactor - ReduceSplits.";
|
<< "Too many active warps. See FindSplitEnactor - ReduceSplits.";
|
||||||
|
|
||||||
typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
|
typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
|
||||||
find_split_params;
|
find_split_params;
|
||||||
typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
|
typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
|
||||||
int grid_size = num_features;
|
int grid_size = data->n_features;
|
||||||
|
|
||||||
find_split_candidates_multiscan_kernel<
|
find_split_candidates_multiscan_kernel<
|
||||||
find_split_params,
|
find_split_params,
|
||||||
reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
|
reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
|
||||||
items_iter, d_split_candidates, d_nodes, node_begin, num_items,
|
data->items_iter, data->split_candidates.data(), data->nodes.data(),
|
||||||
num_features, d_feature_offsets, param, level);
|
node_begin, data->fvalues.size(), data->n_features, data->foffsets.data(),
|
||||||
|
data->param, data->feature_flags.data(), level);
|
||||||
|
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
void find_split_candidates_multiscan(
|
void find_split_candidates_multiscan(GPUData *data, const int level) {
|
||||||
const ItemIter items_iter, Split *d_split_candidates, const Node *d_nodes,
|
|
||||||
bst_uint num_items, int num_features, const int *d_feature_offsets,
|
|
||||||
const GPUTrainingParam param, const int level) {
|
|
||||||
// Select templated variation of split finding algorithm
|
// Select templated variation of split finding algorithm
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case 0:
|
case 0:
|
||||||
find_split_candidates_multiscan_variation<1>(
|
find_split_candidates_multiscan_variation<1>(data, level);
|
||||||
items_iter, d_split_candidates, d_nodes, 0, 1, num_items, num_features,
|
|
||||||
d_feature_offsets, param, level);
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
find_split_candidates_multiscan_variation<2>(
|
find_split_candidates_multiscan_variation<2>(data, level);
|
||||||
items_iter, d_split_candidates, d_nodes, 1, 3, num_items, num_features,
|
|
||||||
d_feature_offsets, param, level);
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
find_split_candidates_multiscan_variation<4>(
|
find_split_candidates_multiscan_variation<4>(data, level);
|
||||||
items_iter, d_split_candidates, d_nodes, 3, 7, num_items, num_features,
|
|
||||||
d_feature_offsets, param, level);
|
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3:
|
||||||
find_split_candidates_multiscan_variation<8>(
|
find_split_candidates_multiscan_variation<8>(data, level);
|
||||||
items_iter, d_split_candidates, d_nodes, 7, 15, num_items, num_features,
|
|
||||||
d_feature_offsets, param, level);
|
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
find_split_candidates_multiscan_variation<16>(
|
find_split_candidates_multiscan_variation<16>(data, level);
|
||||||
items_iter, d_split_candidates, d_nodes, 15, 31, num_items,
|
|
||||||
num_features, d_feature_offsets, param, level);
|
|
||||||
break;
|
|
||||||
case 5:
|
|
||||||
find_split_candidates_multiscan_variation<32>(
|
|
||||||
items_iter, d_split_candidates, d_nodes, 31, 63, num_items,
|
|
||||||
num_features, d_feature_offsets, param, level);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -337,17 +337,8 @@ struct FindSplitEnactorSorting {
|
|||||||
WriteBestSplit(node_id_adjusted);
|
WriteBestSplit(node_id_adjusted);
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void ResetSplitCandidates() {
|
|
||||||
const int max_nodes = 1 << level;
|
|
||||||
const int begin = blockIdx.x * max_nodes;
|
|
||||||
|
|
||||||
dh::block_fill(d_split_candidates_out + begin, max_nodes, Split());
|
|
||||||
}
|
|
||||||
|
|
||||||
__device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin,
|
__device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin,
|
||||||
const bst_uint &segment_end) {
|
const bst_uint &segment_end) {
|
||||||
ResetSplitCandidates();
|
|
||||||
|
|
||||||
int node_begin = segment_begin;
|
int node_begin = segment_begin;
|
||||||
|
|
||||||
const int max_nodes = 1 << level;
|
const int max_nodes = 1 << level;
|
||||||
@ -377,9 +368,9 @@ __global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
|
|||||||
const ItemIter items_iter, Split *d_split_candidates_out,
|
const ItemIter items_iter, Split *d_split_candidates_out,
|
||||||
const Node *d_nodes, bst_uint num_items, const int num_features,
|
const Node *d_nodes, bst_uint num_items, const int num_features,
|
||||||
const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets,
|
const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets,
|
||||||
const GPUTrainingParam param, const int level) {
|
const GPUTrainingParam param, const int *d_feature_flags, const int level) {
|
||||||
|
|
||||||
if (num_items <= 0) {
|
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -408,23 +399,19 @@ __global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
|
|||||||
.ProcessFeature(segment_begin, segment_end);
|
.ProcessFeature(segment_begin, segment_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
void find_split_candidates_sorted(const ItemIter items_iter,
|
void find_split_candidates_sorted(GPUData * data, const int level) {
|
||||||
Split *d_split_candidates, Node *d_nodes,
|
|
||||||
bst_uint num_items, int num_features,
|
|
||||||
const int *d_feature_offsets,
|
|
||||||
gpu_gpair *d_node_sums, int *d_node_offsets,
|
|
||||||
const GPUTrainingParam param,
|
|
||||||
const int level) {
|
|
||||||
const int BLOCK_THREADS = 512;
|
const int BLOCK_THREADS = 512;
|
||||||
|
|
||||||
CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps.";
|
CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps.";
|
||||||
|
|
||||||
int grid_size = num_features;
|
int grid_size = data->n_features;
|
||||||
|
|
||||||
find_split_candidates_sorted_kernel<
|
find_split_candidates_sorted_kernel<
|
||||||
BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>(
|
BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>(
|
||||||
items_iter, d_split_candidates, d_nodes, num_items, num_features,
|
data->items_iter, data->split_candidates.data(), data->nodes.data(),
|
||||||
d_feature_offsets, d_node_sums, d_node_offsets, param, level);
|
data->fvalues.size(), data->n_features,
|
||||||
|
data->foffsets.data(), data->node_sums.data(), data->node_offsets.data(),
|
||||||
|
data->param, data->feature_flags.data(), level);
|
||||||
|
|
||||||
dh::safe_cuda(cudaGetLastError());
|
dh::safe_cuda(cudaGetLastError());
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
|
|||||||
@ -12,143 +12,17 @@
|
|||||||
#include <thrust/sequence.h>
|
#include <thrust/sequence.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <numeric>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "../../../src/common/random.h"
|
#include "../../../src/common/random.h"
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
#include "find_split.cuh"
|
#include "find_split.cuh"
|
||||||
#include "gpu_builder.cuh"
|
#include "gpu_builder.cuh"
|
||||||
#include "types_functions.cuh"
|
#include "types_functions.cuh"
|
||||||
|
#include "gpu_data.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
struct GPUData {
|
|
||||||
GPUData() : allocated(false), n_features(0), n_instances(0) {}
|
|
||||||
|
|
||||||
bool allocated;
|
|
||||||
int n_features;
|
|
||||||
int n_instances;
|
|
||||||
|
|
||||||
dh::bulk_allocator ba;
|
|
||||||
GPUTrainingParam param;
|
|
||||||
|
|
||||||
dh::dvec<float> fvalues;
|
|
||||||
dh::dvec<float> fvalues_temp;
|
|
||||||
dh::dvec<float> fvalues_cached;
|
|
||||||
dh::dvec<int> foffsets;
|
|
||||||
dh::dvec<bst_uint> instance_id;
|
|
||||||
dh::dvec<bst_uint> instance_id_temp;
|
|
||||||
dh::dvec<bst_uint> instance_id_cached;
|
|
||||||
dh::dvec<int> feature_id;
|
|
||||||
dh::dvec<NodeIdT> node_id;
|
|
||||||
dh::dvec<NodeIdT> node_id_temp;
|
|
||||||
dh::dvec<NodeIdT> node_id_instance;
|
|
||||||
dh::dvec<gpu_gpair> gpair;
|
|
||||||
dh::dvec<Node> nodes;
|
|
||||||
dh::dvec<Split> split_candidates;
|
|
||||||
dh::dvec<gpu_gpair> node_sums;
|
|
||||||
dh::dvec<int> node_offsets;
|
|
||||||
dh::dvec<int> sort_index_in;
|
|
||||||
dh::dvec<int> sort_index_out;
|
|
||||||
|
|
||||||
dh::dvec<char> cub_mem;
|
|
||||||
|
|
||||||
ItemIter items_iter;
|
|
||||||
|
|
||||||
void Init(const std::vector<float> &in_fvalues,
|
|
||||||
const std::vector<int> &in_foffsets,
|
|
||||||
const std::vector<bst_uint> &in_instance_id,
|
|
||||||
const std::vector<int> &in_feature_id,
|
|
||||||
const std::vector<bst_gpair> &in_gpair, bst_uint n_instances_in,
|
|
||||||
bst_uint n_features_in, int max_depth, const TrainParam ¶m_in) {
|
|
||||||
n_features = n_features_in;
|
|
||||||
n_instances = n_instances_in;
|
|
||||||
|
|
||||||
uint32_t max_nodes = (1 << (max_depth + 1)) - 1;
|
|
||||||
uint32_t max_nodes_level = 1 << max_depth;
|
|
||||||
|
|
||||||
// Calculate memory for sort
|
|
||||||
size_t cub_mem_size = 0;
|
|
||||||
cub::DoubleBuffer<NodeIdT> db_key;
|
|
||||||
cub::DoubleBuffer<int> db_value;
|
|
||||||
|
|
||||||
cub::DeviceSegmentedRadixSort::SortPairs(
|
|
||||||
cub_mem.data(), cub_mem_size, db_key,
|
|
||||||
db_value, in_fvalues.size(), n_features,
|
|
||||||
foffsets.data(), foffsets.data() + 1);
|
|
||||||
|
|
||||||
// Allocate memory
|
|
||||||
size_t free_memory = dh::available_memory();
|
|
||||||
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
|
|
||||||
&fvalues_cached, in_fvalues.size(), &foffsets,
|
|
||||||
in_foffsets.size(), &instance_id, in_instance_id.size(),
|
|
||||||
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
|
||||||
in_instance_id.size(), &feature_id, in_feature_id.size(),
|
|
||||||
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
|
|
||||||
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
|
|
||||||
max_nodes, &split_candidates, max_nodes_level * n_features,
|
|
||||||
&node_sums, max_nodes_level * n_features, &node_offsets,
|
|
||||||
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
|
|
||||||
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size);
|
|
||||||
|
|
||||||
if (!param_in.silent) {
|
|
||||||
const int mb_size = 1048576;
|
|
||||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
|
||||||
<< free_memory / mb_size << " MB on " << dh::device_name();
|
|
||||||
}
|
|
||||||
node_id.fill(0);
|
|
||||||
node_id_instance.fill(0);
|
|
||||||
|
|
||||||
fvalues = in_fvalues;
|
|
||||||
fvalues_cached = fvalues;
|
|
||||||
foffsets = in_foffsets;
|
|
||||||
instance_id = in_instance_id;
|
|
||||||
instance_id_cached = instance_id;
|
|
||||||
feature_id = in_feature_id;
|
|
||||||
|
|
||||||
param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda,
|
|
||||||
param_in.reg_alpha, param_in.max_delta_step);
|
|
||||||
|
|
||||||
gpair = in_gpair;
|
|
||||||
|
|
||||||
nodes.fill(Node());
|
|
||||||
|
|
||||||
items_iter = thrust::make_zip_iterator(thrust::make_tuple(
|
|
||||||
thrust::make_permutation_iterator(gpair.tbegin(), instance_id.tbegin()),
|
|
||||||
fvalues.tbegin(), node_id.tbegin()));
|
|
||||||
|
|
||||||
allocated = true;
|
|
||||||
|
|
||||||
dh::safe_cuda(cudaGetLastError());
|
|
||||||
}
|
|
||||||
|
|
||||||
~GPUData() {}
|
|
||||||
|
|
||||||
// Reset memory for new boosting iteration
|
|
||||||
void Reset(const std::vector<bst_gpair> &in_gpair) {
|
|
||||||
CHECK(allocated);
|
|
||||||
gpair = in_gpair;
|
|
||||||
instance_id = instance_id_cached;
|
|
||||||
fvalues = fvalues_cached;
|
|
||||||
nodes.fill(Node());
|
|
||||||
node_id_instance.fill(0);
|
|
||||||
node_id.fill(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsAllocated() { return allocated; }
|
|
||||||
|
|
||||||
// Gather from node_id_instance into node_id according to instance_id
|
|
||||||
void GatherNodeId() {
|
|
||||||
// Update node_id for each item
|
|
||||||
auto d_node_id = node_id.data();
|
|
||||||
auto d_node_id_instance = node_id_instance.data();
|
|
||||||
auto d_instance_id = instance_id.data();
|
|
||||||
|
|
||||||
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
|
|
||||||
// Item item = d_items[i];
|
|
||||||
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); }
|
GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); }
|
||||||
|
|
||||||
@ -253,15 +127,26 @@ void GPUBuilder::Sort(int level) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GPUBuilder::ColsampleTree() {
|
||||||
|
unsigned n = static_cast<unsigned>(
|
||||||
|
param.colsample_bytree * gpu_data->n_features);
|
||||||
|
CHECK_GT(n, 0);
|
||||||
|
|
||||||
|
feature_set_tree.resize(gpu_data->n_features);
|
||||||
|
std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
|
||||||
|
std::shuffle(feature_set_tree.begin(), feature_set_tree.end(),
|
||||||
|
common::GlobalRandom());
|
||||||
|
}
|
||||||
|
|
||||||
void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
||||||
RegTree *p_tree) {
|
RegTree *p_tree) {
|
||||||
cudaProfilerStart();
|
|
||||||
try {
|
try {
|
||||||
dh::Timer update;
|
dh::Timer update;
|
||||||
dh::Timer t;
|
dh::Timer t;
|
||||||
this->InitData(gpair, *p_fmat, *p_tree);
|
this->InitData(gpair, *p_fmat, *p_tree);
|
||||||
t.printElapsed("init data");
|
t.printElapsed("init data");
|
||||||
this->InitFirstNode();
|
this->InitFirstNode();
|
||||||
|
this->ColsampleTree();
|
||||||
|
|
||||||
for (int level = 0; level < param.max_depth; level++) {
|
for (int level = 0; level < param.max_depth; level++) {
|
||||||
bool use_multiscan_algorithm = level < multiscan_levels;
|
bool use_multiscan_algorithm = level < multiscan_levels;
|
||||||
@ -280,11 +165,8 @@ void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
dh::Timer split;
|
dh::Timer split;
|
||||||
find_split(gpu_data->items_iter, gpu_data->split_candidates.data(),
|
find_split(gpu_data, param, level, use_multiscan_algorithm,
|
||||||
gpu_data->nodes.data(), (bst_uint)gpu_data->fvalues.size(),
|
feature_set_tree, &feature_set_level);
|
||||||
gpu_data->n_features, gpu_data->foffsets.data(),
|
|
||||||
gpu_data->node_sums.data(), gpu_data->node_offsets.data(),
|
|
||||||
gpu_data->param, level, use_multiscan_algorithm);
|
|
||||||
|
|
||||||
split.printElapsed("split");
|
split.printElapsed("split");
|
||||||
|
|
||||||
@ -302,22 +184,6 @@ void GPUBuilder::Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
|||||||
std::cerr << "Unknown exception." << std::endl;
|
std::cerr << "Unknown exception." << std::endl;
|
||||||
exit(-1);
|
exit(-1);
|
||||||
}
|
}
|
||||||
cudaProfilerStop();
|
|
||||||
}
|
|
||||||
|
|
||||||
float GPUBuilder::GetSubsamplingRate(MetaInfo info) {
|
|
||||||
float subsample = 1.0;
|
|
||||||
uint32_t max_nodes = (1 << (param.max_depth + 1)) - 1;
|
|
||||||
uint32_t max_nodes_level = 1 << param.max_depth;
|
|
||||||
size_t required = 10 * info.num_row + 40 * info.num_nonzero
|
|
||||||
+ 64 * max_nodes + 76 * max_nodes_level * info.num_col;
|
|
||||||
size_t available = dh::available_memory();
|
|
||||||
while (available < required) {
|
|
||||||
subsample -= 0.05;
|
|
||||||
required = 10 * info.num_row + subsample * (44 * info.num_nonzero);
|
|
||||||
}
|
|
||||||
|
|
||||||
return subsample;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
|
void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
|
||||||
@ -325,7 +191,7 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
|
|||||||
CHECK(fmat.SingleColBlock()) << "GPUMaker: must have single column block";
|
CHECK(fmat.SingleColBlock()) << "GPUMaker: must have single column block";
|
||||||
|
|
||||||
if (gpu_data->IsAllocated()) {
|
if (gpu_data->IsAllocated()) {
|
||||||
gpu_data->Reset(gpair);
|
gpu_data->Reset(gpair, param.subsample);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -333,35 +199,6 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
|
|||||||
|
|
||||||
MetaInfo info = fmat.info();
|
MetaInfo info = fmat.info();
|
||||||
|
|
||||||
// Work out if dataset will fit on GPU
|
|
||||||
float subsample = this->GetSubsamplingRate(info);
|
|
||||||
CHECK(subsample > 0.0);
|
|
||||||
if (!param.silent && subsample < param.subsample) {
|
|
||||||
LOG(CONSOLE) << "Not enough device memory for entire dataset.";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Override subsample parameter if user-specified parameter is lower
|
|
||||||
subsample = std::min(param.subsample, subsample);
|
|
||||||
|
|
||||||
std::vector<bool> row_flags;
|
|
||||||
|
|
||||||
if (subsample < 1.0) {
|
|
||||||
if (!param.silent && subsample < 1.0) {
|
|
||||||
LOG(CONSOLE) << "Subsampling " << subsample * 100 << "% of rows.";
|
|
||||||
}
|
|
||||||
|
|
||||||
const RowSet &rowset = fmat.buffered_rowset();
|
|
||||||
row_flags.resize(info.num_row);
|
|
||||||
std::bernoulli_distribution coin_flip(subsample);
|
|
||||||
auto &rnd = common::GlobalRandom();
|
|
||||||
for (size_t i = 0; i < rowset.size(); ++i) {
|
|
||||||
const bst_uint ridx = rowset[i];
|
|
||||||
if (gpair[ridx].hess < 0.0f)
|
|
||||||
continue;
|
|
||||||
row_flags[ridx] = coin_flip(rnd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int> foffsets;
|
std::vector<int> foffsets;
|
||||||
foffsets.push_back(0);
|
foffsets.push_back(0);
|
||||||
std::vector<int> feature_id;
|
std::vector<int> feature_id;
|
||||||
@ -382,18 +219,10 @@ void GPUBuilder::InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat,
|
|||||||
for (const ColBatch::Entry *it = col.data; it != col.data + col.length;
|
for (const ColBatch::Entry *it = col.data; it != col.data + col.length;
|
||||||
it++) {
|
it++) {
|
||||||
bst_uint inst_id = it->index;
|
bst_uint inst_id = it->index;
|
||||||
if (subsample < 1.0) {
|
|
||||||
if (row_flags[inst_id]) {
|
|
||||||
fvalues.push_back(it->fvalue);
|
fvalues.push_back(it->fvalue);
|
||||||
instance_id.push_back(inst_id);
|
instance_id.push_back(inst_id);
|
||||||
feature_id.push_back(i);
|
feature_id.push_back(i);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
fvalues.push_back(it->fvalue);
|
|
||||||
instance_id.push_back(inst_id);
|
|
||||||
feature_id.push_back(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
foffsets.push_back(fvalues.size());
|
foffsets.push_back(fvalues.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -23,6 +23,7 @@ class GPUBuilder {
|
|||||||
RegTree *p_tree);
|
RegTree *p_tree);
|
||||||
|
|
||||||
void UpdateNodeId(int level);
|
void UpdateNodeId(int level);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, // NOLINT
|
void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, // NOLINT
|
||||||
const RegTree &tree);
|
const RegTree &tree);
|
||||||
@ -31,9 +32,12 @@ class GPUBuilder {
|
|||||||
void Sort(int level);
|
void Sort(int level);
|
||||||
void InitFirstNode();
|
void InitFirstNode();
|
||||||
void CopyTree(RegTree &tree); // NOLINT
|
void CopyTree(RegTree &tree); // NOLINT
|
||||||
|
void ColsampleTree();
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
GPUData *gpu_data;
|
GPUData *gpu_data;
|
||||||
|
std::vector<int> feature_set_tree;
|
||||||
|
std::vector<int> feature_set_level;
|
||||||
|
|
||||||
int multiscan_levels =
|
int multiscan_levels =
|
||||||
5; // Number of levels before switching to sorting algorithm
|
5; // Number of levels before switching to sorting algorithm
|
||||||
|
|||||||
162
plugin/updater_gpu/src/gpu_data.cuh
Normal file
162
plugin/updater_gpu/src/gpu_data.cuh
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
/*!
|
||||||
|
* Copyright 2016 Rory mitchell
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#include <xgboost/logging.h>
|
||||||
|
#include <thrust/sequence.h>
|
||||||
|
#include <vector>
|
||||||
|
#include "device_helpers.cuh"
|
||||||
|
#include "../../src/tree/param.h"
|
||||||
|
#include "types_functions.cuh"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
namespace tree {
|
||||||
|
|
||||||
|
struct GPUData {
|
||||||
|
GPUData() : allocated(false), n_features(0), n_instances(0) {}
|
||||||
|
|
||||||
|
bool allocated;
|
||||||
|
int n_features;
|
||||||
|
int n_instances;
|
||||||
|
|
||||||
|
dh::bulk_allocator ba;
|
||||||
|
GPUTrainingParam param;
|
||||||
|
|
||||||
|
dh::dvec<float> fvalues;
|
||||||
|
dh::dvec<float> fvalues_temp;
|
||||||
|
dh::dvec<float> fvalues_cached;
|
||||||
|
dh::dvec<int> foffsets;
|
||||||
|
dh::dvec<bst_uint> instance_id;
|
||||||
|
dh::dvec<bst_uint> instance_id_temp;
|
||||||
|
dh::dvec<bst_uint> instance_id_cached;
|
||||||
|
dh::dvec<int> feature_id;
|
||||||
|
dh::dvec<NodeIdT> node_id;
|
||||||
|
dh::dvec<NodeIdT> node_id_temp;
|
||||||
|
dh::dvec<NodeIdT> node_id_instance;
|
||||||
|
dh::dvec<gpu_gpair> gpair;
|
||||||
|
dh::dvec<Node> nodes;
|
||||||
|
dh::dvec<Split> split_candidates;
|
||||||
|
dh::dvec<gpu_gpair> node_sums;
|
||||||
|
dh::dvec<int> node_offsets;
|
||||||
|
dh::dvec<int> sort_index_in;
|
||||||
|
dh::dvec<int> sort_index_out;
|
||||||
|
|
||||||
|
dh::dvec<char> cub_mem;
|
||||||
|
|
||||||
|
dh::dvec<int> feature_flags;
|
||||||
|
dh::dvec<int> feature_set;
|
||||||
|
|
||||||
|
ItemIter items_iter;
|
||||||
|
|
||||||
|
void Init(const std::vector<float> &in_fvalues,
|
||||||
|
const std::vector<int> &in_foffsets,
|
||||||
|
const std::vector<bst_uint> &in_instance_id,
|
||||||
|
const std::vector<int> &in_feature_id,
|
||||||
|
const std::vector<bst_gpair> &in_gpair, bst_uint n_instances_in,
|
||||||
|
bst_uint n_features_in, int max_depth, const TrainParam ¶m_in) {
|
||||||
|
n_features = n_features_in;
|
||||||
|
n_instances = n_instances_in;
|
||||||
|
|
||||||
|
uint32_t max_nodes = (1 << (max_depth + 1)) - 1;
|
||||||
|
uint32_t max_nodes_level = 1 << max_depth;
|
||||||
|
|
||||||
|
// Calculate memory for sort
|
||||||
|
size_t cub_mem_size = 0;
|
||||||
|
cub::DoubleBuffer<NodeIdT> db_key;
|
||||||
|
cub::DoubleBuffer<int> db_value;
|
||||||
|
|
||||||
|
cub::DeviceSegmentedRadixSort::SortPairs(
|
||||||
|
cub_mem.data(), cub_mem_size, db_key,
|
||||||
|
db_value, in_fvalues.size(), n_features,
|
||||||
|
foffsets.data(), foffsets.data() + 1);
|
||||||
|
|
||||||
|
// Allocate memory
|
||||||
|
size_t free_memory = dh::available_memory();
|
||||||
|
ba.allocate(&fvalues, in_fvalues.size(), &fvalues_temp, in_fvalues.size(),
|
||||||
|
&fvalues_cached, in_fvalues.size(), &foffsets,
|
||||||
|
in_foffsets.size(), &instance_id, in_instance_id.size(),
|
||||||
|
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
||||||
|
in_instance_id.size(), &feature_id, in_feature_id.size(),
|
||||||
|
&node_id, in_fvalues.size(), &node_id_temp, in_fvalues.size(),
|
||||||
|
&node_id_instance, n_instances, &gpair, n_instances, &nodes,
|
||||||
|
max_nodes, &split_candidates, max_nodes_level * n_features,
|
||||||
|
&node_sums, max_nodes_level * n_features, &node_offsets,
|
||||||
|
max_nodes_level * n_features, &sort_index_in, in_fvalues.size(),
|
||||||
|
&sort_index_out, in_fvalues.size(), &cub_mem, cub_mem_size,
|
||||||
|
&feature_flags, n_features, &feature_set, n_features);
|
||||||
|
|
||||||
|
if (!param_in.silent) {
|
||||||
|
const int mb_size = 1048576;
|
||||||
|
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||||
|
<< free_memory / mb_size << " MB on " << dh::device_name();
|
||||||
|
}
|
||||||
|
|
||||||
|
fvalues_cached = in_fvalues;
|
||||||
|
foffsets = in_foffsets;
|
||||||
|
instance_id_cached = in_instance_id;
|
||||||
|
feature_id = in_feature_id;
|
||||||
|
|
||||||
|
param = GPUTrainingParam(param_in.min_child_weight, param_in.reg_lambda,
|
||||||
|
param_in.reg_alpha, param_in.max_delta_step);
|
||||||
|
|
||||||
|
|
||||||
|
allocated = true;
|
||||||
|
|
||||||
|
this->Reset(in_gpair, param_in.subsample);
|
||||||
|
|
||||||
|
items_iter = thrust::make_zip_iterator(thrust::make_tuple(
|
||||||
|
thrust::make_permutation_iterator(gpair.tbegin(), instance_id.tbegin()),
|
||||||
|
fvalues.tbegin(), node_id.tbegin()));
|
||||||
|
|
||||||
|
|
||||||
|
dh::safe_cuda(cudaGetLastError());
|
||||||
|
}
|
||||||
|
|
||||||
|
~GPUData() {}
|
||||||
|
|
||||||
|
// Set gradient pair to 0 with p = 1 - subsample
|
||||||
|
void MarkSubsample(float subsample) {
|
||||||
|
if (subsample == 1.0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto d_gpair = gpair.data();
|
||||||
|
dh::BernoulliRng rng(subsample, common::GlobalRandom()());
|
||||||
|
|
||||||
|
dh::launch_n(n_instances, [=] __device__(int i) {
|
||||||
|
if (!rng(i)) {
|
||||||
|
d_gpair[i] = gpu_gpair();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset memory for new boosting iteration
|
||||||
|
void Reset(const std::vector<bst_gpair> &in_gpair, float subsample) {
|
||||||
|
CHECK(allocated);
|
||||||
|
gpair = in_gpair;
|
||||||
|
this->MarkSubsample(subsample);
|
||||||
|
instance_id = instance_id_cached;
|
||||||
|
fvalues = fvalues_cached;
|
||||||
|
nodes.fill(Node());
|
||||||
|
node_id_instance.fill(0);
|
||||||
|
node_id.fill(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IsAllocated() { return allocated; }
|
||||||
|
|
||||||
|
// Gather from node_id_instance into node_id according to instance_id
|
||||||
|
void GatherNodeId() {
|
||||||
|
// Update node_id for each item
|
||||||
|
auto d_node_id = node_id.data();
|
||||||
|
auto d_node_id_instance = node_id_instance.data();
|
||||||
|
auto d_instance_id = instance_id.data();
|
||||||
|
|
||||||
|
dh::launch_n(fvalues.size(), [=] __device__(bst_uint i) {
|
||||||
|
// Item item = d_items[i];
|
||||||
|
d_node_id[i] = d_node_id_instance[d_instance_id[i]];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace tree
|
||||||
|
} // namespace xgboost
|
||||||
Loading…
x
Reference in New Issue
Block a user