[GPU-Plugin] Major refactor (#2644)
* Removal of redundant code/files. * Removal of exact namespace in GPU plugin * Revert double precision histograms to single precision for performance on Maxwell/Kepler
This commit is contained in:
@@ -2,13 +2,11 @@
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/logging.h>
|
||||
#include <thrust/binary_search.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/random.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system/cuda/execution_policy.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <xgboost/logging.h>
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
@@ -20,7 +18,6 @@
|
||||
#include "nccl.h"
|
||||
|
||||
// Uncomment to enable
|
||||
// #define DEVICE_TIMER
|
||||
#define TIMERS
|
||||
|
||||
namespace dh {
|
||||
@@ -61,25 +58,6 @@ inline ncclResult_t throw_on_nccl_error(ncclResult_t code, const char *file,
|
||||
return code;
|
||||
}
|
||||
|
||||
#define gpuErrchk(ans) \
|
||||
{ gpuAssert((ans), __FILE__, __LINE__); }
|
||||
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort){
|
||||
std::stringstream ss;
|
||||
ss << file << "(" << line << ")";
|
||||
std::string file_and_line;
|
||||
ss >> file_and_line;
|
||||
throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline int n_visible_devices() {
|
||||
int n_visgpus = 0;
|
||||
|
||||
@@ -237,7 +215,8 @@ __device__ range block_stride_range(T begin, T end) {
|
||||
return r;
|
||||
}
|
||||
|
||||
// Threadblock iterates over range, filling with value. Requires all threads in block to be active.
|
||||
// Threadblock iterates over range, filling with value. Requires all threads in
|
||||
// block to be active.
|
||||
template <typename IterT, typename ValueT>
|
||||
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
for (auto i : block_stride_range(static_cast<size_t>(0), n)) {
|
||||
@@ -485,7 +464,7 @@ class bulk_allocator {
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
void allocate(int device_idx, bool silent ,Args... args) {
|
||||
void allocate(int device_idx, bool silent, Args... args) {
|
||||
size_t size = get_size_bytes(args...);
|
||||
|
||||
char *ptr = allocate_device(device_idx, size, MemoryT);
|
||||
@@ -496,8 +475,7 @@ class bulk_allocator {
|
||||
_size.push_back(size);
|
||||
_device_idx.push_back(device_idx);
|
||||
|
||||
if(!silent)
|
||||
{
|
||||
if (!silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << size / mb_size << "MB on [" << device_idx
|
||||
<< "] " << device_name(device_idx) << ", "
|
||||
@@ -545,7 +523,6 @@ struct CubMemory {
|
||||
bool IsAllocated() { return d_temp_storage != NULL; }
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Utility functions
|
||||
*/
|
||||
@@ -653,24 +630,6 @@ inline void multi_launch_n(size_t n, int n_devices, L lambda) {
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Random
|
||||
*/
|
||||
|
||||
struct BernoulliRng {
|
||||
float p;
|
||||
int seed;
|
||||
|
||||
__host__ __device__ BernoulliRng(float p, int seed) : p(p), seed(seed) {}
|
||||
|
||||
__host__ __device__ bool operator()(const int i) const {
|
||||
thrust::default_random_engine rng(seed);
|
||||
thrust::uniform_real_distribution<float> dist;
|
||||
rng.discard(i);
|
||||
return dist(rng) <= p;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Helper macro to measure timing on GPU
|
||||
* @param call the GPU call
|
||||
@@ -687,9 +646,9 @@ struct BernoulliRng {
|
||||
// Load balancing search
|
||||
|
||||
template <typename coordinate_t, typename segments_t, typename offset_t>
|
||||
void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates, int num_tiles,
|
||||
int tile_size, segments_t segments, offset_t num_rows,
|
||||
offset_t num_elements) {
|
||||
void FindMergePartitions(int device_idx, coordinate_t *d_tile_coordinates,
|
||||
int num_tiles, int tile_size, segments_t segments,
|
||||
offset_t num_rows, offset_t num_elements) {
|
||||
dh::launch_n(device_idx, num_tiles + 1, [=] __device__(int idx) {
|
||||
offset_t diagonal = idx * tile_size;
|
||||
coordinate_t tile_coordinate;
|
||||
@@ -761,8 +720,9 @@ __global__ void LbsKernel(coordinate_t *d_coordinates,
|
||||
}
|
||||
|
||||
template <typename func_t, typename segments_iter, typename offset_t>
|
||||
void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
segments_iter segments, offset_t num_segments, func_t f) {
|
||||
void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
offset_t count, segments_iter segments,
|
||||
offset_t num_segments, func_t f) {
|
||||
typedef typename cub::CubVector<offset_t, 2>::Type coordinate_t;
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
const int BLOCK_THREADS = 256;
|
||||
@@ -774,8 +734,8 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t cou
|
||||
coordinate_t *tmp_tile_coordinates =
|
||||
reinterpret_cast<coordinate_t *>(temp_memory->d_temp_storage);
|
||||
|
||||
FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles, BLOCK_THREADS, segments,
|
||||
num_segments, count);
|
||||
FindMergePartitions(device_idx, tmp_tile_coordinates, num_tiles,
|
||||
BLOCK_THREADS, segments, num_segments, count);
|
||||
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, offset_t>
|
||||
<<<num_tiles, BLOCK_THREADS>>>(tmp_tile_coordinates, segments + 1, f,
|
||||
@@ -783,22 +743,24 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t cou
|
||||
}
|
||||
|
||||
template <typename func_t, typename offset_t>
|
||||
void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments, func_t f) {
|
||||
void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments,
|
||||
func_t f) {
|
||||
CHECK(count % num_segments == 0) << "Data is not dense.";
|
||||
|
||||
launch_n(device_idx, count, [=]__device__(offset_t idx)
|
||||
{
|
||||
launch_n(device_idx, count, [=] __device__(offset_t idx) {
|
||||
offset_t segment = idx / (count / num_segments);
|
||||
f(idx, segment);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn template <typename func_t, typename segments_iter, typename offset_t> void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count, segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
|
||||
* \fn template <typename func_t, typename segments_iter, typename offset_t>
|
||||
* void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
* segments_iter segments, offset_t num_segments, bool is_dense, func_t f)
|
||||
*
|
||||
* \brief Load balancing search function. Reads a CSR type matrix description and allows a function
|
||||
* to be executed on each element. Search 'modern GPU load balancing search' for more
|
||||
* information.
|
||||
* \brief Load balancing search function. Reads a CSR type matrix description
|
||||
* and allows a function to be executed on each element. Search 'modern GPU load
|
||||
* balancing search' for more information.
|
||||
*
|
||||
* \author Rory
|
||||
* \date 7/9/2017
|
||||
@@ -817,12 +779,106 @@ void DenseTransformLbs(int device_idx, offset_t count, offset_t num_segments, fu
|
||||
|
||||
template <typename func_t, typename segments_iter, typename offset_t>
|
||||
void TransformLbs(int device_idx, dh::CubMemory *temp_memory, offset_t count,
|
||||
segments_iter segments, offset_t num_segments, bool is_dense, func_t f) {
|
||||
segments_iter segments, offset_t num_segments, bool is_dense,
|
||||
func_t f) {
|
||||
if (is_dense) {
|
||||
DenseTransformLbs(device_idx, count, num_segments, f);
|
||||
}
|
||||
else {
|
||||
SparseTransformLbs(device_idx, temp_memory, count, segments, num_segments, f);
|
||||
} else {
|
||||
SparseTransformLbs(device_idx, temp_memory, count, segments, num_segments,
|
||||
f);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function to sort the pairs using cub's segmented RadixSortPairs
|
||||
* @param tmp_mem cub temporary memory info
|
||||
* @param keys keys double-buffer array
|
||||
* @param vals the values double-buffer array
|
||||
* @param nVals number of elements in the array
|
||||
* @param nSegs number of segments
|
||||
* @param offsets the segments
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
void segmentedSort(dh::CubMemory *tmp_mem, dh::dvec2<T1> *keys,
|
||||
dh::dvec2<T2> *vals, int nVals, int nSegs,
|
||||
const dh::dvec<int> &offsets, int start = 0,
|
||||
int end = sizeof(T1) * 8) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.data(),
|
||||
offsets.data() + 1, start, end));
|
||||
tmp_mem->LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals,
|
||||
nSegs, offsets.data(), offsets.data() + 1, start, end));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function to perform device-wide sum-reduction
|
||||
* @param tmp_mem cub temporary memory info
|
||||
* @param in the input array to be reduced
|
||||
* @param out the output reduced value
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
int nVals) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(
|
||||
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
||||
in.data(), out.data(), nVals));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fill a given constant value across all elements in the buffer
|
||||
* @param out the buffer to be filled
|
||||
* @param len number of elements i the buffer
|
||||
* @param def default value to be filled
|
||||
*/
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void fillConst(int device_idx, T *out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
|
||||
[=] __device__(int i) { out[i] = def; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief gather elements
|
||||
* @param out1 output gathered array for the first buffer
|
||||
* @param in1 first input buffer
|
||||
* @param out2 output gathered array for the second buffer
|
||||
* @param in2 second input buffer
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T1 *out1, const T1 *in1, T2 *out2, const T2 *in2,
|
||||
const int *instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
T2 v2 = in2[iid];
|
||||
out1[i] = v1;
|
||||
out2[i] = v2;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief gather elements
|
||||
* @param out output gathered array
|
||||
* @param in input buffer
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace dh
|
||||
|
||||
Reference in New Issue
Block a user