[GPU-Plugin] (#2227)
* Add fast histogram algorithm * Fix Linux build * Add 'gpu_id' parameter
This commit is contained in:
committed by
Tianqi Chen
parent
d281c6aafa
commit
8ab5d4611c
@@ -1,18 +1,19 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
*/
|
||||
#pragma once
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/random.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <thrust/random.h>
|
||||
#include <algorithm>
|
||||
#include <ctime>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "cusparse_v2.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
@@ -30,7 +31,8 @@ namespace dh {
|
||||
|
||||
#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__)
|
||||
|
||||
cudaError_t throw_on_cuda_error(cudaError_t code, const char *file, int line) {
|
||||
inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
|
||||
int line) {
|
||||
if (code != cudaSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << file << "(" << line << ")";
|
||||
@@ -41,16 +43,29 @@ cudaError_t throw_on_cuda_error(cudaError_t code, const char *file, int line) {
|
||||
|
||||
return code;
|
||||
}
|
||||
#define safe_cusparse(ans) throw_on_cusparse_error((ans), __FILE__, __LINE__)
|
||||
|
||||
#define gpuErrchk(ans) \
|
||||
inline cusparseStatus_t throw_on_cusparse_error(cusparseStatus_t status,
|
||||
const char *file, int line) {
|
||||
if (status != CUSPARSE_STATUS_SUCCESS) {
|
||||
std::stringstream ss;
|
||||
ss << "cusparse error: " << file << "(" << line << ")";
|
||||
std::string error_text;
|
||||
ss >> error_text;
|
||||
throw error_text;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
#define gpuErrchk(ans) \
|
||||
{ gpuAssert((ans), __FILE__, __LINE__); }
|
||||
inline void gpuAssert(cudaError_t code, const char *file, int line,
|
||||
bool abort = true) {
|
||||
if (code != cudaSuccess) {
|
||||
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
|
||||
line);
|
||||
if (abort)
|
||||
exit(code);
|
||||
if (abort) exit(code);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,10 +134,10 @@ struct DeviceTimer {
|
||||
#endif
|
||||
|
||||
#ifdef DEVICE_TIMER
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) // NOLINT
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) // NOLINT
|
||||
: GTimer(GTimer), start(clock()), slot(slot) {}
|
||||
#else
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) {} // NOLINT
|
||||
__device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) {} // NOLINT
|
||||
#endif
|
||||
|
||||
__device__ void End() {
|
||||
@@ -224,21 +239,22 @@ class range {
|
||||
iterator end_;
|
||||
};
|
||||
|
||||
template <typename T> __device__ range grid_stride_range(T begin, T end) {
|
||||
template <typename T>
|
||||
__device__ range grid_stride_range(T begin, T end) {
|
||||
begin += blockDim.x * blockIdx.x + threadIdx.x;
|
||||
range r(begin, end);
|
||||
r.step(gridDim.x * blockDim.x);
|
||||
return r;
|
||||
}
|
||||
|
||||
template <typename T> __device__ range block_stride_range(T begin, T end) {
|
||||
template <typename T>
|
||||
__device__ range block_stride_range(T begin, T end) {
|
||||
begin += threadIdx.x;
|
||||
range r(begin, end);
|
||||
r.step(blockDim.x);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
// Threadblock iterates over range, filling with value
|
||||
template <typename IterT, typename ValueT>
|
||||
__device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
@@ -253,7 +269,8 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
|
||||
class bulk_allocator;
|
||||
|
||||
template <typename T> class dvec {
|
||||
template <typename T>
|
||||
class dvec {
|
||||
friend bulk_allocator;
|
||||
|
||||
private:
|
||||
@@ -302,7 +319,8 @@ template <typename T> class dvec {
|
||||
return thrust::device_pointer_cast(_ptr + size());
|
||||
}
|
||||
|
||||
template <typename T2> dvec &operator=(const std::vector<T2> &other) {
|
||||
template <typename T2>
|
||||
dvec &operator=(const std::vector<T2> &other) {
|
||||
if (other.size() != size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
@@ -331,7 +349,8 @@ class bulk_allocator {
|
||||
|
||||
const size_t align = 256;
|
||||
|
||||
template <typename SizeT> size_t align_round_up(SizeT n) {
|
||||
template <typename SizeT>
|
||||
size_t align_round_up(SizeT n) {
|
||||
if (n % align == 0) {
|
||||
return n;
|
||||
} else {
|
||||
@@ -357,7 +376,7 @@ class bulk_allocator {
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
first_vec->external_allocate(static_cast<void*>(ptr), first_size);
|
||||
first_vec->external_allocate(static_cast<void *>(ptr), first_size);
|
||||
ptr += align_round_up(first_size * sizeof(T));
|
||||
allocate_dvec(ptr, args...);
|
||||
}
|
||||
@@ -366,14 +385,15 @@ class bulk_allocator {
|
||||
bulk_allocator() : _size(0), d_ptr(NULL) {}
|
||||
|
||||
~bulk_allocator() {
|
||||
if (!d_ptr == NULL) {
|
||||
if (!(d_ptr == nullptr)) {
|
||||
safe_cuda(cudaFree(d_ptr));
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() { return _size; }
|
||||
|
||||
template <typename... Args> void allocate(Args... args) {
|
||||
template <typename... Args>
|
||||
void allocate(Args... args) {
|
||||
if (d_ptr != NULL) {
|
||||
throw std::runtime_error("Bulk allocator already allocated");
|
||||
}
|
||||
@@ -393,14 +413,19 @@ struct CubMemory {
|
||||
|
||||
CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
|
||||
|
||||
~CubMemory() {
|
||||
~CubMemory() { Free(); }
|
||||
void Free() {
|
||||
if (d_temp_storage != NULL) {
|
||||
safe_cuda(cudaFree(d_temp_storage));
|
||||
}
|
||||
}
|
||||
|
||||
void Allocate() {
|
||||
safe_cuda(cudaMalloc(&d_temp_storage, temp_storage_bytes));
|
||||
void LazyAllocate(size_t n_bytes) {
|
||||
if (n_bytes > temp_storage_bytes) {
|
||||
Free();
|
||||
safe_cuda(cudaMalloc(&d_temp_storage, n_bytes));
|
||||
temp_storage_bytes = n_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
bool IsAllocated() { return d_temp_storage != NULL; }
|
||||
@@ -453,47 +478,58 @@ void print(char *label, const thrust::device_vector<T> &v,
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
template <typename T1, typename T2> T1 div_round_up(const T1 a, const T2 b) {
|
||||
template <typename T1, typename T2>
|
||||
T1 div_round_up(const T1 a, const T2 b) {
|
||||
return static_cast<T1>(ceil(static_cast<double>(a) / b));
|
||||
}
|
||||
|
||||
template <typename T> thrust::device_ptr<T> dptr(T *d_ptr) {
|
||||
template <typename T>
|
||||
thrust::device_ptr<T> dptr(T *d_ptr) {
|
||||
return thrust::device_pointer_cast(d_ptr);
|
||||
}
|
||||
|
||||
template <typename T> T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
template <typename T>
|
||||
T *raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
template <typename T> size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||
template <typename T>
|
||||
const T *raw(const thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t size_bytes(const thrust::device_vector<T> &v) {
|
||||
return sizeof(T) * v.size();
|
||||
}
|
||||
/*
|
||||
* Kernel launcher
|
||||
*/
|
||||
|
||||
template <typename L> __global__ void launch_n_kernel(size_t n, L lambda) {
|
||||
template <typename L>
|
||||
__global__ void launch_n_kernel(size_t n, L lambda) {
|
||||
for (auto i : grid_stride_range(static_cast<size_t>(0), n)) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename L, int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256>
|
||||
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
|
||||
inline void launch_n(size_t n, L lambda) {
|
||||
const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS);
|
||||
|
||||
#if defined(__CUDACC__)
|
||||
launch_n_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(n, lambda);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* Random
|
||||
* Random
|
||||
*/
|
||||
|
||||
struct BernoulliRng {
|
||||
float p;
|
||||
int seed;
|
||||
|
||||
__host__ __device__ BernoulliRng(float p, int seed):p(p), seed(seed) {}
|
||||
__host__ __device__ BernoulliRng(float p, int seed) : p(p), seed(seed) {}
|
||||
|
||||
__host__ __device__ bool operator()(const int i) const {
|
||||
thrust::default_random_engine rng(seed);
|
||||
@@ -504,5 +540,4 @@ struct BernoulliRng {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
} // namespace dh
|
||||
|
||||
Reference in New Issue
Block a user