[GPU-Plugin] Integration of a faster version of grow_gpu plugin into mainstream (#2360)

* Integrating a faster version of grow_gpu plugin
1. Removed the older files to reduce duplication
2. Moved all of the grow_gpu files under 'exact' folder
3. All of them are inside 'exact' namespace to avoid any conflicts
4. Fixed a bug in benchmark.py while running only 'grow_gpu' plugin
5. Added cub and googletest submodules to ease integration and unit-testing
6. Updates to CMakeLists.txt to directly build cuda objects into libxgboost

* Added support for building gpu plugins through make flow
1. updated makefile and config.mk to add right targets
2. added unit-tests for gpu exact plugin code

* 1. Added support for building gpu plugin using 'make' flow as well
2. Updated instructions for building and testing gpu plugin

* Fix travis-ci errors for PR#2360
1. lint errors on unit-tests
2. removed googletest, instead depended upon dmlc-core provide gtest cache

* Some more fixes to travis-ci lint failures PR#2360

* Added Rory's copyrights to the files containing code from both.

* updated copyright statement as per Rory's request

* moved the static datasets into a script to generate them at runtime

* 1. memory usage print when silent=0
2. tests/ and test/ folder organization
3. removal of the dependency of googletest for just building xgboost
4. coding style updates for .cuh as well

* Fixes for compilation warnings

* add cuda object files as well when JVM_BINDINGS=ON
This commit is contained in:
Thejaswi
2017-06-06 03:09:53 +05:30
committed by Rory Mitchell
parent 2d9052bc7d
commit 85b2fb3eee
37 changed files with 4118 additions and 1601 deletions

View File

@@ -2,8 +2,6 @@
* Copyright 2016 Rory mitchell
*/
#pragma once
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include <thrust/system/cuda/error.h>
@@ -14,6 +12,7 @@
#include <sstream>
#include <string>
#include <vector>
#include <cub/cub.cuh>
// Uncomment to enable
// #define DEVICE_TIMER
@@ -21,6 +20,9 @@
namespace dh {
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
#define DEV_INLINE __device__ __forceinline__
/*
* Error handling functions
*/
@@ -145,7 +147,7 @@ struct Timer {
int64_t elapsed() const { return (ClockT::now() - start).count(); }
void printElapsed(std::string label) {
safe_cuda(cudaDeviceSynchronize());
printf("%s:\t %lld\n", label.c_str(), elapsed());
printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
reset();
}
};
@@ -228,10 +230,12 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
*/
class bulk_allocator;
template <typename T> class dvec2;
template <typename T>
class dvec {
friend bulk_allocator;
friend class dvec2<T>;
private:
T *_ptr;
@@ -241,15 +245,17 @@ class dvec {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec but already allocated");
}
_ptr = static_cast<T *>(ptr);
_size = size;
}
public:
dvec() : _ptr(NULL), _size(0) {}
size_t size() const { return _size; }
bool empty() const { return _ptr == NULL || _size == 0; }
T *data() { return _ptr; }
std::vector<T> as_vector() const {
@@ -265,11 +271,9 @@ class dvec {
void print() {
auto h_vector = this->as_vector();
for (auto e : h_vector) {
std::cout << e << " ";
}
std::cout << "\n";
}
@@ -285,9 +289,7 @@ class dvec {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
thrust::copy(other.begin(), other.end(), this->tbegin());
return *this;
}
@@ -296,36 +298,74 @@ class dvec {
throw std::runtime_error(
"Cannot copy assign dvec to dvec, sizes are different");
}
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
return *this;
}
};
/**
* @class dvec2 device_helpers.cuh
* @brief wrapper for storing 2 dvec's which are needed for cub::DoubleBuffer
*/
template <typename T>
class dvec2 {
friend bulk_allocator;
private:
dvec<T> _d1, _d2;
cub::DoubleBuffer<T> _buff;
void external_allocate(void *ptr1, void *ptr2, size_t size) {
if (!empty()) {
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
}
_d1.external_allocate(ptr1, size);
_d2.external_allocate(ptr2, size);
_buff.d_buffers[0] = static_cast<T *>(ptr1);
_buff.d_buffers[1] = static_cast<T *>(ptr2);
_buff.selector = 0;
}
public:
dvec2() : _d1(), _d2(), _buff() {}
size_t size() const { return _d1.size(); }
bool empty() const { return _d1.empty() || _d2.empty(); }
cub::DoubleBuffer<T> &buff() { return _buff; }
dvec<T> &d1() { return _d1; }
dvec<T> &d2() { return _d2; }
T *current() { return _buff.Current(); }
dvec<T> &current_dvec() { return _buff.selector == 0? d1() : d2(); }
T *other() { return _buff.Alternate(); }
};
class bulk_allocator {
char *d_ptr;
size_t _size;
const size_t align = 256;
const int align = 256;
template <typename SizeT>
size_t align_round_up(SizeT n) {
if (n % align == 0) {
return n;
} else {
return n + align - (n % align);
}
n = (n + align - 1) / align;
return n * align;
}
template <typename T, typename SizeT>
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size) {
return align_round_up(first_size * sizeof(T));
return align_round_up<SizeT>(first_size * sizeof(T));
}
template <typename T, typename SizeT, typename... Args>
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
return align_round_up(first_size * sizeof(T)) + get_size_bytes(args...);
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
}
template <typename T, typename SizeT>
@@ -336,11 +376,37 @@ class bulk_allocator {
template <typename T, typename SizeT, typename... Args>
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
Args... args) {
first_vec->external_allocate(static_cast<void *>(ptr), first_size);
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
ptr += align_round_up(first_size * sizeof(T));
allocate_dvec(ptr, args...);
}
template <typename T, typename SizeT>
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
return 2 * align_round_up(first_size * sizeof(T));
}
template <typename T, typename SizeT, typename... Args>
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
}
template <typename T, typename SizeT>
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
first_vec->external_allocate
(static_cast<void *>(ptr),
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
first_size);
}
template <typename T, typename SizeT, typename... Args>
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
Args... args) {
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
ptr += (align_round_up(first_size * sizeof(T)) * 2);
allocate_dvec(ptr, args...);
}
public:
bulk_allocator() : _size(0), d_ptr(NULL) {}
@@ -357,11 +423,8 @@ class bulk_allocator {
if (d_ptr != NULL) {
throw std::runtime_error("Bulk allocator already allocated");
}
_size = get_size_bytes(args...);
safe_cuda(cudaMalloc(&d_ptr, _size));
allocate_dvec(d_ptr, args...);
}
};
@@ -374,6 +437,7 @@ struct CubMemory {
CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
~CubMemory() { Free(); }
void Free() {
if (d_temp_storage != NULL) {
safe_cuda(cudaFree(d_temp_storage));
@@ -394,13 +458,13 @@ struct CubMemory {
inline size_t available_memory() {
size_t device_free = 0;
size_t device_total = 0;
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
safe_cuda(cudaMemGetInfo(&device_free, &device_total));
return device_free;
}
inline std::string device_name() {
cudaDeviceProp prop;
dh::safe_cuda(cudaGetDeviceProperties(&prop, 0));
safe_cuda(cudaGetDeviceProperties(&prop, 0));
return std::string(prop.name);
}
@@ -430,7 +494,6 @@ template <typename T>
void print(char *label, const thrust::device_vector<T> &v,
const char *format = "%d ", int max = 10) {
thrust::host_vector<T> h_v = v;
std::cout << label << ":\n";
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
printf(format, h_v[i]);
@@ -495,9 +558,21 @@ struct BernoulliRng {
thrust::default_random_engine rng(seed);
thrust::uniform_real_distribution<float> dist;
rng.discard(i);
return dist(rng) <= p;
}
};
/**
* @brief Helper macro to measure timing on GPU
* @param call the GPU call
* @param name name used to track later
* @param stream cuda stream where to measure time
*/
#define TIMEIT(call, name) \
do { \
dh::Timer t1234; \
call; \
t1234.printElapsed(name); \
} while(0)
} // namespace dh