/*! * Copyright 2016 Rory mitchell */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include // Uncomment to enable // #define DEVICE_TIMER // #define TIMERS namespace dh { /* * Error handling functions */ #define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__) inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { std::stringstream ss; ss << file << "(" << line << ")"; std::string file_and_line; ss >> file_and_line; throw thrust::system_error(code, thrust::cuda_category(), file_and_line); } return code; } #define gpuErrchk(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } /* * Timers */ #define MAX_WARPS 32 // Maximum number of warps to time #define MAX_SLOTS 10 #define TIMER_BLOCKID 0 // Block to time struct DeviceTimerGlobal { #ifdef DEVICE_TIMER clock_t total_clocks[MAX_SLOTS][MAX_WARPS]; int64_t count[MAX_SLOTS][MAX_WARPS]; #endif // Clear device memory. Call at start of kernel. __device__ void Init() { #ifdef DEVICE_TIMER if (blockIdx.x == TIMER_BLOCKID && threadIdx.x < MAX_WARPS) { for (int SLOT = 0; SLOT < MAX_SLOTS; SLOT++) { total_clocks[SLOT][threadIdx.x] = 0; count[SLOT][threadIdx.x] = 0; } } #endif } void HostPrint() { #ifdef DEVICE_TIMER DeviceTimerGlobal h_timer; safe_cuda( cudaMemcpyFromSymbol(&h_timer, (*this), sizeof(DeviceTimerGlobal))); for (int SLOT = 0; SLOT < MAX_SLOTS; SLOT++) { if (h_timer.count[SLOT][0] == 0) { continue; } clock_t sum_clocks = 0; int64_t sum_count = 0; for (int WARP = 0; WARP < MAX_WARPS; WARP++) { if (h_timer.count[SLOT][WARP] == 0) { continue; } sum_clocks += h_timer.total_clocks[SLOT][WARP]; sum_count += h_timer.count[SLOT][WARP]; } printf("Slot %d: %d clocks per call, called %d times.\n", SLOT, sum_clocks / sum_count, h_timer.count[SLOT][0]); } #endif } }; struct DeviceTimer { #ifdef DEVICE_TIMER clock_t start; int slot; DeviceTimerGlobal >imer; #endif #ifdef DEVICE_TIMER __device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) // NOLINT : GTimer(GTimer), start(clock()), slot(slot) {} #else __device__ DeviceTimer(DeviceTimerGlobal >imer, int slot) {} // NOLINT #endif __device__ void End() { #ifdef DEVICE_TIMER int warp_id = threadIdx.x / 32; int lane_id = threadIdx.x % 32; if (blockIdx.x == TIMER_BLOCKID && lane_id == 0) { GTimer.count[slot][warp_id] += 1; GTimer.total_clocks[slot][warp_id] += clock() - start; } #endif } }; struct Timer { typedef std::chrono::high_resolution_clock ClockT; typedef std::chrono::high_resolution_clock::time_point TimePointT; TimePointT start; Timer() { reset(); } void reset() { start = ClockT::now(); } int64_t elapsed() const { return (ClockT::now() - start).count(); } void printElapsed(std::string label) { safe_cuda(cudaDeviceSynchronize()); printf("%s:\t %lld\n", label.c_str(), elapsed()); reset(); } }; /* * Range iterator */ class range { public: class iterator { friend class range; public: __host__ __device__ int64_t operator*() const { return i_; } __host__ __device__ const iterator &operator++() { i_ += step_; return *this; } __host__ __device__ iterator operator++(int) { iterator copy(*this); i_ += step_; return copy; } __host__ __device__ bool operator==(const iterator &other) const { return i_ >= other.i_; } __host__ __device__ bool operator!=(const iterator &other) const { return i_ < other.i_; } __host__ __device__ void step(int s) { step_ = s; } protected: __host__ __device__ explicit iterator(int64_t start) : i_(start) {} public: uint64_t i_; int step_ = 1; }; __host__ __device__ iterator begin() const { return begin_; } __host__ __device__ iterator end() const { return end_; } __host__ __device__ range(int64_t begin, int64_t end) : begin_(begin), end_(end) {} __host__ __device__ void step(int s) { begin_.step(s); } private: iterator begin_; iterator end_; }; template __device__ range grid_stride_range(T begin, T end) { begin += blockDim.x * blockIdx.x + threadIdx.x; range r(begin, end); r.step(gridDim.x * blockDim.x); return r; } template __device__ range block_stride_range(T begin, T end) { begin += threadIdx.x; range r(begin, end); r.step(blockDim.x); return r; } // Threadblock iterates over range, filling with value template __device__ void block_fill(IterT begin, size_t n, ValueT value) { for (auto i : block_stride_range(static_cast(0), n)) { begin[i] = value; } } /* * Memory */ class bulk_allocator; template class dvec { friend bulk_allocator; private: T *_ptr; size_t _size; void external_allocate(void *ptr, size_t size) { if (!empty()) { throw std::runtime_error("Tried to allocate dvec but already allocated"); } _ptr = static_cast(ptr); _size = size; } public: dvec() : _ptr(NULL), _size(0) {} size_t size() const { return _size; } bool empty() const { return _ptr == NULL || _size == 0; } T *data() { return _ptr; } std::vector as_vector() const { std::vector h_vector(size()); safe_cuda(cudaMemcpy(h_vector.data(), _ptr, size() * sizeof(T), cudaMemcpyDeviceToHost)); return h_vector; } void fill(T value) { thrust::fill_n(thrust::device_pointer_cast(_ptr), size(), value); } void print() { auto h_vector = this->as_vector(); for (auto e : h_vector) { std::cout << e << " "; } std::cout << "\n"; } thrust::device_ptr tbegin() { return thrust::device_pointer_cast(_ptr); } thrust::device_ptr tend() { return thrust::device_pointer_cast(_ptr + size()); } template dvec &operator=(const std::vector &other) { if (other.size() != size()) { throw std::runtime_error( "Cannot copy assign vector to dvec, sizes are different"); } thrust::copy(other.begin(), other.end(), this->tbegin()); return *this; } dvec &operator=(dvec &other) { if (other.size() != size()) { throw std::runtime_error( "Cannot copy assign dvec to dvec, sizes are different"); } thrust::copy(other.tbegin(), other.tend(), this->tbegin()); return *this; } }; class bulk_allocator { char *d_ptr; size_t _size; const size_t align = 256; template size_t align_round_up(SizeT n) { if (n % align == 0) { return n; } else { return n + align - (n % align); } } template size_t get_size_bytes(dvec *first_vec, SizeT first_size) { return align_round_up(first_size * sizeof(T)); } template size_t get_size_bytes(dvec *first_vec, SizeT first_size, Args... args) { return align_round_up(first_size * sizeof(T)) + get_size_bytes(args...); } template void allocate_dvec(char *ptr, dvec *first_vec, SizeT first_size) { first_vec->external_allocate(static_cast(ptr), first_size); } template void allocate_dvec(char *ptr, dvec *first_vec, SizeT first_size, Args... args) { first_vec->external_allocate(static_cast(ptr), first_size); ptr += align_round_up(first_size * sizeof(T)); allocate_dvec(ptr, args...); } public: bulk_allocator() : _size(0), d_ptr(NULL) {} ~bulk_allocator() { if (!(d_ptr == nullptr)) { safe_cuda(cudaFree(d_ptr)); } } size_t size() { return _size; } template void allocate(Args... args) { if (d_ptr != NULL) { throw std::runtime_error("Bulk allocator already allocated"); } _size = get_size_bytes(args...); safe_cuda(cudaMalloc(&d_ptr, _size)); allocate_dvec(d_ptr, args...); } }; // Keep track of cub library device allocation struct CubMemory { void *d_temp_storage; size_t temp_storage_bytes; CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {} ~CubMemory() { Free(); } void Free() { if (d_temp_storage != NULL) { safe_cuda(cudaFree(d_temp_storage)); } } void LazyAllocate(size_t n_bytes) { if (n_bytes > temp_storage_bytes) { Free(); safe_cuda(cudaMalloc(&d_temp_storage, n_bytes)); temp_storage_bytes = n_bytes; } } bool IsAllocated() { return d_temp_storage != NULL; } }; inline size_t available_memory() { size_t device_free = 0; size_t device_total = 0; dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total)); return device_free; } inline std::string device_name() { cudaDeviceProp prop; dh::safe_cuda(cudaGetDeviceProperties(&prop, 0)); return std::string(prop.name); } /* * Utility functions */ template void print(const thrust::device_vector &v, size_t max_items = 10) { thrust::host_vector h = v; for (int i = 0; i < std::min(max_items, h.size()); i++) { std::cout << " " << h[i]; } std::cout << "\n"; } template void print(const dvec &v, size_t max_items = 10) { std::vector h = v.as_vector(); for (int i = 0; i < std::min(max_items, h.size()); i++) { std::cout << " " << h[i]; } std::cout << "\n"; } template void print(char *label, const thrust::device_vector &v, const char *format = "%d ", int max = 10) { thrust::host_vector h_v = v; std::cout << label << ":\n"; for (int i = 0; i < std::min(static_cast(h_v.size()), max); i++) { printf(format, h_v[i]); } std::cout << "\n"; } template T1 div_round_up(const T1 a, const T2 b) { return static_cast(ceil(static_cast(a) / b)); } template thrust::device_ptr dptr(T *d_ptr) { return thrust::device_pointer_cast(d_ptr); } template T *raw(thrust::device_vector &v) { // NOLINT return raw_pointer_cast(v.data()); } template const T *raw(const thrust::device_vector &v) { // NOLINT return raw_pointer_cast(v.data()); } template size_t size_bytes(const thrust::device_vector &v) { return sizeof(T) * v.size(); } /* * Kernel launcher */ template __global__ void launch_n_kernel(size_t n, L lambda) { for (auto i : grid_stride_range(static_cast(0), n)) { lambda(i); } } template inline void launch_n(size_t n, L lambda) { const int GRID_SIZE = div_round_up(n, ITEMS_PER_THREAD * BLOCK_THREADS); #if defined(__CUDACC__) launch_n_kernel<<>>(n, lambda); #endif } /* * Random */ struct BernoulliRng { float p; int seed; __host__ __device__ BernoulliRng(float p, int seed) : p(p), seed(seed) {} __host__ __device__ bool operator()(const int i) const { thrust::default_random_engine rng(seed); thrust::uniform_real_distribution dist; rng.discard(i); return dist(rng) <= p; } }; } // namespace dh