GPU binning and compression. (#3319)
* GPU binning and compression. - binning and index compression are done inside the DeviceShard constructor - in case of a DMatrix with multiple row batches, it is first converted into a single row batch
This commit is contained in:
committed by
Rory Mitchell
parent
3f7696ff53
commit
286dccb8e8
@@ -8,6 +8,10 @@
|
||||
#include <cstddef>
|
||||
#include <algorithm>
|
||||
|
||||
#ifdef __CUDACC__
|
||||
#include "device_helpers.cuh"
|
||||
#endif
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
@@ -96,6 +100,23 @@ class CompressedBufferWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ void AtomicWriteSymbol
|
||||
(CompressedByteT* buffer, uint64_t symbol, size_t offset) {
|
||||
size_t ibit_start = offset * symbol_bits_;
|
||||
size_t ibit_end = (offset + 1) * symbol_bits_ - 1;
|
||||
size_t ibyte_start = ibit_start / 8, ibyte_end = ibit_end / 8;
|
||||
|
||||
symbol <<= 7 - ibit_end % 8;
|
||||
for (ptrdiff_t ibyte = ibyte_end; ibyte >= (ptrdiff_t)ibyte_start; --ibyte) {
|
||||
dh::AtomicOrByte(reinterpret_cast<unsigned int*>(buffer + detail::kPadding),
|
||||
ibyte, symbol & 0xff);
|
||||
symbol >>= 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IterT>
|
||||
void Write(CompressedByteT *buffer, IterT input_begin, IterT input_end) {
|
||||
uint64_t tmp = 0;
|
||||
|
||||
@@ -122,6 +122,14 @@ inline size_t AvailableMemory(int device_idx) {
|
||||
return device_free;
|
||||
}
|
||||
|
||||
inline size_t TotalMemory(int device_idx) {
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
|
||||
return device_total;
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn inline int max_shared_memory(int device_idx)
|
||||
*
|
||||
@@ -155,6 +163,12 @@ inline void CheckComputeCapability() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
DEV_INLINE void AtomicOrByte(unsigned int* __restrict__ buffer, size_t ibyte, unsigned char b) {
|
||||
atomicOr(&buffer[ibyte / sizeof(unsigned int)], (unsigned int)b << (ibyte % (sizeof(unsigned int)) * 8));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Range iterator
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user