Retire DVec class in favour of c++20 style span for device memory. (#4293)

This commit is contained in:
Rory Mitchell 2019-03-28 13:59:58 +13:00 committed by GitHub
parent c85181dd8a
commit 3f312e30db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 288 additions and 369 deletions

View File

@ -227,179 +227,79 @@ inline void LaunchN(int device_idx, size_t n, L lambda) {
LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda); LaunchN<ITEMS_PER_THREAD, BLOCK_THREADS>(device_idx, n, nullptr, lambda);
} }
/*
* Memory
*/
enum MemoryType { kDevice, kDeviceManaged };
template <MemoryType MemoryT>
class BulkAllocator;
template <typename T>
class DVec2;
template <typename T>
class DVec {
friend class DVec2<T>;
private:
T *ptr_;
size_t size_;
int device_idx_;
public:
void ExternalAllocate(int device_idx, void *ptr, size_t size) {
if (!Empty()) {
throw std::runtime_error("Tried to allocate DVec but already allocated");
}
ptr_ = static_cast<T *>(ptr);
size_ = size;
device_idx_ = device_idx;
safe_cuda(cudaSetDevice(device_idx_));
}
DVec() : ptr_(NULL), size_(0), device_idx_(-1) {}
size_t Size() const { return size_; }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return ptr_ == NULL || size_ == 0; }
T *Data() { return ptr_; }
const T *Data() const { return ptr_; }
xgboost::common::Span<const T> GetSpan() const {
return xgboost::common::Span<const T>(ptr_, this->Size());
}
xgboost::common::Span<T> GetSpan() {
return xgboost::common::Span<T>(ptr_, this->Size());
}
std::vector<T> AsVector() const {
std::vector<T> h_vector(Size());
safe_cuda(cudaSetDevice(device_idx_));
safe_cuda(cudaMemcpy(h_vector.data(), ptr_, Size() * sizeof(T),
cudaMemcpyDeviceToHost));
return h_vector;
}
void Fill(T value) {
auto d_ptr = ptr_;
LaunchN(device_idx_, Size(),
[=] __device__(size_t idx) { d_ptr[idx] = value; });
}
void Print() {
auto h_vector = this->AsVector();
for (auto e : h_vector) {
std::cout << e << " ";
}
std::cout << "\n";
}
thrust::device_ptr<T> tbegin() { return thrust::device_pointer_cast(ptr_); }
thrust::device_ptr<T> tend() {
return thrust::device_pointer_cast(ptr_ + Size());
}
template <typename T2>
DVec &operator=(const std::vector<T2> &other) {
this->copy(other.begin(), other.end());
return *this;
}
DVec &operator=(DVec<T> &other) {
if (other.Size() != Size()) {
throw std::runtime_error(
"Cannot copy assign DVec to DVec, sizes are different");
}
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (other.DeviceIdx() == this->DeviceIdx()) {
dh::safe_cuda(cudaMemcpyAsync(this->Data(), other.Data(),
other.Size() * sizeof(T),
cudaMemcpyDeviceToDevice));
} else {
std::cout << "deviceother: " << other.DeviceIdx()
<< " devicethis: " << this->DeviceIdx() << std::endl;
std::cout << "size deviceother: " << other.Size()
<< " devicethis: " << this->DeviceIdx() << std::endl;
throw std::runtime_error("Cannot copy to/from different devices");
}
return *this;
}
template <typename IterT>
void copy(IterT begin, IterT end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
LOG(FATAL) << "Cannot copy assign vector to DVec, sizes are different" <<
" vector::Size(): " << end - begin << " DVec::Size(): " << Size();
}
thrust::copy(begin, end, this->tbegin());
}
void copy(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to dvec, sizes are different");
}
safe_cuda(cudaMemcpyAsync(this->Data(), begin.get(), Size() * sizeof(T),
cudaMemcpyDefault));
}
};
/** /**
* @class DVec2 device_helpers.cuh * \brief A double buffer, useful for algorithms like sort.
* @brief wrapper for storing 2 DVec's which are needed for cub::DoubleBuffer
*/ */
template <typename T> template <typename T>
class DVec2 { class DoubleBuffer {
private:
DVec<T> d1_, d2_;
cub::DoubleBuffer<T> buff_;
int device_idx_;
public: public:
void ExternalAllocate(int device_idx, void *ptr1, void *ptr2, size_t size) { cub::DoubleBuffer<T> buff;
if (!Empty()) { xgboost::common::Span<T> a, b;
throw std::runtime_error("Tried to allocate DVec2 but already allocated"); DoubleBuffer() = default;
size_t Size() const {
CHECK_EQ(a.size(), b.size());
return a.size();
} }
device_idx_ = device_idx; cub::DoubleBuffer<T> &CubBuffer() { return buff; }
d1_.ExternalAllocate(device_idx_, ptr1, size);
d2_.ExternalAllocate(device_idx_, ptr2, size);
buff_.d_buffers[0] = static_cast<T *>(ptr1);
buff_.d_buffers[1] = static_cast<T *>(ptr2);
buff_.selector = 0;
}
DVec2() : d1_(), d2_(), buff_(), device_idx_(-1) {}
size_t Size() const { return d1_.Size(); } T *Current() { return buff.Current(); }
int DeviceIdx() const { return device_idx_; }
bool Empty() const { return d1_.Empty() || d2_.Empty(); }
cub::DoubleBuffer<T> &buff() { return buff_; }
DVec<T> &D1() { return d1_; }
DVec<T> &D2() { return d2_; }
T *Current() { return buff_.Current(); }
xgboost::common::Span<T> CurrentSpan() { xgboost::common::Span<T> CurrentSpan() {
return xgboost::common::Span<T>{ return xgboost::common::Span<T>{
buff_.Current(), buff.Current(),
static_cast<typename xgboost::common::Span<T>::index_type>(Size())}; static_cast<typename xgboost::common::Span<T>::index_type>(Size())};
} }
DVec<T> &CurrentDVec() { return buff_.selector == 0 ? D1() : D2(); } T *other() { return buff.Alternate(); }
T *other() { return buff_.Alternate(); }
}; };
/**
* \brief Copies device span to std::vector.
*
* \tparam T Generic type parameter.
* \param [in,out] dst Copy destination.
* \param src Copy source. Must be device memory.
*/
template <typename T>
void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) {
CHECK_EQ(dst->size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T),
cudaMemcpyDeviceToHost));
}
/**
* \brief Copies std::vector to device span.
*
* \tparam T Generic type parameter.
* \param dst Copy destination. Must be device memory.
* \param src Copy source.
*/
template <typename T>
void CopyVectorToDeviceSpan(xgboost::common::Span<T> dst ,const std::vector<T>&src)
{
CHECK_EQ(dst.size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
cudaMemcpyHostToDevice));
}
/**
* \brief Device to device memory copy from src to dst. Spans must be the same size. Use subspan to
* copy from a smaller array to a larger array.
*
* \tparam T Generic type parameter.
* \param dst Copy destination. Must be device memory.
* \param src Copy source. Must be device memory.
*/
template <typename T>
void CopyDeviceSpan(xgboost::common::Span<T> dst,
xgboost::common::Span<T> src) {
CHECK_EQ(dst.size(), src.size());
dh::safe_cuda(cudaMemcpyAsync(dst.data(), src.data(), dst.size() * sizeof(T),
cudaMemcpyDeviceToDevice));
}
/*! \brief Helper for allocating large block of memory. */ /*! \brief Helper for allocating large block of memory. */
template <MemoryType MemoryT>
class BulkAllocator { class BulkAllocator {
std::vector<char *> d_ptr_; std::vector<char *> d_ptr_;
std::vector<size_t> size_; std::vector<size_t> size_;
@ -413,70 +313,73 @@ class BulkAllocator {
} }
template <typename T> template <typename T>
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size) { size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size) {
return AlignRoundUp(first_size * sizeof(T)); return AlignRoundUp(first_size * sizeof(T));
} }
template <typename T, typename... Args> template <typename T, typename... Args>
size_t GetSizeBytes(DVec<T> *first_vec, size_t first_size, Args... args) { size_t GetSizeBytes(xgboost::common::Span<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...); return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
} }
template <typename T> template <typename T>
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec, void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
size_t first_size) { size_t first_size) {
first_vec->ExternalAllocate(device_idx, static_cast<void *>(ptr), *first_vec = xgboost::common::Span<T>(reinterpret_cast<T *>(ptr), first_size);
first_size);
} }
template <typename T, typename... Args> template <typename T, typename... Args>
void AllocateDVec(int device_idx, char *ptr, DVec<T> *first_vec, void AllocateSpan(int device_idx, char *ptr, xgboost::common::Span<T> *first_vec,
size_t first_size, Args... args) { size_t first_size, Args... args) {
AllocateDVec<T>(device_idx, ptr, first_vec, first_size); AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
ptr += AlignRoundUp(first_size * sizeof(T)); ptr += AlignRoundUp(first_size * sizeof(T));
AllocateDVec(device_idx, ptr, args...); AllocateSpan(device_idx, ptr, args...);
} }
char *AllocateDevice(int device_idx, size_t bytes, MemoryType t) { char *AllocateDevice(int device_idx, size_t bytes) {
char *ptr; char *ptr;
safe_cuda(cudaSetDevice(device_idx)); safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaMalloc(&ptr, bytes)); safe_cuda(cudaMalloc(&ptr, bytes));
return ptr; return ptr;
} }
template <typename T> template <typename T>
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size) { size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size) {
return 2 * AlignRoundUp(first_size * sizeof(T)); return 2 * AlignRoundUp(first_size * sizeof(T));
} }
template <typename T, typename... Args> template <typename T, typename... Args>
size_t GetSizeBytes(DVec2<T> *first_vec, size_t first_size, Args... args) { size_t GetSizeBytes(DoubleBuffer<T> *first_vec, size_t first_size, Args... args) {
return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...); return GetSizeBytes<T>(first_vec, first_size) + GetSizeBytes(args...);
} }
template <typename T> template <typename T>
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec, void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
size_t first_size) { size_t first_size) {
first_vec->ExternalAllocate( auto ptr1 = reinterpret_cast<T *>(ptr);
device_idx, static_cast<void *>(ptr), auto ptr2 = ptr1 + first_size;
static_cast<void *>(ptr + AlignRoundUp(first_size * sizeof(T))), first_vec->a = xgboost::common::Span<T>(ptr1, first_size);
first_size); first_vec->b = xgboost::common::Span<T>(ptr2, first_size);
first_vec->buff.d_buffers[0] = ptr1;
first_vec->buff.d_buffers[1] = ptr2;
first_vec->buff.selector = 0;
} }
template <typename T, typename... Args> template <typename T, typename... Args>
void AllocateDVec(int device_idx, char *ptr, DVec2<T> *first_vec, void AllocateSpan(int device_idx, char *ptr, DoubleBuffer<T> *first_vec,
size_t first_size, Args... args) { size_t first_size, Args... args) {
AllocateDVec<T>(device_idx, ptr, first_vec, first_size); AllocateSpan<T>(device_idx, ptr, first_vec, first_size);
ptr += (AlignRoundUp(first_size * sizeof(T)) * 2); ptr += (AlignRoundUp(first_size * sizeof(T)) * 2);
AllocateDVec(device_idx, ptr, args...); AllocateSpan(device_idx, ptr, args...);
} }
public: public:
BulkAllocator() = default; BulkAllocator() = default;
// prevent accidental copying, moving or assignment of this object // prevent accidental copying, moving or assignment of this object
BulkAllocator(const BulkAllocator<MemoryT>&) = delete; BulkAllocator(const BulkAllocator&) = delete;
BulkAllocator(BulkAllocator<MemoryT>&&) = delete; BulkAllocator(BulkAllocator&&) = delete;
void operator=(const BulkAllocator<MemoryT>&) = delete; void operator=(const BulkAllocator&) = delete;
void operator=(BulkAllocator<MemoryT>&&) = delete; void operator=(BulkAllocator&&) = delete;
~BulkAllocator() { ~BulkAllocator() {
for (size_t i = 0; i < d_ptr_.size(); i++) { for (size_t i = 0; i < d_ptr_.size(); i++) {
@ -497,9 +400,9 @@ class BulkAllocator {
void Allocate(int device_idx, Args... args) { void Allocate(int device_idx, Args... args) {
size_t size = GetSizeBytes(args...); size_t size = GetSizeBytes(args...);
char *ptr = AllocateDevice(device_idx, size, MemoryT); char *ptr = AllocateDevice(device_idx, size);
AllocateDVec(device_idx, ptr, args...); AllocateSpan(device_idx, ptr, args...);
d_ptr_.push_back(ptr); d_ptr_.push_back(ptr);
size_.push_back(size); size_.push_back(size);
@ -582,28 +485,6 @@ struct CubMemory {
* Utility functions * Utility functions
*/ */
template <typename T>
void Print(const DVec<T> &v, size_t max_items = 10) {
std::vector<T> h = v.as_vector();
for (size_t i = 0; i < std::min(max_items, h.size()); i++) {
std::cout << " " << h[i];
}
std::cout << "\n";
}
/**
* @brief Helper macro to measure timing on GPU
* @param call the GPU call
* @param name name used to track later
* @param stream cuda stream where to measure time
*/
#define TIMEIT(call, name) \
do { \
dh::Timer t1234; \
call; \
t1234.printElapsed(name); \
} while (0)
// Load balancing search // Load balancing search
template <typename CoordinateT, typename SegmentT, typename OffsetT> template <typename CoordinateT, typename SegmentT, typename OffsetT>
@ -762,18 +643,18 @@ void TransformLbs(int device_idx, dh::CubMemory *temp_memory, OffsetT count,
* @param offsets the segments * @param offsets the segments
*/ */
template <typename T1, typename T2> template <typename T1, typename T2>
void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys, void SegmentedSort(dh::CubMemory *tmp_mem, dh::DoubleBuffer<T1> *keys,
dh::DVec2<T2> *vals, int nVals, int nSegs, dh::DoubleBuffer<T2> *vals, int nVals, int nSegs,
const dh::DVec<int> &offsets, int start = 0, xgboost::common::Span<int> offsets, int start = 0,
int end = sizeof(T1) * 8) { int end = sizeof(T1) * 8) {
size_t tmpSize; size_t tmpSize;
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs( dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
NULL, tmpSize, keys->buff(), vals->buff(), nVals, nSegs, offsets.Data(), NULL, tmpSize, keys->CubBuffer(), vals->CubBuffer(), nVals, nSegs,
offsets.Data() + 1, start, end)); offsets.data(), offsets.data() + 1, start, end));
tmp_mem->LazyAllocate(tmpSize); tmp_mem->LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs( dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
tmp_mem->d_temp_storage, tmpSize, keys->buff(), vals->buff(), nVals, tmp_mem->d_temp_storage, tmpSize, keys->CubBuffer(), vals->CubBuffer(),
nSegs, offsets.Data(), offsets.Data() + 1, start, end)); nVals, nSegs, offsets.data(), offsets.data() + 1, start, end));
} }
/** /**
@ -784,14 +665,14 @@ void SegmentedSort(dh::CubMemory *tmp_mem, dh::DVec2<T1> *keys,
* @param nVals number of elements in the input array * @param nVals number of elements in the input array
*/ */
template <typename T> template <typename T>
void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out, void SumReduction(dh::CubMemory &tmp_mem, xgboost::common::Span<T> in, xgboost::common::Span<T> out,
int nVals) { int nVals) {
size_t tmpSize; size_t tmpSize;
dh::safe_cuda( dh::safe_cuda(
cub::DeviceReduce::Sum(NULL, tmpSize, in.Data(), out.Data(), nVals)); cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
tmp_mem.LazyAllocate(tmpSize); tmp_mem.LazyAllocate(tmpSize);
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize, dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
in.Data(), out.Data(), nVals)); in.data(), out.data(), nVals));
} }
/** /**

View File

@ -19,18 +19,18 @@ namespace linear {
DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate); DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
void RescaleIndices(size_t ridx_begin, dh::DVec<xgboost::Entry> *data) { void RescaleIndices(int device_idx, size_t ridx_begin,
auto d_data = data->GetSpan(); common::Span<xgboost::Entry> data) {
dh::LaunchN(data->DeviceIdx(), data->Size(), dh::LaunchN(device_idx, data.size(),
[=] __device__(size_t idx) { d_data[idx].index -= ridx_begin; }); [=] __device__(size_t idx) { data[idx].index -= ridx_begin; });
} }
class DeviceShard { class DeviceShard {
int device_id_; int device_id_;
dh::BulkAllocator<dh::MemoryType::kDevice> ba_; dh::BulkAllocator ba_;
std::vector<size_t> row_ptr_; std::vector<size_t> row_ptr_;
dh::DVec<xgboost::Entry> data_; common::Span<xgboost::Entry> data_;
dh::DVec<GradientPair> gpair_; common::Span<GradientPair> gpair_;
dh::CubMemory temp_; dh::CubMemory temp_;
size_t ridx_begin_; size_t ridx_begin_;
size_t ridx_end_; size_t ridx_end_;
@ -73,12 +73,12 @@ class DeviceShard {
auto col = batch[fidx]; auto col = batch[fidx];
auto seg = column_segments[fidx]; auto seg = column_segments[fidx];
dh::safe_cuda(cudaMemcpy( dh::safe_cuda(cudaMemcpy(
data_.GetSpan().subspan(row_ptr_[fidx]).data(), data_.subspan(row_ptr_[fidx]).data(),
col.data() + seg.first, col.data() + seg.first,
sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice)); sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice));
} }
// Rescale indices with respect to current shard // Rescale indices with respect to current shard
RescaleIndices(ridx_begin_, &data_); RescaleIndices(device_id_, ridx_begin_, data_);
} }
bool IsEmpty() { bool IsEmpty() {
@ -87,8 +87,10 @@ class DeviceShard {
void UpdateGpair(const std::vector<GradientPair> &host_gpair, void UpdateGpair(const std::vector<GradientPair> &host_gpair,
const gbm::GBLinearModelParam &model_param) { const gbm::GBLinearModelParam &model_param) {
gpair_.copy(host_gpair.begin() + ridx_begin_ * model_param.num_output_group, dh::safe_cuda(cudaMemcpyAsync(
host_gpair.begin() + ridx_end_ * model_param.num_output_group); gpair_.data(),
host_gpair.data() + ridx_begin_ * model_param.num_output_group,
gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice));
} }
GradientPair GetBiasGradient(int group_idx, int num_group) { GradientPair GetBiasGradient(int group_idx, int num_group) {
@ -99,14 +101,14 @@ class DeviceShard {
}; // NOLINT }; // NOLINT
thrust::transform_iterator<decltype(f), decltype(counting), size_t> skip( thrust::transform_iterator<decltype(f), decltype(counting), size_t> skip(
counting, f); counting, f);
auto perm = thrust::make_permutation_iterator(gpair_.tbegin(), skip); auto perm = thrust::make_permutation_iterator(gpair_.data(), skip);
return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_); return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_);
} }
void UpdateBiasResidual(float dbias, int group_idx, int num_groups) { void UpdateBiasResidual(float dbias, int group_idx, int num_groups) {
if (dbias == 0.0f) return; if (dbias == 0.0f) return;
auto d_gpair = gpair_.GetSpan(); auto d_gpair = gpair_;
dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) { dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) {
auto &g = d_gpair[idx * num_groups + group_idx]; auto &g = d_gpair[idx * num_groups + group_idx];
g += GradientPair(g.GetHess() * dbias, 0); g += GradientPair(g.GetHess() * dbias, 0);
@ -115,9 +117,9 @@ class DeviceShard {
GradientPair GetGradient(int group_idx, int num_group, int fidx) { GradientPair GetGradient(int group_idx, int num_group, int fidx) {
dh::safe_cuda(cudaSetDevice(device_id_)); dh::safe_cuda(cudaSetDevice(device_id_));
common::Span<xgboost::Entry> d_col = data_.GetSpan().subspan(row_ptr_[fidx]); common::Span<xgboost::Entry> d_col = data_.subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
common::Span<GradientPair> d_gpair = gpair_.GetSpan(); common::Span<GradientPair> d_gpair = gpair_;
auto counting = thrust::make_counting_iterator(0ull); auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) { auto f = [=] __device__(size_t idx) {
auto entry = d_col[idx]; auto entry = d_col[idx];
@ -131,8 +133,8 @@ class DeviceShard {
} }
void UpdateResidual(float dw, int group_idx, int num_groups, int fidx) { void UpdateResidual(float dw, int group_idx, int num_groups, int fidx) {
common::Span<GradientPair> d_gpair = gpair_.GetSpan(); common::Span<GradientPair> d_gpair = gpair_;
common::Span<Entry> d_col = data_.GetSpan().subspan(row_ptr_[fidx]); common::Span<Entry> d_col = data_.subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
dh::LaunchN(device_id_, col_size, [=] __device__(size_t idx) { dh::LaunchN(device_id_, col_size, [=] __device__(size_t idx) {
auto entry = d_col[idx]; auto entry = d_col[idx];

View File

@ -545,21 +545,21 @@ class GPUMaker : public TreeUpdater {
/** whether we have initialized memory already (so as not to repeat!) */ /** whether we have initialized memory already (so as not to repeat!) */
bool allocated_; bool allocated_;
/** feature values stored in column-major compressed format */ /** feature values stored in column-major compressed format */
dh::DVec2<float> vals_; dh::DoubleBuffer<float> vals_;
dh::DVec<float> vals_cached_; common::Span<float> vals_cached_;
/** corresponding instance id's of these featutre values */ /** corresponding instance id's of these featutre values */
dh::DVec2<int> instIds_; dh::DoubleBuffer<int> instIds_;
dh::DVec<int> inst_ids_cached_; common::Span<int> inst_ids_cached_;
/** column offsets for these feature values */ /** column offsets for these feature values */
dh::DVec<int> colOffsets_; common::Span<int> colOffsets_;
dh::DVec<GradientPair> gradsInst_; common::Span<GradientPair> gradsInst_;
dh::DVec2<NodeIdT> nodeAssigns_; dh::DoubleBuffer<NodeIdT> nodeAssigns_;
dh::DVec2<int> nodeLocations_; dh::DoubleBuffer<int> nodeLocations_;
dh::DVec<DeviceNodeStats> nodes_; common::Span<DeviceNodeStats> nodes_;
dh::DVec<NodeIdT> node_assigns_per_inst_; common::Span<NodeIdT> node_assigns_per_inst_;
dh::DVec<GradientPair> gradsums_; common::Span<GradientPair> gradsums_;
dh::DVec<GradientPair> gradscans_; common::Span<GradientPair> gradscans_;
dh::DVec<ExactSplitCandidate> nodeSplits_; common::Span<ExactSplitCandidate> nodeSplits_;
int n_vals_; int n_vals_;
int n_rows_; int n_rows_;
int n_cols_; int n_cols_;
@ -571,10 +571,10 @@ class GPUMaker : public TreeUpdater {
GPUSet devices_; GPUSet devices_;
dh::CubMemory tmp_mem_; dh::CubMemory tmp_mem_;
dh::DVec<GradientPair> tmpScanGradBuff_; common::Span<GradientPair> tmpScanGradBuff_;
dh::DVec<int> tmp_scan_key_buff_; common::Span<int> tmp_scan_key_buff_;
dh::DVec<int> colIds_; common::Span<int> colIds_;
dh::BulkAllocator<dh::MemoryType::kDevice> ba_; dh::BulkAllocator ba_;
public: public:
GPUMaker() : allocated_{false} {} GPUMaker() : allocated_{false} {}
@ -615,8 +615,8 @@ class GPUMaker : public TreeUpdater {
for (int i = 0; i < param_.max_depth; ++i) { for (int i = 0; i < param_.max_depth; ++i) {
if (i == 0) { if (i == 0) {
// make sure to start on a fresh tree with sorted values! // make sure to start on a fresh tree with sorted values!
vals_.CurrentDVec() = vals_cached_; dh::CopyDeviceSpan(vals_.CurrentSpan(), vals_cached_);
instIds_.CurrentDVec() = inst_ids_cached_; dh::CopyDeviceSpan(instIds_.CurrentSpan(), inst_ids_cached_);
TransferGrads(gpair); TransferGrads(gpair);
} }
int nNodes = 1 << i; int nNodes = 1 << i;
@ -630,13 +630,13 @@ class GPUMaker : public TreeUpdater {
} }
void Split2Node(int nNodes, NodeIdT nodeStart) { void Split2Node(int nNodes, NodeIdT nodeStart) {
auto d_nodes = nodes_.GetSpan(); auto d_nodes = nodes_;
auto d_gradScans = gradscans_.GetSpan(); auto d_gradScans = gradscans_;
auto d_gradsums = gradsums_.GetSpan(); auto d_gradsums = gradsums_;
auto d_nodeAssigns = nodeAssigns_.CurrentSpan(); auto d_nodeAssigns = nodeAssigns_.CurrentSpan();
auto d_colIds = colIds_.GetSpan(); auto d_colIds = colIds_;
auto d_vals = vals_.Current(); auto d_vals = vals_.Current();
auto d_nodeSplits = nodeSplits_.Data(); auto d_nodeSplits = nodeSplits_.data();
int nUniqKeys = nNodes; int nUniqKeys = nNodes;
float min_split_loss = param_.min_split_loss; float min_split_loss = param_.min_split_loss;
auto gpu_param = GPUTrainingParam(param_); auto gpu_param = GPUTrainingParam(param_);
@ -679,13 +679,13 @@ class GPUMaker : public TreeUpdater {
} }
void FindSplit(int level, NodeIdT nodeStart, int nNodes) { void FindSplit(int level, NodeIdT nodeStart, int nNodes) {
ReduceScanByKey(gradsums_.GetSpan(), gradscans_.GetSpan(), gradsInst_.GetSpan(), ReduceScanByKey(gradsums_, gradscans_, gradsInst_,
instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes, instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
n_cols_, tmpScanGradBuff_.GetSpan(), tmp_scan_key_buff_.GetSpan(), n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
colIds_.GetSpan(), nodeStart); colIds_, nodeStart);
ArgMaxByKey(nodeSplits_.GetSpan(), gradscans_.GetSpan(), gradsums_.GetSpan(), ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
vals_.CurrentSpan(), colIds_.GetSpan(), nodeAssigns_.CurrentSpan(), vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
nodes_.GetSpan(), nNodes, nodeStart, n_vals_, param_, nodes_, nNodes, nodeStart, n_vals_, param_,
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem); level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
Split2Node(nNodes, nodeStart); Split2Node(nNodes, nodeStart);
} }
@ -707,7 +707,7 @@ class GPUMaker : public TreeUpdater {
} }
std::vector<float> fval; std::vector<float> fval;
std::vector<int> fId; std::vector<int> fId;
std::vector<size_t> offset; std::vector<int> offset;
ConvertToCsc(dmat, &fval, &fId, &offset); ConvertToCsc(dmat, &fval, &fId, &offset);
AllocateAllData(static_cast<int>(offset.size())); AllocateAllData(static_cast<int>(offset.size()));
TransferAndSortData(fval, fId, offset); TransferAndSortData(fval, fId, offset);
@ -715,7 +715,7 @@ class GPUMaker : public TreeUpdater {
} }
void ConvertToCsc(DMatrix* dmat, std::vector<float>* fval, void ConvertToCsc(DMatrix* dmat, std::vector<float>* fval,
std::vector<int>* fId, std::vector<size_t>* offset) { std::vector<int>* fId, std::vector<int>* offset) {
const MetaInfo& info = dmat->Info(); const MetaInfo& info = dmat->Info();
CHECK(info.num_col_ < std::numeric_limits<int>::max()); CHECK(info.num_col_ < std::numeric_limits<int>::max());
CHECK(info.num_row_ < std::numeric_limits<int>::max()); CHECK(info.num_row_ < std::numeric_limits<int>::max());
@ -735,7 +735,7 @@ class GPUMaker : public TreeUpdater {
fval->push_back(e.fvalue); fval->push_back(e.fvalue);
fId->push_back(inst_id); fId->push_back(inst_id);
} }
offset->push_back(fval->size()); offset->push_back(static_cast<int>(fval->size()));
} }
} }
CHECK(fval->size() < std::numeric_limits<int>::max()); CHECK(fval->size() < std::numeric_limits<int>::max());
@ -744,19 +744,21 @@ class GPUMaker : public TreeUpdater {
void TransferAndSortData(const std::vector<float>& fval, void TransferAndSortData(const std::vector<float>& fval,
const std::vector<int>& fId, const std::vector<int>& fId,
const std::vector<size_t>& offset) { const std::vector<int>& offset) {
vals_.CurrentDVec() = fval; dh::CopyVectorToDeviceSpan(vals_.CurrentSpan(), fval);
instIds_.CurrentDVec() = fId; dh::CopyVectorToDeviceSpan(instIds_.CurrentSpan(), fId);
colOffsets_ = offset; dh::CopyVectorToDeviceSpan(colOffsets_, offset);
dh::SegmentedSort<float, int>(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_, dh::SegmentedSort<float, int>(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_,
colOffsets_); colOffsets_);
vals_cached_ = vals_.CurrentDVec(); dh::CopyDeviceSpan(vals_cached_, vals_.CurrentSpan());
inst_ids_cached_ = instIds_.CurrentDVec(); dh::CopyDeviceSpan(inst_ids_cached_, instIds_.CurrentSpan());
AssignColIds<<<n_cols_, 512>>>(colIds_.Data(), colOffsets_.Data()); AssignColIds<<<n_cols_, 512>>>(colIds_.data(), colOffsets_.data());
} }
void TransferGrads(HostDeviceVector<GradientPair>* gpair) { void TransferGrads(HostDeviceVector<GradientPair>* gpair) {
gpair->GatherTo(gradsInst_.tbegin(), gradsInst_.tend()); gpair->GatherTo(
thrust::device_pointer_cast(gradsInst_.data()),
thrust::device_pointer_cast(gradsInst_.data() + gradsInst_.size()));
// evaluate the full-grad reduction for the root node // evaluate the full-grad reduction for the root node
dh::SumReduction<GradientPair>(tmp_mem_, gradsInst_, gradsums_, n_rows_); dh::SumReduction<GradientPair>(tmp_mem_, gradsInst_, gradsums_, n_rows_);
} }
@ -764,14 +766,22 @@ class GPUMaker : public TreeUpdater {
void InitNodeData(int level, NodeIdT nodeStart, int nNodes) { void InitNodeData(int level, NodeIdT nodeStart, int nNodes) {
// all instances belong to root node at the beginning! // all instances belong to root node at the beginning!
if (level == 0) { if (level == 0) {
nodes_.Fill(DeviceNodeStats()); thrust::fill(thrust::device_pointer_cast(nodes_.data()),
nodeAssigns_.CurrentDVec().Fill(0); thrust::device_pointer_cast(nodes_.data() + nodes_.size()),
node_assigns_per_inst_.Fill(0); DeviceNodeStats());
thrust::fill(thrust::device_pointer_cast(nodeAssigns_.Current()),
thrust::device_pointer_cast(nodeAssigns_.Current() +
nodeAssigns_.Size()),
0);
thrust::fill(thrust::device_pointer_cast(node_assigns_per_inst_.data()),
thrust::device_pointer_cast(node_assigns_per_inst_.data() +
node_assigns_per_inst_.size()),
0);
// for root node, just update the gradient/score/weight/id info // for root node, just update the gradient/score/weight/id info
// before splitting it! Currently all data is on GPU, hence this // before splitting it! Currently all data is on GPU, hence this
// stupid little kernel // stupid little kernel
auto d_nodes = nodes_.Data(); auto d_nodes = nodes_;
auto d_sums = gradsums_.Data(); auto d_sums = gradsums_;
auto gpu_params = GPUTrainingParam(param_); auto gpu_params = GPUTrainingParam(param_);
dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) { dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params); d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
@ -781,17 +791,17 @@ class GPUMaker : public TreeUpdater {
const int ItemsPerThread = 4; const int ItemsPerThread = 4;
// assign default node ids first // assign default node ids first
int nBlks = dh::DivRoundUp(n_rows_, BlkDim); int nBlks = dh::DivRoundUp(n_rows_, BlkDim);
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.Data(), FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.data(),
nodes_.Data(), n_rows_); nodes_.data(), n_rows_);
// evaluate the correct child indices of non-missing values next // evaluate the correct child indices of non-missing values next
nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread); nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
AssignNodeIds<<<nBlks, BlkDim>>>( AssignNodeIds<<<nBlks, BlkDim>>>(
node_assigns_per_inst_.Data(), nodeLocations_.Current(), node_assigns_per_inst_.data(), nodeLocations_.Current(),
nodeAssigns_.Current(), instIds_.Current(), nodes_.Data(), nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
colOffsets_.Data(), vals_.Current(), n_vals_, n_cols_); colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
// gather the node assignments across all other columns too // gather the node assignments across all other columns too
dh::Gather(param_.gpu_id, nodeAssigns_.Current(), dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
node_assigns_per_inst_.Data(), instIds_.Current(), n_vals_); node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
SortKeys(level); SortKeys(level);
} }
} }
@ -804,14 +814,14 @@ class GPUMaker : public TreeUpdater {
dh::Gather<float, int>(param_.gpu_id, vals_.other(), dh::Gather<float, int>(param_.gpu_id, vals_.other(),
vals_.Current(), instIds_.other(), instIds_.Current(), vals_.Current(), instIds_.other(), instIds_.Current(),
nodeLocations_.Current(), n_vals_); nodeLocations_.Current(), n_vals_);
vals_.buff().selector ^= 1; vals_.buff.selector ^= 1;
instIds_.buff().selector ^= 1; instIds_.buff.selector ^= 1;
} }
void MarkLeaves() { void MarkLeaves() {
const int BlkDim = 128; const int BlkDim = 128;
int nBlks = dh::DivRoundUp(maxNodes_, BlkDim); int nBlks = dh::DivRoundUp(maxNodes_, BlkDim);
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.Data(), maxNodes_); MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.data(), maxNodes_);
} }
}; };

View File

@ -254,10 +254,13 @@ XGBOOST_DEVICE inline bool IsLeftChild(int nidx) {
// Copy gpu dense representation of tree to xgboost sparse representation // Copy gpu dense representation of tree to xgboost sparse representation
inline void Dense2SparseTree(RegTree* p_tree, inline void Dense2SparseTree(RegTree* p_tree,
const dh::DVec<DeviceNodeStats>& nodes, common::Span<DeviceNodeStats> nodes,
const TrainParam& param) { const TrainParam& param) {
RegTree& tree = *p_tree; RegTree& tree = *p_tree;
std::vector<DeviceNodeStats> h_nodes = nodes.AsVector(); std::vector<DeviceNodeStats> h_nodes(nodes.size());
dh::safe_cuda(cudaMemcpy(h_nodes.data(), nodes.data(),
nodes.size() * sizeof(DeviceNodeStats),
cudaMemcpyDeviceToHost));
int nid = 0; int nid = 0;
for (int gpu_nid = 0; gpu_nid < h_nodes.size(); gpu_nid++) { for (int gpu_nid = 0; gpu_nid < h_nodes.size(); gpu_nid++) {
@ -298,18 +301,16 @@ struct BernoulliRng {
}; };
// Set gradient pair to 0 with p = 1 - subsample // Set gradient pair to 0 with p = 1 - subsample
inline void SubsampleGradientPair(dh::DVec<GradientPair>* p_gpair, float subsample, inline void SubsampleGradientPair(int device_idx,
int offset = 0) { common::Span<GradientPair> d_gpair,
float subsample, int offset = 0) {
if (subsample == 1.0) { if (subsample == 1.0) {
return; return;
} }
dh::DVec<GradientPair>& gpair = *p_gpair;
auto d_gpair = gpair.Data();
BernoulliRng rng(subsample, common::GlobalRandom()()); BernoulliRng rng(subsample, common::GlobalRandom()());
dh::LaunchN(gpair.DeviceIdx(), gpair.Size(), [=] XGBOOST_DEVICE(int i) { dh::LaunchN(device_idx, d_gpair.size(), [=] XGBOOST_DEVICE(int i) {
if (!rng(i + offset)) { if (!rng(i + offset)) {
d_gpair[i] = GradientPair(); d_gpair[i] = GradientPair();
} }

View File

@ -601,7 +601,7 @@ struct DeviceShard {
int n_bins; int n_bins;
int device_id; int device_id;
dh::BulkAllocator<dh::MemoryType::kDevice> ba; dh::BulkAllocator ba;
ELLPackMatrix ellpack_matrix; ELLPackMatrix ellpack_matrix;
@ -610,27 +610,26 @@ struct DeviceShard {
DeviceHistogram<GradientSumT> hist; DeviceHistogram<GradientSumT> hist;
/*! \brief row_ptr form HistCutMatrix. */ /*! \brief row_ptr form HistCutMatrix. */
dh::DVec<uint32_t> feature_segments; common::Span<uint32_t> feature_segments;
/*! \brief minimum value for each feature. */ /*! \brief minimum value for each feature. */
dh::DVec<bst_float> min_fvalue; common::Span<bst_float> min_fvalue;
/*! \brief Cut. */ /*! \brief Cut. */
dh::DVec<bst_float> gidx_fvalue_map; common::Span<bst_float> gidx_fvalue_map;
/*! \brief global index of histogram, which is stored in ELLPack format. */ /*! \brief global index of histogram, which is stored in ELLPack format. */
dh::DVec<common::CompressedByteT> gidx_buffer; common::Span<common::CompressedByteT> gidx_buffer;
/*! \brief Row indices relative to this shard, necessary for sorting rows. */ /*! \brief Row indices relative to this shard, necessary for sorting rows. */
dh::DVec2<bst_uint> ridx; dh::DoubleBuffer<bst_uint> ridx;
dh::DoubleBuffer<int> position;
/*! \brief Gradient pair for each row. */ /*! \brief Gradient pair for each row. */
dh::DVec<GradientPair> gpair; common::Span<GradientPair> gpair;
dh::DVec2<int> position; common::Span<int> monotone_constraints;
common::Span<bst_float> prediction_cache;
dh::DVec<int> monotone_constraints;
dh::DVec<bst_float> prediction_cache;
/*! \brief Sum gradient for each node. */ /*! \brief Sum gradient for each node. */
std::vector<GradientPair> node_sum_gradients; std::vector<GradientPair> node_sum_gradients;
dh::DVec<GradientPair> node_sum_gradients_d; common::Span<GradientPair> node_sum_gradients_d;
/*! \brief row offset in SparsePage (the input data). */ /*! \brief row offset in SparsePage (the input data). */
thrust::device_vector<size_t> row_ptrs; thrust::device_vector<size_t> row_ptrs;
/*! \brief On-device feature set, only actually used on one of the devices */ /*! \brief On-device feature set, only actually used on one of the devices */
@ -718,7 +717,9 @@ struct DeviceShard {
// Reset values for each update iteration // Reset values for each update iteration
void Reset(HostDeviceVector<GradientPair>* dh_gpair) { void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device_id));
position.CurrentDVec().Fill(0); thrust::fill(
thrust::device_pointer_cast(position.Current()),
thrust::device_pointer_cast(position.Current() + position.Size()), 0);
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
GradientPair()); GradientPair());
if (left_counts.size() < 256) { if (left_counts.size() < 256) {
@ -727,13 +728,16 @@ struct DeviceShard {
dh::safe_cuda(cudaMemsetAsync(left_counts.data().get(), 0, dh::safe_cuda(cudaMemsetAsync(left_counts.data().get(), 0,
sizeof(int64_t) * left_counts.size())); sizeof(int64_t) * left_counts.size()));
} }
thrust::sequence(ridx.CurrentDVec().tbegin(), ridx.CurrentDVec().tend()); thrust::sequence(
thrust::device_pointer_cast(ridx.CurrentSpan().data()),
thrust::device_pointer_cast(ridx.CurrentSpan().data() + ridx.Size()));
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0)); std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
ridx_segments.front() = Segment(0, ridx.Size()); ridx_segments.front() = Segment(0, ridx.Size());
this->gpair.copy(dh_gpair->tcbegin(device_id), dh::safe_cuda(cudaMemcpyAsync(
dh_gpair->tcend(device_id)); gpair.data(), dh_gpair->ConstDevicePointer(device_id),
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx); gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
hist.Reset(); hist.Reset();
} }
@ -788,7 +792,7 @@ struct DeviceShard {
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>( <<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix, hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
gpu_param, d_split_candidates, value_constraints[nidx], gpu_param, d_split_candidates, value_constraints[nidx],
monotone_constraints.GetSpan()); monotone_constraints);
// Reduce over features to find best feature // Reduce over features to find best feature
auto d_result = d_result_all.subspan(i, 1); auto d_result = d_result_all.subspan(i, 1);
@ -943,8 +947,8 @@ struct DeviceShard {
void UpdatePredictionCache(bst_float* out_preds_d) { void UpdatePredictionCache(bst_float* out_preds_d) {
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device_id));
if (!prediction_cache_initialised) { if (!prediction_cache_initialised) {
dh::safe_cuda(cudaMemcpyAsync(prediction_cache.Data(), out_preds_d, dh::safe_cuda(cudaMemcpyAsync(prediction_cache.data(), out_preds_d,
prediction_cache.Size() * sizeof(bst_float), prediction_cache.size() * sizeof(bst_float),
cudaMemcpyDefault)); cudaMemcpyDefault));
} }
prediction_cache_initialised = true; prediction_cache_initialised = true;
@ -952,16 +956,16 @@ struct DeviceShard {
CalcWeightTrainParam param_d(param); CalcWeightTrainParam param_d(param);
dh::safe_cuda( dh::safe_cuda(
cudaMemcpyAsync(node_sum_gradients_d.Data(), node_sum_gradients.data(), cudaMemcpyAsync(node_sum_gradients_d.data(), node_sum_gradients.data(),
sizeof(GradientPair) * node_sum_gradients.size(), sizeof(GradientPair) * node_sum_gradients.size(),
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
auto d_position = position.Current(); auto d_position = position.Current();
auto d_ridx = ridx.Current(); auto d_ridx = ridx.Current();
auto d_node_sum_gradients = node_sum_gradients_d.Data(); auto d_node_sum_gradients = node_sum_gradients_d.data();
auto d_prediction_cache = prediction_cache.Data(); auto d_prediction_cache = prediction_cache.data();
dh::LaunchN( dh::LaunchN(
device_id, prediction_cache.Size(), [=] __device__(int local_idx) { device_id, prediction_cache.size(), [=] __device__(int local_idx) {
int pos = d_position[local_idx]; int pos = d_position[local_idx];
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]); bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
d_prediction_cache[d_ridx[local_idx]] += d_prediction_cache[d_ridx[local_idx]] +=
@ -969,8 +973,8 @@ struct DeviceShard {
}); });
dh::safe_cuda(cudaMemcpy( dh::safe_cuda(cudaMemcpy(
out_preds_d, prediction_cache.Data(), out_preds_d, prediction_cache.data(),
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault)); prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
} }
}; };
@ -981,7 +985,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
auto segment_begin = segment.begin; auto segment_begin = segment.begin;
auto d_node_hist = shard->hist.GetNodeHistogram(nidx); auto d_node_hist = shard->hist.GetNodeHistogram(nidx);
auto d_ridx = shard->ridx.Current(); auto d_ridx = shard->ridx.Current();
auto d_gpair = shard->gpair.Data(); auto d_gpair = shard->gpair.data();
auto n_elements = segment.Size() * shard->ellpack_matrix.row_stride; auto n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
@ -1006,7 +1010,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
Segment segment = shard->ridx_segments[nidx]; Segment segment = shard->ridx_segments[nidx];
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data(); auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
bst_uint* d_ridx = shard->ridx.Current(); bst_uint* d_ridx = shard->ridx.Current();
GradientPair* d_gpair = shard->gpair.Data(); GradientPair* d_gpair = shard->gpair.data();
size_t const n_elements = segment.Size() * shard->ellpack_matrix.row_stride; size_t const n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
auto d_matrix = shard->ellpack_matrix; auto d_matrix = shard->ellpack_matrix;
@ -1043,10 +1047,11 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
&gidx_fvalue_map, hmat.cut.size(), &gidx_fvalue_map, hmat.cut.size(),
&min_fvalue, hmat.min_val.size(), &min_fvalue, hmat.min_val.size(),
&monotone_constraints, param.monotone_constraints.size()); &monotone_constraints, param.monotone_constraints.size());
gidx_fvalue_map = hmat.cut;
min_fvalue = hmat.min_val; dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.cut);
feature_segments = hmat.row_ptr; dh::CopyVectorToDeviceSpan(min_fvalue, hmat.min_val);
monotone_constraints = param.monotone_constraints; dh::CopyVectorToDeviceSpan(feature_segments, hmat.row_ptr);
dh::CopyVectorToDeviceSpan(monotone_constraints, param.monotone_constraints);
node_sum_gradients.resize(max_nodes); node_sum_gradients.resize(max_nodes);
ridx_segments.resize(max_nodes); ridx_segments.resize(max_nodes);
@ -1063,14 +1068,16 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
<< "Max leaves and max depth cannot both be unconstrained for " << "Max leaves and max depth cannot both be unconstrained for "
"gpu_hist."; "gpu_hist.";
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes); ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
gidx_buffer.Fill(0); thrust::fill(
thrust::device_pointer_cast(gidx_buffer.data()),
thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
this->CreateHistIndices(row_batch, row_stride, null_gidx_value); this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
ellpack_matrix.Init( ellpack_matrix.Init(
feature_segments.GetSpan(), min_fvalue.GetSpan(), feature_segments, min_fvalue,
gidx_fvalue_map.GetSpan(), row_stride, gidx_fvalue_map, row_stride,
common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols), common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols),
is_dense, null_gidx_value); is_dense, null_gidx_value);
// check if we can use shared memory for building histograms // check if we can use shared memory for building histograms
@ -1121,10 +1128,10 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
dh::DivRoundUp(row_stride, block3.y), 1); dh::DivRoundUp(row_stride, block3.y), 1);
CompressBinEllpackKernel<<<grid3, block3>>> CompressBinEllpackKernel<<<grid3, block3>>>
(common::CompressedBufferWriter(num_symbols), (common::CompressedBufferWriter(num_symbols),
gidx_buffer.Data(), gidx_buffer.data(),
row_ptrs.data().get() + batch_row_begin, row_ptrs.data().get() + batch_row_begin,
entries_d.data().get(), entries_d.data().get(),
gidx_fvalue_map.Data(), feature_segments.Data(), gidx_fvalue_map.data(), feature_segments.data(),
batch_row_begin, batch_nrows, batch_row_begin, batch_nrows,
row_ptrs[batch_row_begin], row_ptrs[batch_row_begin],
row_stride, null_gidx_value); row_stride, null_gidx_value);
@ -1355,7 +1362,7 @@ class GPUHistMakerSpecialised{
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) { [&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
dh::safe_cuda(cudaSetDevice(shard->device_id)); dh::safe_cuda(cudaSetDevice(shard->device_id));
tmp_sums[i] = dh::SumReduction( tmp_sums[i] = dh::SumReduction(
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size()); shard->temp_memory, shard->gpair.data(), shard->gpair.size());
}); });
GradientPair sum_gradient = GradientPair sum_gradient =

View File

@ -7,6 +7,8 @@
#include "../../../src/common/device_helpers.cuh" #include "../../../src/common/device_helpers.cuh"
#include "gtest/gtest.h" #include "gtest/gtest.h"
using xgboost::common::Span;
struct Shard { int id; }; struct Shard { int id; };
TEST(DeviceHelpers, Basic) { TEST(DeviceHelpers, Basic) {
@ -71,3 +73,20 @@ TEST(sumReduce, Test) {
auto sum = dh::SumReduction(temp, dh::Raw(data), data.size()); auto sum = dh::SumReduction(temp, dh::Raw(data), data.size());
ASSERT_NEAR(sum, 100.0f, 1e-5); ASSERT_NEAR(sum, 100.0f, 1e-5);
} }
void TestAllocator() {
int n = 10;
Span<float> a;
Span<int> b;
Span<size_t> c;
dh::BulkAllocator ba;
ba.Allocate(0, &a, n, &b, n, &c, n);
// Should be no illegal memory accesses
dh::LaunchN(0, n, [=] __device__(size_t idx) { c[idx] = a[idx] + b[idx]; });
dh::safe_cuda(cudaDeviceSynchronize());
}
// Define the test in a function so we can use device lambda
TEST(bulkAllocator, Test) { TestAllocator(); }

View File

@ -56,8 +56,8 @@ TEST(GpuHist, BuildGidxDense) {
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param); DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
BuildGidx(&shard, kNRows, kNCols); BuildGidx(&shard, kNRows, kNCols);
std::vector<common::CompressedByteT> h_gidx_buffer; std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
h_gidx_buffer = shard.gidx_buffer.AsVector(); dh::CopyDeviceSpanToVector(&h_gidx_buffer, shard.gidx_buffer);
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25); common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_EQ(shard.ellpack_matrix.row_stride, kNCols); ASSERT_EQ(shard.ellpack_matrix.row_stride, kNCols);
@ -95,8 +95,8 @@ TEST(GpuHist, BuildGidxSparse) {
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param); DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
BuildGidx(&shard, kNRows, kNCols, 0.9f); BuildGidx(&shard, kNRows, kNCols, 0.9f);
std::vector<common::CompressedByteT> h_gidx_buffer; std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
h_gidx_buffer = shard.gidx_buffer.AsVector(); dh::CopyDeviceSpanToVector(&h_gidx_buffer, shard.gidx_buffer);
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25); common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
ASSERT_LE(shard.ellpack_matrix.row_stride, 3); ASSERT_LE(shard.ellpack_matrix.row_stride, 3);
@ -149,17 +149,14 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
gpair = GradientPair(grad, hess); gpair = GradientPair(grad, hess);
} }
thrust::device_vector<GradientPair> gpair (kNRows);
gpair = h_gpair;
int num_symbols = shard.n_bins + 1; int num_symbols = shard.n_bins + 1;
thrust::host_vector<common::CompressedByteT> h_gidx_buffer ( thrust::host_vector<common::CompressedByteT> h_gidx_buffer (
shard.gidx_buffer.Size()); shard.gidx_buffer.size());
common::CompressedByteT* d_gidx_buffer_ptr = shard.gidx_buffer.Data(); common::CompressedByteT* d_gidx_buffer_ptr = shard.gidx_buffer.data();
dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr, dh::safe_cuda(cudaMemcpy(h_gidx_buffer.data(), d_gidx_buffer_ptr,
sizeof(common::CompressedByteT) * shard.gidx_buffer.Size(), sizeof(common::CompressedByteT) * shard.gidx_buffer.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
auto gidx = common::CompressedIterator<uint32_t>(h_gidx_buffer.data(), auto gidx = common::CompressedIterator<uint32_t>(h_gidx_buffer.data(),
num_symbols); num_symbols);
@ -167,9 +164,10 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
shard.ridx_segments.resize(1); shard.ridx_segments.resize(1);
shard.ridx_segments[0] = Segment(0, kNRows); shard.ridx_segments[0] = Segment(0, kNRows);
shard.hist.AllocateHistogram(0); shard.hist.AllocateHistogram(0);
shard.gpair.copy(gpair.begin(), gpair.end()); dh::CopyVectorToDeviceSpan(shard.gpair, h_gpair);
thrust::sequence(shard.ridx.CurrentDVec().tbegin(), thrust::sequence(
shard.ridx.CurrentDVec().tend()); thrust::device_pointer_cast(shard.ridx.Current()),
thrust::device_pointer_cast(shard.ridx.Current() + shard.ridx.Size()));
builder.Build(&shard, 0); builder.Build(&shard, 0);
DeviceHistogram<GradientSumT> d_hist = shard.hist; DeviceHistogram<GradientSumT> d_hist = shard.hist;
@ -262,14 +260,14 @@ TEST(GpuHist, EvaluateSplits) {
&(shard->min_fvalue), cmat.min_val.size(), &(shard->min_fvalue), cmat.min_val.size(),
&(shard->gidx_fvalue_map), 24, &(shard->gidx_fvalue_map), 24,
&(shard->monotone_constraints), kNCols); &(shard->monotone_constraints), kNCols);
shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); dh::CopyVectorToDeviceSpan(shard->feature_segments, cmat.row_ptr);
shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); dh::CopyVectorToDeviceSpan(shard->gidx_fvalue_map, cmat.cut);
shard->monotone_constraints.copy(param.monotone_constraints.begin(), dh::CopyVectorToDeviceSpan(shard->monotone_constraints,
param.monotone_constraints.end()); param.monotone_constraints);
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan(); shard->ellpack_matrix.feature_segments = shard->feature_segments;
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan(); shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map;
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end()); dh::CopyVectorToDeviceSpan(shard->min_fvalue, cmat.min_val);
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan(); shard->ellpack_matrix.min_fvalue = shard->min_fvalue;
// Initialize DeviceShard::hist // Initialize DeviceShard::hist
shard->hist.Init(0, (max_bins - 1) * kNCols); shard->hist.Init(0, (max_bins - 1) * kNCols);
@ -344,8 +342,9 @@ TEST(GpuHist, ApplySplit) {
shard->ba.Allocate(0, &(shard->ridx), kNRows, shard->ba.Allocate(0, &(shard->ridx), kNRows,
&(shard->position), kNRows); &(shard->position), kNRows);
shard->ellpack_matrix.row_stride = kNCols; shard->ellpack_matrix.row_stride = kNCols;
thrust::sequence(shard->ridx.CurrentDVec().tbegin(), thrust::sequence(
shard->ridx.CurrentDVec().tend()); thrust::device_pointer_cast(shard->ridx.Current()),
thrust::device_pointer_cast(shard->ridx.Current() + shard->ridx.Size()));
// Initialize GPUHistMaker // Initialize GPUHistMaker
hist_maker.param_ = param; hist_maker.param_ = param;
RegTree tree; RegTree tree;
@ -378,12 +377,12 @@ TEST(GpuHist, ApplySplit) {
&(shard->feature_segments), cmat.row_ptr.size(), &(shard->feature_segments), cmat.row_ptr.size(),
&(shard->min_fvalue), cmat.min_val.size(), &(shard->min_fvalue), cmat.min_val.size(),
&(shard->gidx_fvalue_map), 24); &(shard->gidx_fvalue_map), 24);
shard->feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end()); dh::CopyVectorToDeviceSpan(shard->feature_segments, cmat.row_ptr);
shard->gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end()); dh::CopyVectorToDeviceSpan(shard->gidx_fvalue_map, cmat.cut);
shard->ellpack_matrix.feature_segments = shard->feature_segments.GetSpan(); shard->ellpack_matrix.feature_segments = shard->feature_segments;
shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map.GetSpan(); shard->ellpack_matrix.gidx_fvalue_map = shard->gidx_fvalue_map;
shard->min_fvalue.copy(cmat.min_val.begin(), cmat.min_val.end()); dh::CopyVectorToDeviceSpan(shard->min_fvalue, cmat.min_val);
shard->ellpack_matrix.min_fvalue = shard->min_fvalue.GetSpan(); shard->ellpack_matrix.min_fvalue = shard->min_fvalue;
shard->ellpack_matrix.is_dense = true; shard->ellpack_matrix.is_dense = true;
common::CompressedBufferWriter wr(num_symbols); common::CompressedBufferWriter wr(num_symbols);
@ -394,10 +393,10 @@ TEST(GpuHist, ApplySplit) {
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes); std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end()); wr.Write(h_gidx_compressed.data(), h_gidx.begin(), h_gidx.end());
shard->gidx_buffer.copy(h_gidx_compressed.begin(), h_gidx_compressed.end()); dh::CopyVectorToDeviceSpan(shard->gidx_buffer, h_gidx_compressed);
shard->ellpack_matrix.gidx_iter = common::CompressedIterator<uint32_t>( shard->ellpack_matrix.gidx_iter = common::CompressedIterator<uint32_t>(
shard->gidx_buffer.Data(), num_symbols); shard->gidx_buffer.data(), num_symbols);
hist_maker.info_ = &info; hist_maker.info_ = &info;
hist_maker.ApplySplit(candidate_entry, &tree); hist_maker.ApplySplit(candidate_entry, &tree);