Modify caching allocator/vector and fix issues relating to inability to train large datasets (#4615)

This commit is contained in:
sriramch 2019-07-08 23:33:27 -07:00 committed by Rory Mitchell
parent cd1526d3b1
commit 7a388cbf8b
5 changed files with 22 additions and 18 deletions

View File

@ -22,7 +22,6 @@
#include "./common/common.h"
#include "./common/config.h"
namespace xgboost {
enum CLITask {
@ -240,6 +239,7 @@ void CLITrain(const CLIParam& param) {
version += 1;
CHECK_EQ(version, rabit::VersionNumber());
}
LOG(INFO) << "Complete Training loop time: " << dmlc::GetTime() - start << " sec";
// always save final round
if ((param.save_period == 0 || param.num_round % param.save_period != 0) &&
param.model_out != "NONE" &&

View File

@ -305,11 +305,11 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
};
pointer allocate(size_t n) {
pointer ptr = super_t::allocate(n);
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n);
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
return ptr;
}
void deallocate(pointer ptr, size_t n) {
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n);
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
return super_t::deallocate(ptr, n);
}
};
@ -329,19 +329,19 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
{
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
static cub::CachingDeviceAllocator allocator(8,3,10);
return allocator;
static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
return *allocator;
}
pointer allocate(size_t n) {
T *ptr;
GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
n * sizeof(T));
pointer thrust_ptr = thrust::device_ptr<T>(ptr);
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n);
pointer thrust_ptr(ptr);
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr;
}
void deallocate(pointer ptr, size_t n) {
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n);
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
GetGlobalCachingAllocator().DeviceFree(ptr.get());
}
__host__ __device__
@ -363,6 +363,7 @@ template <typename T>
using device_vector = thrust::device_vector<T, XGBDeviceAllocator<T>>;
template <typename T>
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>;
/**
* \brief A double buffer, useful for algorithms like sort.
*/
@ -376,9 +377,7 @@ class DoubleBuffer {
DoubleBuffer(VectorT *v1, VectorT *v2) {
a = xgboost::common::Span<T>(v1->data().get(), v1->size());
b = xgboost::common::Span<T>(v2->data().get(), v2->size());
buff.d_buffers[0] = v1->data().get();
buff.d_buffers[1] = v2->data().get();
buff.selector = 0;
buff = cub::DoubleBuffer<T>(a.data(), b.data());
}
size_t Size() const {

View File

@ -250,6 +250,10 @@ class GPUPredictor : public xgboost::Predictor {
struct DeviceShard {
DeviceShard() : device_{-1} {}
~DeviceShard() {
dh::safe_cuda(cudaSetDevice(device_));
}
void Init(int device) {
this->device_ = device;
max_shared_memory_bytes_ = dh::MaxSharedMemory(this->device_);

View File

@ -611,8 +611,6 @@ struct DeviceShard {
/*! \brief Sum gradient for each node. */
std::vector<GradientPair> node_sum_gradients;
common::Span<GradientPair> node_sum_gradients_d;
/*! \brief On-device feature set, only actually used on one of the devices */
dh::device_vector<int> feature_set_d;
/*! The row offset for this shard. */
bst_uint row_begin_idx;
bst_uint row_end_idx;
@ -700,6 +698,7 @@ struct DeviceShard {
this->interaction_constraints.Reset();
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
GradientPair());
row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner.reset(new RowPartitioner(device_id, n_rows));
dh::safe_cuda(cudaMemcpyAsync(
@ -921,6 +920,7 @@ struct DeviceShard {
dh::safe_cuda(cudaMemcpy(
out_preds_d, prediction_cache.data(),
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
row_partitioner.reset();
}
void AllReduceHist(int nidx, dh::AllReducer* reducer) {

View File

@ -11,15 +11,16 @@ namespace tree {
void TestSortPosition(const std::vector<int>& position_in, int left_idx,
int right_idx) {
dh::safe_cuda(cudaSetDevice(0));
std::vector<int64_t> left_count = {
std::count(position_in.begin(), position_in.end(), left_idx)};
thrust::device_vector<int64_t> d_left_count = left_count;
thrust::device_vector<int> position = position_in;
thrust::device_vector<int> position_out(position.size());
dh::caching_device_vector<int64_t> d_left_count = left_count;
dh::caching_device_vector<int> position = position_in;
dh::caching_device_vector<int> position_out(position.size());
thrust::device_vector<RowPartitioner::RowIndexT> ridx(position.size());
dh::caching_device_vector<RowPartitioner::RowIndexT> ridx(position.size());
thrust::sequence(ridx.begin(), ridx.end());
thrust::device_vector<RowPartitioner::RowIndexT> ridx_out(ridx.size());
dh::caching_device_vector<RowPartitioner::RowIndexT> ridx_out(ridx.size());
RowPartitioner rp(0,10);
rp.SortPosition(
common::Span<int>(position.data().get(), position.size()),