diff --git a/src/cli_main.cc b/src/cli_main.cc index 2901eeec7..bed85d8b6 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -22,7 +22,6 @@ #include "./common/common.h" #include "./common/config.h" - namespace xgboost { enum CLITask { @@ -240,6 +239,7 @@ void CLITrain(const CLIParam& param) { version += 1; CHECK_EQ(version, rabit::VersionNumber()); } + LOG(INFO) << "Complete Training loop time: " << dmlc::GetTime() - start << " sec"; // always save final round if ((param.save_period == 0 || param.num_round % param.save_period != 0) && param.model_out != "NONE" && diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 9a678b029..b7a7dcca3 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -305,11 +305,11 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator { }; pointer allocate(size_t n) { pointer ptr = super_t::allocate(n); - GlobalMemoryLogger().RegisterAllocation(ptr.get(), n); + GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); return ptr; } void deallocate(pointer ptr, size_t n) { - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n); + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); return super_t::deallocate(ptr, n); } }; @@ -329,19 +329,19 @@ struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator { { // Configure allocator with maximum cached bin size of ~1GB and no limit on // maximum cached bytes - static cub::CachingDeviceAllocator allocator(8,3,10); - return allocator; + static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29); + return *allocator; } pointer allocate(size_t n) { T *ptr; GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&ptr), n * sizeof(T)); - pointer thrust_ptr = thrust::device_ptr(ptr); - GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n); + pointer thrust_ptr(ptr); + GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); return thrust_ptr; } void deallocate(pointer ptr, size_t n) { - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n); + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); GetGlobalCachingAllocator().DeviceFree(ptr.get()); } __host__ __device__ @@ -363,6 +363,7 @@ template using device_vector = thrust::device_vector>; template using caching_device_vector = thrust::device_vector>; + /** * \brief A double buffer, useful for algorithms like sort. */ @@ -376,9 +377,7 @@ class DoubleBuffer { DoubleBuffer(VectorT *v1, VectorT *v2) { a = xgboost::common::Span(v1->data().get(), v1->size()); b = xgboost::common::Span(v2->data().get(), v2->size()); - buff.d_buffers[0] = v1->data().get(); - buff.d_buffers[1] = v2->data().get(); - buff.selector = 0; + buff = cub::DoubleBuffer(a.data(), b.data()); } size_t Size() const { diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index d67c2963c..f1f327aba 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -250,6 +250,10 @@ class GPUPredictor : public xgboost::Predictor { struct DeviceShard { DeviceShard() : device_{-1} {} + ~DeviceShard() { + dh::safe_cuda(cudaSetDevice(device_)); + } + void Init(int device) { this->device_ = device; max_shared_memory_bytes_ = dh::MaxSharedMemory(this->device_); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index bd43e40e5..6bf86f178 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -611,8 +611,6 @@ struct DeviceShard { /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; common::Span node_sum_gradients_d; - /*! \brief On-device feature set, only actually used on one of the devices */ - dh::device_vector feature_set_d; /*! The row offset for this shard. */ bst_uint row_begin_idx; bst_uint row_end_idx; @@ -700,6 +698,7 @@ struct DeviceShard { this->interaction_constraints.Reset(); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPair()); + row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner.reset(new RowPartitioner(device_id, n_rows)); dh::safe_cuda(cudaMemcpyAsync( @@ -921,6 +920,7 @@ struct DeviceShard { dh::safe_cuda(cudaMemcpy( out_preds_d, prediction_cache.data(), prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault)); + row_partitioner.reset(); } void AllReduceHist(int nidx, dh::AllReducer* reducer) { diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 172fd899b..154d2030c 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -11,15 +11,16 @@ namespace tree { void TestSortPosition(const std::vector& position_in, int left_idx, int right_idx) { + dh::safe_cuda(cudaSetDevice(0)); std::vector left_count = { std::count(position_in.begin(), position_in.end(), left_idx)}; - thrust::device_vector d_left_count = left_count; - thrust::device_vector position = position_in; - thrust::device_vector position_out(position.size()); + dh::caching_device_vector d_left_count = left_count; + dh::caching_device_vector position = position_in; + dh::caching_device_vector position_out(position.size()); - thrust::device_vector ridx(position.size()); + dh::caching_device_vector ridx(position.size()); thrust::sequence(ridx.begin(), ridx.end()); - thrust::device_vector ridx_out(ridx.size()); + dh::caching_device_vector ridx_out(ridx.size()); RowPartitioner rp(0,10); rp.SortPosition( common::Span(position.data().get(), position.size()),