diff --git a/demo/c-api/c-api-demo.c b/demo/c-api/c-api-demo.c index e107876cb..9024badcd 100644 --- a/demo/c-api/c-api-demo.c +++ b/demo/c-api/c-api-demo.c @@ -36,13 +36,12 @@ int main(int argc, char** argv) { // https://xgboost.readthedocs.io/en/latest/parameter.html safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist")); if (use_gpu) { - // set the number of GPUs and the first GPU to use; + // set the GPU to use; // this is not necessary, but provided here as an illustration - safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "1")); safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0")); } else { // avoid evaluating objective and metric on a GPU - safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "0")); + safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1")); } safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic")); diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index e4cef9cf0..47084ec4b 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -19,10 +19,8 @@ struct GenericParameter : public dmlc::Parameter { // number of threads to use if OpenMP is enabled // if equals 0, use system default int nthread; - // primary device. + // primary device, -1 means no gpu. int gpu_id; - // number of devices to use, -1 implies using all available devices. - int n_gpus; // declare parameters DMLC_DECLARE_PARAMETER(GenericParameter) { DMLC_DECLARE_FIELD(seed).set_default(0).describe( @@ -36,15 +34,20 @@ struct GenericParameter : public dmlc::Parameter { DMLC_DECLARE_FIELD(nthread).set_default(0).describe( "Number of threads to use."); DMLC_DECLARE_FIELD(gpu_id) - .set_default(0) + .set_default(-1) + .set_lower_bound(-1) .describe("The primary GPU device ordinal."); DMLC_DECLARE_FIELD(n_gpus) .set_default(0) - .set_range(0, 1) + .set_range(0, 0) .describe("Deprecated. Single process multi-GPU training is no longer supported. " "Please switch to distributed training with one process per GPU. " "This can be done using Dask or Spark."); } + + private: + // number of devices to use (deprecated). + int n_gpus; }; } // namespace xgboost diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc index 6ce653875..b6cb174de 100644 --- a/plugin/example/custom_obj.cc +++ b/plugin/example/custom_obj.cc @@ -60,8 +60,8 @@ class MyLogistic : public ObjFunction { void PredTransform(HostDeviceVector *io_preds) override { // transform margin value to probability. std::vector &preds = io_preds->HostVector(); - for (size_t i = 0; i < preds.size(); ++i) { - preds[i] = 1.0f / (1.0f + std::exp(-preds[i])); + for (auto& pred : preds) { + pred = 1.0f / (1.0f + std::exp(-pred)); } } bst_float ProbToMargin(bst_float base_score) const override { diff --git a/src/common/common.cc b/src/common/common.cc index 60ba3e16e..8f4f4b5c8 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore; GlobalRandomEngine& GlobalRandom() { return RandomThreadLocalStore::Get()->engine; } -} // namespace common #if !defined(XGBOOST_USE_CUDA) -int AllVisibleImpl::AllVisible() { +int AllVisibleGPUs() { return 0; } #endif // !defined(XGBOOST_USE_CUDA) -constexpr GPUSet::GpuIdType GPUSet::kAll; - -GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) { - CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0."; - CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1."; - - GpuIdType const n_devices_visible = AllVisible().Size(); - CHECK_LE(n_gpus, n_devices_visible); - if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) { - LOG(DEBUG) << "Runing on CPU."; - return Empty(); - } - - GpuIdType const n_available_devices = n_devices_visible - gpu_id; - - if (n_gpus == kAll) { // Use all devices starting from `gpu_id'. - CHECK(gpu_id < n_devices_visible) - << "\ngpu_id should be less than number of visible devices.\ngpu_id: " - << gpu_id - << ", number of visible devices: " - << n_devices_visible; - GpuIdType n_devices = - n_available_devices < n_rows ? n_available_devices : n_rows; - LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices; - return Range(gpu_id, n_devices); - } else { // Use devices in ( gpu_id, gpu_id + n_gpus ). - CHECK_LE(n_gpus, n_available_devices) - << "Starting from gpu id: " << gpu_id << ", there are only " - << n_available_devices << " available devices, while n_gpus is set to: " - << n_gpus; - GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows; - LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices; - return Range(gpu_id, n_devices); - } -} - +} // namespace common } // namespace xgboost diff --git a/src/common/common.cu b/src/common/common.cu index 43fa4d465..d30fbc0ae 100644 --- a/src/common/common.cu +++ b/src/common/common.cu @@ -4,8 +4,9 @@ #include "common.h" namespace xgboost { +namespace common { -int AllVisibleImpl::AllVisible() { +int AllVisibleGPUs() { int n_visgpus = 0; try { // When compiled with CUDA but running on CPU only device, @@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() { return n_visgpus; } +} // namespace common } // namespace xgboost diff --git a/src/common/common.h b/src/common/common.h index 9eab46add..ccff63eb7 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -140,88 +140,8 @@ class Range { Iterator begin_; Iterator end_; }; + +int AllVisibleGPUs(); } // namespace common - -struct AllVisibleImpl { - static int AllVisible(); -}; -/* \brief set of devices across which HostDeviceVector can be distributed. - * - * Currently implemented as a range, but can be changed later to something else, - * e.g. a bitset - */ -class GPUSet { - public: - using GpuIdType = int; - static constexpr GpuIdType kAll = -1; - - explicit GPUSet(int start = 0, int ndevices = 0) - : devices_(start, start + ndevices) {} - - static GPUSet Empty() { return GPUSet(); } - - static GPUSet Range(GpuIdType start, GpuIdType n_gpus) { - return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus}; - } - /*! \brief n_gpus and num_rows both are upper bounds. */ - static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus, - GpuIdType num_rows = std::numeric_limits::max()); - - static GPUSet AllVisible() { - GpuIdType n = AllVisibleImpl::AllVisible(); - return Range(0, n); - } - - size_t Size() const { - GpuIdType size = *devices_.end() - *devices_.begin(); - GpuIdType res = size < 0 ? 0 : size; - return static_cast(res); - } - - /* - * By default, we have two configurations of identifying device, one - * is the device id obtained from `cudaGetDevice'. But we sometimes - * store objects that allocated one for each device in a list, which - * requires a zero-based index. - * - * Hence, `DeviceId' converts a zero-based index to actual device id, - * `Index' converts a device id to a zero-based index. - */ - GpuIdType DeviceId(size_t index) const { - GpuIdType result = *devices_.begin() + static_cast(index); - CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet." - << "\nIndex: " << index - << "\nGPUSet: (" << *begin() << ", " << *end() << ")" - << std::endl; - return result; - } - size_t Index(GpuIdType device) const { - CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet." - << "\nGPUSet: (" << *begin() << ", " << *end() << ")" - << std::endl; - size_t result = static_cast(device - *devices_.begin()); - return result; - } - - bool IsEmpty() const { return Size() == 0; } - - bool Contains(GpuIdType device) const { - return *devices_.begin() <= device && device < *devices_.end(); - } - - common::Range::Iterator begin() const { return devices_.begin(); } // NOLINT - common::Range::Iterator end() const { return devices_.end(); } // NOLINT - - friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) { - return lhs.devices_ == rhs.devices_; - } - friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) { - return !(lhs == rhs); - } - - private: - common::Range devices_; -}; - } // namespace xgboost #endif // XGBOOST_COMMON_COMMON_H_ diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 0eee94009..979167459 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector &v) { // NOLINT return raw_pointer_cast(v.data()); } -// if n_devices=-1, then use all visible devices -inline void SynchronizeNDevices(xgboost::GPUSet devices) { - devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices; - for (auto const d : devices) { - safe_cuda(cudaSetDevice(d)); - safe_cuda(cudaDeviceSynchronize()); - } -} - -inline void SynchronizeAll() { - for (int device_idx : xgboost::GPUSet::AllVisible()) { - safe_cuda(cudaSetDevice(device_idx)); - safe_cuda(cudaDeviceSynchronize()); - } -} - inline size_t AvailableMemory(int device_idx) { size_t device_free = 0; size_t device_total = 0; @@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) { } inline void CheckComputeCapability() { - for (int d_idx : xgboost::GPUSet::AllVisible()) { + for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) { cudaDeviceProp prop; safe_cuda(cudaGetDeviceProperties(&prop, d_idx)); std::ostringstream oss; diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu index 4131c59ed..d1ef37df1 100644 --- a/src/common/hist_util.cu +++ b/src/common/hist_util.cu @@ -35,7 +35,6 @@ __global__ void FindCutsK if (icut >= ncuts) { return; } - WXQSketch::Entry v; int isample = 0; if (icut == 0) { isample = 0; @@ -59,11 +58,14 @@ struct IsNotNaN { __device__ bool operator()(float a) const { return !isnan(a); } }; -__global__ void UnpackFeaturesK -(float* __restrict__ fvalues, float* __restrict__ feature_weights, - const size_t* __restrict__ row_ptrs, const float* __restrict__ weights, - Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr, - size_t nrows) { +__global__ void UnpackFeaturesK(float* __restrict__ fvalues, + float* __restrict__ feature_weights, + const size_t* __restrict__ row_ptrs, + const float* __restrict__ weights, + Entry* entries, + size_t nrows_array, + size_t row_begin_ptr, + size_t nrows) { size_t irow = threadIdx.x + size_t(blockIdx.x) * blockDim.x; if (irow >= nrows) { return; @@ -102,8 +104,9 @@ struct SketchContainer { const MetaInfo &info = dmat->Info(); // Initialize Sketches for this dmatrix sketches_.resize(info.num_col_); -#pragma omp parallel for schedule(static) if (info.num_col_ > kOmpNumColsParallelizeLimit) - for (int icol = 0; icol < info.num_col_; ++icol) { +#pragma omp parallel for default(none) shared(info, param) schedule(static) \ +if (info.num_col_ > kOmpNumColsParallelizeLimit) // NOLINT + for (int icol = 0; icol < info.num_col_; ++icol) { // NOLINT sketches_[icol].Init(info.num_row_, 1.0 / (8 * param.max_bin)); } } @@ -120,8 +123,6 @@ struct GPUSketcher { // manage memory for a single GPU class DeviceShard { int device_; - bst_uint row_begin_; // The row offset for this shard - bst_uint row_end_; bst_uint n_rows_; int num_cols_{0}; size_t n_cuts_{0}; @@ -131,27 +132,31 @@ struct GPUSketcher { tree::TrainParam param_; SketchContainer *sketch_container_; - dh::device_vector row_ptrs_; - dh::device_vector entries_; - dh::device_vector fvalues_; - dh::device_vector feature_weights_; - dh::device_vector fvalues_cur_; - dh::device_vector cuts_d_; - thrust::host_vector cuts_h_; - dh::device_vector weights_; - dh::device_vector weights2_; - std::vector n_cuts_cur_; - dh::device_vector num_elements_; - dh::device_vector tmp_storage_; + dh::device_vector row_ptrs_{}; + dh::device_vector entries_{}; + dh::device_vector fvalues_{}; + dh::device_vector feature_weights_{}; + dh::device_vector fvalues_cur_{}; + dh::device_vector cuts_d_{}; + thrust::host_vector cuts_h_{}; + dh::device_vector weights_{}; + dh::device_vector weights2_{}; + std::vector n_cuts_cur_{}; + dh::device_vector num_elements_{}; + dh::device_vector tmp_storage_{}; public: - DeviceShard(int device, bst_uint row_begin, bst_uint row_end, - tree::TrainParam param, SketchContainer *sketch_container) : - device_(device), row_begin_(row_begin), row_end_(row_end), - n_rows_(row_end - row_begin), param_(std::move(param)), sketch_container_(sketch_container) { + DeviceShard(int device, + bst_uint n_rows, + tree::TrainParam param, + SketchContainer* sketch_container) : + device_(device), + n_rows_(n_rows), + param_(std::move(param)), + sketch_container_(sketch_container) { } - ~DeviceShard() { + ~DeviceShard() { // NOLINT dh::safe_cuda(cudaSetDevice(device_)); } @@ -319,19 +324,18 @@ struct GPUSketcher { const auto& offset_vec = row_batch.offset.HostVector(); const auto& data_vec = row_batch.data.HostVector(); - size_t n_entries = offset_vec[row_begin_ + batch_row_end] - - offset_vec[row_begin_ + batch_row_begin]; + size_t n_entries = offset_vec[batch_row_end] - offset_vec[batch_row_begin]; // copy the batch to the GPU dh::safe_cuda (cudaMemcpyAsync(entries_.data().get(), - data_vec.data() + offset_vec[row_begin_ + batch_row_begin], + data_vec.data() + offset_vec[batch_row_begin], n_entries * sizeof(Entry), cudaMemcpyDefault)); // copy the weights if necessary if (has_weights_) { const auto& weights_vec = info.weights_.HostVector(); dh::safe_cuda (cudaMemcpyAsync(weights_.data().get(), - weights_vec.data() + row_begin_ + batch_row_begin, + weights_vec.data() + batch_row_begin, batch_nrows * sizeof(bst_float), cudaMemcpyDefault)); } @@ -349,8 +353,7 @@ struct GPUSketcher { (fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr, row_ptrs_.data().get() + batch_row_begin, has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(), - gpu_batch_nrows_, num_cols_, - offset_vec[row_begin_ + batch_row_begin], batch_nrows); + gpu_batch_nrows_, offset_vec[batch_row_begin], batch_nrows); for (int icol = 0; icol < num_cols_; ++icol) { FindColumnCuts(batch_nrows, icol); @@ -358,7 +361,7 @@ struct GPUSketcher { // add cuts into sketches thrust::copy(cuts_d_.begin(), cuts_d_.end(), cuts_h_.begin()); -#pragma omp parallel for schedule(static) \ +#pragma omp parallel for default(none) schedule(static) \ if (num_cols_ > SketchContainer::kOmpNumColsParallelizeLimit) // NOLINT for (int icol = 0; icol < num_cols_; ++icol) { WXQSketch::SummaryContainer summary; @@ -391,8 +394,7 @@ struct GPUSketcher { dh::safe_cuda(cudaSetDevice(device_)); const auto& offset_vec = row_batch.offset.HostVector(); row_ptrs_.resize(n_rows_ + 1); - thrust::copy(offset_vec.data() + row_begin_, - offset_vec.data() + row_end_ + 1, row_ptrs_.begin()); + thrust::copy(offset_vec.data(), offset_vec.data() + n_rows_ + 1, row_ptrs_.begin()); size_t gpu_nbatches = common::DivRoundUp(n_rows_, gpu_batch_nrows_); for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { SketchBatch(row_batch, info, gpu_batch); @@ -401,32 +403,18 @@ struct GPUSketcher { }; void SketchBatch(const SparsePage &batch, const MetaInfo &info) { - GPUDistribution dist = - GPUDistribution::Block(GPUSet::All(generic_param_.gpu_id, generic_param_.n_gpus, - batch.Size())); + auto device = generic_param_.gpu_id; - // create device shards - shards_.resize(dist.Devices().Size()); - dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr& shard) { - size_t start = dist.ShardStart(batch.Size(), i); - size_t size = dist.ShardSize(batch.Size(), i); - shard = std::unique_ptr( - new DeviceShard(dist.Devices().DeviceId(i), start, - start + size, param_, sketch_container_.get())); - }); + // create device shard + shard_.reset(new DeviceShard(device, batch.Size(), param_, sketch_container_.get())); - // compute sketches for each shard - dh::ExecuteIndexShards(&shards_, - [&](int idx, std::unique_ptr& shard) { - shard->Init(batch, info, gpu_batch_nrows_); - shard->Sketch(batch, info); - shard->ComputeRowStride(); - }); + // compute sketches for the shard + shard_->Init(batch, info, gpu_batch_nrows_); + shard_->Sketch(batch, info); + shard_->ComputeRowStride(); - // compute row stride across all shards - for (const auto &shard : shards_) { - row_stride_ = std::max(row_stride_, shard->GetRowStride()); - } + // compute row stride + row_stride_ = shard_->GetRowStride(); } GPUSketcher(const tree::TrainParam ¶m, const GenericParameter &generic_param, int gpu_nrows) @@ -444,13 +432,13 @@ struct GPUSketcher { this->SketchBatch(batch, info); } - hmat->Init(&sketch_container_.get()->sketches_, param_.max_bin); + hmat->Init(&sketch_container_->sketches_, param_.max_bin); return row_stride_; } private: - std::vector> shards_; + std::unique_ptr shard_; const tree::TrainParam ¶m_; const GenericParameter &generic_param_; int gpu_batch_nrows_; diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index 6ccb1a661..c28218b4e 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -30,19 +30,19 @@ struct HostDeviceVectorImpl { }; template -HostDeviceVector::HostDeviceVector(size_t size, T v, const GPUDistribution &) +HostDeviceVector::HostDeviceVector(size_t size, T v, int device) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(size, v); } template -HostDeviceVector::HostDeviceVector(std::initializer_list init, const GPUDistribution &) +HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } template -HostDeviceVector::HostDeviceVector(const std::vector& init, const GPUDistribution &) +HostDeviceVector::HostDeviceVector(const std::vector& init, int device) : impl_(nullptr) { impl_ = new HostDeviceVectorImpl(init); } @@ -75,29 +75,23 @@ template size_t HostDeviceVector::Size() const { return impl_->Vec().size(); } template -GPUSet HostDeviceVector::Devices() const { return GPUSet::Empty(); } +int HostDeviceVector::DeviceIdx() const { return -1; } template -const GPUDistribution& HostDeviceVector::Distribution() const { - static GPUDistribution dummyInstance; - return dummyInstance; -} +T* HostDeviceVector::DevicePointer() { return nullptr; } template -T* HostDeviceVector::DevicePointer(int device) { return nullptr; } - -template -const T* HostDeviceVector::ConstDevicePointer(int device) const { +const T* HostDeviceVector::ConstDevicePointer() const { return nullptr; } template -common::Span HostDeviceVector::DeviceSpan(int device) { +common::Span HostDeviceVector::DeviceSpan() { return common::Span(); } template -common::Span HostDeviceVector::ConstDeviceSpan(int device) const { +common::Span HostDeviceVector::ConstDeviceSpan() const { return common::Span(); } @@ -115,10 +109,7 @@ void HostDeviceVector::Resize(size_t new_size, T v) { } template -size_t HostDeviceVector::DeviceStart(int device) const { return 0; } - -template -size_t HostDeviceVector::DeviceSize(int device) const { return 0; } +size_t HostDeviceVector::DeviceSize() const { return 0; } template void HostDeviceVector::Fill(T v) { @@ -149,18 +140,12 @@ bool HostDeviceVector::HostCanAccess(GPUAccess access) const { } template -bool HostDeviceVector::DeviceCanAccess(int device, GPUAccess access) const { +bool HostDeviceVector::DeviceCanAccess(GPUAccess access) const { return false; } template -void HostDeviceVector::Shard(const GPUDistribution& distribution) const { } - -template -void HostDeviceVector::Shard(GPUSet devices) const { } - -template -void Reshard(const GPUDistribution &distribution) { } +void HostDeviceVector::SetDevice(int device) const {} // explicit instantiations are required, as HostDeviceVector isn't header-only template class HostDeviceVector; diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index bbf8af690..3205e8fd1 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -10,7 +10,6 @@ #include #include "./device_helpers.cuh" - namespace xgboost { // the handler to call instead of cudaSetDevice; only used for testing @@ -43,144 +42,12 @@ class Permissions { }; template -struct HostDeviceVectorImpl { - struct DeviceShard { - DeviceShard() - : proper_size_{0}, device_{-1}, start_{0}, perm_d_{false}, - cached_size_{static_cast(~0)}, vec_{nullptr} {} - - ~DeviceShard() { +class HostDeviceVectorImpl { + public: + HostDeviceVectorImpl(size_t size, T v, int device) : device_(device), perm_h_(device < 0) { + if (device >= 0) { SetDevice(); - } - - void Init(HostDeviceVectorImpl* vec, int device) { - if (vec_ == nullptr) { vec_ = vec; } - CHECK_EQ(vec, vec_); - device_ = device; - LazyResize(vec_->Size()); - perm_d_ = vec_->perm_h_.Complementary(); - } - - void Init(HostDeviceVectorImpl* vec, const DeviceShard& other) { - if (vec_ == nullptr) { vec_ = vec; } - CHECK_EQ(vec, vec_); - device_ = other.device_; - cached_size_ = other.cached_size_; - start_ = other.start_; - proper_size_ = other.proper_size_; - SetDevice(); - data_.resize(other.data_.size()); - perm_d_ = other.perm_d_; - } - - void ScatterFrom(const T* begin) { - // TODO(canonizer): avoid full copy of host data - LazySyncDevice(GPUAccess::kWrite); - SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), begin + start_, - data_.size() * sizeof(T), cudaMemcpyDefault)); - } - - void GatherTo(thrust::device_ptr begin) { - LazySyncDevice(GPUAccess::kRead); - SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(begin.get() + start_, data_.data().get(), - proper_size_ * sizeof(T), cudaMemcpyDefault)); - } - - void Fill(T v) { - // TODO(canonizer): avoid full copy of host data - LazySyncDevice(GPUAccess::kWrite); - SetDevice(); - thrust::fill(data_.begin(), data_.end(), v); - } - - void Copy(DeviceShard* other) { - // TODO(canonizer): avoid full copy of host data for this (but not for other) - LazySyncDevice(GPUAccess::kWrite); - other->LazySyncDevice(GPUAccess::kRead); - SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_.data().get(), other->data_.data().get(), - data_.size() * sizeof(T), cudaMemcpyDefault)); - } - - void LazySyncHost(GPUAccess access) { - SetDevice(); - dh::safe_cuda(cudaMemcpy(vec_->data_h_.data() + start_, - data_.data().get(), proper_size_ * sizeof(T), - cudaMemcpyDeviceToHost)); - perm_d_.DenyComplementary(access); - } - - void LazyResize(size_t new_size) { - if (new_size == cached_size_) { return; } - // resize is required - int ndevices = vec_->distribution_.devices_.Size(); - int device_index = vec_->distribution_.devices_.Index(device_); - start_ = vec_->distribution_.ShardStart(new_size, device_index); - proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index); - // The size on this device. - size_t size_d = vec_->distribution_.ShardSize(new_size, device_index); - SetDevice(); - data_.resize(size_d); - cached_size_ = new_size; - } - - void LazySyncDevice(GPUAccess access) { - if (perm_d_.CanAccess(access)) { return; } - if (perm_d_.CanRead()) { - // deny read to the host - perm_d_.Grant(access); - std::lock_guard lock(vec_->mutex_); - vec_->perm_h_.DenyComplementary(access); - return; - } - // data is on the host - size_t size_h = vec_->data_h_.size(); - LazyResize(size_h); - SetDevice(); - dh::safe_cuda( - cudaMemcpy(data_.data().get(), vec_->data_h_.data() + start_, - data_.size() * sizeof(T), cudaMemcpyHostToDevice)); - perm_d_.Grant(access); - - std::lock_guard lock(vec_->mutex_); - vec_->perm_h_.DenyComplementary(access); - vec_->size_d_ = size_h; - } - - void SetDevice() { - if (cudaSetDeviceHandler == nullptr) { - dh::safe_cuda(cudaSetDevice(device_)); - } else { - (*cudaSetDeviceHandler)(device_); - } - } - - T* Raw() { return data_.data().get(); } - size_t Start() const { return start_; } - size_t DataSize() const { return data_.size(); } - Permissions& Perm() { return perm_d_; } - Permissions const& Perm() const { return perm_d_; } - - private: - int device_; - dh::device_vector data_; - // cached vector size - size_t cached_size_; - size_t start_; - // size of the portion to copy back to the host - size_t proper_size_; - Permissions perm_d_; - HostDeviceVectorImpl* vec_; - }; - - HostDeviceVectorImpl(size_t size, T v, const GPUDistribution &distribution) - : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) { - if (!distribution_.IsEmpty()) { - size_d_ = size; - InitShards(); - Fill(v); + data_d_.resize(size, v); } else { data_h_.resize(size, v); } @@ -188,127 +55,81 @@ struct HostDeviceVectorImpl { // required, as a new std::mutex has to be created HostDeviceVectorImpl(const HostDeviceVectorImpl& other) - : data_h_(other.data_h_), perm_h_(other.perm_h_), size_d_(other.size_d_), - distribution_(other.distribution_), mutex_() { - shards_.resize(other.shards_.size()); - dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) { - shard.Init(this, other.shards_.at(i)); - }); + : device_(other.device_), data_h_(other.data_h_), perm_h_(other.perm_h_), mutex_() { + if (device_ >= 0) { + SetDevice(); + data_d_ = other.data_d_; + } } // Initializer can be std::vector or std::initializer_list template - HostDeviceVectorImpl(const Initializer& init, const GPUDistribution &distribution) - : distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) { - if (!distribution_.IsEmpty()) { - size_d_ = init.size(); - InitShards(); + HostDeviceVectorImpl(const Initializer& init, int device) : device_(device), perm_h_(device < 0) { + if (device >= 0) { + LazyResizeDevice(init.size()); Copy(init); } else { data_h_ = init; } } - void InitShards() { - int ndevices = distribution_.devices_.Size(); - shards_.resize(ndevices); - dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) { - shard.Init(this, distribution_.devices_.DeviceId(i)); - }); + ~HostDeviceVectorImpl() { + if (device_ >= 0) { + SetDevice(); + } } - size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : size_d_; } + size_t Size() const { return perm_h_.CanRead() ? data_h_.size() : data_d_.size(); } - GPUSet Devices() const { return distribution_.devices_; } + int DeviceIdx() const { return device_; } - const GPUDistribution& Distribution() const { return distribution_; } - - T* DevicePointer(int device) { - CHECK(distribution_.devices_.Contains(device)); - LazySyncDevice(device, GPUAccess::kWrite); - return shards_.at(distribution_.devices_.Index(device)).Raw(); + T* DevicePointer() { + LazySyncDevice(GPUAccess::kWrite); + return data_d_.data().get(); } - const T* ConstDevicePointer(int device) { - CHECK(distribution_.devices_.Contains(device)); - LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).Raw(); + const T* ConstDevicePointer() { + LazySyncDevice(GPUAccess::kRead); + return data_d_.data().get(); } - common::Span DeviceSpan(int device) { - GPUSet devices = distribution_.devices_; - CHECK(devices.Contains(device)); - LazySyncDevice(device, GPUAccess::kWrite); - return {shards_.at(devices.Index(device)).Raw(), - static_cast::index_type>(DeviceSize(device))}; + common::Span DeviceSpan() { + LazySyncDevice(GPUAccess::kWrite); + return {data_d_.data().get(), static_cast::index_type>(DeviceSize())}; } - common::Span ConstDeviceSpan(int device) { - GPUSet devices = distribution_.devices_; - CHECK(devices.Contains(device)); - LazySyncDevice(device, GPUAccess::kRead); + common::Span ConstDeviceSpan() { + LazySyncDevice(GPUAccess::kRead); using SpanInd = typename common::Span::index_type; - return {shards_.at(devices.Index(device)).Raw(), - static_cast(DeviceSize(device))}; + return {data_d_.data().get(), static_cast(DeviceSize())}; } - size_t DeviceSize(int device) { - CHECK(distribution_.devices_.Contains(device)); - LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).DataSize(); + size_t DeviceSize() { + LazySyncDevice(GPUAccess::kRead); + return data_d_.size(); } - size_t DeviceStart(int device) { - CHECK(distribution_.devices_.Contains(device)); - LazySyncDevice(device, GPUAccess::kRead); - return shards_.at(distribution_.devices_.Index(device)).Start(); + thrust::device_ptr tbegin() { // NOLINT + return thrust::device_ptr(DevicePointer()); } - thrust::device_ptr tbegin(int device) { // NOLINT - return thrust::device_ptr(DevicePointer(device)); + thrust::device_ptr tcbegin() { // NOLINT + return thrust::device_ptr(ConstDevicePointer()); } - thrust::device_ptr tcbegin(int device) { // NOLINT - return thrust::device_ptr(ConstDevicePointer(device)); + thrust::device_ptr tend() { // NOLINT + return tbegin() + DeviceSize(); } - thrust::device_ptr tend(int device) { // NOLINT - return tbegin(device) + DeviceSize(device); - } - - thrust::device_ptr tcend(int device) { // NOLINT - return tcbegin(device) + DeviceSize(device); - } - - void ScatterFrom(thrust::device_ptr begin, thrust::device_ptr end) { - CHECK_EQ(end - begin, Size()); - if (perm_h_.CanWrite()) { - dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(), - (end - begin) * sizeof(T), - cudaMemcpyDeviceToHost)); - } else { - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.ScatterFrom(begin.get()); - }); - } - } - - void GatherTo(thrust::device_ptr begin, thrust::device_ptr end) { - CHECK_EQ(end - begin, Size()); - if (perm_h_.CanWrite()) { - dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(), - data_h_.size() * sizeof(T), - cudaMemcpyHostToDevice)); - } else { - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.GatherTo(begin); }); - } + thrust::device_ptr tcend() { // NOLINT + return tcbegin() + DeviceSize(); } void Fill(T v) { // NOLINT if (perm_h_.CanWrite()) { std::fill(data_h_.begin(), data_h_.end(), v); } else { - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { shard.Fill(v); }); + DeviceFill(v); } } @@ -320,14 +141,10 @@ struct HostDeviceVectorImpl { return; } // Data is on device; - if (distribution_ != other->distribution_) { - distribution_ = GPUDistribution(); - Shard(other->Distribution()); - size_d_ = other->size_d_; + if (device_ != other->device_) { + SetDevice(other->device_); } - dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) { - shard.Copy(&other->shards_.at(i)); - }); + DeviceCopy(other); } void Copy(const std::vector& other) { @@ -335,9 +152,7 @@ struct HostDeviceVectorImpl { if (perm_h_.CanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.ScatterFrom(other.data()); - }); + DeviceCopy(other.data()); } } @@ -346,9 +161,7 @@ struct HostDeviceVectorImpl { if (perm_h_.CanWrite()) { std::copy(other.begin(), other.end(), data_h_.begin()); } else { - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.ScatterFrom(other.begin()); - }); + DeviceCopy(other.begin()); } } @@ -362,40 +175,23 @@ struct HostDeviceVectorImpl { return data_h_; } - void Shard(const GPUDistribution& distribution) { - if (distribution_ == distribution) { return; } - CHECK(distribution_.IsEmpty()) - << "Data resides on different GPUs: " << "ID: " - << *(distribution_.Devices().begin()) << " and ID: " - << *(distribution.Devices().begin()); - distribution_ = distribution; - InitShards(); - } - - void Shard(GPUSet new_devices) { - if (distribution_.Devices() == new_devices) { return; } - Shard(GPUDistribution::Block(new_devices)); - } - - void Reshard(const GPUDistribution &distribution) { - if (distribution_ == distribution) { return; } - LazySyncHost(GPUAccess::kWrite); - distribution_ = distribution; - shards_.clear(); - InitShards(); + void SetDevice(int device) { + if (device_ == device) { return; } + if (device_ >= 0) { + LazySyncHost(GPUAccess::kWrite); + } + device_ = device; + if (device_ >= 0) { + LazyResizeDevice(data_h_.size()); + } } void Resize(size_t new_size, T v) { if (new_size == Size()) { return; } - if (distribution_.IsFixedSize()) { - CHECK_EQ(new_size, distribution_.offsets_.back()); - } - if (Size() == 0 && !distribution_.IsEmpty()) { + if (Size() == 0 && device_ >= 0) { // fast on-device resize perm_h_ = Permissions(false); - size_d_ = new_size; - InitShards(); - Fill(v); + data_d_.resize(new_size, v); } else { // resize on host LazySyncHost(GPUAccess::kWrite); @@ -407,72 +203,110 @@ struct HostDeviceVectorImpl { if (perm_h_.CanAccess(access)) { return; } if (perm_h_.CanRead()) { // data is present, just need to deny access to the device - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.Perm().DenyComplementary(access); - }); perm_h_.Grant(access); return; } std::lock_guard lock(mutex_); - if (data_h_.size() != size_d_) { data_h_.resize(size_d_); } - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.LazySyncHost(access); - }); + if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); } + SetDevice(); + dh::safe_cuda(cudaMemcpy(data_h_.data(), + data_d_.data().get(), + data_d_.size() * sizeof(T), + cudaMemcpyDeviceToHost)); perm_h_.Grant(access); } - void LazySyncDevice(int device, GPUAccess access) { - GPUSet devices = distribution_.Devices(); - CHECK(devices.Contains(device)); - shards_.at(devices.Index(device)).LazySyncDevice(access); + void LazySyncDevice(GPUAccess access) { + if (DevicePerm().CanAccess(access)) { return; } + if (DevicePerm().CanRead()) { + // deny read to the host + std::lock_guard lock(mutex_); + perm_h_.DenyComplementary(access); + return; + } + // data is on the host + LazyResizeDevice(data_h_.size()); + SetDevice(); + dh::safe_cuda(cudaMemcpy(data_d_.data().get(), + data_h_.data(), + data_d_.size() * sizeof(T), + cudaMemcpyHostToDevice)); + + std::lock_guard lock(mutex_); + perm_h_.DenyComplementary(access); } bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); } - - bool DeviceCanAccess(int device, GPUAccess access) { - GPUSet devices = distribution_.Devices(); - if (!devices.Contains(device)) { return false; } - return shards_.at(devices.Index(device)).Perm().CanAccess(access); - } + bool DeviceCanAccess(GPUAccess access) { return DevicePerm().CanAccess(access); } private: - std::vector data_h_; - Permissions perm_h_; - // the total size of the data stored on the devices - size_t size_d_; - GPUDistribution distribution_; + int device_{-1}; + std::vector data_h_{}; + dh::device_vector data_d_{}; + Permissions perm_h_{false}; // protects size_d_ and perm_h_ when updated from multiple threads - std::mutex mutex_; - std::vector shards_; + std::mutex mutex_{}; + + void DeviceFill(T v) { + // TODO(canonizer): avoid full copy of host data + LazySyncDevice(GPUAccess::kWrite); + SetDevice(); + thrust::fill(data_d_.begin(), data_d_.end(), v); + } + + void DeviceCopy(HostDeviceVectorImpl* other) { + // TODO(canonizer): avoid full copy of host data for this (but not for other) + LazySyncDevice(GPUAccess::kWrite); + other->LazySyncDevice(GPUAccess::kRead); + SetDevice(); + dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(), + data_d_.size() * sizeof(T), cudaMemcpyDefault)); + } + + void DeviceCopy(const T* begin) { + // TODO(canonizer): avoid full copy of host data + LazySyncDevice(GPUAccess::kWrite); + SetDevice(); + dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin, + data_d_.size() * sizeof(T), cudaMemcpyDefault)); + } + + void LazyResizeDevice(size_t new_size) { + if (new_size == data_d_.size()) { return; } + SetDevice(); + data_d_.resize(new_size); + } + + void SetDevice() { + CHECK_GE(device_, 0); + if (cudaSetDeviceHandler == nullptr) { + dh::safe_cuda(cudaSetDevice(device_)); + } else { + (*cudaSetDeviceHandler)(device_); + } + } + + Permissions DevicePerm() const { return perm_h_.Complementary(); } }; -template -HostDeviceVector::HostDeviceVector -(size_t size, T v, const GPUDistribution &distribution) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(size, v, distribution); -} +template +HostDeviceVector::HostDeviceVector(size_t size, T v, int device) + : impl_(new HostDeviceVectorImpl(size, v, device)) {} template -HostDeviceVector::HostDeviceVector -(std::initializer_list init, const GPUDistribution &distribution) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(init, distribution); -} +HostDeviceVector::HostDeviceVector(std::initializer_list init, int device) + : impl_(new HostDeviceVectorImpl(init, device)) {} template -HostDeviceVector::HostDeviceVector -(const std::vector& init, const GPUDistribution &distribution) : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(init, distribution); -} +HostDeviceVector::HostDeviceVector(const std::vector& init, int device) + : impl_(new HostDeviceVectorImpl(init, device)) {} template HostDeviceVector::HostDeviceVector(const HostDeviceVector& other) - : impl_(nullptr) { - impl_ = new HostDeviceVectorImpl(*other.impl_); -} + : impl_(new HostDeviceVectorImpl(*other.impl_)) {} template -HostDeviceVector& HostDeviceVector::operator= -(const HostDeviceVector& other) { +HostDeviceVector& HostDeviceVector::operator=(const HostDeviceVector& other) { if (this == &other) { return *this; } std::unique_ptr> newImpl(new HostDeviceVectorImpl(*other.impl_)); @@ -491,73 +325,51 @@ template size_t HostDeviceVector::Size() const { return impl_->Size(); } template -GPUSet HostDeviceVector::Devices() const { return impl_->Devices(); } +int HostDeviceVector::DeviceIdx() const { return impl_->DeviceIdx(); } template -const GPUDistribution& HostDeviceVector::Distribution() const { - return impl_->Distribution(); +T* HostDeviceVector::DevicePointer() { + return impl_->DevicePointer(); } template -T* HostDeviceVector::DevicePointer(int device) { - return impl_->DevicePointer(device); +const T* HostDeviceVector::ConstDevicePointer() const { + return impl_->ConstDevicePointer(); } template -const T* HostDeviceVector::ConstDevicePointer(int device) const { - return impl_->ConstDevicePointer(device); +common::Span HostDeviceVector::DeviceSpan() { + return impl_->DeviceSpan(); } template -common::Span HostDeviceVector::DeviceSpan(int device) { - return impl_->DeviceSpan(device); +common::Span HostDeviceVector::ConstDeviceSpan() const { + return impl_->ConstDeviceSpan(); } template -common::Span HostDeviceVector::ConstDeviceSpan(int device) const { - return impl_->ConstDeviceSpan(device); +size_t HostDeviceVector::DeviceSize() const { + return impl_->DeviceSize(); } template -size_t HostDeviceVector::DeviceStart(int device) const { - return impl_->DeviceStart(device); +thrust::device_ptr HostDeviceVector::tbegin() { // NOLINT + return impl_->tbegin(); } template -size_t HostDeviceVector::DeviceSize(int device) const { - return impl_->DeviceSize(device); +thrust::device_ptr HostDeviceVector::tcbegin() const { // NOLINT + return impl_->tcbegin(); } template -thrust::device_ptr HostDeviceVector::tbegin(int device) { // NOLINT - return impl_->tbegin(device); +thrust::device_ptr HostDeviceVector::tend() { // NOLINT + return impl_->tend(); } template -thrust::device_ptr HostDeviceVector::tcbegin(int device) const { // NOLINT - return impl_->tcbegin(device); -} - -template -thrust::device_ptr HostDeviceVector::tend(int device) { // NOLINT - return impl_->tend(device); -} - -template -thrust::device_ptr HostDeviceVector::tcend(int device) const { // NOLINT - return impl_->tcend(device); -} - -template -void HostDeviceVector::ScatterFrom -(thrust::device_ptr begin, thrust::device_ptr end) { - impl_->ScatterFrom(begin, end); -} - -template -void HostDeviceVector::GatherTo -(thrust::device_ptr begin, thrust::device_ptr end) const { - impl_->GatherTo(begin, end); +thrust::device_ptr HostDeviceVector::tcend() const { // NOLINT + return impl_->tcend(); } template @@ -594,23 +406,13 @@ bool HostDeviceVector::HostCanAccess(GPUAccess access) const { } template -bool HostDeviceVector::DeviceCanAccess(int device, GPUAccess access) const { - return impl_->DeviceCanAccess(device, access); +bool HostDeviceVector::DeviceCanAccess(GPUAccess access) const { + return impl_->DeviceCanAccess(access); } template -void HostDeviceVector::Shard(GPUSet new_devices) const { - impl_->Shard(new_devices); -} - -template -void HostDeviceVector::Shard(const GPUDistribution &distribution) const { - impl_->Shard(distribution); -} - -template -void HostDeviceVector::Reshard(const GPUDistribution &distribution) { - impl_->Reshard(distribution); +void HostDeviceVector::SetDevice(int device) const { + impl_->SetDevice(device); } template diff --git a/src/common/host_device_vector.h b/src/common/host_device_vector.h index 0ffe75f5e..abaa5dd59 100644 --- a/src/common/host_device_vector.h +++ b/src/common/host_device_vector.h @@ -79,113 +79,6 @@ void SetCudaSetDeviceHandler(void (*handler)(int)); template struct HostDeviceVectorImpl; -// Distribution for the HostDeviceVector; it specifies such aspects as the -// devices it is distributed on, whether there are copies of elements from -// other GPUs as well as the granularity of splitting. It may also specify -// explicit boundaries for devices, in which case the size of the array cannot -// be changed. -class GPUDistribution { - template friend struct HostDeviceVectorImpl; - - public: - explicit GPUDistribution(GPUSet devices = GPUSet::Empty()) - : devices_(devices), granularity_(1), overlap_(0) {} - - private: - GPUDistribution(GPUSet devices, int granularity, int overlap, - std::vector &&offsets) - : devices_(devices), granularity_(granularity), overlap_(overlap), - offsets_(std::move(offsets)) {} - - public: - static GPUDistribution Empty() { return GPUDistribution(); } - - static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); } - - static GPUDistribution Overlap(GPUSet devices, int overlap) { - return GPUDistribution(devices, 1, overlap, std::vector()); - } - - static GPUDistribution Granular(GPUSet devices, int granularity) { - return GPUDistribution(devices, granularity, 0, std::vector()); - } - - // NOTE(rongou): Explicit offsets don't necessarily cover the whole vector. Sections before the - // first shard or after the last shard may be on host only. This windowing is done in the GPU - // predictor for external memory support. - static GPUDistribution Explicit(GPUSet devices, std::vector offsets) { - return GPUDistribution(devices, 1, 0, std::move(offsets)); - } - - friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) { - bool const res = a.devices_ == b.devices_ && - a.granularity_ == b.granularity_ && - a.overlap_ == b.overlap_ && - a.offsets_ == b.offsets_; - return res; - } - - friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) { - return !(a == b); - } - - GPUSet Devices() const { return devices_; } - - bool IsEmpty() const { return devices_.IsEmpty(); } - - size_t ShardStart(size_t size, int index) const { - if (size == 0) { return 0; } - if (offsets_.size() > 0) { - // explicit offsets are provided - CHECK_EQ(offsets_.back(), size); - return offsets_.at(index); - } - // no explicit offsets - size_t begin = std::min(index * Portion(size), size); - begin = begin > size ? size : begin; - return begin; - } - - size_t ShardSize(size_t size, size_t index) const { - if (size == 0) { return 0; } - if (offsets_.size() > 0) { - // explicit offsets are provided - CHECK_EQ(offsets_.back(), size); - return offsets_.at(index + 1) - offsets_.at(index) + - (index == devices_.Size() - 1 ? overlap_ : 0); - } - size_t portion = Portion(size); - size_t begin = std::min(index * portion, size); - size_t end = std::min((index + 1) * portion + overlap_ * granularity_, size); - return end - begin; - } - - size_t ShardProperSize(size_t size, size_t index) const { - if (size == 0) { return 0; } - return ShardSize(size, index) - (devices_.Size() - 1 > index ? overlap_ : 0); - } - - bool IsFixedSize() const { return !offsets_.empty(); } - - private: - static size_t DivRoundUp(size_t a, size_t b) { return (a + b - 1) / b; } - static size_t RoundUp(size_t a, size_t b) { return DivRoundUp(a, b) * b; } - - size_t Portion(size_t size) const { - return RoundUp - (DivRoundUp - (std::max(static_cast(size - overlap_ * granularity_), - static_cast(1)), - devices_.Size()), granularity_); - } - - GPUSet devices_; - int granularity_; - int overlap_; - // explicit offsets for the GPU parts, if any - std::vector offsets_; -}; - enum GPUAccess { kNone, kRead, // write implies read @@ -199,46 +92,38 @@ inline GPUAccess operator-(GPUAccess a, GPUAccess b) { template class HostDeviceVector { public: - explicit HostDeviceVector(size_t size = 0, T v = T(), - const GPUDistribution &distribution = GPUDistribution()); - HostDeviceVector(std::initializer_list init, - const GPUDistribution &distribution = GPUDistribution()); - explicit HostDeviceVector(const std::vector& init, - const GPUDistribution &distribution = GPUDistribution()); + explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); + HostDeviceVector(std::initializer_list init, int device = -1); + explicit HostDeviceVector(const std::vector& init, int device = -1); ~HostDeviceVector(); HostDeviceVector(const HostDeviceVector&); HostDeviceVector& operator=(const HostDeviceVector&); size_t Size() const; - GPUSet Devices() const; - const GPUDistribution& Distribution() const; - common::Span DeviceSpan(int device); - common::Span ConstDeviceSpan(int device) const; - common::Span DeviceSpan(int device) const { return ConstDeviceSpan(device); } - T* DevicePointer(int device); - const T* ConstDevicePointer(int device) const; - const T* DevicePointer(int device) const { return ConstDevicePointer(device); } + int DeviceIdx() const; + common::Span DeviceSpan(); + common::Span ConstDeviceSpan() const; + common::Span DeviceSpan() const { return ConstDeviceSpan(); } + T* DevicePointer(); + const T* ConstDevicePointer() const; + const T* DevicePointer() const { return ConstDevicePointer(); } T* HostPointer() { return HostVector().data(); } const T* ConstHostPointer() const { return ConstHostVector().data(); } const T* HostPointer() const { return ConstHostPointer(); } - size_t DeviceStart(int device) const; - size_t DeviceSize(int device) const; + size_t DeviceSize() const; // only define functions returning device_ptr // if HostDeviceVector.h is included from a .cu file #ifdef __CUDACC__ - thrust::device_ptr tbegin(int device); // NOLINT - thrust::device_ptr tend(int device); // NOLINT - thrust::device_ptr tcbegin(int device) const; // NOLINT - thrust::device_ptr tcend(int device) const; // NOLINT - thrust::device_ptr tbegin(int device) const { // NOLINT - return tcbegin(device); + thrust::device_ptr tbegin(); // NOLINT + thrust::device_ptr tend(); // NOLINT + thrust::device_ptr tcbegin() const; // NOLINT + thrust::device_ptr tcend() const; // NOLINT + thrust::device_ptr tbegin() const { // NOLINT + return tcbegin(); } - thrust::device_ptr tend(int device) const { return tcend(device); } // NOLINT - - void ScatterFrom(thrust::device_ptr begin, thrust::device_ptr end); - void GatherTo(thrust::device_ptr begin, thrust::device_ptr end) const; + thrust::device_ptr tend() const { return tcend(); } // NOLINT #endif // __CUDACC__ void Fill(T v); @@ -251,18 +136,9 @@ class HostDeviceVector { const std::vector& HostVector() const {return ConstHostVector(); } bool HostCanAccess(GPUAccess access) const; - bool DeviceCanAccess(int device, GPUAccess access) const; + bool DeviceCanAccess(GPUAccess access) const; - /*! - * \brief Specify memory distribution. - */ - void Shard(const GPUDistribution &distribution) const; - void Shard(GPUSet devices) const; - - /*! - * \brief Change memory distribution. - */ - void Reshard(const GPUDistribution &distribution); + void SetDevice(int device) const; void Resize(size_t new_size, T v = T()); diff --git a/src/common/transform.h b/src/common/transform.h index b1bc55322..205dbf575 100644 --- a/src/common/transform.h +++ b/src/common/transform.h @@ -57,14 +57,10 @@ class Transform { template struct Evaluator { public: - Evaluator(Functor func, Range range, GPUSet devices, bool shard) : + Evaluator(Functor func, Range range, int device, bool shard) : func_(func), range_{std::move(range)}, shard_{shard}, - distribution_{GPUDistribution::Block(devices)} {} - Evaluator(Functor func, Range range, GPUDistribution dist, - bool shard) : - func_(func), range_{std::move(range)}, shard_{shard}, - distribution_{std::move(dist)} {} + device_{device} {} /*! * \brief Evaluate the functor with input pointers to HostDeviceVector. @@ -74,7 +70,7 @@ class Transform { */ template void Eval(HDV... vectors) const { - bool on_device = !distribution_.IsEmpty(); + bool on_device = device_ >= 0; if (on_device) { LaunchCUDA(func_, vectors...); @@ -86,13 +82,13 @@ class Transform { private: // CUDA UnpackHDV template - Span UnpackHDV(HostDeviceVector* _vec, int _device) const { - auto span = _vec->DeviceSpan(_device); + Span UnpackHDVOnDevice(HostDeviceVector* _vec) const { + auto span = _vec->DeviceSpan(); return span; } template - Span UnpackHDV(const HostDeviceVector* _vec, int _device) const { - auto span = _vec->ConstDeviceSpan(_device); + Span UnpackHDVOnDevice(const HostDeviceVector* _vec) const { + auto span = _vec->ConstDeviceSpan(); return span; } // CPU UnpackHDV @@ -108,15 +104,15 @@ class Transform { } // Recursive unpack for Shard. template - void UnpackShard(GPUDistribution dist, const HostDeviceVector *vector) const { - vector->Shard(dist); + void UnpackShard(int device, const HostDeviceVector *vector) const { + vector->SetDevice(device); } template - void UnpackShard(GPUDistribution dist, + void UnpackShard(int device, const HostDeviceVector *_vector, const HostDeviceVector *... _vectors) const { - _vector->Shard(dist); - UnpackShard(dist, _vectors...); + _vector->SetDevice(device); + UnpackShard(device, _vectors...); } #if defined(__CUDACC__) @@ -124,28 +120,20 @@ class Transform { typename... HDV> void LaunchCUDA(Functor _func, HDV*... _vectors) const { if (shard_) - UnpackShard(distribution_, _vectors...); + UnpackShard(device_, _vectors...); - GPUSet devices = distribution_.Devices(); size_t range_size = *range_.end() - *range_.begin(); // Extract index to deal with possible old OpenMP. - size_t device_beg = *(devices.begin()); - size_t device_end = *(devices.end()); -#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1) - for (omp_ulong device = device_beg; device < device_end; ++device) { // NOLINT - // Ignore other attributes of GPUDistribution for spliting index. - // This deals with situation like multi-class setting where - // granularity is used in data vector. - size_t shard_size = GPUDistribution::Block(devices).ShardSize( - range_size, devices.Index(device)); - Range shard_range {0, static_cast(shard_size)}; - dh::safe_cuda(cudaSetDevice(device)); - const int GRID_SIZE = - static_cast(DivRoundUp(*(range_.end()), kBlockThreads)); - detail::LaunchCUDAKernel<<>>( - _func, shard_range, UnpackHDV(_vectors, device)...); - } + // This deals with situation like multi-class setting where + // granularity is used in data vector. + size_t shard_size = range_size; + Range shard_range {0, static_cast(shard_size)}; + dh::safe_cuda(cudaSetDevice(device_)); + const int GRID_SIZE = + static_cast(DivRoundUp(*(range_.end()), kBlockThreads)); + detail::LaunchCUDAKernel<<>>( + _func, shard_range, UnpackHDVOnDevice(_vectors)...); } #else /*! \brief Dummy funtion defined when compiling for CPU. */ @@ -172,7 +160,7 @@ class Transform { Range range_; /*! \brief Whether sharding for vectors is required. */ bool shard_; - GPUDistribution distribution_; + int device_; }; public: @@ -191,15 +179,9 @@ class Transform { */ template static Evaluator Init(Functor func, Range const range, - GPUSet const devices, + int device, bool const shard = true) { - return Evaluator {func, std::move(range), std::move(devices), shard}; - } - template - static Evaluator Init(Functor func, Range const range, - GPUDistribution const dist, - bool const shard = true) { - return Evaluator {func, std::move(range), std::move(dist), shard}; + return Evaluator {func, std::move(range), device, shard}; } }; diff --git a/src/data/data.cu b/src/data/data.cu index d991739f6..27c7d7166 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -78,9 +78,9 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { } else { LOG(FATAL) << "Unknown metainfo: " << key; } - dst->Reshard(GPUDistribution(GPUSet::Range(ptr_device, 1))); + dst->SetDevice(ptr_device); dst->Resize(length); - auto p_dst = thrust::device_pointer_cast(dst->DevicePointer(0)); + auto p_dst = thrust::device_pointer_cast(dst->DevicePointer()); thrust::copy(p_src, p_src + length, p_dst); } } // namespace xgboost diff --git a/src/data/simple_csr_source.cu b/src/data/simple_csr_source.cu index af0cf505c..cd8f30309 100644 --- a/src/data/simple_csr_source.cu +++ b/src/data/simple_csr_source.cu @@ -77,16 +77,14 @@ void SimpleCSRSource::FromDeviceColumnar(std::vector cols) { dh::safe_cuda(cudaSetDevice(device)); - GPUSet devices = GPUSet::Range(device, 1); - - page_.offset.Reshard(GPUDistribution(devices)); + page_.offset.SetDevice(device); page_.offset.Resize(info.num_row_ + 1); - page_.data.Reshard(GPUDistribution(devices)); + page_.data.SetDevice(device); page_.data.Resize(info.num_nonzero_); - auto s_data = page_.data.DeviceSpan(device); - auto s_offsets = page_.offset.DeviceSpan(device); + auto s_data = page_.data.DeviceSpan(); + auto s_offsets = page_.offset.DeviceSpan(); CHECK_EQ(s_offsets.size(), n_rows + 1); int32_t constexpr kThreads = 256; diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 43048061e..8d790e9e4 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -182,9 +182,9 @@ void GBTree::DoBoost(DMatrix* p_fmat, CHECK_EQ(in_gpair->Size() % ngroup, 0U) << "must have exactly ngroup*nrow gpairs"; // TODO(canonizer): perform this on GPU if HostDeviceVector has device set. - HostDeviceVector tmp - (in_gpair->Size() / ngroup, GradientPair(), - GPUDistribution::Block(in_gpair->Distribution().Devices())); + HostDeviceVector tmp(in_gpair->Size() / ngroup, + GradientPair(), + in_gpair->DeviceIdx()); const auto& gpair_h = in_gpair->ConstHostVector(); auto nsize = static_cast(tmp.Size()); for (int gid = 0; gid < ngroup; ++gid) { diff --git a/src/learner.cc b/src/learner.cc index 0eb175f3d..b6cf889c7 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -237,14 +237,13 @@ class LearnerImpl : public Learner { std::vector > attr; fi->Read(&attr); for (auto& kv : attr) { - // Load `predictor`, `n_gpus`, `gpu_id` parameters from extra attributes + // Load `predictor`, `gpu_id` parameters from extra attributes const std::string prefix = "SAVED_PARAM_"; if (kv.first.find(prefix) == 0) { const std::string saved_param = kv.first.substr(prefix.length()); bool is_gpu_predictor = saved_param == "predictor" && kv.second == "gpu_predictor"; #ifdef XGBOOST_USE_CUDA - if (saved_param == "predictor" || saved_param == "n_gpus" - || saved_param == "gpu_id") { + if (saved_param == "predictor" || saved_param == "gpu_id") { cfg_[saved_param] = kv.second; LOG(INFO) << "Parameter '" << saved_param << "' has been recovered from " @@ -266,7 +265,7 @@ class LearnerImpl : public Learner { } #endif // XGBOOST_USE_CUDA // NO visible GPU in current environment - if (is_gpu_predictor && GPUSet::AllVisible().Size() == 0) { + if (is_gpu_predictor && common::AllVisibleGPUs() == 0) { cfg_["predictor"] = "cpu_predictor"; kv.second = "cpu_predictor"; LOG(INFO) << "Switch gpu_predictor to cpu_predictor."; @@ -294,7 +293,9 @@ class LearnerImpl : public Learner { auto n = tparam_.__DICT__(); cfg_.insert(n.cbegin(), n.cend()); - gbm_->Configure({cfg_.cbegin(), cfg_.cend()}); + Args args = {cfg_.cbegin(), cfg_.cend()}; + generic_param_.InitAllowUnknown(args); + gbm_->Configure(args); obj_->Configure({cfg_.begin(), cfg_.end()}); for (auto& p_metric : metrics_) { @@ -331,9 +332,8 @@ class LearnerImpl : public Learner { } } { - // Write `predictor`, `n_gpus`, `gpu_id` parameters as extra attributes - for (const auto& key : std::vector{ - "predictor", "n_gpus", "gpu_id"}) { + // Write `predictor`, `gpu_id` parameters as extra attributes + for (const auto& key : std::vector{"predictor", "gpu_id"}) { auto it = cfg_.find(key); if (it != cfg_.end()) { mparam.contain_extra_attrs = 1; @@ -581,13 +581,8 @@ class LearnerImpl : public Learner { gbm_->Configure(args); if (this->gbm_->UseGPU()) { - if (cfg_.find("n_gpus") == cfg_.cend()) { - generic_param_.n_gpus = 1; - } - if (generic_param_.n_gpus != 1) { - LOG(FATAL) << "Single process multi-GPU training is no longer supported. " - "Please switch to distributed GPU training with one process per GPU. " - "This can be done using Dask or Spark."; + if (cfg_.find("gpu_id") == cfg_.cend()) { + generic_param_.gpu_id = 0; } } } diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu index fad98f31f..74bbd8e25 100644 --- a/src/linear/updater_gpu_coordinate.cu +++ b/src/linear/updater_gpu_coordinate.cu @@ -19,12 +19,6 @@ namespace linear { DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate); -void RescaleIndices(int device_idx, size_t ridx_begin, - common::Span data) { - dh::LaunchN(device_idx, data.size(), - [=] __device__(size_t idx) { data[idx].index -= ridx_begin; }); -} - class DeviceShard { int device_id_; dh::BulkAllocator ba_; @@ -32,18 +26,16 @@ class DeviceShard { common::Span data_; common::Span gpair_; dh::CubMemory temp_; - size_t ridx_begin_; - size_t ridx_end_; + size_t shard_size_; public: DeviceShard(int device_id, const SparsePage &batch, // column batch - bst_uint row_begin, bst_uint row_end, + bst_uint shard_size, const LinearTrainParam ¶m, const gbm::GBLinearModelParam &model_param) : device_id_(device_id), - ridx_begin_(row_begin), - ridx_end_(row_end) { + shard_size_(shard_size) { if ( IsEmpty() ) { return; } dh::safe_cuda(cudaSetDevice(device_id_)); // The begin and end indices for the section of each column associated with @@ -51,25 +43,25 @@ class DeviceShard { std::vector> column_segments; row_ptr_ = {0}; // iterate through columns - for (auto fidx = 0; fidx < batch.Size(); fidx++) { + for (size_t fidx = 0; fidx < batch.Size(); fidx++) { common::Span col = batch[fidx]; auto cmp = [](Entry e1, Entry e2) { return e1.index < e2.index; }; auto column_begin = std::lower_bound(col.cbegin(), col.cend(), - xgboost::Entry(row_begin, 0.0f), cmp); + xgboost::Entry(0, 0.0f), cmp); auto column_end = std::lower_bound(col.cbegin(), col.cend(), - xgboost::Entry(row_end, 0.0f), cmp); + xgboost::Entry(shard_size_, 0.0f), cmp); column_segments.emplace_back( std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin())); row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin)); } ba_.Allocate(device_id_, &data_, row_ptr_.back(), &gpair_, - (row_end - row_begin) * model_param.num_output_group); + shard_size_ * model_param.num_output_group); - for (int fidx = 0; fidx < batch.Size(); fidx++) { + for (size_t fidx = 0; fidx < batch.Size(); fidx++) { auto col = batch[fidx]; auto seg = column_segments[fidx]; dh::safe_cuda(cudaMemcpy( @@ -77,23 +69,21 @@ class DeviceShard { col.data() + seg.first, sizeof(Entry) * (seg.second - seg.first), cudaMemcpyHostToDevice)); } - // Rescale indices with respect to current shard - RescaleIndices(device_id_, ridx_begin_, data_); } - ~DeviceShard() { + ~DeviceShard() { // NOLINT dh::safe_cuda(cudaSetDevice(device_id_)); } bool IsEmpty() { - return (ridx_end_ - ridx_begin_) == 0; + return shard_size_ == 0; } void UpdateGpair(const std::vector &host_gpair, const gbm::GBLinearModelParam &model_param) { dh::safe_cuda(cudaMemcpyAsync( gpair_.data(), - host_gpair.data() + ridx_begin_ * model_param.num_output_group, + host_gpair.data(), gpair_.size() * sizeof(GradientPair), cudaMemcpyHostToDevice)); } @@ -107,13 +97,13 @@ class DeviceShard { counting, f); auto perm = thrust::make_permutation_iterator(gpair_.data(), skip); - return dh::SumReduction(temp_, perm, ridx_end_ - ridx_begin_); + return dh::SumReduction(temp_, perm, shard_size_); } void UpdateBiasResidual(float dbias, int group_idx, int num_groups) { if (dbias == 0.0f) return; auto d_gpair = gpair_; - dh::LaunchN(device_id_, ridx_end_ - ridx_begin_, [=] __device__(size_t idx) { + dh::LaunchN(device_id_, shard_size_, [=] __device__(size_t idx) { auto &g = d_gpair[idx * num_groups + group_idx]; g += GradientPair(g.GetHess() * dbias, 0); }); @@ -154,7 +144,7 @@ class DeviceShard { * \brief Coordinate descent algorithm that updates one feature per iteration */ -class GPUCoordinateUpdater : public LinearUpdater { +class GPUCoordinateUpdater : public LinearUpdater { // NOLINT public: // set training parameter void Configure(Args const& args) override { @@ -165,37 +155,23 @@ class GPUCoordinateUpdater : public LinearUpdater { void LazyInitShards(DMatrix *p_fmat, const gbm::GBLinearModelParam &model_param) { - if (!shards_.empty()) return; + if (shard_) return; - dist_ = GPUDistribution::Block(GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus, - p_fmat->Info().num_row_)); - auto devices = dist_.Devices(); + device_ = learner_param_->gpu_id; - size_t n_devices = static_cast(devices.Size()); - size_t row_begin = 0; - size_t num_row = static_cast(p_fmat->Info().num_row_); + auto num_row = static_cast(p_fmat->Info().num_row_); // Partition input matrix into row segments std::vector row_segments; row_segments.push_back(0); - for (int d_idx = 0; d_idx < n_devices; ++d_idx) { - size_t shard_size = dist_.ShardSize(num_row, d_idx); - size_t row_end = row_begin + shard_size; - row_segments.push_back(row_end); - row_begin = row_end; - } + size_t shard_size = num_row; + row_segments.push_back(shard_size); CHECK(p_fmat->SingleColBlock()); SparsePage const& batch = *(p_fmat->GetBatches().begin()); - shards_.resize(n_devices); - // Create device shards - dh::ExecuteIndexShards(&shards_, - [&](int i, std::unique_ptr& shard) { - shard = std::unique_ptr( - new DeviceShard(devices.DeviceId(i), batch, row_segments[i], - row_segments[i + 1], tparam_, model_param)); - }); + // Create device shard + shard_.reset(new DeviceShard(device_, batch, shard_size, tparam_, model_param)); } void Update(HostDeviceVector *in_gpair, DMatrix *p_fmat, @@ -208,11 +184,9 @@ class GPUCoordinateUpdater : public LinearUpdater { monitor_.Start("UpdateGpair"); auto &in_gpair_host = in_gpair->ConstHostVector(); // Update gpair - dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr& shard) { - if (!shard->IsEmpty()) { - shard->UpdateGpair(in_gpair_host, model->param); - } - }); + if (shard_) { + shard_->UpdateGpair(in_gpair_host, model->param); + } monitor_.Stop("UpdateGpair"); monitor_.Start("UpdateBias"); @@ -237,32 +211,21 @@ class GPUCoordinateUpdater : public LinearUpdater { } void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) { - for (int group_idx = 0; group_idx < model->param.num_output_group; - ++group_idx) { + for (int group_idx = 0; group_idx < model->param.num_output_group; ++group_idx) { // Get gradient - auto grad = dh::ReduceShards( - &shards_, [&](std::unique_ptr &shard) { - if (!shard->IsEmpty()) { - GradientPair result = - shard->GetBiasGradient(group_idx, - model->param.num_output_group); - return result; - } - return GradientPair(0, 0); - }); - + auto grad = GradientPair(0, 0); + if (shard_) { + grad = shard_->GetBiasGradient(group_idx, model->param.num_output_group); + } auto dbias = static_cast( tparam_.learning_rate * - CoordinateDeltaBias(grad.GetGrad(), grad.GetHess())); + CoordinateDeltaBias(grad.GetGrad(), grad.GetHess())); model->bias()[group_idx] += dbias; // Update residual - dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr& shard) { - if (!shard->IsEmpty()) { - shard->UpdateBiasResidual(dbias, group_idx, - model->param.num_output_group); - } - }); + if (shard_) { + shard_->UpdateBiasResidual(dbias, group_idx, model->param.num_output_group); + } } } @@ -271,38 +234,30 @@ class GPUCoordinateUpdater : public LinearUpdater { gbm::GBLinearModel *model) { bst_float &w = (*model)[fidx][group_idx]; // Get gradient - auto grad = dh::ReduceShards( - &shards_, [&](std::unique_ptr &shard) { - if (!shard->IsEmpty()) { - return shard->GetGradient(group_idx, model->param.num_output_group, - fidx); - } - return GradientPair(0, 0); - }); - + auto grad = GradientPair(0, 0); + if (shard_) { + grad = shard_->GetGradient(group_idx, model->param.num_output_group, fidx); + } auto dw = static_cast(tparam_.learning_rate * CoordinateDelta(grad.GetGrad(), grad.GetHess(), w, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm)); w += dw; - dh::ExecuteIndexShards(&shards_, [&](int idx, - std::unique_ptr &shard) { - if (!shard->IsEmpty()) { - shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx); - } - }); + if (shard_) { + shard_->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx); + } } private: // training parameter LinearTrainParam tparam_; CoordinateParam coord_param_; - GPUDistribution dist_; + int device_{}; std::unique_ptr selector_; common::Monitor monitor_; - std::vector> shards_; + std::unique_ptr shard_{nullptr}; }; XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent") diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index 0415f98e1..661d02dac 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -30,8 +30,7 @@ DMLC_REGISTRY_FILE_TAG(elementwise_metric); template class ElementWiseMetricsReduction { public: - explicit ElementWiseMetricsReduction(EvalRow policy) : - policy_(std::move(policy)) {} + explicit ElementWiseMetricsReduction(EvalRow policy) : policy_(std::move(policy)) {} PackedReduceResult CpuReduceMetrics( const HostDeviceVector& weights, @@ -59,34 +58,31 @@ class ElementWiseMetricsReduction { #if defined(XGBOOST_USE_CUDA) ~ElementWiseMetricsReduction() { - for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) { - dh::safe_cuda(cudaSetDevice(id)); - size_t index = devices_.Index(id); - allocators_.at(index).Free(); + if (device_ >= 0) { + dh::safe_cuda(cudaSetDevice(device_)); + allocator_.Free(); } } PackedReduceResult DeviceReduceMetrics( - GPUSet::GpuIdType device_id, - size_t device_index, const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds) { - size_t n_data = preds.DeviceSize(device_id); + size_t n_data = preds.DeviceSize(); thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + n_data; - auto s_label = labels.DeviceSpan(device_id); - auto s_preds = preds.DeviceSpan(device_id); - auto s_weights = weights.DeviceSpan(device_id); + auto s_label = labels.DeviceSpan(); + auto s_preds = preds.DeviceSpan(); + auto s_weights = weights.DeviceSpan(); bool const is_null_weight = weights.Size() == 0; auto d_policy = policy_; PackedReduceResult result = thrust::transform_reduce( - thrust::cuda::par(allocators_.at(device_index)), + thrust::cuda::par(allocator_), begin, end, [=] XGBOOST_DEVICE(size_t idx) { bst_float weight = is_null_weight ? 1.0f : s_weights[idx]; @@ -105,37 +101,24 @@ class ElementWiseMetricsReduction { PackedReduceResult Reduce( const GenericParameter &tparam, - GPUSet devices, + int device, const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds) { PackedReduceResult result; - if (devices.IsEmpty()) { + if (device < 0) { result = CpuReduceMetrics(weights, labels, preds); } #if defined(XGBOOST_USE_CUDA) else { // NOLINT - if (allocators_.empty()) { - devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus); - allocators_.resize(devices_.Size()); - } - preds.Shard(devices); - labels.Shard(devices); - weights.Shard(devices); - std::vector res_per_device(devices.Size()); + device_ = device; + preds.SetDevice(device_); + labels.SetDevice(device_); + weights.SetDevice(device_); -#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1) - for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) { - dh::safe_cuda(cudaSetDevice(id)); - size_t index = devices.Index(id); - res_per_device.at(index) = - DeviceReduceMetrics(id, index, weights, labels, preds); - } - - for (auto const& res : res_per_device) { - result += res; - } + dh::safe_cuda(cudaSetDevice(device_)); + result = DeviceReduceMetrics(weights, labels, preds); } #endif // defined(XGBOOST_USE_CUDA) return result; @@ -144,8 +127,8 @@ class ElementWiseMetricsReduction { private: EvalRow policy_; #if defined(XGBOOST_USE_CUDA) - GPUSet devices_; - std::vector allocators_; + int device_{-1}; + dh::CubMemory allocator_; #endif // defined(XGBOOST_USE_CUDA) }; @@ -345,11 +328,10 @@ struct EvalEWiseBase : public Metric { << "label and prediction size not match, " << "hint: use merror or mlogloss for multi-class classification"; const auto ndata = static_cast(info.labels_.Size()); - // Dealing with ndata < n_gpus. - GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata); + int device = tparam_->gpu_id; auto result = - reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds); + reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds); double dat[2] { result.Residue(), result.Weights() }; if (distributed) { diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu index 4dde1ed4d..5b18b9f5f 100644 --- a/src/metric/multiclass_metric.cu +++ b/src/metric/multiclass_metric.cu @@ -74,35 +74,32 @@ class MultiClassMetricsReduction { #if defined(XGBOOST_USE_CUDA) ~MultiClassMetricsReduction() { - for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) { - dh::safe_cuda(cudaSetDevice(id)); - size_t index = devices_.Index(id); - allocators_.at(index).Free(); + if (device_ >= 0) { + dh::safe_cuda(cudaSetDevice(device_)); + allocator_.Free(); } } PackedReduceResult DeviceReduceMetrics( - GPUSet::GpuIdType device_id, - size_t device_index, const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds, const size_t n_class) { - size_t n_data = labels.DeviceSize(device_id); + size_t n_data = labels.DeviceSize(); thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + n_data; - auto s_labels = labels.DeviceSpan(device_id); - auto s_preds = preds.DeviceSpan(device_id); - auto s_weights = weights.DeviceSpan(device_id); + auto s_labels = labels.DeviceSpan(); + auto s_preds = preds.DeviceSpan(); + auto s_weights = weights.DeviceSpan(); bool const is_null_weight = weights.Size() == 0; auto s_label_error = label_error_.GetSpan(1); s_label_error[0] = 0; PackedReduceResult result = thrust::transform_reduce( - thrust::cuda::par(allocators_.at(device_index)), + thrust::cuda::par(allocator_), begin, end, [=] XGBOOST_DEVICE(size_t idx) { bst_float weight = is_null_weight ? 1.0f : s_weights[idx]; @@ -127,38 +124,25 @@ class MultiClassMetricsReduction { PackedReduceResult Reduce( const GenericParameter &tparam, - GPUSet devices, + int device, size_t n_class, const HostDeviceVector& weights, const HostDeviceVector& labels, const HostDeviceVector& preds) { PackedReduceResult result; - if (devices.IsEmpty()) { + if (device < 0) { result = CpuReduceMetrics(weights, labels, preds, n_class); } #if defined(XGBOOST_USE_CUDA) else { // NOLINT - if (allocators_.empty()) { - devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus); - allocators_.resize(devices_.Size()); - } - preds.Shard(GPUDistribution::Granular(devices, n_class)); - labels.Shard(devices); - weights.Shard(devices); - std::vector res_per_device(devices.Size()); + device_ = tparam.gpu_id; + preds.SetDevice(device_); + labels.SetDevice(device_); + weights.SetDevice(device_); -#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1) - for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) { - dh::safe_cuda(cudaSetDevice(id)); - size_t index = devices.Index(id); - res_per_device.at(index) = - DeviceReduceMetrics(id, index, weights, labels, preds, n_class); - } - - for (auto const& res : res_per_device) { - result += res; - } + dh::safe_cuda(cudaSetDevice(device_)); + result = DeviceReduceMetrics(weights, labels, preds, n_class); } #endif // defined(XGBOOST_USE_CUDA) return result; @@ -167,8 +151,8 @@ class MultiClassMetricsReduction { private: #if defined(XGBOOST_USE_CUDA) dh::PinnedMemory label_error_; - GPUSet devices_; - std::vector allocators_; + int device_{-1}; + dh::CubMemory allocator_; #endif // defined(XGBOOST_USE_CUDA) }; @@ -190,8 +174,8 @@ struct EvalMClassBase : public Metric { << " use logloss for binary classification"; const auto ndata = static_cast(info.labels_.Size()); - GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata); - auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds); + int device = tparam_->gpu_id; + auto result = reducer_.Reduce(*tparam_, device, nclass, info.weights_, info.labels_, preds); double dat[2] { result.Residue(), result.Weights() }; if (distributed) { diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu index 0dd3010bc..88a9b06f3 100644 --- a/src/objective/hinge.cu +++ b/src/objective/hinge.cu @@ -58,7 +58,7 @@ class HingeObj : public ObjFunction { _out_gpair[_idx] = GradientPair(g, h); }, common::Range{0, static_cast(ndata)}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata)).Eval( + tparam_->gpu_id).Eval( out_gpair, &preds, &info.labels_, &info.weights_); } @@ -68,7 +68,7 @@ class HingeObj : public ObjFunction { _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0; }, common::Range{0, static_cast(io_preds->Size()), 1}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size())) + tparam_->gpu_id) .Eval(io_preds); } diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index ca803f522..80cf69410 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -59,14 +59,14 @@ class SoftmaxMultiClassObj : public ObjFunction { const int nclass = param_.num_class; const auto ndata = static_cast(preds.Size() / nclass); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size()); - out_gpair->Shard(GPUDistribution::Granular(devices, nclass)); - info.labels_.Shard(GPUDistribution::Block(devices)); - info.weights_.Shard(GPUDistribution::Block(devices)); - preds.Shard(GPUDistribution::Granular(devices, nclass)); + auto device = tparam_->gpu_id; + out_gpair->SetDevice(device); + info.labels_.SetDevice(device); + info.weights_.SetDevice(device); + preds.SetDevice(device); - label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size()); - label_correct_.Shard(GPUDistribution::Block(devices)); + label_correct_.Resize(1); + label_correct_.SetDevice(device); out_gpair->Resize(preds.Size()); label_correct_.Fill(1); @@ -100,7 +100,7 @@ class SoftmaxMultiClassObj : public ObjFunction { p = label == k ? p - 1.0f : p; gpair[idx * nclass + k] = GradientPair(p * wt, h); } - }, common::Range{0, ndata}, devices, false) + }, common::Range{0, ndata}, device, false) .Eval(out_gpair, &info.labels_, &preds, &info.weights_, &label_correct_); std::vector& label_correct_h = label_correct_.HostVector(); @@ -125,7 +125,7 @@ class SoftmaxMultiClassObj : public ObjFunction { const auto ndata = static_cast(io_preds->Size() / nclass); max_preds_.Resize(ndata); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size()); + auto device = tparam_->gpu_id; if (prob) { common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t _idx, common::Span _preds) { @@ -133,11 +133,11 @@ class SoftmaxMultiClassObj : public ObjFunction { _preds.subspan(_idx * nclass, nclass); common::Softmax(point.begin(), point.end()); }, - common::Range{0, ndata}, GPUDistribution::Granular(devices, nclass)) + common::Range{0, ndata}, device) .Eval(io_preds); } else { - io_preds->Shard(GPUDistribution::Granular(devices, nclass)); - max_preds_.Shard(GPUDistribution::Block(devices)); + io_preds->SetDevice(device); + max_preds_.SetDevice(device); common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t _idx, common::Span _preds, @@ -148,7 +148,7 @@ class SoftmaxMultiClassObj : public ObjFunction { common::FindMaxIndex(point.cbegin(), point.cend()) - point.cbegin(); }, - common::Range{0, ndata}, devices, false) + common::Range{0, ndata}, device, false) .Eval(io_preds, &max_preds_); } if (!prob) { diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index 0354d5777..ea2c92566 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -57,8 +57,8 @@ class RegLossObj : public ObjFunction { << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size(); size_t ndata = preds.Size(); out_gpair->Resize(ndata); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size()); - label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size()); + auto device = tparam_->gpu_id; + label_correct_.Resize(1); label_correct_.Fill(1); bool is_null_weight = info.weights_.Size() == 0; @@ -83,7 +83,7 @@ class RegLossObj : public ObjFunction { _out_gpair[_idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w, Loss::SecondOrderGradient(p, label) * w); }, - common::Range{0, static_cast(ndata)}, devices).Eval( + common::Range{0, static_cast(ndata)}, device).Eval( &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_); // copy "label correct" flags back to host @@ -105,7 +105,7 @@ class RegLossObj : public ObjFunction { [] XGBOOST_DEVICE(size_t _idx, common::Span _preds) { _preds[_idx] = Loss::PredTransform(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size())) + tparam_->gpu_id) .Eval(io_preds); } @@ -175,8 +175,8 @@ class PoissonRegression : public ObjFunction { CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided"; size_t ndata = preds.Size(); out_gpair->Resize(ndata); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size()); - label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size()); + auto device = tparam_->gpu_id; + label_correct_.Resize(1); label_correct_.Fill(1); bool is_null_weight = info.weights_.Size() == 0; @@ -197,7 +197,7 @@ class PoissonRegression : public ObjFunction { _out_gpair[_idx] = GradientPair{(expf(p) - y) * w, expf(p + max_delta_step) * w}; }, - common::Range{0, static_cast(ndata)}, devices).Eval( + common::Range{0, static_cast(ndata)}, device).Eval( &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_); // copy "label correct" flags back to host std::vector& label_correct_h = label_correct_.HostVector(); @@ -213,7 +213,7 @@ class PoissonRegression : public ObjFunction { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size())) + tparam_->gpu_id) .Eval(io_preds); } void EvalTransform(HostDeviceVector *io_preds) override { @@ -340,9 +340,9 @@ class GammaRegression : public ObjFunction { CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty"; CHECK_EQ(preds.Size(), info.labels_.Size()) << "labels are not correctly provided"; const size_t ndata = preds.Size(); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata); + auto device = tparam_->gpu_id; out_gpair->Resize(ndata); - label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size()); + label_correct_.Resize(1); label_correct_.Fill(1); const bool is_null_weight = info.weights_.Size() == 0; @@ -361,7 +361,7 @@ class GammaRegression : public ObjFunction { } _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w); }, - common::Range{0, static_cast(ndata)}, devices).Eval( + common::Range{0, static_cast(ndata)}, device).Eval( &label_correct_, out_gpair, &preds, &info.labels_, &info.weights_); // copy "label correct" flags back to host @@ -378,7 +378,7 @@ class GammaRegression : public ObjFunction { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size())) + tparam_->gpu_id) .Eval(io_preds); } void EvalTransform(HostDeviceVector *io_preds) override { @@ -430,8 +430,8 @@ class TweedieRegression : public ObjFunction { const size_t ndata = preds.Size(); out_gpair->Resize(ndata); - auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, preds.Size()); - label_correct_.Resize(devices.IsEmpty() ? 1 : devices.Size()); + auto device = tparam_->gpu_id; + label_correct_.Resize(1); label_correct_.Fill(1); const bool is_null_weight = info.weights_.Size() == 0; @@ -455,7 +455,7 @@ class TweedieRegression : public ObjFunction { std::exp((1 - rho) * p) + (2 - rho) * expf((2 - rho) * p); _out_gpair[_idx] = GradientPair(grad * w, hess * w); }, - common::Range{0, static_cast(ndata), 1}, devices) + common::Range{0, static_cast(ndata), 1}, device) .Eval(&label_correct_, out_gpair, &preds, &info.labels_, &info.weights_); // copy "label correct" flags back to host @@ -472,7 +472,7 @@ class TweedieRegression : public ObjFunction { _preds[_idx] = expf(_preds[_idx]); }, common::Range{0, static_cast(io_preds->Size())}, - GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, io_preds->Size())) + tparam_->gpu_id) .Eval(io_preds); } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index 5fe4d426d..bd3ac3583 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -20,12 +20,6 @@ namespace predictor { DMLC_REGISTRY_FILE_TAG(gpu_predictor); -template -void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) { - thrust::transform(begin_itr, end_itr, begin_itr, - [=] __device__(size_t elem) { return elem + amount; }); -} - /** * \struct DevicePredictionNode * @@ -44,7 +38,7 @@ struct DevicePredictionNode { int fidx; int left_child_idx; int right_child_idx; - NodeValue val; + NodeValue val{}; DevicePredictionNode(const RegTree::Node& n) { // NOLINT static_assert(sizeof(DevicePredictionNode) == 16, "Size is not 16 bytes"); @@ -200,58 +194,14 @@ __global__ void PredictKernel(common::Span d_nodes, } class GPUPredictor : public xgboost::Predictor { - protected: - struct DevicePredictionCacheEntry { - std::shared_ptr data; - HostDeviceVector predictions; - }; - private: - void DeviceOffsets(const HostDeviceVector& data, - size_t total_size, - std::vector* out_offsets) { - auto& offsets = *out_offsets; - offsets.resize(devices_.Size() + 1); - offsets[0] = 0; -#pragma omp parallel for schedule(static, 1) if (devices_.Size() > 1) - for (int shard = 0; shard < devices_.Size(); ++shard) { - int device = devices_.DeviceId(shard); - auto data_span = data.DeviceSpan(device); - dh::safe_cuda(cudaSetDevice(device)); - if (data_span.size() == 0) { - offsets[shard + 1] = total_size; - } else { - // copy the last element from every shard - dh::safe_cuda(cudaMemcpy(&offsets.at(shard + 1), - &data_span[data_span.size()-1], - sizeof(size_t), cudaMemcpyDeviceToHost)); - } - } - } - - // This function populates the explicit offsets that can be used to create a window into the - // underlying host vector. The window starts from the `batch_offset` and has a size of - // `batch_size`, and is sharded across all the devices. Each shard is granular depending on - // the number of output classes `n_classes`. - void PredictionDeviceOffsets(size_t total_size, size_t batch_offset, size_t batch_size, - int n_classes, std::vector* out_offsets) { - auto& offsets = *out_offsets; - size_t n_shards = devices_.Size(); - offsets.resize(n_shards + 2); - size_t rows_per_shard = common::DivRoundUp(batch_size, n_shards); - for (size_t shard = 0; shard < devices_.Size(); ++shard) { - size_t n_rows = std::min(batch_size, shard * rows_per_shard); - offsets[shard] = batch_offset + n_rows * n_classes; - } - offsets[n_shards] = batch_offset + batch_size * n_classes; - offsets[n_shards + 1] = total_size; - } - struct DeviceShard { DeviceShard() : device_{-1} {} ~DeviceShard() { - dh::safe_cuda(cudaSetDevice(device_)); + if (device_ >= 0) { + dh::safe_cuda(cudaSetDevice(device_)); + } } void Init(int device) { @@ -284,10 +234,9 @@ class GPUPredictor : public xgboost::Predictor { void PredictInternal (const SparsePage& batch, size_t num_features, HostDeviceVector* predictions) { - if (predictions->DeviceSize(device_) == 0) { return; } dh::safe_cuda(cudaSetDevice(device_)); const int BLOCK_THREADS = 128; - size_t num_rows = batch.offset.DeviceSize(device_) - 1; + size_t num_rows = batch.offset.DeviceSize() - 1; const int GRID_SIZE = static_cast(common::DivRoundUp(num_rows, BLOCK_THREADS)); int shared_memory_bytes = static_cast @@ -297,14 +246,12 @@ class GPUPredictor : public xgboost::Predictor { shared_memory_bytes = 0; use_shared = false; } - const auto& data_distr = batch.data.Distribution(); - size_t entry_start = data_distr.ShardStart(batch.data.Size(), - data_distr.Devices().Index(device_)); + size_t entry_start = 0; PredictKernel<<>> - (dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_), - dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_), - batch.data.DeviceSpan(device_), this->tree_begin_, this->tree_end_, num_features, + (dh::ToSpan(nodes_), predictions->DeviceSpan(), dh::ToSpan(tree_segments_), + dh::ToSpan(tree_group_), batch.offset.DeviceSpan(), + batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows, entry_start, use_shared, this->num_group_); } @@ -322,7 +269,7 @@ class GPUPredictor : public xgboost::Predictor { void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) { CHECK_EQ(model.param.size_leaf_vector, 0); // Copy decision trees to device - thrust::host_vector h_tree_segments; + thrust::host_vector h_tree_segments{}; h_tree_segments.reserve((tree_end - tree_begin) + 1); size_t sum = 0; h_tree_segments.push_back(sum); @@ -337,9 +284,7 @@ class GPUPredictor : public xgboost::Predictor { std::copy(src_nodes.begin(), src_nodes.end(), h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]); } - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard &shard) { - shard.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end); - }); + shard_.InitModel(model, h_tree_segments, h_nodes, tree_begin, tree_end); } void DevicePredictInternal(DMatrix* dmat, @@ -352,40 +297,43 @@ class GPUPredictor : public xgboost::Predictor { InitModel(model, tree_begin, tree_end); size_t batch_offset = 0; + auto* preds = out_preds; + std::unique_ptr> batch_preds{nullptr}; for (auto &batch : dmat->GetBatches()) { bool is_external_memory = batch.Size() < dmat->Info().num_row_; if (is_external_memory) { - std::vector out_preds_offsets; - PredictionDeviceOffsets(out_preds->Size(), batch_offset, batch.Size(), - model.param.num_output_group, &out_preds_offsets); - out_preds->Reshard(GPUDistribution::Explicit(devices_, out_preds_offsets)); + batch_preds.reset(new HostDeviceVector); + batch_preds->Resize(batch.Size() * model.param.num_output_group); + std::copy(out_preds->ConstHostVector().begin() + batch_offset, + out_preds->ConstHostVector().begin() + batch_offset + batch_preds->Size(), + batch_preds->HostVector().begin()); + preds = batch_preds.get(); } - batch.offset.Shard(GPUDistribution::Overlap(devices_, 1)); - std::vector device_offsets; - DeviceOffsets(batch.offset, batch.data.Size(), &device_offsets); - batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets)); + batch.offset.SetDevice(device_); + batch.data.SetDevice(device_); + preds->SetDevice(device_); + shard_.PredictInternal(batch, model.param.num_feature, preds); - dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) { - shard.PredictInternal(batch, model.param.num_feature, out_preds); - }); + if (is_external_memory) { + auto h_preds = preds->ConstHostVector(); + std::copy(h_preds.begin(), h_preds.end(), out_preds->HostVector().begin() + batch_offset); + } batch_offset += batch.Size() * model.param.num_output_group; } - out_preds->Reshard(GPUDistribution::Granular(devices_, model.param.num_output_group)); monitor_.StopCuda("DevicePredictInternal"); } public: - GPUPredictor() = default; + GPUPredictor() : device_{-1} {}; void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { - GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus, - dmat->Info().num_row_); - CHECK_NE(devices.Size(), 0); - ConfigureShards(devices); + int device = learner_param_->gpu_id; + CHECK_GE(device, 0); + ConfigureShard(device); if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { return; @@ -408,10 +356,9 @@ class GPUPredictor : public xgboost::Predictor { size_t n_classes = model.param.num_output_group; size_t n = n_classes * info.num_row_; const HostDeviceVector& base_margin = info.base_margin_; - out_preds->Shard(GPUDistribution::Granular(devices_, n_classes)); out_preds->Resize(n); if (base_margin.Size() != 0) { - CHECK_EQ(out_preds->Size(), n); + CHECK_EQ(base_margin.Size(), n); out_preds->Copy(base_margin); } else { out_preds->Fill(model.base_margin); @@ -427,7 +374,7 @@ class GPUPredictor : public xgboost::Predictor { const HostDeviceVector& y = it->second.predictions; if (y.Size() != 0) { monitor_.StartCuda("PredictFromCache"); - out_preds->Shard(y.Distribution()); + out_preds->SetDevice(y.DeviceIdx()); out_preds->Resize(y.Size()); out_preds->Copy(y); monitor_.StopCuda("PredictFromCache"); @@ -500,25 +447,23 @@ class GPUPredictor : public xgboost::Predictor { const std::vector>& cache) override { Predictor::Configure(cfg, cache); - GPUSet devices = GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus); - ConfigureShards(devices); + int device = learner_param_->gpu_id; + if (device >= 0) { + ConfigureShard(device); + } } private: /*! \brief Re configure shards when GPUSet is changed. */ - void ConfigureShards(GPUSet devices) { - if (devices_ == devices) return; + void ConfigureShard(int device) { + if (device_ == device) return; - devices_ = devices; - shards_.clear(); - shards_.resize(devices_.Size()); - dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){ - shard.Init(devices_.DeviceId(i)); - }); + device_ = device; + shard_.Init(device_); } - std::vector shards_; - GPUSet devices_; + DeviceShard shard_; + int device_; common::Monitor monitor_; }; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index f78825ab6..b72acac8e 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -702,7 +702,7 @@ struct DeviceShard { row_partitioner.reset(new RowPartitioner(device_id, n_rows)); dh::safe_cuda(cudaMemcpyAsync( - gpair.data(), dh_gpair->ConstDevicePointer(device_id), + gpair.data(), dh_gpair->ConstDevicePointer(), gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost)); SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx); hist.Reset(); @@ -745,8 +745,8 @@ struct DeviceShard { for (auto i = 0ull; i < nidxs.size(); i++) { auto nidx = nidxs[i]; auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx)); - p_feature_set->Shard(GPUSet(device_id, 1)); - auto d_sampled_features = p_feature_set->DeviceSpan(device_id); + p_feature_set->SetDevice(device_id); + auto d_sampled_features = p_feature_set->DeviceSpan(); common::Span d_feature_set = interaction_constraints.Query(d_sampled_features, nidx); auto d_split_candidates = @@ -1016,7 +1016,7 @@ struct DeviceShard { dh::AllReducer* reducer, int64_t num_columns) { constexpr int kRootNIdx = 0; - const auto &gpair = gpair_all->DeviceSpan(device_id); + const auto &gpair = gpair_all->DeviceSpan(); dh::SumReduction(temp_memory, gpair, node_sum_gradients_d, gpair.size()); @@ -1294,11 +1294,8 @@ class GPUHistMakerSpecialised { param_.InitAllowUnknown(args); generic_param_ = generic_param; hist_maker_param_.InitAllowUnknown(args); - auto devices = GPUSet::All(generic_param_->gpu_id, - generic_param_->n_gpus); - n_devices_ = devices.Size(); - CHECK(n_devices_ != 0) << "Must have at least one device"; - dist_ = GPUDistribution::Block(devices); + device_ = generic_param_->gpu_id; + CHECK_GE(device_, 0) << "Must have at least one device"; dh::CheckComputeCapability(); @@ -1330,30 +1327,22 @@ class GPUHistMakerSpecialised { void InitDataOnce(DMatrix* dmat) { info_ = &dmat->Info(); - int n_devices = dist_.Devices().Size(); - - device_list_.resize(n_devices); - for (int index = 0; index < n_devices; ++index) { - int device_id = dist_.Devices().DeviceId(index); - device_list_[index] = device_id; - } - - reducer_.Init(device_list_); + reducer_.Init({device_}); // Synchronise the column sampling seed uint32_t column_sampling_seed = common::GlobalRandom()(); rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); // Create device shards - shards_.resize(n_devices); + shards_.resize(1); dh::ExecuteIndexShards( &shards_, [&](int idx, std::unique_ptr>& shard) { - dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx))); - size_t start = dist_.ShardStart(info_->num_row_, idx); - size_t size = dist_.ShardSize(info_->num_row_, idx); + dh::safe_cuda(cudaSetDevice(device_)); + size_t start = 0; + size_t size = info_->num_row_; shard = std::unique_ptr>( - new DeviceShard(dist_.Devices().DeviceId(idx), idx, + new DeviceShard(device_, idx, start, start + size, param_, column_sampling_seed, info_->num_col_)); @@ -1436,7 +1425,7 @@ class GPUHistMakerSpecialised { for (auto& tree : trees) { tree = *p_tree; } - gpair->Reshard(dist_); + gpair->SetDevice(device_); // Launch one thread for each device "shard" containing a subset of rows. // Threads will cooperatively build the tree, synchronising over histograms. @@ -1462,13 +1451,13 @@ class GPUHistMakerSpecialised { return false; } monitor_.StartCuda("UpdatePredictionCache"); - p_out_preds->Shard(dist_.Devices()); + p_out_preds->SetDevice(device_); dh::ExecuteIndexShards( &shards_, [&](int idx, std::unique_ptr>& shard) { dh::safe_cuda(cudaSetDevice(shard->device_id)); shard->UpdatePredictionCache( - p_out_preds->DevicePointer(shard->device_id)); + p_out_preds->DevicePointer()); }); monitor_.StopCuda("UpdatePredictionCache"); return true; @@ -1483,7 +1472,6 @@ class GPUHistMakerSpecialised { private: bool initialised_; - int n_devices_; int n_bins_; GPUHistMakerTrainParam hist_maker_param_; @@ -1492,11 +1480,9 @@ class GPUHistMakerSpecialised { dh::AllReducer reducer_; DMatrix* p_last_fmat_; - GPUDistribution dist_; + int device_; common::Monitor monitor_; - /*! List storing device id. */ - std::vector device_list_; }; class GPUHistMaker : public TreeUpdater { diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc deleted file mode 100644 index ba6946e80..000000000 --- a/tests/cpp/common/test_common.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "../../../src/common/common.h" -#include - -namespace xgboost { -TEST(GPUSet, Basic) { - GPUSet devices = GPUSet::Empty(); - ASSERT_TRUE(devices.IsEmpty()); - - devices = GPUSet{0, 1}; - ASSERT_TRUE(devices != GPUSet::Empty()); - EXPECT_EQ(devices.Size(), 1); - - devices = GPUSet::Range(1, 0); - EXPECT_EQ(devices.Size(), 0); - EXPECT_TRUE(devices.IsEmpty()); - - EXPECT_FALSE(devices.Contains(1)); - - devices = GPUSet::Range(2, -1); - EXPECT_EQ(devices, GPUSet::Empty()); - EXPECT_EQ(devices.Size(), 0); - EXPECT_TRUE(devices.IsEmpty()); - - devices = GPUSet::Range(2, 8); // 2 ~ 10 - EXPECT_EQ(devices.Size(), 8); - EXPECT_ANY_THROW(devices.DeviceId(8)); - - auto device_id = devices.DeviceId(0); - EXPECT_EQ(device_id, 2); - auto device_index = devices.Index(2); - EXPECT_EQ(device_index, 0); - -#ifndef XGBOOST_USE_CUDA - EXPECT_EQ(GPUSet::AllVisible(), GPUSet::Empty()); -#endif -} -} // namespace xgboost diff --git a/tests/cpp/common/test_common.cu b/tests/cpp/common/test_common.cu deleted file mode 100644 index f2d62cebc..000000000 --- a/tests/cpp/common/test_common.cu +++ /dev/null @@ -1,83 +0,0 @@ -#include -#include -#include "../../../src/common/common.h" -#include "../helpers.h" - -#include - -namespace xgboost { - -TEST(GPUSet, GPUBasic) { - GPUSet devices = GPUSet::Empty(); - ASSERT_TRUE(devices.IsEmpty()); - - devices = GPUSet{1, 1}; - ASSERT_TRUE(devices != GPUSet::Empty()); - EXPECT_EQ(devices.Size(), 1); - EXPECT_EQ(*(devices.begin()), 1); - - devices = GPUSet::Range(1, 0); - EXPECT_EQ(devices, GPUSet::Empty()); - EXPECT_EQ(devices.Size(), 0); - EXPECT_TRUE(devices.IsEmpty()); - - EXPECT_FALSE(devices.Contains(1)); - - devices = GPUSet::Range(2, -1); - EXPECT_EQ(devices, GPUSet::Empty()); - - devices = GPUSet::Range(2, 8); - EXPECT_EQ(devices.Size(), 8); - - EXPECT_EQ(*devices.begin(), 2); - EXPECT_EQ(*devices.end(), 2 + devices.Size()); - EXPECT_EQ(8, devices.Size()); - - ASSERT_NO_THROW(GPUSet::AllVisible()); - devices = GPUSet::AllVisible(); - if (devices.IsEmpty()) { - LOG(WARNING) << "Empty devices."; - } -} - -TEST(GPUSet, Verbose) { - { - std::map args {}; - args["verbosity"] = "3"; // LOG INFO - - testing::internal::CaptureStderr(); - ConsoleLogger::Configure({args.cbegin(), args.cend()}); - GPUSet::All(0, 1); - std::string output = testing::internal::GetCapturedStderr(); - ASSERT_NE(output.find("GPU ID: 0"), std::string::npos); - ASSERT_NE(output.find("GPUs: 1"), std::string::npos); - - args["verbosity"] = "1"; // restore - ConsoleLogger::Configure({args.cbegin(), args.cend()}); - } -} - -#if defined(XGBOOST_USE_NCCL) -TEST(GPUSet, MGPU_GPUBasic) { - { - GPUSet devices = GPUSet::All(1, 1); - ASSERT_EQ(*(devices.begin()), 1); - ASSERT_EQ(*(devices.end()), 2); - ASSERT_EQ(devices.Size(), 1); - ASSERT_TRUE(devices.Contains(1)); - } - - { - GPUSet devices = GPUSet::All(0, -1); - ASSERT_GE(devices.Size(), 2); - } - - // Specify number of rows. - { - GPUSet devices = GPUSet::All(0, -1, 1); - ASSERT_EQ(devices.Size(), 1); - } -} -#endif - -} // namespace xgboost \ No newline at end of file diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc index 542bf2c30..7bf61dcfd 100644 --- a/tests/cpp/common/test_config.cc +++ b/tests/cpp/common/test_config.cc @@ -87,8 +87,8 @@ TEST(ConfigParser, ParseKeyValuePair) { ASSERT_TRUE(parser.ParseKeyValuePair("booster = gbtree", &key, &value)); ASSERT_EQ(key, "booster"); ASSERT_EQ(value, "gbtree"); - ASSERT_TRUE(parser.ParseKeyValuePair("n_gpus = 2", &key, &value)); - ASSERT_EQ(key, "n_gpus"); + ASSERT_TRUE(parser.ParseKeyValuePair("gpu_id = 2", &key, &value)); + ASSERT_EQ(key, "gpu_id"); ASSERT_EQ(value, "2"); ASSERT_TRUE(parser.ParseKeyValuePair("monotone_constraints = (1,0,-1)", &key, &value)); diff --git a/tests/cpp/common/test_gpu_hist_util.cu b/tests/cpp/common/test_gpu_hist_util.cu index 5be7d8dd2..bbb1cc4bf 100644 --- a/tests/cpp/common/test_gpu_hist_util.cu +++ b/tests/cpp/common/test_gpu_hist_util.cu @@ -18,7 +18,7 @@ namespace xgboost { namespace common { -void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) { +void TestDeviceSketch(bool use_external_memory) { // create the data int nrows = 10001; std::shared_ptr *dmat = nullptr; @@ -53,7 +53,7 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) { // find the cuts on the GPU HistogramCuts hmat_gpu; - size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0, devices.Size()), gpu_batch_nrows, + size_t row_stride = DeviceSketch(p, CreateEmptyGenericParam(0), gpu_batch_nrows, dmat->get(), &hmat_gpu); // compare the row stride with the one obtained from the dmatrix @@ -81,11 +81,11 @@ void TestDeviceSketch(const GPUSet& devices, bool use_external_memory) { } TEST(gpu_hist_util, DeviceSketch) { - TestDeviceSketch(GPUSet::Range(0, 1), false); + TestDeviceSketch(false); } TEST(gpu_hist_util, DeviceSketch_ExternalMemory) { - TestDeviceSketch(GPUSet::Range(0, 1), true); + TestDeviceSketch(true); } } // namespace common diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index 57c49754e..1ab4348c8 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -30,45 +30,36 @@ struct HostDeviceVectorSetDeviceHandler { } }; -void InitHostDeviceVector(size_t n, const GPUDistribution& distribution, - HostDeviceVector *v) { +void InitHostDeviceVector(size_t n, int device, HostDeviceVector *v) { // create the vector - GPUSet devices = distribution.Devices(); - v->Shard(distribution); + v->SetDevice(device); v->Resize(n); ASSERT_EQ(v->Size(), n); - ASSERT_TRUE(v->Distribution() == distribution); - ASSERT_TRUE(v->Devices() == devices); - // ensure that the devices have read-write access - for (int i = 0; i < devices.Size(); ++i) { - ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead)); - ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite)); - } + ASSERT_EQ(v->DeviceIdx(), device); + // ensure that the device have read-write access + ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); + ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite)); // ensure that the host has no access ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead)); // fill in the data on the host std::vector& data_h = v->HostVector(); - // ensure that the host has full access, while the devices have none + // ensure that the host has full access, while the device have none ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead)); ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite)); - for (int i = 0; i < devices.Size(); ++i) { - ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kRead)); - ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite)); - } + ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead)); + ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite)); ASSERT_EQ(data_h.size(), n); std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin()); } void PlusOne(HostDeviceVector *v) { - int n_devices = v->Devices().Size(); - for (int i = 0; i < n_devices; ++i) { - SetDevice(i); - thrust::transform(v->tbegin(i), v->tend(i), v->tbegin(i), - [=]__device__(unsigned int a){ return a + 1; }); - } + int device = v->DeviceIdx(); + SetDevice(device); + thrust::transform(v->tbegin(), v->tend(), v->tbegin(), + [=]__device__(unsigned int a){ return a + 1; }); } void CheckDevice(HostDeviceVector *v, @@ -76,24 +67,24 @@ void CheckDevice(HostDeviceVector *v, const std::vector& sizes, unsigned int first, GPUAccess access) { int n_devices = sizes.size(); - ASSERT_EQ(v->Devices().Size(), n_devices); + ASSERT_EQ(n_devices, 1); for (int i = 0; i < n_devices; ++i) { - ASSERT_EQ(v->DeviceSize(i), sizes.at(i)); + ASSERT_EQ(v->DeviceSize(), sizes.at(i)); SetDevice(i); - ASSERT_TRUE(thrust::equal(v->tcbegin(i), v->tcend(i), + ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(), thrust::make_counting_iterator(first + starts[i]))); - ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead)); + ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); // ensure that the device has at most the access specified by access - ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kWrite), access == GPUAccess::kWrite); + ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite); } ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead); ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); for (int i = 0; i < n_devices; ++i) { SetDevice(i); - ASSERT_TRUE(thrust::equal(v->tbegin(i), v->tend(i), + ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(), thrust::make_counting_iterator(first + starts[i]))); - ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kRead)); - ASSERT_TRUE(v->DeviceCanAccess(i, GPUAccess::kWrite)); + ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead)); + ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite)); } ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead)); ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite)); @@ -107,20 +98,20 @@ void CheckHost(HostDeviceVector *v, GPUAccess access) { } ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead)); ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite); - size_t n_devices = v->Devices().Size(); + size_t n_devices = 1; for (int i = 0; i < n_devices; ++i) { - ASSERT_EQ(v->DeviceCanAccess(i, GPUAccess::kRead), access == GPUAccess::kRead); + ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead); // the devices should have no write access - ASSERT_FALSE(v->DeviceCanAccess(i, GPUAccess::kWrite)); + ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite)); } } void TestHostDeviceVector -(size_t n, const GPUDistribution& distribution, +(size_t n, int device, const std::vector& starts, const std::vector& sizes) { HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; - InitHostDeviceVector(n, distribution, &v); + InitHostDeviceVector(n, device, &v); CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead); PlusOne(&v); CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite); @@ -130,54 +121,24 @@ void TestHostDeviceVector TEST(HostDeviceVector, TestBlock) { size_t n = 1001; - int n_devices = 2; - auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices)); - std::vector starts{0, 501}; - std::vector sizes{501, 500}; - TestHostDeviceVector(n, distribution, starts, sizes); -} - -TEST(HostDeviceVector, TestGranular) { - size_t n = 3003; - int n_devices = 2; - auto distribution = GPUDistribution::Granular(GPUSet::Range(0, n_devices), 3); - std::vector starts{0, 1503}; - std::vector sizes{1503, 1500}; - TestHostDeviceVector(n, distribution, starts, sizes); -} - -TEST(HostDeviceVector, TestOverlap) { - size_t n = 1001; - int n_devices = 2; - auto distribution = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1); - std::vector starts{0, 500}; - std::vector sizes{501, 501}; - TestHostDeviceVector(n, distribution, starts, sizes); -} - -TEST(HostDeviceVector, TestExplicit) { - size_t n = 1001; - int n_devices = 2; - std::vector offsets{0, 550, 1001}; - auto distribution = GPUDistribution::Explicit(GPUSet::Range(0, n_devices), offsets); - std::vector starts{0, 550}; - std::vector sizes{550, 451}; - TestHostDeviceVector(n, distribution, starts, sizes); + int device = 0; + std::vector starts{0}; + std::vector sizes{1001}; + TestHostDeviceVector(n, device, starts, sizes); } TEST(HostDeviceVector, TestCopy) { size_t n = 1001; - int n_devices = 2; - auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices)); - std::vector starts{0, 501}; - std::vector sizes{501, 500}; + int device = 0; + std::vector starts{0}; + std::vector sizes{1001}; HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVector v; { // a separate scope to ensure that v1 is gone before further checks HostDeviceVector v1; - InitHostDeviceVector(n, distribution, &v1); + InitHostDeviceVector(n, device, &v1); v = v1; } CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead); @@ -193,16 +154,16 @@ TEST(HostDeviceVector, Shard) { h_vec[i] = i; } HostDeviceVector vec (h_vec); - auto devices = GPUSet::Range(0, 1); + auto device = 0; - vec.Shard(devices); - ASSERT_EQ(vec.DeviceSize(0), h_vec.size()); + vec.SetDevice(device); + ASSERT_EQ(vec.DeviceSize(), h_vec.size()); ASSERT_EQ(vec.Size(), h_vec.size()); - auto span = vec.DeviceSpan(0); // sync to device + auto span = vec.DeviceSpan(); // sync to device - vec.Reshard(GPUDistribution::Empty()); // pull back to cpu, empty devices. + vec.SetDevice(-1); // pull back to cpu. ASSERT_EQ(vec.Size(), h_vec.size()); - ASSERT_TRUE(vec.Devices().IsEmpty()); + ASSERT_EQ(vec.DeviceIdx(), -1); auto h_vec_1 = vec.HostVector(); ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); @@ -214,16 +175,16 @@ TEST(HostDeviceVector, Reshard) { h_vec[i] = i; } HostDeviceVector vec (h_vec); - auto devices = GPUSet::Range(0, 1); + auto device = 0; - vec.Shard(devices); - ASSERT_EQ(vec.DeviceSize(0), h_vec.size()); + vec.SetDevice(device); + ASSERT_EQ(vec.DeviceSize(), h_vec.size()); ASSERT_EQ(vec.Size(), h_vec.size()); PlusOne(&vec); - vec.Reshard(GPUDistribution::Empty()); + vec.SetDevice(-1); ASSERT_EQ(vec.Size(), h_vec.size()); - ASSERT_TRUE(vec.Devices().IsEmpty()); + ASSERT_EQ(vec.DeviceIdx(), -1); auto h_vec_1 = vec.HostVector(); for (size_t i = 0; i < h_vec_1.size(); ++i) { @@ -233,97 +194,14 @@ TEST(HostDeviceVector, Reshard) { TEST(HostDeviceVector, Span) { HostDeviceVector vec {1.0f, 2.0f, 3.0f, 4.0f}; - vec.Shard(GPUSet{0, 1}); - auto span = vec.DeviceSpan(0); - ASSERT_EQ(vec.DeviceSize(0), span.size()); - ASSERT_EQ(vec.DevicePointer(0), span.data()); - auto const_span = vec.ConstDeviceSpan(0); - ASSERT_EQ(vec.DeviceSize(0), span.size()); - ASSERT_EQ(vec.ConstDevicePointer(0), span.data()); + vec.SetDevice(0); + auto span = vec.DeviceSpan(); + ASSERT_EQ(vec.DeviceSize(), span.size()); + ASSERT_EQ(vec.DevicePointer(), span.data()); + auto const_span = vec.ConstDeviceSpan(); + ASSERT_EQ(vec.DeviceSize(), span.size()); + ASSERT_EQ(vec.ConstDevicePointer(), span.data()); } -// Multi-GPUs' test -#if defined(XGBOOST_USE_NCCL) -TEST(HostDeviceVector, MGPU_Shard) { - auto devices = GPUSet::AllVisible(); - if (devices.Size() < 2) { - LOG(WARNING) << "Not testing in multi-gpu environment."; - return; - } - - std::vector h_vec (2345); - for (size_t i = 0; i < h_vec.size(); ++i) { - h_vec[i] = i; - } - HostDeviceVector vec (h_vec); - - // Data size for each device. - std::vector devices_size (devices.Size()); - - // From CPU to GPUs. - vec.Shard(devices); - size_t total_size = 0; - for (size_t i = 0; i < devices.Size(); ++i) { - total_size += vec.DeviceSize(i); - devices_size[i] = vec.DeviceSize(i); - } - ASSERT_EQ(total_size, h_vec.size()); - ASSERT_EQ(total_size, vec.Size()); - - // Shard from devices to devices with different distribution. - EXPECT_ANY_THROW( - vec.Shard(GPUDistribution::Granular(devices, 12))); - - // All data is drawn back to CPU - vec.Reshard(GPUDistribution::Empty()); - ASSERT_TRUE(vec.Devices().IsEmpty()); - ASSERT_EQ(vec.Size(), h_vec.size()); - - vec.Shard(GPUDistribution::Granular(devices, 12)); - total_size = 0; - for (size_t i = 0; i < devices.Size(); ++i) { - total_size += vec.DeviceSize(i); - devices_size[i] = vec.DeviceSize(i); - } - ASSERT_EQ(total_size, h_vec.size()); - ASSERT_EQ(total_size, vec.Size()); -} - -TEST(HostDeviceVector, MGPU_Reshard) { - auto devices = GPUSet::AllVisible(); - if (devices.Size() < 2) { - LOG(WARNING) << "Not testing in multi-gpu environment."; - return; - } - - size_t n = 1001; - int n_devices = 2; - auto distribution = GPUDistribution::Block(GPUSet::Range(0, n_devices)); - std::vector starts{0, 501}; - std::vector sizes{501, 500}; - - HostDeviceVector v; - InitHostDeviceVector(n, distribution, &v); - CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead); - PlusOne(&v); - CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite); - CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kWrite); - - auto distribution1 = GPUDistribution::Overlap(GPUSet::Range(0, n_devices), 1); - v.Reshard(distribution1); - - for (size_t i = 0; i < n_devices; ++i) { - auto span = v.DeviceSpan(i); // sync to device - } - - std::vector starts1{0, 500}; - std::vector sizes1{501, 501}; - CheckDevice(&v, starts1, sizes1, 1, GPUAccess::kWrite); - CheckHost(&v, GPUAccess::kRead); - CheckHost(&v, GPUAccess::kWrite); -} -#endif - } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc index 1b37ba885..1ae31236b 100644 --- a/tests/cpp/common/test_json.cc +++ b/tests/cpp/common/test_json.cc @@ -33,7 +33,7 @@ std::string GetModelStr() { }, "configuration": { "booster": "gbtree", - "n_gpus": "1", + "gpu_id": "0", "num_class": "0", "num_feature": "10", "objective": "reg:linear", diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc index 902c282bc..4dee523ef 100644 --- a/tests/cpp/common/test_transform_range.cc +++ b/tests/cpp/common/test_transform_range.cc @@ -9,13 +9,11 @@ #if defined(__CUDACC__) -#define TRANSFORM_GPU_RANGE GPUSet::Range(0, 1) -#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Range(0, 1)) +#define TRANSFORM_GPU 0 #else -#define TRANSFORM_GPU_RANGE GPUSet::Empty() -#define TRANSFORM_GPU_DIST GPUDistribution::Block(GPUSet::Empty()) +#define TRANSFORM_GPU -1 #endif @@ -46,13 +44,13 @@ TEST(Transform, DeclareUnifiedTest(Basic)) { std::vector h_sol(size); InitializeRange(h_sol.begin(), h_sol.end()); - const HostDeviceVector in_vec{h_in, TRANSFORM_GPU_DIST}; - HostDeviceVector out_vec{h_out, TRANSFORM_GPU_DIST}; + const HostDeviceVector in_vec{h_in, TRANSFORM_GPU}; + HostDeviceVector out_vec{h_out, TRANSFORM_GPU}; out_vec.Fill(0); Transform<>::Init(TestTransformRange{}, Range{0, static_cast(size)}, - TRANSFORM_GPU_RANGE) + TRANSFORM_GPU) .Eval(&out_vec, &in_vec); std::vector res = out_vec.HostVector(); diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu index 29172937f..4e4c259e4 100644 --- a/tests/cpp/common/test_transform_range.cu +++ b/tests/cpp/common/test_transform_range.cu @@ -5,87 +5,13 @@ namespace xgboost { namespace common { -// Test here is multi gpu specific -TEST(Transform, MGPU_Basic) { - auto devices = GPUSet::AllVisible(); - CHECK_GT(devices.Size(), 1); - const size_t size {256}; - std::vector h_in(size); - std::vector h_out(size); - InitializeRange(h_in.begin(), h_in.end()); - std::vector h_sol(size); - InitializeRange(h_sol.begin(), h_sol.end()); - - const HostDeviceVector in_vec {h_in, - GPUDistribution::Block(GPUSet::Empty())}; - HostDeviceVector out_vec {h_out, - GPUDistribution::Block(GPUSet::Empty())}; - out_vec.Fill(0); - - in_vec.Shard(GPUDistribution::Granular(devices, 8)); - out_vec.Shard(GPUDistribution::Block(devices)); - - // Granularity is different, sharding will throw. - EXPECT_ANY_THROW( - Transform<>::Init(TestTransformRange{}, Range{0, size}, devices) - .Eval(&out_vec, &in_vec)); - - - Transform<>::Init(TestTransformRange{}, Range{0, size}, - devices, false).Eval(&out_vec, &in_vec); - std::vector res = out_vec.HostVector(); - - ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin())); -} - -// Test for multi-classes setting. -template -struct TestTransformRangeGranular { - const size_t granularity = 8; - - explicit TestTransformRangeGranular(const size_t granular) : granularity{granular} {} - void XGBOOST_DEVICE operator()(size_t _idx, - Span _out, Span _in) { - auto in_sub = _in.subspan(_idx * granularity, granularity); - auto out_sub = _out.subspan(_idx * granularity, granularity); - for (size_t i = 0; i < granularity; ++i) { - out_sub[i] = in_sub[i]; - } - } -}; - -TEST(Transform, MGPU_Granularity) { - GPUSet devices = GPUSet::All(0, -1); - - const size_t size {8990}; - const size_t granularity = 10; - - GPUDistribution distribution = - GPUDistribution::Granular(devices, granularity); - - std::vector h_in(size); - std::vector h_out(size); - InitializeRange(h_in.begin(), h_in.end()); - std::vector h_sol(size); - InitializeRange(h_sol.begin(), h_sol.end()); - - const HostDeviceVector in_vec {h_in, distribution}; - HostDeviceVector out_vec {h_out, distribution}; - - ASSERT_NO_THROW( - Transform<>::Init( - TestTransformRangeGranular{granularity}, - Range{0, size / granularity}, - distribution) - .Eval(&out_vec, &in_vec)); - std::vector res = out_vec.HostVector(); - - ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin())); -} - TEST(Transform, MGPU_SpecifiedGpuId) { + if (AllVisibleGPUs() < 2) { + LOG(WARNING) << "Not testing in multi-gpu environment."; + return; + } // Use 1 GPU, Numbering of GPU starts from 1 - auto devices = GPUSet::All(1, 1); + auto device = 1; const size_t size {256}; std::vector h_in(size); std::vector h_out(size); @@ -93,13 +19,11 @@ TEST(Transform, MGPU_SpecifiedGpuId) { std::vector h_sol(size); InitializeRange(h_sol.begin(), h_sol.end()); - const HostDeviceVector in_vec {h_in, - GPUDistribution::Block(devices)}; - HostDeviceVector out_vec {h_out, - GPUDistribution::Block(devices)}; + const HostDeviceVector in_vec {h_in, device}; + HostDeviceVector out_vec {h_out, device}; ASSERT_NO_THROW( - Transform<>::Init(TestTransformRange{}, Range{0, size}, devices) + Transform<>::Init(TestTransformRange{}, Range{0, size}, device) .Eval(&out_vec, &in_vec)); std::vector res = out_vec.HostVector(); ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin())); diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 10ba161dd..0ae2ba856 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -12,7 +12,7 @@ TEST(GBTree, SelectTreeMethod) { auto p_dmat {(*p_shared_ptr_dmat).get()}; GenericParameter generic_param; - generic_param.InitAllowUnknown(std::vector{Arg("n_gpus", "0")}); + generic_param.InitAllowUnknown(std::vector{}); std::unique_ptr p_gbm{ GradientBooster::Create("gbtree", &generic_param, {}, 0)}; auto& gbtree = dynamic_cast (*p_gbm); @@ -35,7 +35,7 @@ TEST(GBTree, SelectTreeMethod) { Arg{"num_feature", n_feat}}, p_dmat); ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker"); #ifdef XGBOOST_USE_CUDA - generic_param.InitAllowUnknown(std::vector{Arg{"n_gpus", "1"}}); + generic_param.InitAllowUnknown(std::vector{Arg{"gpu_id", "0"}}); gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)}, p_dmat); ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist"); diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index 761307fd7..1571393fd 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -29,9 +29,9 @@ #endif #if defined(__CUDACC__) -#define NGPUS 1 +#define GPUIDX 0 #else -#define NGPUS 0 +#define GPUIDX -1 #endif bool FileExists(const std::string& filename); @@ -189,11 +189,10 @@ std::unique_ptr CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_c gbm::GBTreeModel CreateTestModel(); -inline GenericParameter CreateEmptyGenericParam(int gpu_id, int n_gpus) { +inline GenericParameter CreateEmptyGenericParam(int gpu_id) { xgboost::GenericParameter tparam; std::vector> args { - {"gpu_id", std::to_string(gpu_id)}, - {"n_gpus", std::to_string(n_gpus)}}; + {"gpu_id", std::to_string(gpu_id)}}; tparam.Init(args); return tparam; } diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc index 504728a09..0a9c0178b 100644 --- a/tests/cpp/linear/test_linear.cc +++ b/tests/cpp/linear/test_linear.cc @@ -7,7 +7,7 @@ TEST(Linear, shotgun) { auto mat = xgboost::CreateDMatrix(10, 10, 0); - auto lparam = xgboost::CreateEmptyGenericParam(0, 0); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); { auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("shotgun", &lparam)); @@ -33,7 +33,7 @@ TEST(Linear, shotgun) { TEST(Linear, coordinate) { auto mat = xgboost::CreateDMatrix(10, 10, 0); - auto lparam = xgboost::CreateEmptyGenericParam(0, 0); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("coord_descent", &lparam)); updater->Configure({{"eta", "1."}}); diff --git a/tests/cpp/linear/test_linear.cu b/tests/cpp/linear/test_linear.cu index 9fba4735e..b8f4c2722 100644 --- a/tests/cpp/linear/test_linear.cu +++ b/tests/cpp/linear/test_linear.cu @@ -7,8 +7,7 @@ namespace xgboost { TEST(Linear, GPUCoordinate) { auto mat = xgboost::CreateDMatrix(10, 10, 0); - auto lparam = CreateEmptyGenericParam(0, 1); - lparam.n_gpus = 1; + auto lparam = CreateEmptyGenericParam(GPUIDX); auto updater = std::unique_ptr( xgboost::LinearUpdater::Create("gpu_coord_descent", &lparam)); updater->Configure({{"eta", "1."}}); diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc index c38b81a7e..fe411f2c6 100644 --- a/tests/cpp/metric/test_elementwise_metric.cc +++ b/tests/cpp/metric/test_elementwise_metric.cc @@ -6,7 +6,7 @@ #include "../helpers.h" TEST(Metric, DeclareUnifiedTest(RMSE)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("rmse", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "rmse"); @@ -20,7 +20,7 @@ TEST(Metric, DeclareUnifiedTest(RMSE)) { } TEST(Metric, DeclareUnifiedTest(RMSLE)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "rmsle"); @@ -32,7 +32,7 @@ TEST(Metric, DeclareUnifiedTest(RMSLE)) { } TEST(Metric, DeclareUnifiedTest(MAE)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("mae", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "mae"); @@ -45,7 +45,7 @@ TEST(Metric, DeclareUnifiedTest(MAE)) { } TEST(Metric, DeclareUnifiedTest(LogLoss)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("logloss", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "logloss"); @@ -58,7 +58,7 @@ TEST(Metric, DeclareUnifiedTest(LogLoss)) { } TEST(Metric, DeclareUnifiedTest(Error)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("error", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "error"); @@ -90,7 +90,7 @@ TEST(Metric, DeclareUnifiedTest(Error)) { } TEST(Metric, DeclareUnifiedTest(PoissionNegLogLik)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "poisson-nloglik"); diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc index f4cf682a3..fdb620928 100644 --- a/tests/cpp/metric/test_metric.cc +++ b/tests/cpp/metric/test_metric.cc @@ -4,7 +4,7 @@ #include "../helpers.h" TEST(Metric, UnknownMetric) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = nullptr; EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &tparam)); EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &tparam)); diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc index f8313082f..95f2c417e 100644 --- a/tests/cpp/metric/test_multiclass_metric.cc +++ b/tests/cpp/metric/test_multiclass_metric.cc @@ -4,10 +4,9 @@ #include "../helpers.h" -inline void TestMultiClassError(xgboost::GPUSet const& devices) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); - lparam.gpu_id = *devices.begin(); - lparam.n_gpus = devices.Size(); +inline void TestMultiClassError(int device) { + auto lparam = xgboost::CreateEmptyGenericParam(device); + lparam.gpu_id = device; xgboost::Metric * metric = xgboost::Metric::Create("merror", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "merror"); @@ -23,14 +22,12 @@ inline void TestMultiClassError(xgboost::GPUSet const& devices) { } TEST(Metric, DeclareUnifiedTest(MultiClassError)) { - auto devices = xgboost::GPUSet::Range(0, NGPUS); - TestMultiClassError(devices); + TestMultiClassError(GPUIDX); } -inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); - lparam.gpu_id = *devices.begin(); - lparam.n_gpus = devices.Size(); +inline void TestMultiClassLogLoss(int device) { + auto lparam = xgboost::CreateEmptyGenericParam(device); + lparam.gpu_id = device; xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &lparam); metric->Configure({}); ASSERT_STREQ(metric->Name(), "mlogloss"); @@ -46,27 +43,31 @@ inline void TestMultiClassLogLoss(xgboost::GPUSet const& devices) { } TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) { - auto devices = xgboost::GPUSet::Range(0, NGPUS); - TestMultiClassLogLoss(devices); + TestMultiClassLogLoss(GPUIDX); } #if defined(XGBOOST_USE_NCCL) && defined(__CUDACC__) +namespace xgboost { +namespace common { TEST(Metric, MGPU_MultiClassError) { + if (AllVisibleGPUs() < 2) { + LOG(WARNING) << "Not testing in multi-gpu environment."; + return; + } + { - auto devices = xgboost::GPUSet::All(0, -1); - TestMultiClassError(devices); + TestMultiClassError(0); } { - auto devices = xgboost::GPUSet::All(1, -1); - TestMultiClassError(devices); + TestMultiClassError(1); } { - auto devices = xgboost::GPUSet::All(0, -1); - TestMultiClassLogLoss(devices); + TestMultiClassLogLoss(0); } { - auto devices = xgboost::GPUSet::All(1, -1); - TestMultiClassLogLoss(devices); + TestMultiClassLogLoss(1); } } +} // namespace common +} // namespace xgboost #endif // defined(XGBOOST_USE_NCCL) diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc index ef1d1377f..792deb5a6 100644 --- a/tests/cpp/metric/test_rank_metric.cc +++ b/tests/cpp/metric/test_rank_metric.cc @@ -4,7 +4,7 @@ #include "../helpers.h" TEST(Metric, AMS) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); EXPECT_ANY_THROW(xgboost::Metric::Create("ams", &tparam)); xgboost::Metric * metric = xgboost::Metric::Create("ams@0.5f", &tparam); ASSERT_STREQ(metric->Name(), "ams@0.5"); @@ -23,7 +23,7 @@ TEST(Metric, AMS) { } TEST(Metric, AUC) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("auc", &tparam); ASSERT_STREQ(metric->Name(), "auc"); EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10); @@ -38,7 +38,7 @@ TEST(Metric, AUC) { } TEST(Metric, AUCPR) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric *metric = xgboost::Metric::Create("aucpr", &tparam); ASSERT_STREQ(metric->Name(), "aucpr"); EXPECT_NEAR(GetMetricEval(metric, {0, 0, 1, 1}, {0, 0, 1, 1}), 1, 1e-10); @@ -65,7 +65,7 @@ TEST(Metric, Precision) { // When the limit for precision is not given, it takes the limit at // std::numeric_limits::max(); hence all values are very small // NOTE(AbdealiJK): Maybe this should be fixed to be num_row by default. - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("pre", &tparam); ASSERT_STREQ(metric->Name(), "pre"); EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 0, 1e-7); @@ -89,7 +89,7 @@ TEST(Metric, Precision) { } TEST(Metric, NDCG) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("ndcg", &tparam); ASSERT_STREQ(metric->Name(), "ndcg"); EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {})); @@ -147,7 +147,7 @@ TEST(Metric, NDCG) { } TEST(Metric, MAP) { - auto tparam = xgboost::CreateEmptyGenericParam(0, 0); + auto tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::Metric * metric = xgboost::Metric::Create("map", &tparam); ASSERT_STREQ(metric->Name(), "map"); EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}), 1, 1e-10); diff --git a/tests/cpp/objective/test_hinge.cc b/tests/cpp/objective/test_hinge.cc index f6485cbe6..fb9b24daa 100644 --- a/tests/cpp/objective/test_hinge.cc +++ b/tests/cpp/objective/test_hinge.cc @@ -6,7 +6,7 @@ #include "../helpers.h" TEST(Objective, DeclareUnifiedTest(HingeObj)) { - xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:hinge", &tparam); xgboost::bst_float eps = std::numeric_limits::min(); diff --git a/tests/cpp/objective/test_multiclass_obj.cc b/tests/cpp/objective/test_multiclass_obj.cc index d6157a672..77d38f09a 100644 --- a/tests/cpp/objective/test_multiclass_obj.cc +++ b/tests/cpp/objective/test_multiclass_obj.cc @@ -7,7 +7,7 @@ #include "../helpers.h" TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args {{"num_class", "3"}}; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("multi:softmax", &lparam); @@ -25,7 +25,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) { } TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) { - auto lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + auto lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args{ std::pair("num_class", "3")}; @@ -47,7 +47,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) { } TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args { std::pair("num_class", "3")}; diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc index 7a092dff0..bcf2bd497 100644 --- a/tests/cpp/objective/test_regression_obj.cc +++ b/tests/cpp/objective/test_regression_obj.cc @@ -7,7 +7,7 @@ #include "../helpers.h" TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) { - xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = @@ -32,7 +32,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) { } TEST(Objective, DeclareUnifiedTest(SquaredLog)) { - xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = @@ -56,7 +56,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) { } TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) { - xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter tparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &tparam); @@ -72,7 +72,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) { } TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:logistic", &lparam); @@ -102,7 +102,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) { } TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("binary:logitraw", &lparam); @@ -118,7 +118,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) { } TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam); @@ -140,7 +140,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) { } TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson", &lparam); @@ -168,7 +168,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) { } TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam); @@ -189,7 +189,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) { } TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:gamma", &lparam); @@ -217,7 +217,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) { } TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam); @@ -241,7 +241,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) { #if defined(__CUDACC__) TEST(Objective, CPU_vs_CUDA) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 1); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:squarederror", &lparam); @@ -267,12 +267,12 @@ TEST(Objective, CPU_vs_CUDA) { { // CPU - lparam.n_gpus = 0; + lparam.gpu_id = -1; obj->GetGradient(preds, info, 0, &cpu_out_preds); } { // CUDA - lparam.n_gpus = 1; + lparam.gpu_id = 0; obj->GetGradient(preds, info, 0, &cuda_out_preds); } @@ -294,7 +294,7 @@ TEST(Objective, CPU_vs_CUDA) { #endif TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, NGPUS); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie", &lparam); @@ -325,7 +325,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) { // CoxRegression not implemented in GPU code, no need for testing. #if !defined(__CUDACC__) TEST(Objective, CoxRegressionGPair) { - xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(0, 0); + xgboost::GenericParameter lparam = xgboost::CreateEmptyGenericParam(GPUIDX); std::vector> args; xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("survival:cox", &lparam); diff --git a/tests/cpp/plugin/test_example_objective.cc b/tests/cpp/plugin/test_example_objective.cc index 816f861f4..4bcb38979 100644 --- a/tests/cpp/plugin/test_example_objective.cc +++ b/tests/cpp/plugin/test_example_objective.cc @@ -6,7 +6,7 @@ namespace xgboost { TEST(Plugin, ExampleObjective) { - xgboost::GenericParameter tparam = CreateEmptyGenericParam(0, 0); + xgboost::GenericParameter tparam = CreateEmptyGenericParam(GPUIDX); auto * obj = xgboost::ObjFunction::Create("mylogistic", &tparam); ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"error"}); delete obj; diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 43bb1181b..32d323af3 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -6,7 +6,7 @@ namespace xgboost { TEST(cpu_predictor, Test) { - auto lparam = CreateEmptyGenericParam(0, 0); + auto lparam = CreateEmptyGenericParam(GPUIDX); std::unique_ptr cpu_predictor = std::unique_ptr(Predictor::Create("cpu_predictor", &lparam)); @@ -59,7 +59,7 @@ TEST(cpu_predictor, ExternalMemoryTest) { dmlc::TemporaryDirectory tmpdir; std::string filename = tmpdir.path + "/big.libsvm"; std::unique_ptr dmat = CreateSparsePageDMatrix(12, 64, filename); - auto lparam = CreateEmptyGenericParam(0, 0); + auto lparam = CreateEmptyGenericParam(GPUIDX); std::unique_ptr cpu_predictor = std::unique_ptr(Predictor::Create("cpu_predictor", &lparam)); diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 5cf5817de..9a50e76f9 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -33,8 +33,8 @@ namespace xgboost { namespace predictor { TEST(gpu_predictor, Test) { - auto cpu_lparam = CreateEmptyGenericParam(0, 0); - auto gpu_lparam = CreateEmptyGenericParam(0, 1); + auto cpu_lparam = CreateEmptyGenericParam(-1); + auto gpu_lparam = CreateEmptyGenericParam(0); std::unique_ptr gpu_predictor = std::unique_ptr(Predictor::Create("gpu_predictor", &gpu_lparam)); @@ -69,7 +69,7 @@ TEST(gpu_predictor, Test) { } TEST(gpu_predictor, ExternalMemoryTest) { - auto lparam = CreateEmptyGenericParam(0, 1); + auto lparam = CreateEmptyGenericParam(0); std::unique_ptr gpu_predictor = std::unique_ptr(Predictor::Create("gpu_predictor", &lparam)); gpu_predictor->Configure({}, {}); @@ -83,26 +83,26 @@ TEST(gpu_predictor, ExternalMemoryTest) { std::string file1 = tmpdir.path + "/big_1.libsvm"; std::string file2 = tmpdir.path + "/big_2.libsvm"; dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0)); - dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1)); - dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2)); +// dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1)); +// dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2)); for (const auto& dmat: dmats) { - // Test predict batch + dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5); HostDeviceVector out_predictions; gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0); EXPECT_EQ(out_predictions.Size(), dmat->Info().num_row_ * n_classes); const std::vector &host_vector = out_predictions.ConstHostVector(); for (int i = 0; i < host_vector.size() / n_classes; i++) { - ASSERT_EQ(host_vector[i * n_classes], 1.5); - ASSERT_EQ(host_vector[i * n_classes + 1], 0.); - ASSERT_EQ(host_vector[i * n_classes + 2], 0.); + ASSERT_EQ(host_vector[i * n_classes], 2.0); + ASSERT_EQ(host_vector[i * n_classes + 1], 0.5); + ASSERT_EQ(host_vector[i * n_classes + 2], 0.5); } } } // Test whether pickling preserves predictor parameters TEST(gpu_predictor, PicklingTest) { - int const ngpu = 1; + int const gpuid = 0; dmlc::TemporaryDirectory tempdir; const std::string tmp_file = tempdir.path + "/simple.libsvm"; @@ -134,7 +134,7 @@ TEST(gpu_predictor, PicklingTest) { ASSERT_EQ(XGBoosterSetParam( bst, "tree_method", "gpu_hist"), 0) << XGBGetLastError(); ASSERT_EQ(XGBoosterSetParam( - bst, "n_gpus", std::to_string(ngpu).c_str()), 0) << XGBGetLastError(); + bst, "gpu_id", std::to_string(gpuid).c_str()), 0) << XGBGetLastError(); ASSERT_EQ(XGBoosterSetParam(bst, "predictor", "gpu_predictor"), 0) << XGBGetLastError(); // Run boosting iterations @@ -160,7 +160,7 @@ TEST(gpu_predictor, PicklingTest) { { // Query predictor const auto& kwargs = QueryBoosterConfigurationArguments(bst2); ASSERT_EQ(kwargs.at("predictor"), "gpu_predictor"); - ASSERT_EQ(kwargs.at("n_gpus"), std::to_string(ngpu).c_str()); + ASSERT_EQ(kwargs.at("gpu_id"), std::to_string(gpuid).c_str()); } { // Change predictor and query again diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 9261bb96f..318d09628 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -168,10 +168,9 @@ TEST(Learner, IO) { std::unique_ptr learner {Learner::Create(mat)}; learner->SetParams({Arg{"tree_method", "auto"}, Arg{"predictor", "gpu_predictor"}, - Arg{"n_gpus", "1"}}); + Arg{"gpu_id", "0"}}); learner->UpdateOneIter(0, p_dmat.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1); dmlc::TemporaryDirectory tempdir; const std::string fname = tempdir.path + "/model.bst"; @@ -185,7 +184,6 @@ TEST(Learner, IO) { std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); learner->Load(fi.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0); delete pp_dmat; } @@ -208,31 +206,27 @@ TEST(Learner, GPUConfiguration) { Arg{"updater", "gpu_coord_descent"}}); learner->UpdateOneIter(0, p_dmat.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1); } { std::unique_ptr learner {Learner::Create(mat)}; learner->SetParams({Arg{"tree_method", "gpu_hist"}}); learner->UpdateOneIter(0, p_dmat.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1); } { // with CPU algorithm std::unique_ptr learner {Learner::Create(mat)}; learner->SetParams({Arg{"tree_method", "hist"}}); learner->UpdateOneIter(0, p_dmat.get()); - ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 0); + ASSERT_EQ(learner->GetGenericParameter().gpu_id, -1); } { - // with CPU algorithm, but `n_gpus` takes priority + // with CPU algorithm, but `gpu_id` takes priority std::unique_ptr learner {Learner::Create(mat)}; learner->SetParams({Arg{"tree_method", "hist"}, - Arg{"n_gpus", "1"}}); + Arg{"gpu_id", "0"}}); learner->UpdateOneIter(0, p_dmat.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1); } { // With CPU algorithm but GPU Predictor, this is to simulate when @@ -243,7 +237,6 @@ TEST(Learner, GPUConfiguration) { Arg{"predictor", "gpu_predictor"}}); learner->UpdateOneIter(0, p_dmat.get()); ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); - ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1); } delete pp_dmat; diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 869859fee..df811c413 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -366,7 +366,7 @@ TEST(GpuHist, EvaluateSplits) { ASSERT_NEAR(res[1].fvalue, 0.26, xgboost::kRtEps); } -void TestHistogramIndexImpl(int n_gpus) { +void TestHistogramIndexImpl() { // Test if the compressed histogram index matches when using a sparse // dmatrix with and without using external memory @@ -384,7 +384,7 @@ void TestHistogramIndexImpl(int n_gpus) { {"max_leaves", "0"} }; - GenericParameter generic_param(CreateEmptyGenericParam(0, n_gpus)); + GenericParameter generic_param(CreateEmptyGenericParam(0)); hist_maker.Configure(training_params, &generic_param); hist_maker.InitDataOnce(hist_maker_dmat.get()); @@ -412,7 +412,7 @@ void TestHistogramIndexImpl(int n_gpus) { } TEST(GpuHist, TestHistogramIndex) { - TestHistogramIndexImpl(1); + TestHistogramIndexImpl(); } } // namespace tree diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc index 8ca8af479..ba1d15d29 100644 --- a/tests/cpp/tree/test_prune.cc +++ b/tests/cpp/tree/test_prune.cc @@ -29,7 +29,7 @@ TEST(Updater, Prune) { {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} }; auto dmat = CreateDMatrix(32, 16, 0.4, 3); - auto lparam = CreateEmptyGenericParam(0, 0); + auto lparam = CreateEmptyGenericParam(GPUIDX); // prepare tree RegTree tree = RegTree(); diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc index 0bc1190bf..7d1f0d9e5 100644 --- a/tests/cpp/tree/test_refresh.cc +++ b/tests/cpp/tree/test_refresh.cc @@ -25,7 +25,7 @@ TEST(Updater, Refresh) { {"reg_lambda", "1"}}; RegTree tree = RegTree(); - auto lparam = CreateEmptyGenericParam(0, 0); + auto lparam = CreateEmptyGenericParam(GPUIDX); tree.param.InitAllowUnknown(cfg); std::vector trees {&tree}; std::unique_ptr refresher(TreeUpdater::Create("refresh", &lparam)); diff --git a/tests/distributed/distributed_gpu.py b/tests/distributed/distributed_gpu.py index 6665cd142..4c47f54a3 100644 --- a/tests/distributed/distributed_gpu.py +++ b/tests/distributed/distributed_gpu.py @@ -61,7 +61,6 @@ base_params = { def params_basic_1x4(rank): return dict(base_params, **{ - 'n_gpus': 1, 'gpu_id': rank, }), 20 diff --git a/tests/python-gpu/test_gpu_linear.py b/tests/python-gpu/test_gpu_linear.py index 79e5919b6..a0726f079 100644 --- a/tests/python-gpu/test_gpu_linear.py +++ b/tests/python-gpu/test_gpu_linear.py @@ -23,7 +23,7 @@ class TestGPULinear(unittest.TestCase): @pytest.mark.skipif(**tm.no_sklearn()) def test_gpu_coordinate(self): parameters = self.common_param.copy() - parameters['n_gpus'] = [1] + parameters['gpu_id'] = [0] for param in test_linear.parameter_combinations(parameters): results = test_linear.run_suite( param, 150, self.datasets, scale_features=True) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index dbc7b1020..fc21293ea 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -21,7 +21,7 @@ datasets = ["Boston", "Cancer", "Digits", "Sparse regression", class TestGPU(unittest.TestCase): def test_gpu_hist(self): - test_param = parameter_combinations({'n_gpus': [1], 'max_depth': [2, 8], + test_param = parameter_combinations({'gpu_id': [0], 'max_depth': [2, 8], 'max_leaves': [255, 4], 'max_bin': [2, 256], 'grow_policy': ['lossguide']}) @@ -38,8 +38,7 @@ class TestGPU(unittest.TestCase): @pytest.mark.mgpu def test_specified_gpu_id_gpu_update(self): - variable_param = {'n_gpus': [1], - 'gpu_id': [1], + variable_param = {'gpu_id': [1], 'max_depth': [8], 'max_leaves': [255, 4], 'max_bin': [2, 64], diff --git a/tests/python-gpu/test_large_sizes.py b/tests/python-gpu/test_large_sizes.py index 421964285..739b21d6a 100644 --- a/tests/python-gpu/test_large_sizes.py +++ b/tests/python-gpu/test_large_sizes.py @@ -63,7 +63,7 @@ class TestGPU(unittest.TestCase): 'nthread': 0, 'eta': 1, 'verbosity': 3, - 'n_gpus': 1, + 'gpu_id': 0, 'objective': 'binary:logistic', 'max_bin': max_bin, 'eval_metric': 'auc'}