Fix specifying gpu_id, add tests. (#3851)
* Rewrite gpu_id related code. * Remove normalised/unnormalised operatios. * Address difference between `Index' and `Device ID'. * Modify doc for `gpu_id'. * Better LOG for GPUSet. * Check specified n_gpus. * Remove inappropriate `device_idx' term. * Clarify GpuIdType and size_t.
This commit is contained in:
@@ -147,61 +147,86 @@ struct AllVisibleImpl {
|
||||
*/
|
||||
class GPUSet {
|
||||
public:
|
||||
using GpuIdType = int;
|
||||
static constexpr GpuIdType kAll = -1;
|
||||
|
||||
explicit GPUSet(int start = 0, int ndevices = 0)
|
||||
: devices_(start, start + ndevices) {}
|
||||
|
||||
static GPUSet Empty() { return GPUSet(); }
|
||||
|
||||
static GPUSet Range(int start, int ndevices) {
|
||||
return ndevices <= 0 ? Empty() : GPUSet{start, ndevices};
|
||||
static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
|
||||
return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
|
||||
}
|
||||
/*! \brief ndevices and num_rows both are upper bounds. */
|
||||
static GPUSet All(int ndevices, int num_rows = std::numeric_limits<int>::max()) {
|
||||
int n_devices_visible = AllVisible().Size();
|
||||
if (ndevices < 0 || ndevices > n_devices_visible) {
|
||||
ndevices = n_devices_visible;
|
||||
/*! \brief n_gpus and num_rows both are upper bounds. */
|
||||
static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
|
||||
GpuIdType num_rows = std::numeric_limits<GpuIdType>::max()) {
|
||||
CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
|
||||
CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";
|
||||
|
||||
GpuIdType const n_devices_visible = AllVisible().Size();
|
||||
if (n_devices_visible == 0) { return Empty(); }
|
||||
|
||||
GpuIdType const n_available_devices = n_devices_visible - gpu_id;
|
||||
|
||||
if (n_gpus == kAll) { // Use all devices starting from `gpu_id'.
|
||||
CHECK(gpu_id < n_devices_visible)
|
||||
<< "\ngpu_id should be less than number of visible devices.\ngpu_id: "
|
||||
<< gpu_id
|
||||
<< ", number of visible devices: "
|
||||
<< n_devices_visible;
|
||||
GpuIdType n_devices =
|
||||
n_available_devices < num_rows ? n_available_devices : num_rows;
|
||||
return Range(gpu_id, n_devices);
|
||||
} else { // Use devices in ( gpu_id, gpu_id + n_gpus ).
|
||||
CHECK_LE(n_gpus, n_available_devices)
|
||||
<< "Starting from gpu id: " << gpu_id << ", there are only "
|
||||
<< n_available_devices << " available devices, while n_gpus is set to: "
|
||||
<< n_gpus;
|
||||
GpuIdType n_devices = n_gpus < num_rows ? n_gpus : num_rows;
|
||||
return Range(gpu_id, n_devices);
|
||||
}
|
||||
// fix-up device number to be limited by number of rows
|
||||
ndevices = ndevices > num_rows ? num_rows : ndevices;
|
||||
return Range(0, ndevices);
|
||||
}
|
||||
static GPUSet AllVisible() {
|
||||
int n = AllVisibleImpl::AllVisible();
|
||||
return Range(0, n);
|
||||
}
|
||||
/*! \brief Ensure gpu_id is correct, so not dependent upon user knowing details */
|
||||
static int GetDeviceIdx(int gpu_id) {
|
||||
auto devices = AllVisible();
|
||||
CHECK(!devices.IsEmpty()) << "Empty device.";
|
||||
return (std::abs(gpu_id) + 0) % devices.Size();
|
||||
}
|
||||
/*! \brief Counting from gpu_id */
|
||||
GPUSet Normalised(int gpu_id) const {
|
||||
return Range(gpu_id, Size());
|
||||
}
|
||||
/*! \brief Counting from 0 */
|
||||
GPUSet Unnormalised() const {
|
||||
return Range(0, Size());
|
||||
}
|
||||
|
||||
int Size() const {
|
||||
int res = *devices_.end() - *devices_.begin();
|
||||
return res < 0 ? 0 : res;
|
||||
static GPUSet AllVisible() {
|
||||
GpuIdType n = AllVisibleImpl::AllVisible();
|
||||
return Range(0, n);
|
||||
}
|
||||
/*! \brief Get normalised device id. */
|
||||
int operator[](int index) const {
|
||||
CHECK(index >= 0 && index < Size());
|
||||
return *devices_.begin() + index;
|
||||
|
||||
size_t Size() const {
|
||||
GpuIdType size = *devices_.end() - *devices_.begin();
|
||||
GpuIdType res = size < 0 ? 0 : size;
|
||||
return static_cast<size_t>(res);
|
||||
}
|
||||
|
||||
/*
|
||||
* By default, we have two configurations of identifying device, one
|
||||
* is the device id obtained from `cudaGetDevice'. But we sometimes
|
||||
* store objects that allocated one for each device in a list, which
|
||||
* requires a zero-based index.
|
||||
*
|
||||
* Hence, `DeviceId' converts a zero-based index to actual device id,
|
||||
* `Index' converts a device id to a zero-based index.
|
||||
*/
|
||||
GpuIdType DeviceId(size_t index) const {
|
||||
GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
|
||||
CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
|
||||
<< "\nIndex: " << index
|
||||
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
|
||||
<< std::endl;
|
||||
return result;
|
||||
}
|
||||
size_t Index(GpuIdType device) const {
|
||||
CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
|
||||
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
|
||||
<< std::endl;
|
||||
size_t result = static_cast<size_t>(device - *devices_.begin());
|
||||
return result;
|
||||
}
|
||||
|
||||
bool IsEmpty() const { return Size() == 0; }
|
||||
/*! \brief Get un-normalised index. */
|
||||
int Index(int device) const {
|
||||
CHECK(Contains(device));
|
||||
return device - *devices_.begin();
|
||||
}
|
||||
|
||||
bool Contains(int device) const {
|
||||
bool Contains(GpuIdType device) const {
|
||||
return *devices_.begin() <= device && device < *devices_.end();
|
||||
}
|
||||
|
||||
|
||||
@@ -53,6 +53,16 @@ T *Raw(thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
}
|
||||
|
||||
inline void CudaCheckPointerDevice(void* ptr) {
|
||||
cudaPointerAttributes attr;
|
||||
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||
int ptr_device = attr.device;
|
||||
int cur_device = -1;
|
||||
cudaGetDevice(&cur_device);
|
||||
CHECK_EQ(ptr_device, cur_device) << "pointer device: " << ptr_device
|
||||
<< "current device: " << cur_device;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
|
||||
return raw_pointer_cast(v.data());
|
||||
@@ -61,7 +71,7 @@ const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
|
||||
// if n_devices=-1, then use all visible devices
|
||||
inline void SynchronizeNDevices(xgboost::GPUSet devices) {
|
||||
devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
|
||||
for (auto const d : devices.Unnormalised()) {
|
||||
for (auto const d : devices) {
|
||||
safe_cuda(cudaSetDevice(d));
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
@@ -743,7 +753,8 @@ void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
typename std::iterator_traits<T>::value_type SumReduction(dh::CubMemory &tmp_mem, T in, int nVals) {
|
||||
typename std::iterator_traits<T>::value_type SumReduction(
|
||||
dh::CubMemory &tmp_mem, T in, int nVals) {
|
||||
using ValueT = typename std::iterator_traits<T>::value_type;
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
|
||||
@@ -900,11 +911,10 @@ class AllReducer {
|
||||
double *recvbuff, int count) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
CHECK(initialised);
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals[communication_group_idx]));
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinals.at(communication_group_idx)));
|
||||
dh::safe_nccl(ncclAllReduce(sendbuff, recvbuff, count, ncclDouble, ncclSum,
|
||||
comms[communication_group_idx],
|
||||
streams[communication_group_idx]));
|
||||
comms.at(communication_group_idx),
|
||||
streams.at(communication_group_idx)));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -352,8 +352,9 @@ struct GPUSketcher {
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||
size_t start = dist_.ShardStart(info.num_row_, i);
|
||||
size_t size = dist_.ShardSize(info.num_row_, i);
|
||||
shard = std::unique_ptr<DeviceShard>
|
||||
(new DeviceShard(dist_.Devices()[i], start, start + size, param_));
|
||||
shard = std::unique_ptr<DeviceShard>(
|
||||
new DeviceShard(dist_.Devices().DeviceId(i),
|
||||
start, start + size, param_));
|
||||
});
|
||||
|
||||
// compute sketches for each shard
|
||||
@@ -379,8 +380,7 @@ struct GPUSketcher {
|
||||
}
|
||||
|
||||
GPUSketcher(tree::TrainParam param, size_t n_rows) : param_(std::move(param)) {
|
||||
dist_ = GPUDistribution::Block(GPUSet::All(param_.n_gpus, n_rows).
|
||||
Normalised(param_.gpu_id));
|
||||
dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus, n_rows));
|
||||
}
|
||||
|
||||
std::vector<std::unique_ptr<DeviceShard>> shards_;
|
||||
|
||||
@@ -46,14 +46,13 @@ template <typename T>
|
||||
struct HostDeviceVectorImpl {
|
||||
struct DeviceShard {
|
||||
DeviceShard()
|
||||
: index_(-1), proper_size_(0), device_(-1), start_(0), perm_d_(false),
|
||||
: proper_size_(0), device_(-1), start_(0), perm_d_(false),
|
||||
cached_size_(~0), vec_(nullptr) {}
|
||||
|
||||
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||
if (vec_ == nullptr) { vec_ = vec; }
|
||||
CHECK_EQ(vec, vec_);
|
||||
device_ = device;
|
||||
index_ = vec_->distribution_.devices_.Index(device);
|
||||
LazyResize(vec_->Size());
|
||||
perm_d_ = vec_->perm_h_.Complementary();
|
||||
}
|
||||
@@ -62,7 +61,6 @@ struct HostDeviceVectorImpl {
|
||||
if (vec_ == nullptr) { vec_ = vec; }
|
||||
CHECK_EQ(vec, vec_);
|
||||
device_ = other.device_;
|
||||
index_ = other.index_;
|
||||
cached_size_ = other.cached_size_;
|
||||
start_ = other.start_;
|
||||
proper_size_ = other.proper_size_;
|
||||
@@ -114,10 +112,11 @@ struct HostDeviceVectorImpl {
|
||||
if (new_size == cached_size_) { return; }
|
||||
// resize is required
|
||||
int ndevices = vec_->distribution_.devices_.Size();
|
||||
start_ = vec_->distribution_.ShardStart(new_size, index_);
|
||||
proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
|
||||
int device_index = vec_->distribution_.devices_.Index(device_);
|
||||
start_ = vec_->distribution_.ShardStart(new_size, device_index);
|
||||
proper_size_ = vec_->distribution_.ShardProperSize(new_size, device_index);
|
||||
// The size on this device.
|
||||
size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
|
||||
size_t size_d = vec_->distribution_.ShardSize(new_size, device_index);
|
||||
SetDevice();
|
||||
data_.resize(size_d);
|
||||
cached_size_ = new_size;
|
||||
@@ -154,7 +153,6 @@ struct HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
int index_;
|
||||
int device_;
|
||||
thrust::device_vector<T> data_;
|
||||
// cached vector size
|
||||
@@ -183,13 +181,13 @@ struct HostDeviceVectorImpl {
|
||||
distribution_(other.distribution_), mutex_() {
|
||||
shards_.resize(other.shards_.size());
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Init(this, other.shards_[i]);
|
||||
shard.Init(this, other.shards_.at(i));
|
||||
});
|
||||
}
|
||||
|
||||
// Init can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Init>
|
||||
HostDeviceVectorImpl(const Init& init, GPUDistribution distribution)
|
||||
// Initializer can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Initializer>
|
||||
HostDeviceVectorImpl(const Initializer& init, GPUDistribution distribution)
|
||||
: distribution_(distribution), perm_h_(distribution.IsEmpty()), size_d_(0) {
|
||||
if (!distribution_.IsEmpty()) {
|
||||
size_d_ = init.size();
|
||||
@@ -204,7 +202,7 @@ struct HostDeviceVectorImpl {
|
||||
int ndevices = distribution_.devices_.Size();
|
||||
shards_.resize(ndevices);
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Init(this, distribution_.devices_[i]);
|
||||
shard.Init(this, distribution_.devices_.DeviceId(i));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -217,20 +215,20 @@ struct HostDeviceVectorImpl {
|
||||
T* DevicePointer(int device) {
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kWrite);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.data().get();
|
||||
return shards_.at(distribution_.devices_.Index(device)).data_.data().get();
|
||||
}
|
||||
|
||||
const T* ConstDevicePointer(int device) {
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.data().get();
|
||||
return shards_.at(distribution_.devices_.Index(device)).data_.data().get();
|
||||
}
|
||||
|
||||
common::Span<T> DeviceSpan(int device) {
|
||||
GPUSet devices = distribution_.devices_;
|
||||
CHECK(devices.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kWrite);
|
||||
return {shards_[devices.Index(device)].data_.data().get(),
|
||||
return {shards_.at(devices.Index(device)).data_.data().get(),
|
||||
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
|
||||
}
|
||||
|
||||
@@ -238,20 +236,20 @@ struct HostDeviceVectorImpl {
|
||||
GPUSet devices = distribution_.devices_;
|
||||
CHECK(devices.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return {shards_[devices.Index(device)].data_.data().get(),
|
||||
return {shards_.at(devices.Index(device)).data_.data().get(),
|
||||
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
|
||||
}
|
||||
|
||||
size_t DeviceSize(int device) {
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].data_.size();
|
||||
return shards_.at(distribution_.devices_.Index(device)).data_.size();
|
||||
}
|
||||
|
||||
size_t DeviceStart(int device) {
|
||||
CHECK(distribution_.devices_.Contains(device));
|
||||
LazySyncDevice(device, GPUAccess::kRead);
|
||||
return shards_[distribution_.devices_.Index(device)].start_;
|
||||
return shards_.at(distribution_.devices_.Index(device)).start_;
|
||||
}
|
||||
|
||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||
@@ -316,7 +314,7 @@ struct HostDeviceVectorImpl {
|
||||
size_d_ = other->size_d_;
|
||||
}
|
||||
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
|
||||
shard.Copy(&other->shards_[i]);
|
||||
shard.Copy(&other->shards_.at(i));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -405,7 +403,7 @@ struct HostDeviceVectorImpl {
|
||||
void LazySyncDevice(int device, GPUAccess access) {
|
||||
GPUSet devices = distribution_.Devices();
|
||||
CHECK(devices.Contains(device));
|
||||
shards_[devices.Index(device)].LazySyncDevice(access);
|
||||
shards_.at(devices.Index(device)).LazySyncDevice(access);
|
||||
}
|
||||
|
||||
bool HostCanAccess(GPUAccess access) { return perm_h_.CanAccess(access); }
|
||||
@@ -413,7 +411,7 @@ struct HostDeviceVectorImpl {
|
||||
bool DeviceCanAccess(int device, GPUAccess access) {
|
||||
GPUSet devices = distribution_.Devices();
|
||||
if (!devices.Contains(device)) { return false; }
|
||||
return shards_[devices.Index(device)].perm_d_.CanAccess(access);
|
||||
return shards_.at(devices.Index(device)).perm_d_.CanAccess(access);
|
||||
}
|
||||
|
||||
std::vector<T> data_h_;
|
||||
|
||||
@@ -78,10 +78,11 @@ void SetCudaSetDeviceHandler(void (*handler)(int));
|
||||
|
||||
template <typename T> struct HostDeviceVectorImpl;
|
||||
|
||||
// Distribution for the HostDeviceVector; it specifies such aspects as the devices it is
|
||||
// distributed on, whether there are copies of elements from other GPUs as well as the granularity
|
||||
// of splitting. It may also specify explicit boundaries for devices, in which case the size of the
|
||||
// array cannot be changed.
|
||||
// Distribution for the HostDeviceVector; it specifies such aspects as the
|
||||
// devices it is distributed on, whether there are copies of elements from
|
||||
// other GPUs as well as the granularity of splitting. It may also specify
|
||||
// explicit boundaries for devices, in which case the size of the array cannot
|
||||
// be changed.
|
||||
class GPUDistribution {
|
||||
template<typename T> friend struct HostDeviceVectorImpl;
|
||||
|
||||
|
||||
@@ -86,11 +86,13 @@ class Transform {
|
||||
// CUDA UnpackHDV
|
||||
template <typename T>
|
||||
Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
|
||||
return _vec->DeviceSpan(_device);
|
||||
auto span = _vec->DeviceSpan(_device);
|
||||
return span;
|
||||
}
|
||||
template <typename T>
|
||||
Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
|
||||
return _vec->ConstDeviceSpan(_device);
|
||||
auto span = _vec->ConstDeviceSpan(_device);
|
||||
return span;
|
||||
}
|
||||
// CPU UnpackHDV
|
||||
template <typename T>
|
||||
@@ -125,19 +127,23 @@ class Transform {
|
||||
|
||||
GPUSet devices = distribution_.Devices();
|
||||
size_t range_size = *range_.end() - *range_.begin();
|
||||
|
||||
// Extract index to deal with possible old OpenMP.
|
||||
size_t device_beg = *(devices.begin());
|
||||
size_t device_end = *(devices.end());
|
||||
#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
|
||||
for (omp_ulong i = 0; i < devices.Size(); ++i) {
|
||||
int d = devices.Index(i);
|
||||
for (omp_ulong device = device_beg; device < device_end; ++device) { // NOLINT
|
||||
// Ignore other attributes of GPUDistribution for spliting index.
|
||||
size_t shard_size =
|
||||
GPUDistribution::Block(devices).ShardSize(range_size, d);
|
||||
// This deals with situation like multi-class setting where
|
||||
// granularity is used in data vector.
|
||||
size_t shard_size = GPUDistribution::Block(devices).ShardSize(
|
||||
range_size, devices.Index(device));
|
||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||
dh::safe_cuda(cudaSetDevice(d));
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
const int GRID_SIZE =
|
||||
static_cast<int>(dh::DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
|
||||
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
|
||||
_func, shard_range, UnpackHDV(_vectors, d)...);
|
||||
_func, shard_range, UnpackHDV(_vectors, device)...);
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user