More explict sharding methods for device memory (#4396)

* Rename the Reshard method to Shard

* Add a new Reshard method for sharding a vector that's already sharded
This commit is contained in:
Rong Ou
2019-04-30 16:47:23 -07:00
committed by Rory Mitchell
parent 797ba8e72d
commit eaab364a63
12 changed files with 154 additions and 77 deletions

View File

@@ -154,10 +154,13 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const { }
void HostDeviceVector<T>::Shard(const GPUDistribution& distribution) const { }
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet devices) const { }
void HostDeviceVector<T>::Shard(GPUSet devices) const { }
template <typename T>
void Reshard(const GPUDistribution &distribution) { }
// explicit instantiations are required, as HostDeviceVector isn't header-only
template class HostDeviceVector<bst_float>;

View File

@@ -318,7 +318,7 @@ struct HostDeviceVectorImpl {
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Reshard(other->Distribution());
Shard(other->Distribution());
size_d_ = other->size_d_;
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
@@ -358,19 +358,24 @@ struct HostDeviceVectorImpl {
return data_h_;
}
void Reshard(const GPUDistribution& distribution) {
void Shard(const GPUDistribution& distribution) {
if (distribution_ == distribution) { return; }
CHECK(distribution_.IsEmpty() || distribution.IsEmpty());
if (distribution.IsEmpty()) {
LazySyncHost(GPUAccess::kWrite);
}
CHECK(distribution_.IsEmpty());
distribution_ = distribution;
InitShards();
}
void Reshard(GPUSet new_devices) {
void Shard(GPUSet new_devices) {
if (distribution_.Devices() == new_devices) { return; }
Reshard(GPUDistribution::Block(new_devices));
Shard(GPUDistribution::Block(new_devices));
}
void Reshard(const GPUDistribution &distribution) {
if (distribution_ == distribution) { return; }
LazySyncHost(GPUAccess::kWrite);
distribution_ = distribution;
shards_.clear();
InitShards();
}
void Resize(size_t new_size, T v) {
@@ -586,12 +591,17 @@ bool HostDeviceVector<T>::DeviceCanAccess(int device, GPUAccess access) const {
}
template <typename T>
void HostDeviceVector<T>::Reshard(GPUSet new_devices) const {
impl_->Reshard(new_devices);
void HostDeviceVector<T>::Shard(GPUSet new_devices) const {
impl_->Shard(new_devices);
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution& distribution) const {
void HostDeviceVector<T>::Shard(const GPUDistribution &distribution) const {
impl_->Shard(distribution);
}
template <typename T>
void HostDeviceVector<T>::Reshard(const GPUDistribution &distribution) {
impl_->Reshard(distribution);
}

View File

@@ -14,7 +14,7 @@
* Initialization/Allocation:<br/>
* One can choose to initialize the vector on CPU or GPU during constructor.
* (use the 'devices' argument) Or, can choose to use the 'Resize' method to
* allocate/resize memory explicitly, and use the 'Reshard' method
* allocate/resize memory explicitly, and use the 'Shard' method
* to specify the devices.
*
* Accessing underlying data:<br/>
@@ -98,6 +98,8 @@ class GPUDistribution {
offsets_(std::move(offsets)) {}
public:
static GPUDistribution Empty() { return GPUDistribution(); }
static GPUDistribution Block(GPUSet devices) { return GPUDistribution(devices); }
static GPUDistribution Overlap(GPUSet devices, int overlap) {
@@ -250,11 +252,15 @@ class HostDeviceVector {
/*!
* \brief Specify memory distribution.
*
* If GPUSet::Empty() is used, all data will be drawn back to CPU.
*/
void Reshard(const GPUDistribution& distribution) const;
void Reshard(GPUSet devices) const;
void Shard(const GPUDistribution &distribution) const;
void Shard(GPUSet devices) const;
/*!
* \brief Change memory distribution.
*/
void Reshard(const GPUDistribution &distribution);
void Resize(size_t new_size, T v = T());
private:

View File

@@ -57,13 +57,13 @@ class Transform {
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, GPUSet devices, bool reshard) :
Evaluator(Functor func, Range range, GPUSet devices, bool shard) :
func_(func), range_{std::move(range)},
reshard_{reshard},
shard_{shard},
distribution_{std::move(GPUDistribution::Block(devices))} {}
Evaluator(Functor func, Range range, GPUDistribution dist,
bool reshard) :
func_(func), range_{std::move(range)}, reshard_{reshard},
bool shard) :
func_(func), range_{std::move(range)}, shard_{shard},
distribution_{std::move(dist)} {}
/*!
@@ -106,25 +106,25 @@ class Transform {
return Span<T const> {_vec->ConstHostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
// Recursive unpack for Reshard.
// Recursive unpack for Shard.
template <typename T>
void UnpackReshard(GPUDistribution dist, const HostDeviceVector<T>* vector) const {
vector->Reshard(dist);
void UnpackShard(GPUDistribution dist, const HostDeviceVector<T> *vector) const {
vector->Shard(dist);
}
template <typename Head, typename... Rest>
void UnpackReshard(GPUDistribution dist,
const HostDeviceVector<Head>* _vector,
const HostDeviceVector<Rest>*... _vectors) const {
_vector->Reshard(dist);
UnpackReshard(dist, _vectors...);
void UnpackShard(GPUDistribution dist,
const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const {
_vector->Shard(dist);
UnpackShard(dist, _vectors...);
}
#if defined(__CUDACC__)
template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
if (reshard_)
UnpackReshard(distribution_, _vectors...);
if (shard_)
UnpackShard(distribution_, _vectors...);
GPUSet devices = distribution_.Devices();
size_t range_size = *range_.end() - *range_.begin();
@@ -170,8 +170,8 @@ class Transform {
Functor func_;
/*! \brief Range object specifying parallel threads index range. */
Range range_;
/*! \brief Whether resharding for vectors is required. */
bool reshard_;
/*! \brief Whether sharding for vectors is required. */
bool shard_;
GPUDistribution distribution_;
};
@@ -187,19 +187,19 @@ class Transform {
* \param range Range object specifying parallel threads index range.
* \param devices GPUSet specifying GPUs to use, when compiling for CPU,
* this should be GPUSet::Empty().
* \param reshard Whether Reshard for HostDeviceVector is needed.
* \param shard Whether Shard for HostDeviceVector is needed.
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUSet const devices,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), reshard};
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), shard};
}
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUDistribution const dist,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), reshard};
bool const shard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), shard};
}
};