Lazy initialization of device vector. (#5173)
* Lazy initialization of device vector. * Fix #5162. * Disable copy constructor of HostDeviceVector. Prevents implicit copying. * Fix CPU build. * Bring back move assignment operator.
This commit is contained in:
parent
77cfbff5a7
commit
ee287808fb
@ -59,8 +59,25 @@ class MetaInfo {
|
||||
* can be used to specify initial prediction to boost from.
|
||||
*/
|
||||
HostDeviceVector<bst_float> base_margin_;
|
||||
|
||||
/*! \brief default constructor */
|
||||
MetaInfo() = default;
|
||||
MetaInfo& operator=(MetaInfo const& that) {
|
||||
this->num_row_ = that.num_row_;
|
||||
this->num_col_ = that.num_col_;
|
||||
this->num_nonzero_ = that.num_nonzero_;
|
||||
|
||||
this->labels_.Resize(that.labels_.Size());
|
||||
this->labels_.Copy(that.labels_);
|
||||
|
||||
this->group_ptr_ = that.group_ptr_;
|
||||
|
||||
this->weights_.Resize(that.weights_.Size());
|
||||
this->weights_.Copy(that.weights_);
|
||||
this->base_margin_.Resize(that.base_margin_.Size());
|
||||
this->base_margin_.Copy(that.base_margin_);
|
||||
return *this;
|
||||
}
|
||||
/*!
|
||||
* \brief Get weight of each instances.
|
||||
* \param i Instance index.
|
||||
@ -246,10 +263,10 @@ class SparsePage {
|
||||
/**
|
||||
* \brief Pushes external data batch onto this page
|
||||
*
|
||||
* \tparam AdapterBatchT
|
||||
* \param batch
|
||||
* \param missing
|
||||
* \param nthread
|
||||
* \tparam AdapterBatchT
|
||||
* \param batch
|
||||
* \param missing
|
||||
* \param nthread
|
||||
*
|
||||
* \return The maximum number of columns encountered in this input batch. Useful when pushing many adapter batches to work out the total number of columns.
|
||||
*/
|
||||
|
||||
@ -88,8 +88,13 @@ class HostDeviceVector {
|
||||
HostDeviceVector(std::initializer_list<T> init, int device = -1);
|
||||
explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
|
||||
~HostDeviceVector();
|
||||
HostDeviceVector(const HostDeviceVector<T>&);
|
||||
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
|
||||
|
||||
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
||||
HostDeviceVector(HostDeviceVector<T>&&);
|
||||
|
||||
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
|
||||
HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
|
||||
|
||||
size_t Size() const;
|
||||
int DeviceIdx() const;
|
||||
common::Span<T> DeviceSpan();
|
||||
@ -116,6 +121,7 @@ class HostDeviceVector {
|
||||
bool HostCanWrite() const;
|
||||
bool DeviceCanRead() const;
|
||||
bool DeviceCanWrite() const;
|
||||
GPUAccess DeviceAccess() const;
|
||||
|
||||
void SetDevice(int device) const;
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <xgboost/base.h>
|
||||
#include <xgboost/data.h>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include "xgboost/host_device_vector.h"
|
||||
|
||||
@ -18,6 +19,7 @@ struct HostDeviceVectorImpl {
|
||||
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v) {}
|
||||
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init) {}
|
||||
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)) {}
|
||||
HostDeviceVectorImpl(HostDeviceVectorImpl&& that) : data_h_(std::move(that.data_h_)) {}
|
||||
|
||||
void Swap(HostDeviceVectorImpl &other) {
|
||||
data_h_.swap(other.data_h_);
|
||||
@ -47,6 +49,22 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& that) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(std::move(*that.impl_));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& that) {
|
||||
if (this == &that) { return *this; }
|
||||
|
||||
std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
|
||||
new HostDeviceVectorImpl<T>(std::move(*that.impl_)));
|
||||
delete impl_;
|
||||
impl_ = new_impl.release();
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::~HostDeviceVector() {
|
||||
delete impl_;
|
||||
@ -54,21 +72,8 @@ HostDeviceVector<T>::~HostDeviceVector() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(*other.impl_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
|
||||
if (this == &other) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
HostDeviceVectorImpl<T> newInstance(*other.impl_);
|
||||
newInstance.Swap(*impl_);
|
||||
|
||||
return *this;
|
||||
GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
||||
return kNone;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@ -29,7 +29,7 @@ class HostDeviceVectorImpl {
|
||||
if (device >= 0) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_.resize(size, v);
|
||||
data_d_->resize(size, v);
|
||||
} else {
|
||||
data_h_.resize(size, v);
|
||||
}
|
||||
@ -47,34 +47,40 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
}
|
||||
|
||||
HostDeviceVectorImpl(HostDeviceVectorImpl<T>&& that) :
|
||||
device_{that.device_},
|
||||
data_h_{std::move(that.data_h_)},
|
||||
data_d_{std::move(that.data_d_)},
|
||||
gpu_access_{that.gpu_access_} {}
|
||||
|
||||
~HostDeviceVectorImpl() {
|
||||
if (device_ >= 0) {
|
||||
SetDevice();
|
||||
}
|
||||
}
|
||||
|
||||
size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_.size(); }
|
||||
size_t Size() const { return HostCanRead() ? data_h_.size() : data_d_->size(); }
|
||||
|
||||
int DeviceIdx() const { return device_; }
|
||||
|
||||
T* DevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
return data_d_.data().get();
|
||||
return data_d_->data().get();
|
||||
}
|
||||
|
||||
const T* ConstDevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
return data_d_.data().get();
|
||||
return data_d_->data().get();
|
||||
}
|
||||
|
||||
common::Span<T> DeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
return {data_d_.data().get(), Size()};
|
||||
return {data_d_->data().get(), Size()};
|
||||
}
|
||||
|
||||
common::Span<const T> ConstDeviceSpan() {
|
||||
LazySyncDevice(GPUAccess::kRead);
|
||||
return {data_d_.data().get(), Size()};
|
||||
return {data_d_->data().get(), Size()};
|
||||
}
|
||||
|
||||
void Fill(T v) { // NOLINT
|
||||
@ -83,17 +89,19 @@ class HostDeviceVectorImpl {
|
||||
} else {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
thrust::fill(data_d_.begin(), data_d_.end(), v);
|
||||
thrust::fill(data_d_->begin(), data_d_->end(), v);
|
||||
}
|
||||
}
|
||||
|
||||
void Copy(HostDeviceVectorImpl<T>* other) {
|
||||
CHECK_EQ(Size(), other->Size());
|
||||
SetDevice(other->device_);
|
||||
// Data is on host.
|
||||
if (HostCanWrite() && other->HostCanWrite()) {
|
||||
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
|
||||
return;
|
||||
}
|
||||
SetDevice();
|
||||
CopyToDevice(other);
|
||||
}
|
||||
|
||||
@ -138,11 +146,11 @@ class HostDeviceVectorImpl {
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size()) { return; }
|
||||
if (Size() == 0 && device_ >= 0) {
|
||||
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
|
||||
// fast on-device resize
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_.resize(new_size, v);
|
||||
data_d_->resize(new_size, v);
|
||||
} else {
|
||||
// resize on host
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
@ -158,11 +166,11 @@ class HostDeviceVectorImpl {
|
||||
return;
|
||||
}
|
||||
gpu_access_ = access;
|
||||
if (data_h_.size() != data_d_.size()) { data_h_.resize(data_d_.size()); }
|
||||
if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); }
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(),
|
||||
data_d_.data().get(),
|
||||
data_d_.size() * sizeof(T),
|
||||
data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
@ -176,9 +184,9 @@ class HostDeviceVectorImpl {
|
||||
// data is on the host
|
||||
LazyResizeDevice(data_h_.size());
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpy(data_d_.data().get(),
|
||||
dh::safe_cuda(cudaMemcpy(data_d_->data().get(),
|
||||
data_h_.data(),
|
||||
data_d_.size() * sizeof(T),
|
||||
data_d_->size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
gpu_access_ = access;
|
||||
}
|
||||
@ -189,11 +197,12 @@ class HostDeviceVectorImpl {
|
||||
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
GPUAccess Access() const { return gpu_access_; }
|
||||
|
||||
private:
|
||||
int device_{-1};
|
||||
std::vector<T> data_h_{};
|
||||
dh::device_vector<T> data_d_{};
|
||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||
|
||||
void CopyToDevice(HostDeviceVectorImpl* other) {
|
||||
@ -203,8 +212,8 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), other->data_d_.data().get(),
|
||||
data_d_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(),
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
}
|
||||
|
||||
@ -212,14 +221,14 @@ class HostDeviceVectorImpl {
|
||||
LazyResizeDevice(Size());
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_.data().get(), begin,
|
||||
data_d_.size() * sizeof(T), cudaMemcpyDefault));
|
||||
dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin,
|
||||
data_d_->size() * sizeof(T), cudaMemcpyDefault));
|
||||
}
|
||||
|
||||
void LazyResizeDevice(size_t new_size) {
|
||||
if (new_size == data_d_.size()) { return; }
|
||||
if (data_d_ && new_size == data_d_->size()) { return; }
|
||||
SetDevice();
|
||||
data_d_.resize(new_size);
|
||||
data_d_->resize(new_size);
|
||||
}
|
||||
|
||||
void SetDevice() {
|
||||
@ -229,6 +238,10 @@ class HostDeviceVectorImpl {
|
||||
} else {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
}
|
||||
|
||||
if (!data_d_) {
|
||||
data_d_.reset(new dh::device_vector<T>);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@ -245,16 +258,17 @@ HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const HostDeviceVector<T>& other)
|
||||
: impl_(new HostDeviceVectorImpl<T>(*other.impl_)) {}
|
||||
HostDeviceVector<T>::HostDeviceVector(HostDeviceVector<T>&& other)
|
||||
: impl_(new HostDeviceVectorImpl<T>(std::move(*other.impl_))) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& other) {
|
||||
HostDeviceVector<T>& HostDeviceVector<T>::operator=(HostDeviceVector<T>&& other) {
|
||||
if (this == &other) { return *this; }
|
||||
|
||||
std::unique_ptr<HostDeviceVectorImpl<T>> newImpl(new HostDeviceVectorImpl<T>(*other.impl_));
|
||||
std::unique_ptr<HostDeviceVectorImpl<T>> new_impl(
|
||||
new HostDeviceVectorImpl<T>(std::move(*other.impl_)));
|
||||
delete impl_;
|
||||
impl_ = newImpl.release();
|
||||
impl_ = new_impl.release();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -338,6 +352,11 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||
return impl_->DeviceCanWrite();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
||||
return impl_->Access();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(int device) const {
|
||||
impl_->SetDevice(device);
|
||||
|
||||
@ -339,6 +339,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
// the first step only modifies prediction store in learner without following code.
|
||||
InitOutPredictions(cache_emtry->second.data->Info(),
|
||||
&(cache_emtry->second.predictions), model);
|
||||
CHECK_EQ(cache_emtry->second.predictions.Size(), out_preds->Size());
|
||||
cache_emtry->second.predictions.Copy(*out_preds);
|
||||
}
|
||||
}
|
||||
|
||||
@ -60,6 +60,7 @@ void PlusOne(HostDeviceVector<int> *v) {
|
||||
SetDevice(device);
|
||||
thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
|
||||
[=]__device__(unsigned int a){ return a + 1; });
|
||||
ASSERT_TRUE(v->DeviceCanWrite());
|
||||
}
|
||||
|
||||
void CheckDevice(HostDeviceVector<int>* v,
|
||||
@ -125,7 +126,8 @@ TEST(HostDeviceVector, Copy) {
|
||||
// a separate scope to ensure that v1 is gone before further checks
|
||||
HostDeviceVector<int> v1;
|
||||
InitHostDeviceVector(n, device, &v1);
|
||||
v = v1;
|
||||
v.Resize(v1.Size());
|
||||
v.Copy(v1);
|
||||
}
|
||||
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||
PlusOne(&v);
|
||||
|
||||
@ -11,26 +11,26 @@ TEST(ColumnSampler, Test) {
|
||||
|
||||
// No node sampling
|
||||
cs.Init(n, 1.0f, 0.5f, 0.5f);
|
||||
auto set0 = *cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set0.Size(), 32);
|
||||
auto set0 = cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set0->Size(), 32);
|
||||
|
||||
auto set1 = *cs.GetFeatureSet(0);
|
||||
auto set1 = cs.GetFeatureSet(0);
|
||||
|
||||
ASSERT_EQ(set0.HostVector(), set1.HostVector());
|
||||
ASSERT_EQ(set0->HostVector(), set1->HostVector());
|
||||
|
||||
auto set2 = *cs.GetFeatureSet(1);
|
||||
ASSERT_NE(set1.HostVector(), set2.HostVector());
|
||||
ASSERT_EQ(set2.Size(), 32);
|
||||
auto set2 = cs.GetFeatureSet(1);
|
||||
ASSERT_NE(set1->HostVector(), set2->HostVector());
|
||||
ASSERT_EQ(set2->Size(), 32);
|
||||
|
||||
// Node sampling
|
||||
cs.Init(n, 0.5f, 1.0f, 0.5f);
|
||||
auto set3 = *cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set3.Size(), 32);
|
||||
auto set3 = cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set3->Size(), 32);
|
||||
|
||||
auto set4 = *cs.GetFeatureSet(0);
|
||||
auto set4 = cs.GetFeatureSet(0);
|
||||
|
||||
ASSERT_NE(set3.HostVector(), set4.HostVector());
|
||||
ASSERT_EQ(set4.Size(), 32);
|
||||
ASSERT_NE(set3->HostVector(), set4->HostVector());
|
||||
ASSERT_EQ(set4->Size(), 32);
|
||||
|
||||
// No level or node sampling, should be the same at different depth
|
||||
cs.Init(n, 1.0f, 1.0f, 0.5f);
|
||||
@ -38,11 +38,11 @@ TEST(ColumnSampler, Test) {
|
||||
cs.GetFeatureSet(1)->HostVector());
|
||||
|
||||
cs.Init(n, 1.0f, 1.0f, 1.0f);
|
||||
auto set5 = *cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set5.Size(), n);
|
||||
auto set5 = cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set5->Size(), n);
|
||||
cs.Init(n, 1.0f, 1.0f, 1.0f);
|
||||
auto set6 = *cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set5.HostVector(), set6.HostVector());
|
||||
auto set6 = cs.GetFeatureSet(0);
|
||||
ASSERT_EQ(set5->HostVector(), set6->HostVector());
|
||||
|
||||
// Should always be a minimum of one feature
|
||||
cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user