Multi-GPU HostDeviceVector. (#3287)

* Multi-GPU HostDeviceVector.

- HostDeviceVector instances can now span multiple devices, defined by GPUSet struct
- the interface of HostDeviceVector has been modified accordingly
- GPU objective functions are now multi-GPU
- GPU predicting from cache is now multi-GPU
- avoiding omp_set_num_threads() calls
- other minor changes
This commit is contained in:
Andrew V. Adinetz
2018-05-04 22:00:05 +02:00
committed by Rory Mitchell
parent 90a5c4db9d
commit b8a0d66fe6
9 changed files with 569 additions and 250 deletions

View File

@@ -74,46 +74,35 @@ __global__ void pred_transform_k(float * __restrict__ preds, int n) {
template<typename Loss>
class GPURegLossObj : public ObjFunction {
protected:
// manages device data
struct DeviceData {
DVec<float> labels, weights;
DVec<unsigned int> label_correct;
// allocate everything on device
DeviceData(dh::BulkAllocator<dh::MemoryType::kDevice>* ba, int device_idx, size_t n) {
ba->Allocate(device_idx, false,
&labels, n,
&weights, n,
&label_correct, 1);
}
size_t Size() const { return labels.Size(); }
};
bool copied_;
std::unique_ptr<dh::BulkAllocator<dh::MemoryType::kDevice>> ba_;
std::unique_ptr<DeviceData> data_;
HostDeviceVector<bst_float> preds_d_;
HostDeviceVector<GradientPair> out_gpair_d_;
HostDeviceVector<bst_float> labels_, weights_;
HostDeviceVector<unsigned int> label_correct_;
// allocate device data for n elements, do nothing if enough memory is allocated already
void LazyResize(int n) {
if (data_.get() != nullptr && data_->Size() >= n)
// allocate device data for n elements, do nothing if memory is allocated already
void LazyResize(size_t n, size_t n_weights) {
if (labels_.Size() == n && weights_.Size() == n_weights)
return;
copied_ = false;
// free the old data and allocate the new data
ba_.reset(new dh::BulkAllocator<dh::MemoryType::kDevice>());
data_.reset(new DeviceData(ba_.get(), 0, n));
preds_d_.Resize(n, 0.0f, param_.gpu_id);
out_gpair_d_.Resize(n, GradientPair(), param_.gpu_id);
labels_.Reshard(devices_);
weights_.Reshard(devices_);
label_correct_.Reshard(devices_);
if (labels_.Size() != n) {
labels_.Resize(n);
label_correct_.Resize(devices_.Size());
}
if (weights_.Size() != n_weights)
weights_.Resize(n_weights);
}
public:
GPURegLossObj() : copied_(false), preds_d_(0, -1), out_gpair_d_({}, -1) {}
GPURegLossObj() : copied_(false) {}
void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
param_.InitAllowUnknown(args);
CHECK(param_.n_gpus != 0) << "Must have at least one device";
devices_ = GPUSet::Range(param_.gpu_id, dh::NDevicesAll(param_.n_gpus));
}
void GetGradient(HostDeviceVector<float>* preds,
@@ -125,46 +114,50 @@ class GPURegLossObj : public ObjFunction {
<< "labels are not correctly provided"
<< "preds.size=" << preds->Size() << ", label.size=" << info.labels_.size();
size_t ndata = preds->Size();
out_gpair->Resize(ndata, GradientPair(), param_.gpu_id);
LazyResize(ndata);
GetGradientDevice(preds->DevicePointer(param_.gpu_id), info, iter,
out_gpair->DevicePointer(param_.gpu_id), ndata);
preds->Reshard(devices_);
out_gpair->Reshard(devices_);
out_gpair->Resize(ndata);
LazyResize(ndata, info.weights_.size());
GetGradientDevice(preds, info, iter, out_gpair);
}
private:
void GetGradientDevice(float* preds,
void GetGradientDevice(HostDeviceVector<float>* preds,
const MetaInfo &info,
int iter,
GradientPair* out_gpair, size_t n) {
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
DeviceData& d = *data_;
d.label_correct.Fill(1);
HostDeviceVector<GradientPair>* out_gpair) {
label_correct_.Fill(1);
// only copy the labels and weights once, similar to how the data is copied
if (!copied_) {
thrust::copy(info.labels_.begin(), info.labels_.begin() + n,
d.labels.tbegin());
if (info.weights_.size() > 0) {
thrust::copy(info.weights_.begin(), info.weights_.begin() + n,
d.weights.tbegin());
}
labels_.Copy(info.labels_);
if (info.weights_.size() > 0)
weights_.Copy(info.weights_);
copied_ = true;
}
// run the kernel
const int block = 256;
get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
(out_gpair, d.label_correct.Data(), preds,
d.labels.Data(), info.weights_.size() > 0 ? d.weights.Data() : nullptr,
n, param_.scale_pos_weight);
dh::safe_cuda(cudaGetLastError());
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < devices_.Size(); ++i) {
int d = devices_[i];
dh::safe_cuda(cudaSetDevice(d));
const int block = 256;
size_t n = preds->DeviceSize(d);
if (n > 0) {
get_gradient_k<Loss><<<dh::DivRoundUp(n, block), block>>>
(out_gpair->DevicePointer(d), label_correct_.DevicePointer(d),
preds->DevicePointer(d), labels_.DevicePointer(d),
info.weights_.size() > 0 ? weights_.DevicePointer(d) : nullptr,
n, param_.scale_pos_weight);
dh::safe_cuda(cudaGetLastError());
}
dh::safe_cuda(cudaDeviceSynchronize());
}
// copy output data from the GPU
unsigned int label_correct_h;
thrust::copy_n(d.label_correct.tbegin(), 1, &label_correct_h);
bool label_correct = label_correct_h != 0;
if (!label_correct) {
LOG(FATAL) << Loss::LabelErrorMsg();
// copy "label correct" flags back to host
std::vector<unsigned int>& label_correct_h = label_correct_.HostVector();
for (int i = 0; i < devices_.Size(); ++i) {
if (label_correct_h[i] == 0)
LOG(FATAL) << Loss::LabelErrorMsg();
}
}
@@ -174,24 +167,33 @@ class GPURegLossObj : public ObjFunction {
}
void PredTransform(HostDeviceVector<float> *io_preds) override {
PredTransformDevice(io_preds->DevicePointer(param_.gpu_id), io_preds->Size());
io_preds->Reshard(devices_);
size_t ndata = io_preds->Size();
PredTransformDevice(io_preds);
}
void PredTransformDevice(float* preds, size_t n) {
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
const int block = 256;
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds, n);
dh::safe_cuda(cudaGetLastError());
dh::safe_cuda(cudaDeviceSynchronize());
void PredTransformDevice(HostDeviceVector<float>* preds) {
#pragma omp parallel for schedule(static, 1)
for (int i = 0; i < devices_.Size(); ++i) {
int d = devices_[i];
dh::safe_cuda(cudaSetDevice(d));
const int block = 256;
size_t n = preds->DeviceSize(d);
if (n > 0) {
pred_transform_k<Loss><<<dh::DivRoundUp(n, block), block>>>(preds->DevicePointer(d), n);
dh::safe_cuda(cudaGetLastError());
}
dh::safe_cuda(cudaDeviceSynchronize());
}
}
float ProbToMargin(float base_score) const override {
return Loss::ProbToMargin(base_score);
}
protected:
GPURegLossParam param_;
GPUSet devices_;
};
// register the objective functions