Make HostDeviceVector single gpu only (#4773)

* Make HostDeviceVector single gpu only
This commit is contained in:
Rong Ou
2019-08-25 14:51:13 -07:00
committed by Rory Mitchell
parent 41227d1933
commit 38ab79f889
54 changed files with 641 additions and 1621 deletions

View File

@@ -30,8 +30,7 @@ DMLC_REGISTRY_FILE_TAG(elementwise_metric);
template <typename EvalRow>
class ElementWiseMetricsReduction {
public:
explicit ElementWiseMetricsReduction(EvalRow policy) :
policy_(std::move(policy)) {}
explicit ElementWiseMetricsReduction(EvalRow policy) : policy_(std::move(policy)) {}
PackedReduceResult CpuReduceMetrics(
const HostDeviceVector<bst_float>& weights,
@@ -59,34 +58,31 @@ class ElementWiseMetricsReduction {
#if defined(XGBOOST_USE_CUDA)
~ElementWiseMetricsReduction() {
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices_.Index(id);
allocators_.at(index).Free();
if (device_ >= 0) {
dh::safe_cuda(cudaSetDevice(device_));
allocator_.Free();
}
}
PackedReduceResult DeviceReduceMetrics(
GPUSet::GpuIdType device_id,
size_t device_index,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds) {
size_t n_data = preds.DeviceSize(device_id);
size_t n_data = preds.DeviceSize();
thrust::counting_iterator<size_t> begin(0);
thrust::counting_iterator<size_t> end = begin + n_data;
auto s_label = labels.DeviceSpan(device_id);
auto s_preds = preds.DeviceSpan(device_id);
auto s_weights = weights.DeviceSpan(device_id);
auto s_label = labels.DeviceSpan();
auto s_preds = preds.DeviceSpan();
auto s_weights = weights.DeviceSpan();
bool const is_null_weight = weights.Size() == 0;
auto d_policy = policy_;
PackedReduceResult result = thrust::transform_reduce(
thrust::cuda::par(allocators_.at(device_index)),
thrust::cuda::par(allocator_),
begin, end,
[=] XGBOOST_DEVICE(size_t idx) {
bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@@ -105,37 +101,24 @@ class ElementWiseMetricsReduction {
PackedReduceResult Reduce(
const GenericParameter &tparam,
GPUSet devices,
int device,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result;
if (devices.IsEmpty()) {
if (device < 0) {
result = CpuReduceMetrics(weights, labels, preds);
}
#if defined(XGBOOST_USE_CUDA)
else { // NOLINT
if (allocators_.empty()) {
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
allocators_.resize(devices_.Size());
}
preds.Shard(devices);
labels.Shard(devices);
weights.Shard(devices);
std::vector<PackedReduceResult> res_per_device(devices.Size());
device_ = device;
preds.SetDevice(device_);
labels.SetDevice(device_);
weights.SetDevice(device_);
#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices.Index(id);
res_per_device.at(index) =
DeviceReduceMetrics(id, index, weights, labels, preds);
}
for (auto const& res : res_per_device) {
result += res;
}
dh::safe_cuda(cudaSetDevice(device_));
result = DeviceReduceMetrics(weights, labels, preds);
}
#endif // defined(XGBOOST_USE_CUDA)
return result;
@@ -144,8 +127,8 @@ class ElementWiseMetricsReduction {
private:
EvalRow policy_;
#if defined(XGBOOST_USE_CUDA)
GPUSet devices_;
std::vector<dh::CubMemory> allocators_;
int device_{-1};
dh::CubMemory allocator_;
#endif // defined(XGBOOST_USE_CUDA)
};
@@ -345,11 +328,10 @@ struct EvalEWiseBase : public Metric {
<< "label and prediction size not match, "
<< "hint: use merror or mlogloss for multi-class classification";
const auto ndata = static_cast<omp_ulong>(info.labels_.Size());
// Dealing with ndata < n_gpus.
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
int device = tparam_->gpu_id;
auto result =
reducer_.Reduce(*tparam_, devices, info.weights_, info.labels_, preds);
reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds);
double dat[2] { result.Residue(), result.Weights() };
if (distributed) {

View File

@@ -74,35 +74,32 @@ class MultiClassMetricsReduction {
#if defined(XGBOOST_USE_CUDA)
~MultiClassMetricsReduction() {
for (GPUSet::GpuIdType id = *devices_.begin(); id < *devices_.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices_.Index(id);
allocators_.at(index).Free();
if (device_ >= 0) {
dh::safe_cuda(cudaSetDevice(device_));
allocator_.Free();
}
}
PackedReduceResult DeviceReduceMetrics(
GPUSet::GpuIdType device_id,
size_t device_index,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds,
const size_t n_class) {
size_t n_data = labels.DeviceSize(device_id);
size_t n_data = labels.DeviceSize();
thrust::counting_iterator<size_t> begin(0);
thrust::counting_iterator<size_t> end = begin + n_data;
auto s_labels = labels.DeviceSpan(device_id);
auto s_preds = preds.DeviceSpan(device_id);
auto s_weights = weights.DeviceSpan(device_id);
auto s_labels = labels.DeviceSpan();
auto s_preds = preds.DeviceSpan();
auto s_weights = weights.DeviceSpan();
bool const is_null_weight = weights.Size() == 0;
auto s_label_error = label_error_.GetSpan<int32_t>(1);
s_label_error[0] = 0;
PackedReduceResult result = thrust::transform_reduce(
thrust::cuda::par(allocators_.at(device_index)),
thrust::cuda::par(allocator_),
begin, end,
[=] XGBOOST_DEVICE(size_t idx) {
bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
@@ -127,38 +124,25 @@ class MultiClassMetricsReduction {
PackedReduceResult Reduce(
const GenericParameter &tparam,
GPUSet devices,
int device,
size_t n_class,
const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result;
if (devices.IsEmpty()) {
if (device < 0) {
result = CpuReduceMetrics(weights, labels, preds, n_class);
}
#if defined(XGBOOST_USE_CUDA)
else { // NOLINT
if (allocators_.empty()) {
devices_ = GPUSet::All(tparam.gpu_id, tparam.n_gpus);
allocators_.resize(devices_.Size());
}
preds.Shard(GPUDistribution::Granular(devices, n_class));
labels.Shard(devices);
weights.Shard(devices);
std::vector<PackedReduceResult> res_per_device(devices.Size());
device_ = tparam.gpu_id;
preds.SetDevice(device_);
labels.SetDevice(device_);
weights.SetDevice(device_);
#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
for (GPUSet::GpuIdType id = *devices.begin(); id < *devices.end(); ++id) {
dh::safe_cuda(cudaSetDevice(id));
size_t index = devices.Index(id);
res_per_device.at(index) =
DeviceReduceMetrics(id, index, weights, labels, preds, n_class);
}
for (auto const& res : res_per_device) {
result += res;
}
dh::safe_cuda(cudaSetDevice(device_));
result = DeviceReduceMetrics(weights, labels, preds, n_class);
}
#endif // defined(XGBOOST_USE_CUDA)
return result;
@@ -167,8 +151,8 @@ class MultiClassMetricsReduction {
private:
#if defined(XGBOOST_USE_CUDA)
dh::PinnedMemory label_error_;
GPUSet devices_;
std::vector<dh::CubMemory> allocators_;
int device_{-1};
dh::CubMemory allocator_;
#endif // defined(XGBOOST_USE_CUDA)
};
@@ -190,8 +174,8 @@ struct EvalMClassBase : public Metric {
<< " use logloss for binary classification";
const auto ndata = static_cast<bst_omp_uint>(info.labels_.Size());
GPUSet devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus, ndata);
auto result = reducer_.Reduce(*tparam_, devices, nclass, info.weights_, info.labels_, preds);
int device = tparam_->gpu_id;
auto result = reducer_.Reduce(*tparam_, device, nclass, info.weights_, info.labels_, preds);
double dat[2] { result.Residue(), result.Weights() };
if (distributed) {