Make HostDeviceVector single gpu only (#4773)
* Make HostDeviceVector single gpu only
This commit is contained in:
@@ -702,7 +702,7 @@ struct DeviceShard {
|
||||
row_partitioner.reset(new RowPartitioner(device_id, n_rows));
|
||||
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
gpair.data(), dh_gpair->ConstDevicePointer(device_id),
|
||||
gpair.data(), dh_gpair->ConstDevicePointer(),
|
||||
gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
|
||||
SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
|
||||
hist.Reset();
|
||||
@@ -745,8 +745,8 @@ struct DeviceShard {
|
||||
for (auto i = 0ull; i < nidxs.size(); i++) {
|
||||
auto nidx = nidxs[i];
|
||||
auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
|
||||
p_feature_set->Shard(GPUSet(device_id, 1));
|
||||
auto d_sampled_features = p_feature_set->DeviceSpan(device_id);
|
||||
p_feature_set->SetDevice(device_id);
|
||||
auto d_sampled_features = p_feature_set->DeviceSpan();
|
||||
common::Span<int32_t> d_feature_set =
|
||||
interaction_constraints.Query(d_sampled_features, nidx);
|
||||
auto d_split_candidates =
|
||||
@@ -1016,7 +1016,7 @@ struct DeviceShard {
|
||||
dh::AllReducer* reducer, int64_t num_columns) {
|
||||
constexpr int kRootNIdx = 0;
|
||||
|
||||
const auto &gpair = gpair_all->DeviceSpan(device_id);
|
||||
const auto &gpair = gpair_all->DeviceSpan();
|
||||
|
||||
dh::SumReduction(temp_memory, gpair, node_sum_gradients_d,
|
||||
gpair.size());
|
||||
@@ -1294,11 +1294,8 @@ class GPUHistMakerSpecialised {
|
||||
param_.InitAllowUnknown(args);
|
||||
generic_param_ = generic_param;
|
||||
hist_maker_param_.InitAllowUnknown(args);
|
||||
auto devices = GPUSet::All(generic_param_->gpu_id,
|
||||
generic_param_->n_gpus);
|
||||
n_devices_ = devices.Size();
|
||||
CHECK(n_devices_ != 0) << "Must have at least one device";
|
||||
dist_ = GPUDistribution::Block(devices);
|
||||
device_ = generic_param_->gpu_id;
|
||||
CHECK_GE(device_, 0) << "Must have at least one device";
|
||||
|
||||
dh::CheckComputeCapability();
|
||||
|
||||
@@ -1330,30 +1327,22 @@ class GPUHistMakerSpecialised {
|
||||
void InitDataOnce(DMatrix* dmat) {
|
||||
info_ = &dmat->Info();
|
||||
|
||||
int n_devices = dist_.Devices().Size();
|
||||
|
||||
device_list_.resize(n_devices);
|
||||
for (int index = 0; index < n_devices; ++index) {
|
||||
int device_id = dist_.Devices().DeviceId(index);
|
||||
device_list_[index] = device_id;
|
||||
}
|
||||
|
||||
reducer_.Init(device_list_);
|
||||
reducer_.Init({device_});
|
||||
|
||||
// Synchronise the column sampling seed
|
||||
uint32_t column_sampling_seed = common::GlobalRandom()();
|
||||
rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
|
||||
|
||||
// Create device shards
|
||||
shards_.resize(n_devices);
|
||||
shards_.resize(1);
|
||||
dh::ExecuteIndexShards(
|
||||
&shards_,
|
||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||
dh::safe_cuda(cudaSetDevice(dist_.Devices().DeviceId(idx)));
|
||||
size_t start = dist_.ShardStart(info_->num_row_, idx);
|
||||
size_t size = dist_.ShardSize(info_->num_row_, idx);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
size_t start = 0;
|
||||
size_t size = info_->num_row_;
|
||||
shard = std::unique_ptr<DeviceShard<GradientSumT>>(
|
||||
new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
|
||||
new DeviceShard<GradientSumT>(device_, idx,
|
||||
start, start + size, param_,
|
||||
column_sampling_seed,
|
||||
info_->num_col_));
|
||||
@@ -1436,7 +1425,7 @@ class GPUHistMakerSpecialised {
|
||||
for (auto& tree : trees) {
|
||||
tree = *p_tree;
|
||||
}
|
||||
gpair->Reshard(dist_);
|
||||
gpair->SetDevice(device_);
|
||||
|
||||
// Launch one thread for each device "shard" containing a subset of rows.
|
||||
// Threads will cooperatively build the tree, synchronising over histograms.
|
||||
@@ -1462,13 +1451,13 @@ class GPUHistMakerSpecialised {
|
||||
return false;
|
||||
}
|
||||
monitor_.StartCuda("UpdatePredictionCache");
|
||||
p_out_preds->Shard(dist_.Devices());
|
||||
p_out_preds->SetDevice(device_);
|
||||
dh::ExecuteIndexShards(
|
||||
&shards_,
|
||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||
shard->UpdatePredictionCache(
|
||||
p_out_preds->DevicePointer(shard->device_id));
|
||||
p_out_preds->DevicePointer());
|
||||
});
|
||||
monitor_.StopCuda("UpdatePredictionCache");
|
||||
return true;
|
||||
@@ -1483,7 +1472,6 @@ class GPUHistMakerSpecialised {
|
||||
private:
|
||||
bool initialised_;
|
||||
|
||||
int n_devices_;
|
||||
int n_bins_;
|
||||
|
||||
GPUHistMakerTrainParam hist_maker_param_;
|
||||
@@ -1492,11 +1480,9 @@ class GPUHistMakerSpecialised {
|
||||
dh::AllReducer reducer_;
|
||||
|
||||
DMatrix* p_last_fmat_;
|
||||
GPUDistribution dist_;
|
||||
int device_;
|
||||
|
||||
common::Monitor monitor_;
|
||||
/*! List storing device id. */
|
||||
std::vector<int> device_list_;
|
||||
};
|
||||
|
||||
class GPUHistMaker : public TreeUpdater {
|
||||
|
||||
Reference in New Issue
Block a user