further cleanup of single process multi-GPU code (#4810)
* use subspan in gpu predictor instead of copying * Revise `HostDeviceVector`
This commit is contained in:
@@ -10,17 +10,6 @@
|
||||
|
||||
using xgboost::common::Span;
|
||||
|
||||
struct Shard { int id; };
|
||||
|
||||
TEST(DeviceHelpers, Basic) {
|
||||
std::vector<Shard> shards (4);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
shards[i].id = i;
|
||||
}
|
||||
int sum = dh::ReduceShards<int>(&shards, [](Shard& s) { return s.id ; });
|
||||
ASSERT_EQ(sum, 6);
|
||||
}
|
||||
|
||||
void CreateTestData(xgboost::bst_uint num_rows, int max_row_size,
|
||||
thrust::host_vector<int> *row_ptr,
|
||||
thrust::host_vector<xgboost::bst_uint> *rows) {
|
||||
|
||||
@@ -38,19 +38,19 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
|
||||
ASSERT_EQ(v->Size(), n);
|
||||
ASSERT_EQ(v->DeviceIdx(), device);
|
||||
// ensure that the device have read-write access
|
||||
ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
|
||||
ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
|
||||
ASSERT_TRUE(v->DeviceCanRead());
|
||||
ASSERT_TRUE(v->DeviceCanWrite());
|
||||
// ensure that the host has no access
|
||||
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
|
||||
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
|
||||
ASSERT_FALSE(v->HostCanRead());
|
||||
ASSERT_FALSE(v->HostCanWrite());
|
||||
|
||||
// fill in the data on the host
|
||||
std::vector<int>& data_h = v->HostVector();
|
||||
// ensure that the host has full access, while the device have none
|
||||
ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
|
||||
ASSERT_TRUE(v->HostCanAccess(GPUAccess::kWrite));
|
||||
ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kRead));
|
||||
ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
|
||||
ASSERT_TRUE(v->HostCanRead());
|
||||
ASSERT_TRUE(v->HostCanWrite());
|
||||
ASSERT_FALSE(v->DeviceCanRead());
|
||||
ASSERT_FALSE(v->DeviceCanWrite());
|
||||
ASSERT_EQ(data_h.size(), n);
|
||||
std::copy_n(thrust::make_counting_iterator(0), n, data_h.begin());
|
||||
}
|
||||
@@ -62,76 +62,62 @@ void PlusOne(HostDeviceVector<int> *v) {
|
||||
[=]__device__(unsigned int a){ return a + 1; });
|
||||
}
|
||||
|
||||
void CheckDevice(HostDeviceVector<int> *v,
|
||||
const std::vector<size_t>& starts,
|
||||
const std::vector<size_t>& sizes,
|
||||
unsigned int first, GPUAccess access) {
|
||||
int n_devices = sizes.size();
|
||||
ASSERT_EQ(n_devices, 1);
|
||||
for (int i = 0; i < n_devices; ++i) {
|
||||
ASSERT_EQ(v->DeviceSize(), sizes.at(i));
|
||||
SetDevice(i);
|
||||
ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(),
|
||||
thrust::make_counting_iterator(first + starts[i])));
|
||||
ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
|
||||
// ensure that the device has at most the access specified by access
|
||||
ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
|
||||
}
|
||||
ASSERT_EQ(v->HostCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
|
||||
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
|
||||
for (int i = 0; i < n_devices; ++i) {
|
||||
SetDevice(i);
|
||||
ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(),
|
||||
thrust::make_counting_iterator(first + starts[i])));
|
||||
ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kRead));
|
||||
ASSERT_TRUE(v->DeviceCanAccess(GPUAccess::kWrite));
|
||||
}
|
||||
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kRead));
|
||||
ASSERT_FALSE(v->HostCanAccess(GPUAccess::kWrite));
|
||||
void CheckDevice(HostDeviceVector<int>* v,
|
||||
size_t size,
|
||||
unsigned int first,
|
||||
GPUAccess access) {
|
||||
ASSERT_EQ(v->Size(), size);
|
||||
SetDevice(v->DeviceIdx());
|
||||
|
||||
ASSERT_TRUE(thrust::equal(v->tcbegin(), v->tcend(),
|
||||
thrust::make_counting_iterator(first)));
|
||||
ASSERT_TRUE(v->DeviceCanRead());
|
||||
// ensure that the device has at most the access specified by access
|
||||
ASSERT_EQ(v->DeviceCanWrite(), access == GPUAccess::kWrite);
|
||||
ASSERT_EQ(v->HostCanRead(), access == GPUAccess::kRead);
|
||||
ASSERT_FALSE(v->HostCanWrite());
|
||||
|
||||
ASSERT_TRUE(thrust::equal(v->tbegin(), v->tend(),
|
||||
thrust::make_counting_iterator(first)));
|
||||
ASSERT_TRUE(v->DeviceCanRead());
|
||||
ASSERT_TRUE(v->DeviceCanWrite());
|
||||
ASSERT_FALSE(v->HostCanRead());
|
||||
ASSERT_FALSE(v->HostCanWrite());
|
||||
}
|
||||
|
||||
void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
||||
const std::vector<int>& data_h = access == GPUAccess::kWrite ?
|
||||
const std::vector<int>& data_h = access == GPUAccess::kNone ?
|
||||
v->HostVector() : v->ConstHostVector();
|
||||
for (size_t i = 0; i < v->Size(); ++i) {
|
||||
ASSERT_EQ(data_h.at(i), i + 1);
|
||||
}
|
||||
ASSERT_TRUE(v->HostCanAccess(GPUAccess::kRead));
|
||||
ASSERT_EQ(v->HostCanAccess(GPUAccess::kWrite), access == GPUAccess::kWrite);
|
||||
size_t n_devices = 1;
|
||||
for (int i = 0; i < n_devices; ++i) {
|
||||
ASSERT_EQ(v->DeviceCanAccess(GPUAccess::kRead), access == GPUAccess::kRead);
|
||||
// the devices should have no write access
|
||||
ASSERT_FALSE(v->DeviceCanAccess(GPUAccess::kWrite));
|
||||
}
|
||||
ASSERT_TRUE(v->HostCanRead());
|
||||
ASSERT_EQ(v->HostCanWrite(), access == GPUAccess::kNone);
|
||||
ASSERT_EQ(v->DeviceCanRead(), access == GPUAccess::kRead);
|
||||
// the devices should have no write access
|
||||
ASSERT_FALSE(v->DeviceCanWrite());
|
||||
}
|
||||
|
||||
void TestHostDeviceVector
|
||||
(size_t n, int device,
|
||||
const std::vector<size_t>& starts, const std::vector<size_t>& sizes) {
|
||||
void TestHostDeviceVector(size_t n, int device) {
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
HostDeviceVector<int> v;
|
||||
InitHostDeviceVector(n, device, &v);
|
||||
CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
|
||||
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||
PlusOne(&v);
|
||||
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
|
||||
CheckDevice(&v, n, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kNone);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, TestBlock) {
|
||||
TEST(HostDeviceVector, Basic) {
|
||||
size_t n = 1001;
|
||||
int device = 0;
|
||||
std::vector<size_t> starts{0};
|
||||
std::vector<size_t> sizes{1001};
|
||||
TestHostDeviceVector(n, device, starts, sizes);
|
||||
TestHostDeviceVector(n, device);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, TestCopy) {
|
||||
TEST(HostDeviceVector, Copy) {
|
||||
size_t n = 1001;
|
||||
int device = 0;
|
||||
std::vector<size_t> starts{0};
|
||||
std::vector<size_t> sizes{1001};
|
||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||
|
||||
HostDeviceVector<int> v;
|
||||
@@ -141,14 +127,14 @@ TEST(HostDeviceVector, TestCopy) {
|
||||
InitHostDeviceVector(n, device, &v1);
|
||||
v = v1;
|
||||
}
|
||||
CheckDevice(&v, starts, sizes, 0, GPUAccess::kRead);
|
||||
CheckDevice(&v, n, 0, GPUAccess::kRead);
|
||||
PlusOne(&v);
|
||||
CheckDevice(&v, starts, sizes, 1, GPUAccess::kWrite);
|
||||
CheckDevice(&v, n, 1, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kRead);
|
||||
CheckHost(&v, GPUAccess::kWrite);
|
||||
CheckHost(&v, GPUAccess::kNone);
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Shard) {
|
||||
TEST(HostDeviceVector, SetDevice) {
|
||||
std::vector<int> h_vec (2345);
|
||||
for (size_t i = 0; i < h_vec.size(); ++i) {
|
||||
h_vec[i] = i;
|
||||
@@ -157,7 +143,6 @@ TEST(HostDeviceVector, Shard) {
|
||||
auto device = 0;
|
||||
|
||||
vec.SetDevice(device);
|
||||
ASSERT_EQ(vec.DeviceSize(), h_vec.size());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
auto span = vec.DeviceSpan(); // sync to device
|
||||
|
||||
@@ -169,39 +154,26 @@ TEST(HostDeviceVector, Shard) {
|
||||
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Reshard) {
|
||||
std::vector<int> h_vec (2345);
|
||||
for (size_t i = 0; i < h_vec.size(); ++i) {
|
||||
h_vec[i] = i;
|
||||
}
|
||||
HostDeviceVector<int> vec (h_vec);
|
||||
auto device = 0;
|
||||
|
||||
vec.SetDevice(device);
|
||||
ASSERT_EQ(vec.DeviceSize(), h_vec.size());
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
PlusOne(&vec);
|
||||
|
||||
vec.SetDevice(-1);
|
||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||
ASSERT_EQ(vec.DeviceIdx(), -1);
|
||||
|
||||
auto h_vec_1 = vec.HostVector();
|
||||
for (size_t i = 0; i < h_vec_1.size(); ++i) {
|
||||
ASSERT_EQ(h_vec_1.at(i), i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, Span) {
|
||||
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
||||
vec.SetDevice(0);
|
||||
auto span = vec.DeviceSpan();
|
||||
ASSERT_EQ(vec.DeviceSize(), span.size());
|
||||
ASSERT_EQ(vec.Size(), span.size());
|
||||
ASSERT_EQ(vec.DevicePointer(), span.data());
|
||||
auto const_span = vec.ConstDeviceSpan();
|
||||
ASSERT_EQ(vec.DeviceSize(), span.size());
|
||||
ASSERT_EQ(vec.ConstDevicePointer(), span.data());
|
||||
ASSERT_EQ(vec.Size(), const_span.size());
|
||||
ASSERT_EQ(vec.ConstDevicePointer(), const_span.data());
|
||||
}
|
||||
|
||||
TEST(HostDeviceVector, MGPU_Basic) {
|
||||
if (AllVisibleGPUs() < 2) {
|
||||
LOG(WARNING) << "Not testing in multi-gpu environment.";
|
||||
return;
|
||||
}
|
||||
|
||||
size_t n = 1001;
|
||||
int device = 1;
|
||||
TestHostDeviceVector(n, device);
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -83,8 +83,8 @@ TEST(gpu_predictor, ExternalMemoryTest) {
|
||||
std::string file1 = tmpdir.path + "/big_1.libsvm";
|
||||
std::string file2 = tmpdir.path + "/big_2.libsvm";
|
||||
dmats.push_back(CreateSparsePageDMatrix(9, 64UL, file0));
|
||||
// dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
|
||||
// dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
|
||||
dmats.push_back(CreateSparsePageDMatrix(128, 128UL, file1));
|
||||
dmats.push_back(CreateSparsePageDMatrix(1024, 1024UL, file2));
|
||||
|
||||
for (const auto& dmat: dmats) {
|
||||
dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
|
||||
|
||||
@@ -113,7 +113,7 @@ TEST(GpuHist, BuildGidxDense) {
|
||||
{"max_leaves", "0"},
|
||||
};
|
||||
param.Init(args);
|
||||
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols, kNCols);
|
||||
DeviceShard<GradientPairPrecise> shard(0, kNRows, param, kNCols, kNCols);
|
||||
BuildGidx(&shard, kNRows, kNCols);
|
||||
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
||||
@@ -154,8 +154,7 @@ TEST(GpuHist, BuildGidxSparse) {
|
||||
};
|
||||
param.Init(args);
|
||||
|
||||
DeviceShard<GradientPairPrecise> shard(0, 0, 0, kNRows, param, kNCols,
|
||||
kNCols);
|
||||
DeviceShard<GradientPairPrecise> shard(0, kNRows, param, kNCols, kNCols);
|
||||
BuildGidx(&shard, kNRows, kNCols, 0.9f);
|
||||
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(shard.gidx_buffer.size());
|
||||
@@ -200,8 +199,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
{"max_leaves", "0"},
|
||||
};
|
||||
param.Init(args);
|
||||
DeviceShard<GradientSumT> shard(0, 0, 0, kNRows, param, kNCols,
|
||||
kNCols);
|
||||
DeviceShard<GradientSumT> shard(0, kNRows, param, kNCols, kNCols);
|
||||
BuildGidx(&shard, kNRows, kNCols);
|
||||
|
||||
xgboost::SimpleLCG gen;
|
||||
@@ -303,8 +301,7 @@ TEST(GpuHist, EvaluateSplits) {
|
||||
|
||||
// Initialize DeviceShard
|
||||
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard{
|
||||
new DeviceShard<GradientPairPrecise>(0, 0, 0, kNRows, param, kNCols,
|
||||
kNCols)};
|
||||
new DeviceShard<GradientPairPrecise>(0, kNRows, param, kNCols, kNCols)};
|
||||
// Initialize DeviceShard::node_sum_gradients
|
||||
shard->node_sum_gradients = {{6.4f, 12.8f}};
|
||||
|
||||
@@ -391,24 +388,20 @@ void TestHistogramIndexImpl() {
|
||||
hist_maker_ext.Configure(training_params, &generic_param);
|
||||
hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
|
||||
|
||||
ASSERT_EQ(hist_maker.shards_.size(), hist_maker_ext.shards_.size());
|
||||
|
||||
// Extract the device shards from the histogram makers and from that its compressed
|
||||
// Extract the device shard from the histogram makers and from that its compressed
|
||||
// histogram index
|
||||
for (size_t i = 0; i < hist_maker.shards_.size(); ++i) {
|
||||
const auto &dev_shard = hist_maker.shards_[i];
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);
|
||||
const auto &dev_shard = hist_maker.shard_;
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer(dev_shard->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer, dev_shard->gidx_buffer);
|
||||
|
||||
const auto &dev_shard_ext = hist_maker_ext.shards_[i];
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);
|
||||
const auto &dev_shard_ext = hist_maker_ext.shard_;
|
||||
std::vector<common::CompressedByteT> h_gidx_buffer_ext(dev_shard_ext->gidx_buffer.size());
|
||||
dh::CopyDeviceSpanToVector(&h_gidx_buffer_ext, dev_shard_ext->gidx_buffer);
|
||||
|
||||
ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins);
|
||||
ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size());
|
||||
ASSERT_EQ(dev_shard->n_bins, dev_shard_ext->n_bins);
|
||||
ASSERT_EQ(dev_shard->gidx_buffer.size(), dev_shard_ext->gidx_buffer.size());
|
||||
|
||||
ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
|
||||
}
|
||||
ASSERT_EQ(h_gidx_buffer, h_gidx_buffer_ext);
|
||||
}
|
||||
|
||||
TEST(GpuHist, TestHistogramIndex) {
|
||||
|
||||
Reference in New Issue
Block a user