Fuse gpu_hist all-reduce calls where possible (#7867)
This commit is contained in:
@@ -6,41 +6,58 @@ namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
TEST(GpuHist, DriverDepthWise) {
|
||||
Driver<GPUExpandEntry> driver(TrainParam::kDepthWise);
|
||||
TrainParam p;
|
||||
p.InitAllowUnknown(Args{});
|
||||
p.grow_policy = TrainParam::kDepthWise;
|
||||
Driver<GPUExpandEntry> driver(p, 2);
|
||||
EXPECT_TRUE(driver.Pop().empty());
|
||||
DeviceSplitCandidate split;
|
||||
split.loss_chg = 1.0f;
|
||||
GPUExpandEntry root(0, 0, split, .0f, .0f, .0f);
|
||||
split.left_sum = {0.0f, 1.0f};
|
||||
split.right_sum = {0.0f, 1.0f};
|
||||
GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
|
||||
driver.Push({root});
|
||||
EXPECT_EQ(driver.Pop().front().nid, 0);
|
||||
driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}});
|
||||
// Should return entries from level 1
|
||||
driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
|
||||
driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
|
||||
driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
|
||||
driver.Push({GPUExpandEntry{4, 2, split, 2.0f, 1.0f, 1.0f}});
|
||||
// Should return 2 entries from level 1
|
||||
// as we limited the driver to pop maximum 2 nodes
|
||||
auto res = driver.Pop();
|
||||
EXPECT_EQ(res.size(), 2);
|
||||
for (auto &e : res) {
|
||||
EXPECT_EQ(e.depth, 1);
|
||||
}
|
||||
|
||||
// Should now return 1 entry from level 1
|
||||
res = driver.Pop();
|
||||
EXPECT_EQ(res[0].depth, 2);
|
||||
EXPECT_EQ(res.size(), 1);
|
||||
EXPECT_EQ(res.at(0).depth, 1);
|
||||
|
||||
res = driver.Pop();
|
||||
EXPECT_EQ(res.at(0).depth, 2);
|
||||
EXPECT_TRUE(driver.Pop().empty());
|
||||
}
|
||||
|
||||
TEST(GpuHist, DriverLossGuided) {
|
||||
DeviceSplitCandidate high_gain;
|
||||
high_gain.left_sum = {0.0f, 1.0f};
|
||||
high_gain.right_sum = {0.0f, 1.0f};
|
||||
high_gain.loss_chg = 5.0f;
|
||||
DeviceSplitCandidate low_gain;
|
||||
DeviceSplitCandidate low_gain = high_gain;
|
||||
low_gain.loss_chg = 1.0f;
|
||||
|
||||
Driver<GPUExpandEntry> driver(TrainParam::kLossGuide);
|
||||
TrainParam p;
|
||||
p.grow_policy=TrainParam::kLossGuide;
|
||||
Driver<GPUExpandEntry> driver(p);
|
||||
EXPECT_TRUE(driver.Pop().empty());
|
||||
GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f);
|
||||
GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
|
||||
driver.Push({root});
|
||||
EXPECT_EQ(driver.Pop().front().nid, 0);
|
||||
// Select high gain first
|
||||
driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
|
||||
driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
|
||||
auto res = driver.Pop();
|
||||
EXPECT_EQ(res.size(), 1);
|
||||
EXPECT_EQ(res[0].nid, 2);
|
||||
@@ -49,8 +66,8 @@ TEST(GpuHist, DriverLossGuided) {
|
||||
EXPECT_EQ(res[0].nid, 1);
|
||||
|
||||
// If equal gain, use nid
|
||||
driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
|
||||
driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
|
||||
driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
|
||||
res = driver.Pop();
|
||||
EXPECT_EQ(res[0].nid, 1);
|
||||
res = driver.Pop();
|
||||
|
||||
@@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) {
|
||||
std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
|
||||
for (bool is_dense : is_dense_array) {
|
||||
for (int shm_size : shm_sizes) {
|
||||
TestDeterministicHistogram<GradientPair>(is_dense, shm_size);
|
||||
TestDeterministicHistogram<GradientPairPrecise>(is_dense, shm_size);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,31 +27,40 @@ TEST(GpuHist, DeviceHistogram) {
|
||||
// Ensures that node allocates correctly after reaching `kStopGrowingSize`.
|
||||
dh::safe_cuda(cudaSetDevice(0));
|
||||
constexpr size_t kNBins = 128;
|
||||
constexpr size_t kNNodes = 4;
|
||||
constexpr int kNNodes = 4;
|
||||
constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
|
||||
DeviceHistogram<GradientPairPrecise, kStopGrowing> histogram;
|
||||
DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
|
||||
histogram.Init(0, kNBins);
|
||||
for (size_t i = 0; i < kNNodes; ++i) {
|
||||
histogram.AllocateHistogram(i);
|
||||
for (int i = 0; i < kNNodes; ++i) {
|
||||
histogram.AllocateHistograms({i});
|
||||
}
|
||||
histogram.Reset();
|
||||
ASSERT_EQ(histogram.Data().size(), kStopGrowing);
|
||||
|
||||
// Use allocated memory but do not erase nidx_map.
|
||||
for (size_t i = 0; i < kNNodes; ++i) {
|
||||
histogram.AllocateHistogram(i);
|
||||
for (int i = 0; i < kNNodes; ++i) {
|
||||
histogram.AllocateHistograms({i});
|
||||
}
|
||||
for (size_t i = 0; i < kNNodes; ++i) {
|
||||
for (int i = 0; i < kNNodes; ++i) {
|
||||
ASSERT_TRUE(histogram.HistogramExists(i));
|
||||
}
|
||||
|
||||
// Erase existing nidx_map.
|
||||
for (size_t i = kNNodes; i < kNNodes * 2; ++i) {
|
||||
histogram.AllocateHistogram(i);
|
||||
}
|
||||
for (size_t i = 0; i < kNNodes; ++i) {
|
||||
ASSERT_FALSE(histogram.HistogramExists(i));
|
||||
// Add two new nodes
|
||||
histogram.AllocateHistograms({kNNodes});
|
||||
histogram.AllocateHistograms({kNNodes + 1});
|
||||
|
||||
// Old cached nodes should still exist
|
||||
for (int i = 0; i < kNNodes; ++i) {
|
||||
ASSERT_TRUE(histogram.HistogramExists(i));
|
||||
}
|
||||
|
||||
// Should be deleted
|
||||
ASSERT_FALSE(histogram.HistogramExists(kNNodes));
|
||||
// Most recent node should exist
|
||||
ASSERT_TRUE(histogram.HistogramExists(kNNodes + 1));
|
||||
|
||||
// Add same node again - should fail
|
||||
EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes + 1}););
|
||||
}
|
||||
|
||||
std::vector<GradientPairPrecise> GetHostHistGpair() {
|
||||
@@ -96,9 +105,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
|
||||
thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
|
||||
maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
|
||||
maker.hist.AllocateHistogram(0);
|
||||
maker.hist.AllocateHistograms({0});
|
||||
maker.gpair = gpair.DeviceSpan();
|
||||
maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
|
||||
maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);
|
||||
|
||||
BuildGradientHistogram(
|
||||
page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
|
||||
@@ -106,7 +115,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
|
||||
!use_shared_memory_histograms);
|
||||
|
||||
DeviceHistogram<GradientSumT>& d_hist = maker.hist;
|
||||
DeviceHistogramStorage<GradientSumT>& d_hist = maker.hist;
|
||||
|
||||
auto node_histogram = d_hist.GetNodeHistogram(0);
|
||||
// d_hist.data stored in float, not gradient pair
|
||||
@@ -129,12 +138,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
|
||||
|
||||
TEST(GpuHist, BuildHistGlobalMem) {
|
||||
TestBuildHist<GradientPairPrecise>(false);
|
||||
TestBuildHist<GradientPair>(false);
|
||||
}
|
||||
|
||||
TEST(GpuHist, BuildHistSharedMem) {
|
||||
TestBuildHist<GradientPairPrecise>(true);
|
||||
TestBuildHist<GradientPair>(true);
|
||||
}
|
||||
|
||||
HistogramCutsWrapper GetHostCutMatrix () {
|
||||
@@ -198,7 +205,7 @@ TEST(GpuHist, EvaluateRootSplit) {
|
||||
|
||||
// Initialize GPUHistMakerDevice::hist
|
||||
maker.hist.Init(0, (max_bins - 1) * kNCols);
|
||||
maker.hist.AllocateHistogram(0);
|
||||
maker.hist.AllocateHistograms({0});
|
||||
// Each row of hist_gpair represents gpairs for one feature.
|
||||
// Each entry represents a bin.
|
||||
std::vector<GradientPairPrecise> hist_gpair = GetHostHistGpair();
|
||||
|
||||
Reference in New Issue
Block a user