enable ROCm on latest XGBoost

This commit is contained in:
Hui Liu
2023-10-23 11:07:08 -07:00
328 changed files with 8028 additions and 3642 deletions

View File

@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
p_mat->Info().Validate(-1);
p_mat->Info().Validate(DeviceOrd::CPU());
EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
dmlc::Error);
@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
void TestSketchFromWeights(bool with_group) {
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
size_t constexpr kGroups = 10;
auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
Context ctx;
common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);

View File

@@ -222,7 +222,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
ASSERT_EQ(info.feature_types.Size(), n_features);
HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
cuts_ptr.SetDevice(0);
cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
dh::device_vector<float> weight(n_samples * n_features, 0);
dh::Iota(dh::ToSpan(weight));
@@ -235,7 +235,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
detail::EntryCompareOp());
detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
&weight, &columns_ptr);
auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -377,7 +377,8 @@ template <typename Adapter>
auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
DeviceOrd::CUDA(0));
MetaInfo info;
AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -444,7 +445,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container);
HistogramCuts cuts;
@@ -472,7 +473,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_container);
@@ -507,7 +508,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
}
ASSERT_EQ(info.feature_types.Size(), 1);
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), &container);
HistogramCuts cuts;
@@ -580,11 +581,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
namespace {
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(ctx->gpu_id));
#endif
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
auto n = n_samples * n_features;
std::vector<float> x;
x.resize(n);
@@ -624,21 +621,21 @@ void TestGetColumnSize(std::size_t n_samples) {
std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1);
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1);
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1);
}
@@ -715,9 +712,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
size_t constexpr kGroups = 10;
HostDeviceVector<float> storage;
std::string m =
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
&storage);
std::string m = RandomDataGenerator{kRows, kCols, 0}
.Device(DeviceOrd::CUDA(0))
.GenerateArrayInterface(&storage);
MetaInfo info;
Context ctx;
auto& h_weights = info.weights_.HostVector();
@@ -736,14 +733,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
}
info.weights_.SetDevice(0);
info.weights_.SetDevice(DeviceOrd::CUDA(0));
info.num_row_ = kRows;
info.num_col_ = kCols;
data::CupyAdapter adapter(m);
auto const& batch = adapter.Value();
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container);
@@ -787,7 +784,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
// https://github.com/dmlc/xgboost/issues/7946
h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
}
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container);
sketch_container.MakeCuts(&weighted, info.IsColumnSplit());

View File

@@ -1,7 +1,6 @@
/*!
* Copyright 2018 XGBoost contributors
/**
* Copyright 2018-2023 XGBoost contributors
*/
#include <gtest/gtest.h>
#include <thrust/equal.h>
#include <thrust/iterator/counting_iterator.h>
@@ -13,21 +12,14 @@
#endif
#include <xgboost/host_device_vector.h>
namespace xgboost {
namespace common {
namespace xgboost::common {
namespace {
void SetDeviceForTest(int device) {
void SetDeviceForTest(DeviceOrd device) {
int n_devices;
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaGetDeviceCount(&n_devices));
device %= n_devices;
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipGetDeviceCount(&n_devices));
device %= n_devices;
dh::safe_cuda(hipSetDevice(device));
#endif
device.ordinal %= n_devices;
dh::safe_cuda(cudaSetDevice(device.ordinal));
}
} // namespace
@@ -42,13 +34,13 @@ struct HostDeviceVectorSetDeviceHandler {
}
};
void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
// create the vector
v->SetDevice(device);
v->Resize(n);
ASSERT_EQ(v->Size(), n);
ASSERT_EQ(v->DeviceIdx(), device);
ASSERT_EQ(v->Device(), device);
// ensure that the device have read-write access
ASSERT_TRUE(v->DeviceCanRead());
ASSERT_TRUE(v->DeviceCanWrite());
@@ -68,7 +60,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
}
void PlusOne(HostDeviceVector<int> *v) {
int device = v->DeviceIdx();
auto device = v->Device();
SetDeviceForTest(device);
thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
[=]__device__(unsigned int a){ return a + 1; });
@@ -80,7 +72,7 @@ void CheckDevice(HostDeviceVector<int>* v,
unsigned int first,
GPUAccess access) {
ASSERT_EQ(v->Size(), size);
SetDeviceForTest(v->DeviceIdx());
SetDeviceForTest(v->Device());
ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
thrust::make_counting_iterator(first)));
@@ -111,7 +103,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
ASSERT_FALSE(v->DeviceCanWrite());
}
void TestHostDeviceVector(size_t n, int device) {
void TestHostDeviceVector(size_t n, DeviceOrd device) {
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v;
InitHostDeviceVector(n, device, &v);
@@ -124,13 +116,13 @@ void TestHostDeviceVector(size_t n, int device) {
TEST(HostDeviceVector, Basic) {
size_t n = 1001;
int device = 0;
DeviceOrd device = DeviceOrd::CUDA(0);
TestHostDeviceVector(n, device);
}
TEST(HostDeviceVector, Copy) {
size_t n = 1001;
int device = 0;
auto device = DeviceOrd::CUDA(0);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v;
@@ -154,15 +146,15 @@ TEST(HostDeviceVector, SetDevice) {
h_vec[i] = i;
}
HostDeviceVector<int> vec (h_vec);
auto device = 0;
auto device = DeviceOrd::CUDA(0);
vec.SetDevice(device);
ASSERT_EQ(vec.Size(), h_vec.size());
auto span = vec.DeviceSpan(); // sync to device
vec.SetDevice(-1); // pull back to cpu.
vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu.
ASSERT_EQ(vec.Size(), h_vec.size());
ASSERT_EQ(vec.DeviceIdx(), -1);
ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
auto h_vec_1 = vec.HostVector();
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -170,7 +162,7 @@ TEST(HostDeviceVector, SetDevice) {
TEST(HostDeviceVector, Span) {
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
vec.SetDevice(0);
vec.SetDevice(DeviceOrd::CUDA(0));
auto span = vec.DeviceSpan();
ASSERT_EQ(vec.Size(), span.size());
ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -194,5 +186,4 @@ TEST(HostDeviceVector, Empty) {
ASSERT_FALSE(another.Empty());
ASSERT_TRUE(vec.Empty());
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -144,7 +144,8 @@ TEST(IO, Resource) {
fout << 1.0 << std::endl;
fout.close();
auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
auto resource = std::shared_ptr<MmapResource>{
new MmapResource{path, 0, sizeof(double)}};
ASSERT_EQ(resource->Size(), sizeof(double));
ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
ASSERT_EQ(resource->DataAs<double>()[0], val);

View File

@@ -1,13 +1,15 @@
/**
* Copyright (c) 2019-2023, XGBoost Contributors
* Copyright 2019-2023, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <fstream>
#include <iterator> // for back_inserter
#include <map>
#include "../../../src/common/charconv.h"
#include "../../../src/common/io.h"
#include "../../../src/common/json_utils.h"
#include "../../../src/common/threading_utils.h" // for ParallelFor
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
@@ -691,4 +693,16 @@ TEST(Json, TypeCheck) {
ASSERT_NE(err.find("foo"), std::string::npos);
}
}
TEST(Json, Dump) {
auto str = GetModelStr();
auto jobj = Json::Load(str);
std::string result_s = Json::Dump(jobj);
std::vector<char> result_v = Json::Dump<std::vector<char>>(jobj);
ASSERT_EQ(result_s.size(), result_v.size());
for (std::size_t i = 0; i < result_s.size(); ++i) {
ASSERT_EQ(result_s[i], result_v[i]);
}
}
} // namespace xgboost

View File

@@ -16,7 +16,7 @@ namespace xgboost::linalg {
namespace {
void TestElementWiseKernel() {
auto device = DeviceOrd::CUDA(0);
Tensor<float, 3> l{{2, 3, 4}, 0};
Tensor<float, 3> l{{2, 3, 4}, device};
{
/**
* Non-contiguous

View File

@@ -9,9 +9,7 @@
#include "../../../src/data/adapter.h"
#include "xgboost/context.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
TEST(Quantile, LoadBalance) {
size_t constexpr kRows = 1000, kCols = 100;
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
}
auto m = RandomDataGenerator{kRows, kCols, 0}
.Device(Context::kCpuId)
.Device(DeviceOrd::CPU())
.Type(ft)
.MaxCategory(17)
.Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
}
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -32,7 +32,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
TEST(GPUQuantile, Basic) {
constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, kBins, kCols, kRows, 0);
SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
dh::caching_device_vector<Entry> entries;
dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -45,12 +45,12 @@ void TestSketchUnique(float sparsity) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
.Seed(seed)
.Device(0)
.Device(FstCU())
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -65,7 +65,7 @@ void TestSketchUnique(float sparsity) {
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); });
auto end = kCols * kRows;
detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
&cut_sizes_scan, &column_sizes_scan);
auto const& cut_sizes = cut_sizes_scan.HostVector();
ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -93,13 +93,9 @@ TEST(GPUQuantile, Unique) {
}
// if with_error is true, the test tolerates floating point error
void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device));
#endif
dh::safe_cuda(cudaSetDevice(device.ordinal));
std::vector<SketchEntry> h_in(in.size());
dh::CopyDeviceSpanToVector(&h_in, in);
std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -134,13 +130,12 @@ TEST(GPUQuantile, Prune) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage);
std::string interface_str =
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
&storage);
data::CupyAdapter adapter(interface_str);
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -156,7 +151,7 @@ TEST(GPUQuantile, Prune) {
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
sketch.Data().data() + sketch.Data().size(),
detail::SketchUnique{}));
TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
});
}
@@ -164,10 +159,10 @@ TEST(GPUQuantile, MergeEmpty) {
constexpr size_t kRows = 1000, kCols = 100;
size_t n_bins = 10;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage_0;
std::string interface_str_0 =
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
MetaInfo info;
@@ -204,34 +199,33 @@ TEST(GPUQuantile, MergeBasic) {
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(FstCU())
.Seed(seed)
.GenerateArrayInterface(&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch_0);
SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_1);
std::string interface_str_1 =
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
&storage_1);
data::CupyAdapter adapter_1(interface_str_1);
AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch_1);
AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_1);
size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
if (info.weights_.Size() != 0) {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
sketch_0.FixError();
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
} else {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
}
auto columns_ptr = sketch_0.ColumnsPtr();
@@ -251,24 +245,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
MetaInfo info;
int32_t seed = 0;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_0);
std::string interface_str_0 =
RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
&storage_0);
data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&sketch_0);
size_t f_rows = rows * frac;
SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
.Device(0)
.Seed(seed)
.GenerateArrayInterface(&storage_1);
std::string interface_str_1 =
RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
&storage_1);
auto data_1 = storage_1.DeviceSpan();
auto tuple_it = thrust::make_tuple(
thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -290,7 +282,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
auto columns_ptr = sketch_0.ColumnsPtr();
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -321,11 +313,10 @@ TEST(GPUQuantile, MergeDuplicated) {
TEST(GPUQuantile, MultiMerge) {
constexpr size_t kRows = 20, kCols = 1;
int32_t world = 2;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
// Set up single node version
HostDeviceVector<FeatureType> ft;
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -333,12 +324,12 @@ TEST(GPUQuantile, MultiMerge) {
for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Device(FstCU())
.Seed(rank + seed)
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft;
containers.emplace_back(ft, n_bins, kCols, kRows, 0);
containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(),
&containers.back());
@@ -348,21 +339,44 @@ TEST(GPUQuantile, MultiMerge) {
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
sketch_on_single_node.FixError();
}
TestQuantileElemRank(0, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr());
TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(),
sketch_on_single_node.ColumnsPtr());
TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
});
}
TEST(GPUQuantile, MissingColumns) {
auto dmat = std::unique_ptr<DMatrix>{[=]() {
std::size_t constexpr kRows = 1000, kCols = 100;
auto sparsity = 0.5f;
std::vector<FeatureType> ft(kCols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
}
auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
.Seed(0)
.Lower(.0f)
.Upper(1.0f)
.Type(ft)
.MaxCategory(13)
.GenerateDMatrix();
return dmat->SliceCol(2, 1);
}()};
dmat->Info().data_split_mode = DataSplitMode::kRow;
auto ctx = MakeCUDACtx(0);
std::size_t constexpr kBins = 64;
HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins);
ASSERT_TRUE(cuts.HasCategorical());
}
namespace {
void TestAllReduceBasic() {
auto const world = collective::GetWorldSize();
constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
auto const device = GPUIDX;
auto const device = DeviceOrd::CUDA(GPUIDX);
// Set up single node version;
HostDeviceVector<FeatureType> ft({}, device);
@@ -440,18 +454,14 @@ TEST_F(MGPUQuantileTest, AllReduceBasic) {
}
namespace {
void TestColumnSplitBasic() {
void TestColumnSplit(DMatrix* dmat) {
auto const world = collective::GetWorldSize();
auto const rank = collective::GetRank();
std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
auto m = std::unique_ptr<DMatrix>{[=]() {
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
return dmat->SliceCol(world, rank);
}()};
auto m = std::unique_ptr<DMatrix>{dmat->SliceCol(world, rank)};
// Generate cuts for distributed environment.
auto ctx = MakeCUDACtx(GPUIDX);
std::size_t constexpr kBins = 64;
HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
// Generate cuts for single node environment
@@ -484,7 +494,26 @@ void TestColumnSplitBasic() {
} // anonymous namespace
TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
DoTest(TestColumnSplitBasic);
std::size_t constexpr kRows = 1000, kCols = 100;
auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
DoTest(TestColumnSplit, dmat.get());
}
TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
std::size_t constexpr kRows = 1000, kCols = 100;
auto sparsity = 0.5f;
std::vector<FeatureType> ft(kCols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
}
auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
.Seed(0)
.Lower(.0f)
.Upper(1.0f)
.Type(ft)
.MaxCategory(13)
.GenerateDMatrix();
DoTest(TestColumnSplit, dmat.get());
}
namespace {
@@ -494,7 +523,7 @@ void TestSameOnAllWorkers() {
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) {
auto const rank = collective::GetRank();
auto const device = GPUIDX;
auto const device = DeviceOrd::CUDA(GPUIDX);
HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device);
@@ -525,9 +554,9 @@ void TestSameOnAllWorkers() {
thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank);
collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
all_workers.size());
collective::Synchronize(device);
collective::Synchronize(device.ordinal);
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size());
@@ -573,7 +602,7 @@ TEST(GPUQuantile, Push) {
columns_ptr[1] = kRows;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
auto sketch_data = sketch.Data();
@@ -613,7 +642,7 @@ TEST(GPUQuantile, MultiColPush) {
int32_t n_bins = 16;
HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
dh::device_vector<Entry> d_entries {entries};
dh::device_vector<size_t> columns_ptr(kCols + 1, 0);

View File

@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
HostDeviceVector<float> predt(info.num_row_, 0);
auto& h_predt = predt.HostVector();
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto rank_idx =
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
// empty label
ASSERT_THROW(fail(), dmlc::Error);
info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
// invalid label
ASSERT_THROW(fail(), dmlc::Error);
auto h_labels = info.labels.HostView();

View File

@@ -42,7 +42,7 @@ void TestCalcQueriesInvIDCG() {
auto d_scores = dh::ToSpan(scores);
common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
ltr::LambdaRankParam p;
p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -77,7 +77,7 @@ void TestRankingCache(Context const* ctx) {
HostDeviceVector<float> predt(info.num_row_, 0);
auto& h_predt = predt.HostVector();
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
predt.SetDevice(ctx->gpu_id);
predt.SetDevice(ctx->Device());
auto rank_idx =
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());

View File

@@ -9,12 +9,11 @@
#include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter
#include "../helpers.h"
namespace xgboost {
namespace common {
namespace xgboost::common {
TEST(Stats, Quantile) {
Context ctx;
{
linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
std::vector<size_t> index{0, 2, 3, 4, 6};
auto h_arr = arr.HostView();
auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
TEST(Stats, WeightedQuantile) {
Context ctx;
linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
auto h_arr = arr.HostView();
auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
Context ctx;
{
linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
HostDeviceVector<float> weights;
linalg::Tensor<float, 1> out;
Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
{
ctx = ctx.MakeCPU();
// 4x2 matrix
linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
HostDeviceVector<float> weights;
linalg::Tensor<float, 1> out;
Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
namespace {
void TestMean(Context const* ctx) {
std::size_t n{128};
linalg::Vector<float> data({n}, ctx->gpu_id);
linalg::Vector<float> data({n}, ctx->Device());
auto h_v = data.HostView().Values();
std::iota(h_v.begin(), h_v.end(), .0f);
auto nf = static_cast<float>(n);
float mean = nf * (nf - 1) / 2 / n;
linalg::Vector<float> res{{1}, ctx->gpu_id};
linalg::Vector<float> res{{1}, ctx->Device()};
Mean(ctx, data, &res);
auto h_res = res.HostView();
ASSERT_EQ(h_res.Size(), 1);
@@ -127,6 +126,5 @@ TEST(Stats, GPUMean) {
auto ctx = MakeCUDACtx(0);
TestMean(&ctx);
}
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
} // namespace common
} // namespace xgboost
#endif // defined(XGBOOST_USE_CUDA)
} // namespace xgboost::common

View File

@@ -25,8 +25,8 @@ namespace common {
namespace {
class StatsGPU : public ::testing::Test {
private:
linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
HostDeviceVector<float> results_;
using TestSet = std::vector<std::pair<float, float>>;
Context ctx_;
@@ -51,7 +51,7 @@ class StatsGPU : public ::testing::Test {
data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end());
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
auto d_arr = arr.View(DeviceOrd::CUDA(0));
auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -63,7 +63,7 @@ class StatsGPU : public ::testing::Test {
// one alpha for each segment
HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
alphas.SetDevice(0);
alphas.SetDevice(FstCU());
auto d_alphas = alphas.ConstDeviceSpan();
auto w_it = thrust::make_constant_iterator(0.1f);
SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -85,7 +85,7 @@ class StatsGPU : public ::testing::Test {
auto val_it =
dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
linalg::Tensor<float, 1> weights{{10}, 0};
linalg::Tensor<float, 1> weights{{10}, FstCU()};
linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
[=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
auto w_it = weights.Data()->ConstDevicePointer();
@@ -106,7 +106,7 @@ class StatsGPU : public ::testing::Test {
data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end());
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
auto d_arr = arr.View(DeviceOrd::CUDA(0));
auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -118,7 +118,7 @@ class StatsGPU : public ::testing::Test {
// one alpha for each segment
HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
alphas.SetDevice(0);
alphas.SetDevice(FstCU());
auto d_alphas = alphas.ConstDeviceSpan();
SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
val_it + d_arr.Size(), &results_);

View File

@@ -11,63 +11,59 @@
#include "../../../src/common/transform.h"
#include "../helpers.h"
namespace xgboost::common {
namespace {
constexpr DeviceOrd TransformDevice() {
#if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
#define TRANSFORM_GPU 0
return DeviceOrd::CUDA(0);
#else
#define TRANSFORM_GPU -1
return DeviceOrd::CPU();
#endif
namespace xgboost {
namespace common {
}
} // namespace
template <typename T>
struct TestTransformRange {
void XGBOOST_DEVICE operator()(size_t _idx,
Span<bst_float> _out, Span<const bst_float> _in) {
void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
_out[_idx] = _in[_idx];
}
};
TEST(Transform, DeclareUnifiedTest(Basic)) {
const size_t size {256};
std::vector<bst_float> h_in(size);
std::vector<bst_float> h_out(size);
const size_t size{256};
std::vector<float> h_in(size);
std::vector<float> h_out(size);
std::iota(h_in.begin(), h_in.end(), 0);
std::vector<bst_float> h_sol(size);
std::vector<float> h_sol(size);
std::iota(h_sol.begin(), h_sol.end(), 0);
const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
auto device = TransformDevice();
HostDeviceVector<float> const in_vec{h_in, device};
HostDeviceVector<float> out_vec{h_out, device};
out_vec.Fill(0);
Transform<>::Init(TestTransformRange<bst_float>{},
Transform<>::Init(TestTransformRange<float>{},
Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
TRANSFORM_GPU)
TransformDevice())
.Eval(&out_vec, &in_vec);
std::vector<bst_float> res = out_vec.HostVector();
std::vector<float> res = out_vec.HostVector();
ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
}
#if !defined(__CUDACC__) && !defined(__HIP_PLATFORM_AMD__)
TEST(TransformDeathTest, Exception) {
size_t const kSize {16};
std::vector<bst_float> h_in(kSize);
const HostDeviceVector<bst_float> in_vec{h_in, -1};
size_t const kSize{16};
std::vector<float> h_in(kSize);
const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
EXPECT_DEATH(
{
Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-1)
DeviceOrd::CPU())
.Eval(&in_vec);
},
"");
}
#endif
} // namespace common
} // namespace xgboost
} // namespace xgboost::common

View File

@@ -0,0 +1,5 @@
/**
* Copyright 2023 XGBoost contributors
*/
// Dummy file to keep the CUDA tests.
#include "test_transform_range.cc"