Optimize adapter element counting on GPU. (#9209)

- Implement a simple `IterSpan` for passing iterators with size.
- Use shared memory for column size counts.
- Use one thread for each sample in row count to reduce atomic operations.
This commit is contained in:
Jiaming Yuan
2023-05-30 23:28:43 +08:00
committed by GitHub
parent 097f11b6e0
commit 17fd3f55e9
9 changed files with 323 additions and 61 deletions

View File

@@ -51,3 +51,22 @@ void TestCudfAdapter()
TEST(DeviceAdapter, CudfAdapter) {
TestCudfAdapter();
}
namespace xgboost::data {
TEST(DeviceAdapter, GetRowCounts) {
auto ctx = MakeCUDACtx(0);
for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
HostDeviceVector<float> storage;
auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
.Device(ctx.gpu_id)
.GenerateArrayInterface(&storage);
auto adapter = CupyAdapter{str_arr};
HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
offset.SetDevice(ctx.gpu_id);
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
std::numeric_limits<float>::quiet_NaN());
ASSERT_EQ(rstride, n_features);
}
}
} // namespace xgboost::data