Optimize adapter element counting on GPU. (#9209)
- Implement a simple `IterSpan` for passing iterators with size. - Use shared memory for column size counts. - Use one thread for each sample in row count to reduce atomic operations.
This commit is contained in:
@@ -51,3 +51,22 @@ void TestCudfAdapter()
|
||||
TEST(DeviceAdapter, CudfAdapter) {
|
||||
TestCudfAdapter();
|
||||
}
|
||||
|
||||
namespace xgboost::data {
|
||||
TEST(DeviceAdapter, GetRowCounts) {
|
||||
auto ctx = MakeCUDACtx(0);
|
||||
|
||||
for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
|
||||
HostDeviceVector<float> storage;
|
||||
auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
|
||||
.Device(ctx.gpu_id)
|
||||
.GenerateArrayInterface(&storage);
|
||||
auto adapter = CupyAdapter{str_arr};
|
||||
HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
|
||||
offset.SetDevice(ctx.gpu_id);
|
||||
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
|
||||
std::numeric_limits<float>::quiet_NaN());
|
||||
ASSERT_EQ(rstride, n_features);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::data
|
||||
|
||||
Reference in New Issue
Block a user