Support distributed CPU env for categorical data. (#7575)

* Add support for cat data in sketch allreduce.
* Share tests between CPU and GPU.
This commit is contained in:
Jiaming Yuan
2022-01-18 21:56:07 +08:00
committed by GitHub
parent deab0e32ba
commit cc06fab9a7
5 changed files with 299 additions and 150 deletions

View File

@@ -58,10 +58,17 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for distributed environment.
auto sparsity = 0.5f;
auto rank = rabit::GetRank();
std::vector<FeatureType> ft(cols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
}
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
.Lower(.0f)
.Upper(1.0f)
.Type(ft)
.MaxCategory(13)
.GenerateDMatrix();
std::vector<float> hessian(rows, 1.0);
@@ -95,6 +102,8 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
for (auto rank = 0; rank < world; ++rank) {
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
.Type(ft)
.MaxCategory(13)
.Lower(.0f)
.Upper(1.0f)
.GenerateDMatrix();
@@ -181,8 +190,15 @@ TEST(Quantile, SameOnAllWorkers) {
kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
auto rank = rabit::GetRank();
HostDeviceVector<float> storage;
std::vector<FeatureType> ft(kCols);
for (size_t i = 0; i < ft.size(); ++i) {
ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
}
auto m = RandomDataGenerator{kRows, kCols, 0}
.Device(0)
.Type(ft)
.MaxCategory(17)
.Seed(rank + seed)
.GenerateDMatrix();
auto cuts = SketchOnDMatrix(m.get(), n_bins);