Revamp the rabit implementation. (#10112)

This PR replaces the original RABIT implementation with a new one, which has already been partially merged into XGBoost. The new one features:
- Federated learning for both CPU and GPU.
- NCCL.
- More data types.
- A unified interface for all the underlying implementations.
- Improved timeout handling for both tracker and workers.
- Exhausted tests with metrics (fixed a couple of bugs along the way).
- A reusable tracker for Python and JVM packages.
This commit is contained in:
Jiaming Yuan
2024-05-20 11:56:23 +08:00
committed by GitHub
parent ba9b4cb1ee
commit a5a58102e5
195 changed files with 2768 additions and 9234 deletions

View File

@@ -3,15 +3,16 @@
*/
#include "test_metainfo.h"
#include <gmock/gmock.h>
#include <dmlc/io.h>
#include <gmock/gmock.h>
#include <xgboost/data.h>
#include <memory>
#include <string>
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" // for GMockTHrow
#include "../collective/test_worker.h" // for TestDistributedGlobal
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" // for GMockTHrow
#include "xgboost/base.h"
namespace xgboost {
@@ -118,8 +119,8 @@ void VerifyGetSetFeatureColumnSplit() {
} // anonymous namespace
TEST(MetaInfo, GetSetFeatureColumnSplit) {
auto constexpr kWorldSize{3};
RunWithInMemoryCommunicator(kWorldSize, VerifyGetSetFeatureColumnSplit);
auto constexpr kWorkers{3};
collective::TestDistributedGlobal(kWorkers, VerifyGetSetFeatureColumnSplit);
}
TEST(MetaInfo, SaveLoadBinary) {

View File

@@ -9,6 +9,7 @@
#include "../../../src/data/adapter.h" // ArrayAdapter
#include "../../../src/data/simple_dmatrix.h" // SimpleDMatrix
#include "../collective/test_worker.h" // for TestDistributedGlobal
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" // RandomDataGenerator,CreateSimpleTestData
#include "xgboost/base.h"
@@ -444,5 +445,5 @@ void VerifyColumnSplit() {
TEST(SimpleDMatrix, ColumnSplit) {
auto constexpr kWorldSize{3};
RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit);
collective::TestDistributedGlobal(kWorldSize, VerifyColumnSplit);
}