[CI] Test federated learning plugin in the CI (#8325)

This commit is contained in:
Philip Hyunsu Cho
2022-10-12 13:57:39 -07:00
committed by GitHub
parent 97a5b088a5
commit 2faa744aba
16 changed files with 190 additions and 117 deletions

View File

@@ -6,7 +6,7 @@ set -x
if [ "$#" -lt 1 ]
then
echo "Usage: $0 [container to build]"
return 1
exit 1
fi
container=$1
@@ -17,18 +17,21 @@ echo "--- Build container ${container}"
BUILD_ARGS=""
case "${container}" in
cpu)
;;
gpu|rmm)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
;;
jvm_gpu_build)
gpu_build_centos7|jvm_gpu_build)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
;;
*)
echo "Unrecognized container ID: ${container}"
return 2
exit 2
;;
esac

View File

@@ -14,7 +14,8 @@ $command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
# the configured header build/dmlc/build_config.h instead of
# include/dmlc/build_config_default.h.
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
-DPLUGIN_DENSE_PARSER=ON -DPLUGIN_FEDERATED=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Stash XGBoost CLI executable"

View File

@@ -20,10 +20,10 @@ command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \
${arch_flag}
$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
-DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
-DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
-DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"

View File

@@ -17,6 +17,7 @@ steps:
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh jvm_gpu_build"
key: build-containers
agents:

View File

@@ -13,7 +13,9 @@ steps:
#### -------- CONTAINER BUILD --------
- label: ":docker: Build containers"
commands:
- "tests/buildkite/build-containers.sh cpu"
- "tests/buildkite/build-containers.sh gpu"
- "tests/buildkite/build-containers.sh gpu_build_centos7"
- "tests/buildkite/build-containers.sh rmm"
key: build-containers
agents:

View File

@@ -26,6 +26,15 @@ ENV CPP=cpp-8
ENV GOSU_VERSION 1.10
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 --shallow-submodules && \
pushd grpc && \
cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \
cmake --build build --target install && \
popd && \
rm -rf grpc
# Create new Conda environment
COPY conda_env/cpu_test.yml /scripts/
RUN mamba env create -n cpu_test --file=/scripts/cpu_test.yml

View File

@@ -1,49 +0,0 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu16.04
ARG CUDA_VERSION_ARG
# Environment
ENV DEBIAN_FRONTEND noninteractive
SHELL ["/bin/bash", "-c"] # Use Bash as shell
# Install all basic requirements
RUN \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/3bf863cc.pub && \
apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:ubuntu-toolchain-r/test && \
apt-get update && \
apt-get install -y tar unzip wget bzip2 libgomp1 git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Python
wget -nv -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.13.4-1 && \
apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
ENV PATH=/opt/python/bin:$PATH
ENV CC=gcc-8
ENV CXX=g++-8
ENV CPP=cpp-8
ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]

View File

@@ -35,6 +35,15 @@ ENV CPP=/opt/rh/devtoolset-8/root/usr/bin/cpp
ENV GOSU_VERSION 1.10
# Install gRPC
RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
--recurse-submodules --depth 1 && \
pushd grpc && \
cmake -S . -B build -GNinja -DCMAKE_INSTALL_PREFIX=/opt/grpc && \
cmake --build build --target install && \
popd && \
rm -rf grpc
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \

View File

@@ -0,0 +1,19 @@
#include <chrono>
#include <thread>
#include <random>
#include <cstdint>
#include "helpers.h"
using namespace std::chrono_literals;
int GenerateRandomPort(int low, int high) {
// Ensure unique timestamp by introducing a small artificial delay
std::this_thread::sleep_for(100ms);
auto timestamp = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count());
std::mt19937_64 rng(timestamp);
std::uniform_int_distribution<int> dist(low, high);
int port = dist(rng);
return port;
}

View File

@@ -0,0 +1,10 @@
/*!
* Copyright 2022 XGBoost contributors
*/
#ifndef XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
#define XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_
int GenerateRandomPort(int low, int high);
#endif // XGBOOST_TESTS_CPP_PLUGIN_HELPERS_H_

View File

@@ -5,24 +5,36 @@
#include <gtest/gtest.h>
#include <thrust/host_vector.h>
#include <iostream>
#include <thread>
#include <ctime>
#include "./helpers.h"
#include "../../../plugin/federated/federated_communicator.h"
#include "../../../plugin/federated/federated_server.h"
#include "../../../src/collective/device_communicator_adapter.cuh"
namespace {
std::string GetServerAddress() {
int port = GenerateRandomPort(50000, 60000);
std::string address = std::string("localhost:") + std::to_string(port);
return address;
}
} // anonymous namespace
namespace xgboost {
namespace collective {
std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)
class FederatedAdapterTest : public ::testing::Test {
protected:
void SetUp() override {
server_address_ = GetServerAddress();
server_thread_.reset(new std::thread([this] {
grpc::ServerBuilder builder;
federated::FederatedService service{kWorldSize};
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
server_ = builder.BuildAndStart();
server_->Wait();
@@ -35,6 +47,7 @@ class FederatedAdapterTest : public ::testing::Test {
}
static int const kWorldSize{2};
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
};
@@ -52,9 +65,10 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
threads.emplace_back(std::thread([rank, server_address=server_address_] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};
int const count = 3;
thrust::device_vector<double> buffer(count, 0);
thrust::sequence(buffer.begin(), buffer.end());
@@ -74,9 +88,10 @@ TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread([rank] {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
DeviceCommunicatorAdapter adapter{rank, &comm};
threads.emplace_back(std::thread([rank, server_address=server_address_] {
FederatedCommunicator comm{kWorldSize, rank, server_address};
// Assign device 0 to all workers, since we run gtest in a single-GPU machine
DeviceCommunicatorAdapter adapter{0, &comm};
int const count = rank + 2;
thrust::device_vector<char> buffer(count, 0);

View File

@@ -5,34 +5,46 @@
#include <grpcpp/server_builder.h>
#include <gtest/gtest.h>
#include <iostream>
#include <thread>
#include <ctime>
#include "helpers.h"
#include "../../../plugin/federated/federated_communicator.h"
#include "../../../plugin/federated/federated_server.h"
namespace {
std::string GetServerAddress() {
int port = GenerateRandomPort(50000, 60000);
std::string address = std::string("localhost:") + std::to_string(port);
return address;
}
} // anonymous namespace
namespace xgboost {
namespace collective {
std::string const kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)
class FederatedCommunicatorTest : public ::testing::Test {
public:
static void VerifyAllreduce(int rank) {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
static void VerifyAllreduce(int rank, const std::string& server_address) {
FederatedCommunicator comm{kWorldSize, rank, server_address};
CheckAllreduce(comm);
}
static void VerifyBroadcast(int rank) {
FederatedCommunicator comm{kWorldSize, rank, kServerAddress};
static void VerifyBroadcast(int rank, const std::string& server_address) {
FederatedCommunicator comm{kWorldSize, rank, server_address};
CheckBroadcast(comm, rank);
}
protected:
void SetUp() override {
server_address_ = GetServerAddress();
server_thread_.reset(new std::thread([this] {
grpc::ServerBuilder builder;
federated::FederatedService service{kWorldSize};
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
server_ = builder.BuildAndStart();
server_->Wait();
@@ -66,29 +78,40 @@ class FederatedCommunicatorTest : public ::testing::Test {
}
static int const kWorldSize{3};
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
};
TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
auto construct = []() { FederatedCommunicator comm{0, 0, kServerAddress, "", "", ""}; };
std::string server_address{GetServerAddress()};
auto construct = [server_address]() {
FederatedCommunicator comm{0, 0, server_address, "", "", ""};
};
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooSmall) {
auto construct = []() { FederatedCommunicator comm{1, -1, kServerAddress, "", "", ""}; };
std::string server_address{GetServerAddress()};
auto construct = [server_address]() {
FederatedCommunicator comm{1, -1, server_address, "", "", ""};
};
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(FederatedCommunicatorSimpleTest, ThrowOnRankTooBig) {
auto construct = []() { FederatedCommunicator comm{1, 1, kServerAddress, "", "", ""}; };
std::string server_address{GetServerAddress()};
auto construct = [server_address]() {
FederatedCommunicator comm{1, 1, server_address, "", "", ""};
};
EXPECT_THROW(construct(), dmlc::Error);
}
TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
auto construct = []() {
std::string server_address{GetServerAddress()};
auto construct = [server_address]() {
Json config{JsonObject()};
config["federated_server_address"] = kServerAddress;
config["federated_server_address"] = server_address;
config["federated_world_size"] = std::string("1");
config["federated_rank"] = Integer(0);
auto *comm = FederatedCommunicator::Create(config);
@@ -97,9 +120,10 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnWorldSizeNotInteger) {
}
TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
auto construct = []() {
std::string server_address{GetServerAddress()};
auto construct = [server_address]() {
Json config{JsonObject()};
config["federated_server_address"] = kServerAddress;
config["federated_server_address"] = server_address;
config["federated_world_size"] = 1;
config["federated_rank"] = std::string("0");
auto *comm = FederatedCommunicator::Create(config);
@@ -108,20 +132,23 @@ TEST(FederatedCommunicatorSimpleTest, ThrowOnRankNotInteger) {
}
TEST(FederatedCommunicatorSimpleTest, GetWorldSizeAndRank) {
FederatedCommunicator comm{6, 3, kServerAddress};
std::string server_address{GetServerAddress()};
FederatedCommunicator comm{6, 3, server_address};
EXPECT_EQ(comm.GetWorldSize(), 6);
EXPECT_EQ(comm.GetRank(), 3);
}
TEST(FederatedCommunicatorSimpleTest, IsDistributed) {
FederatedCommunicator comm{2, 1, kServerAddress};
std::string server_address{GetServerAddress()};
FederatedCommunicator comm{2, 1, server_address};
EXPECT_TRUE(comm.IsDistributed());
}
TEST_F(FederatedCommunicatorTest, Allreduce) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank));
threads.emplace_back(
std::thread(&FederatedCommunicatorTest::VerifyAllreduce, rank, server_address_));
}
for (auto &thread : threads) {
thread.join();
@@ -131,7 +158,8 @@ TEST_F(FederatedCommunicatorTest, Allreduce) {
TEST_F(FederatedCommunicatorTest, Broadcast) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank));
threads.emplace_back(
std::thread(&FederatedCommunicatorTest::VerifyBroadcast, rank, server_address_));
}
for (auto &thread : threads) {
thread.join();

View File

@@ -4,32 +4,45 @@
#include <grpcpp/server_builder.h>
#include <gtest/gtest.h>
#include <iostream>
#include <thread>
#include <ctime>
#include "helpers.h"
#include "federated_client.h"
#include "federated_server.h"
namespace {
std::string GetServerAddress() {
int port = GenerateRandomPort(50000, 60000);
std::string address = std::string("localhost:") + std::to_string(port);
return address;
}
} // anonymous namespace
namespace xgboost {
class FederatedServerTest : public ::testing::Test {
public:
static void VerifyAllgather(int rank) {
federated::FederatedClient client{kServerAddress, rank};
static void VerifyAllgather(int rank, const std::string& server_address) {
federated::FederatedClient client{server_address, rank};
CheckAllgather(client, rank);
}
static void VerifyAllreduce(int rank) {
federated::FederatedClient client{kServerAddress, rank};
static void VerifyAllreduce(int rank, const std::string& server_address) {
federated::FederatedClient client{server_address, rank};
CheckAllreduce(client);
}
static void VerifyBroadcast(int rank) {
federated::FederatedClient client{kServerAddress, rank};
static void VerifyBroadcast(int rank, const std::string& server_address) {
federated::FederatedClient client{server_address, rank};
CheckBroadcast(client, rank);
}
static void VerifyMixture(int rank) {
federated::FederatedClient client{kServerAddress, rank};
static void VerifyMixture(int rank, const std::string& server_address) {
federated::FederatedClient client{server_address, rank};
for (auto i = 0; i < 10; i++) {
CheckAllgather(client, rank);
CheckAllreduce(client);
@@ -39,10 +52,11 @@ class FederatedServerTest : public ::testing::Test {
protected:
void SetUp() override {
server_address_ = GetServerAddress();
server_thread_.reset(new std::thread([this] {
grpc::ServerBuilder builder;
federated::FederatedService service{kWorldSize};
builder.AddListeningPort(kServerAddress, grpc::InsecureServerCredentials());
builder.AddListeningPort(server_address_, grpc::InsecureServerCredentials());
builder.RegisterService(&service);
server_ = builder.BuildAndStart();
server_->Wait();
@@ -80,17 +94,15 @@ class FederatedServerTest : public ::testing::Test {
}
static int const kWorldSize{3};
static std::string const kServerAddress;
std::string server_address_;
std::unique_ptr<std::thread> server_thread_;
std::unique_ptr<grpc::Server> server_;
};
std::string const FederatedServerTest::kServerAddress{"localhost:56789"}; // NOLINT(cert-err58-cpp)
TEST_F(FederatedServerTest, Allgather) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank));
threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllgather, rank, server_address_));
}
for (auto& thread : threads) {
thread.join();
@@ -100,7 +112,7 @@ TEST_F(FederatedServerTest, Allgather) {
TEST_F(FederatedServerTest, Allreduce) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank));
threads.emplace_back(std::thread(&FederatedServerTest::VerifyAllreduce, rank, server_address_));
}
for (auto& thread : threads) {
thread.join();
@@ -110,7 +122,7 @@ TEST_F(FederatedServerTest, Allreduce) {
TEST_F(FederatedServerTest, Broadcast) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank));
threads.emplace_back(std::thread(&FederatedServerTest::VerifyBroadcast, rank, server_address_));
}
for (auto& thread : threads) {
thread.join();
@@ -120,7 +132,7 @@ TEST_F(FederatedServerTest, Broadcast) {
TEST_F(FederatedServerTest, Mixture) {
std::vector<std::thread> threads;
for (auto rank = 0; rank < kWorldSize; rank++) {
threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank));
threads.emplace_back(std::thread(&FederatedServerTest::VerifyMixture, rank, server_address_));
}
for (auto& thread : threads) {
thread.join();