[Breaking] Switch from rabit to the collective communicator (#8257)

* Switch from rabit to the collective communicator * fix size_t specialization * really fix size_t * try again * add include * more include * fix lint errors * remove rabit includes * fix pylint error * return dict from communicator context * fix communicator shutdown * fix dask test * reset communicator mocklist * fix distributed tests * do not save device communicator * fix jvm gpu tests * add python test for federated communicator * Update gputreeshap submodule Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
2022-10-05 15:39:01 -07:00
parent e47b3a3da3
commit 668b8a0ea4
79 changed files with 805 additions and 2212 deletions
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -1,249 +0,0 @@
-"""Distributed XGBoost Rabit related API."""
-import ctypes
-from enum import IntEnum, unique
-import logging
-import pickle
-from typing import Any, TypeVar, Callable, Optional, cast, List, Union
-
-import numpy as np
-
-from .core import _LIB, c_str, _check_call
-
-LOGGER = logging.getLogger("[xgboost.rabit]")
-
-
-def _init_rabit() -> None:
-    """internal library initializer."""
-    if _LIB is not None:
-        _LIB.RabitGetRank.restype = ctypes.c_int
-        _LIB.RabitGetWorldSize.restype = ctypes.c_int
-        _LIB.RabitIsDistributed.restype = ctypes.c_int
-        _LIB.RabitVersionNumber.restype = ctypes.c_int
-
-
-def init(args: Optional[List[bytes]] = None) -> None:
-    """Initialize the rabit library with arguments"""
-    if args is None:
-        args = []
-    arr = (ctypes.c_char_p * len(args))()
-    arr[:] = cast(List[Union[ctypes.c_char_p, bytes, None, int]], args)
-    _LIB.RabitInit(len(arr), arr)
-
-
-def finalize() -> None:
-    """Finalize the process, notify tracker everything is done."""
-    _LIB.RabitFinalize()
-
-
-def get_rank() -> int:
-    """Get rank of current process.
-
-    Returns
-    -------
-    rank : int
-        Rank of current process.
-    """
-    ret = _LIB.RabitGetRank()
-    return ret
-
-
-def get_world_size() -> int:
-    """Get total number workers.
-
-    Returns
-    -------
-    n : int
-        Total number of process.
-    """
-    ret = _LIB.RabitGetWorldSize()
-    return ret
-
-
-def is_distributed() -> int:
-    '''If rabit is distributed.'''
-    is_dist = _LIB.RabitIsDistributed()
-    return is_dist
-
-
-def tracker_print(msg: Any) -> None:
-    """Print message to the tracker.
-
-    This function can be used to communicate the information of
-    the progress to the tracker
-
-    Parameters
-    ----------
-    msg : str
-        The message to be printed to tracker.
-    """
-    if not isinstance(msg, str):
-        msg = str(msg)
-    is_dist = _LIB.RabitIsDistributed()
-    if is_dist != 0:
-        _check_call(_LIB.RabitTrackerPrint(c_str(msg)))
-    else:
-        print(msg.strip(), flush=True)
-
-
-def get_processor_name() -> bytes:
-    """Get the processor name.
-
-    Returns
-    -------
-    name : str
-        the name of processor(host)
-    """
-    mxlen = 256
-    length = ctypes.c_ulong()
-    buf = ctypes.create_string_buffer(mxlen)
-    _LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
-    return buf.value
-
-
-T = TypeVar("T")                # pylint:disable=invalid-name
-
-
-def broadcast(data: T, root: int) -> T:
-    """Broadcast object from one node to all other nodes.
-
-    Parameters
-    ----------
-    data : any type that can be pickled
-        Input data, if current rank does not equal root, this can be None
-    root : int
-        Rank of the node to broadcast data from.
-
-    Returns
-    -------
-    object : int
-        the result of broadcast.
-    """
-    rank = get_rank()
-    length = ctypes.c_ulong()
-    if root == rank:
-        assert data is not None, 'need to pass in data when broadcasting'
-        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
-        length.value = len(s)
-    # run first broadcast
-    _check_call(_LIB.RabitBroadcast(ctypes.byref(length),
-                                    ctypes.sizeof(ctypes.c_ulong), root))
-    if root != rank:
-        dptr = (ctypes.c_char * length.value)()
-        # run second
-        _check_call(_LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
-                                        length.value, root))
-        data = pickle.loads(dptr.raw)
-        del dptr
-    else:
-        _check_call(_LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
-                                        length.value, root))
-        del s
-    return data
-
-
-# enumeration of dtypes
-DTYPE_ENUM__ = {
-    np.dtype('int8'): 0,
-    np.dtype('uint8'): 1,
-    np.dtype('int32'): 2,
-    np.dtype('uint32'): 3,
-    np.dtype('int64'): 4,
-    np.dtype('uint64'): 5,
-    np.dtype('float32'): 6,
-    np.dtype('float64'): 7
-}
-
-
-@unique
-class Op(IntEnum):
-    '''Supported operations for rabit.'''
-    MAX = 0
-    MIN = 1
-    SUM = 2
-    OR = 3
-
-
-def allreduce(                  # pylint:disable=invalid-name
-    data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None
-) -> np.ndarray:
-    """Perform allreduce, return the result.
-
-    Parameters
-    ----------
-    data :
-        Input data.
-    op :
-        Reduction operators, can be MIN, MAX, SUM, BITOR
-    prepare_fun :
-        Lazy preprocessing function, if it is not None, prepare_fun(data)
-        will be called by the function before performing allreduce, to initialize the data
-        If the result of Allreduce can be recovered directly,
-        then prepare_fun will NOT be called
-
-    Returns
-    -------
-    result :
-        The result of allreduce, have same shape as data
-
-    Notes
-    -----
-    This function is not thread-safe.
-    """
-    if not isinstance(data, np.ndarray):
-        raise Exception('allreduce only takes in numpy.ndarray')
-    buf = data.ravel()
-    if buf.base is data.base:
-        buf = buf.copy()
-    if buf.dtype not in DTYPE_ENUM__:
-        raise Exception(f"data type {buf.dtype} not supported")
-    if prepare_fun is None:
-        _check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                                        buf.size, DTYPE_ENUM__[buf.dtype],
-                                        int(op), None, None))
-    else:
-        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-
-        def pfunc(_: Any) -> None:
-            """prepare function."""
-            fn = cast(Callable[[np.ndarray], None], prepare_fun)
-            fn(data)
-        _check_call(_LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                                        buf.size, DTYPE_ENUM__[buf.dtype],
-                                        op, func_ptr(pfunc), None))
-    return buf
-
-
-def version_number() -> int:
-    """Returns version number of current stored model.
-
-    This means how many calls to CheckPoint we made so far.
-
-    Returns
-    -------
-    version : int
-        Version number of currently stored model
-    """
-    ret = _LIB.RabitVersionNumber()
-    return ret
-
-
-class RabitContext:
-    """A context controlling rabit initialization and finalization."""
-
-    def __init__(self, args: List[bytes] = None) -> None:
-        if args is None:
-            args = []
-        self.args = args
-
-    def __enter__(self) -> None:
-        init(self.args)
-        assert is_distributed()
-        LOGGER.debug("-------------- rabit say hello ------------------")
-
-    def __exit__(self, *args: List) -> None:
-        finalize()
-        LOGGER.debug("--------------- rabit say bye ------------------")
-
-
-# initialization script
-_init_rabit()