From 0f37a01dd9063d84d187c8f82dd4c493707ce695 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 7 Feb 2023 01:53:33 +0800 Subject: [PATCH] Require black formatter for the python package. (#8748) --- python-package/setup.py | 321 +++++++++--------- python-package/xgboost/collective.py | 59 ++-- python-package/xgboost/core.py | 484 +++++++++++++++------------ python-package/xgboost/data.py | 107 +++--- python-package/xgboost/libpath.py | 58 ++-- python-package/xgboost/plotting.py | 57 ++-- python-package/xgboost/rabit.py | 5 +- python-package/xgboost/tracker.py | 69 ++-- python-package/xgboost/training.py | 110 +++--- tests/ci_build/lint_python.py | 11 +- 10 files changed, 707 insertions(+), 574 deletions(-) diff --git a/python-package/setup.py b/python-package/setup.py index 9cb8c9154..fe1cbf2e9 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -19,17 +19,17 @@ sys.path.insert(0, CURRENT_DIR) # requires using CMake directly. USER_OPTIONS = { # libxgboost options. - 'use-openmp': (None, 'Build with OpenMP support.', 1), - 'use-cuda': (None, 'Build with GPU acceleration.', 0), - 'use-nccl': (None, 'Build with NCCL to enable distributed GPU support.', 0), - 'build-with-shared-nccl': (None, 'Build with shared NCCL library.', 0), - 'hide-cxx-symbols': (None, 'Hide all C++ symbols during build.', 1), - 'use-hdfs': (None, 'Build with HDFS support', 0), - 'use-azure': (None, 'Build with AZURE support.', 0), - 'use-s3': (None, 'Build with S3 support', 0), - 'plugin-dense-parser': (None, 'Build dense parser plugin.', 0), + "use-openmp": (None, "Build with OpenMP support.", 1), + "use-cuda": (None, "Build with GPU acceleration.", 0), + "use-nccl": (None, "Build with NCCL to enable distributed GPU support.", 0), + "build-with-shared-nccl": (None, "Build with shared NCCL library.", 0), + "hide-cxx-symbols": (None, "Hide all C++ symbols during build.", 1), + "use-hdfs": (None, "Build with HDFS support", 0), + "use-azure": (None, "Build with AZURE support.", 0), + "use-s3": (None, "Build with S3 support", 0), + "plugin-dense-parser": (None, "Build dense parser plugin.", 0), # Python specific - 'use-system-libxgboost': (None, 'Use libxgboost.so in system path.', 0) + "use-system-libxgboost": (None, "Use libxgboost.so in system path.", 0), } NEED_CLEAN_TREE = set() @@ -38,20 +38,21 @@ BUILD_TEMP_DIR = None def lib_name() -> str: - '''Return platform dependent shared object name.''' - if system() == 'Linux' or system().upper().endswith('BSD'): - name = 'libxgboost.so' - elif system() == 'Darwin': - name = 'libxgboost.dylib' - elif system() == 'Windows': - name = 'xgboost.dll' - elif system() == 'OS400': - name = 'libxgboost.so' + """Return platform dependent shared object name.""" + if system() == "Linux" or system().upper().endswith("BSD"): + name = "libxgboost.so" + elif system() == "Darwin": + name = "libxgboost.dylib" + elif system() == "Windows": + name = "xgboost.dll" + elif system() == "OS400": + name = "libxgboost.so" return name def copy_tree(src_dir: str, target_dir: str) -> None: - '''Copy source tree into build directory.''' + """Copy source tree into build directory.""" + def clean_copy_tree(src: str, dst: str) -> None: shutil.copytree(src, dst) NEED_CLEAN_TREE.add(os.path.abspath(dst)) @@ -60,30 +61,30 @@ def copy_tree(src_dir: str, target_dir: str) -> None: shutil.copy(src, dst) NEED_CLEAN_FILE.add(os.path.abspath(dst)) - src = os.path.join(src_dir, 'src') - inc = os.path.join(src_dir, 'include') - dmlc_core = os.path.join(src_dir, 'dmlc-core') + src = os.path.join(src_dir, "src") + inc = os.path.join(src_dir, "include") + dmlc_core = os.path.join(src_dir, "dmlc-core") gputreeshap = os.path.join(src_dir, "gputreeshap") - rabit = os.path.join(src_dir, 'rabit') - cmake = os.path.join(src_dir, 'cmake') - plugin = os.path.join(src_dir, 'plugin') + rabit = os.path.join(src_dir, "rabit") + cmake = os.path.join(src_dir, "cmake") + plugin = os.path.join(src_dir, "plugin") - clean_copy_tree(src, os.path.join(target_dir, 'src')) - clean_copy_tree(inc, os.path.join(target_dir, 'include')) - clean_copy_tree(dmlc_core, os.path.join(target_dir, 'dmlc-core')) + clean_copy_tree(src, os.path.join(target_dir, "src")) + clean_copy_tree(inc, os.path.join(target_dir, "include")) + clean_copy_tree(dmlc_core, os.path.join(target_dir, "dmlc-core")) clean_copy_tree(gputreeshap, os.path.join(target_dir, "gputreeshap")) - clean_copy_tree(rabit, os.path.join(target_dir, 'rabit')) - clean_copy_tree(cmake, os.path.join(target_dir, 'cmake')) - clean_copy_tree(plugin, os.path.join(target_dir, 'plugin')) + clean_copy_tree(rabit, os.path.join(target_dir, "rabit")) + clean_copy_tree(cmake, os.path.join(target_dir, "cmake")) + clean_copy_tree(plugin, os.path.join(target_dir, "plugin")) - cmake_list = os.path.join(src_dir, 'CMakeLists.txt') - clean_copy_file(cmake_list, os.path.join(target_dir, 'CMakeLists.txt')) - lic = os.path.join(src_dir, 'LICENSE') - clean_copy_file(lic, os.path.join(target_dir, 'LICENSE')) + cmake_list = os.path.join(src_dir, "CMakeLists.txt") + clean_copy_file(cmake_list, os.path.join(target_dir, "CMakeLists.txt")) + lic = os.path.join(src_dir, "LICENSE") + clean_copy_file(lic, os.path.join(target_dir, "LICENSE")) def clean_up() -> None: - '''Removed copied files.''' + """Removed copied files.""" for path in NEED_CLEAN_TREE: shutil.rmtree(path) for path in NEED_CLEAN_FILE: @@ -91,15 +92,16 @@ def clean_up() -> None: class CMakeExtension(Extension): # pylint: disable=too-few-public-methods - '''Wrapper for extension''' + """Wrapper for extension""" + def __init__(self, name: str) -> None: super().__init__(name=name, sources=[]) class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors - '''Custom build_ext command using CMake.''' + """Custom build_ext command using CMake.""" - logger = logging.getLogger('XGBoost build_ext') + logger = logging.getLogger("XGBoost build_ext") # pylint: disable=too-many-arguments def build( @@ -110,157 +112,171 @@ class BuildExt(build_ext.build_ext): # pylint: disable=too-many-ancestors build_tool: Optional[str] = None, use_omp: int = 1, ) -> None: - '''Build the core library with CMake.''' - cmake_cmd = ['cmake', src_dir, generator] + """Build the core library with CMake.""" + cmake_cmd = ["cmake", src_dir, generator] for k, v in USER_OPTIONS.items(): - arg = k.replace('-', '_').upper() + arg = k.replace("-", "_").upper() value = str(v[2]) - if arg == 'USE_SYSTEM_LIBXGBOOST': + if arg == "USE_SYSTEM_LIBXGBOOST": continue - if arg == 'USE_OPENMP' and use_omp == 0: + if arg == "USE_OPENMP" and use_omp == 0: cmake_cmd.append("-D" + arg + "=0") continue - cmake_cmd.append('-D' + arg + '=' + value) + cmake_cmd.append("-D" + arg + "=" + value) # Flag for cross-compiling for Apple Silicon # We use environment variable because it's the only way to pass down custom flags # through the cibuildwheel package, which otherwise calls `python setup.py bdist_wheel` # command. - if 'CIBW_TARGET_OSX_ARM64' in os.environ: + if "CIBW_TARGET_OSX_ARM64" in os.environ: cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64") - self.logger.info('Run CMake command: %s', str(cmake_cmd)) + self.logger.info("Run CMake command: %s", str(cmake_cmd)) subprocess.check_call(cmake_cmd, cwd=build_dir) - if system() != 'Windows': + if system() != "Windows": nproc = os.cpu_count() assert build_tool is not None - subprocess.check_call([build_tool, '-j' + str(nproc)], - cwd=build_dir) + subprocess.check_call([build_tool, "-j" + str(nproc)], cwd=build_dir) else: - subprocess.check_call(['cmake', '--build', '.', - '--config', 'Release'], cwd=build_dir) + subprocess.check_call( + ["cmake", "--build", ".", "--config", "Release"], cwd=build_dir + ) def build_cmake_extension(self) -> None: - '''Configure and build using CMake''' - if USER_OPTIONS['use-system-libxgboost'][2]: - self.logger.info('Using system libxgboost.') + """Configure and build using CMake""" + if USER_OPTIONS["use-system-libxgboost"][2]: + self.logger.info("Using system libxgboost.") return build_dir = self.build_temp global BUILD_TEMP_DIR # pylint: disable=global-statement BUILD_TEMP_DIR = build_dir libxgboost = os.path.abspath( - os.path.join(CURRENT_DIR, os.path.pardir, 'lib', lib_name())) + os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name()) + ) if os.path.exists(libxgboost): - self.logger.info('Found shared library, skipping build.') + self.logger.info("Found shared library, skipping build.") return - src_dir = 'xgboost' + src_dir = "xgboost" try: - copy_tree(os.path.join(CURRENT_DIR, os.path.pardir), - os.path.join(self.build_temp, src_dir)) + copy_tree( + os.path.join(CURRENT_DIR, os.path.pardir), + os.path.join(self.build_temp, src_dir), + ) except Exception: # pylint: disable=broad-except copy_tree(src_dir, os.path.join(self.build_temp, src_dir)) - self.logger.info('Building from source. %s', libxgboost) + self.logger.info("Building from source. %s", libxgboost) if not os.path.exists(build_dir): os.mkdir(build_dir) - if shutil.which('ninja'): - build_tool = 'ninja' + if shutil.which("ninja"): + build_tool = "ninja" else: - build_tool = 'make' - if sys.platform.startswith('os400'): - build_tool = 'make' + build_tool = "make" + if sys.platform.startswith("os400"): + build_tool = "make" - if system() == 'Windows': + if system() == "Windows": # Pick up from LGB, just test every possible tool chain. for vs in ( "-GVisual Studio 17 2022", - '-GVisual Studio 16 2019', - '-GVisual Studio 15 2017', - '-GVisual Studio 14 2015', - '-GMinGW Makefiles', + "-GVisual Studio 16 2019", + "-GVisual Studio 15 2017", + "-GVisual Studio 14 2015", + "-GMinGW Makefiles", ): try: self.build(src_dir, build_dir, vs) self.logger.info( - '%s is used for building Windows distribution.', vs) + "%s is used for building Windows distribution.", vs + ) break except subprocess.CalledProcessError: shutil.rmtree(build_dir) os.mkdir(build_dir) continue else: - gen = '-GNinja' if build_tool == 'ninja' else '-GUnix Makefiles' + gen = "-GNinja" if build_tool == "ninja" else "-GUnix Makefiles" try: self.build(src_dir, build_dir, gen, build_tool, use_omp=1) except subprocess.CalledProcessError: - self.logger.warning('Disabling OpenMP support.') + self.logger.warning("Disabling OpenMP support.") self.build(src_dir, build_dir, gen, build_tool, use_omp=0) def build_extension(self, ext: Extension) -> None: - '''Override the method for dispatching.''' + """Override the method for dispatching.""" if isinstance(ext, CMakeExtension): self.build_cmake_extension() else: super().build_extension(ext) def copy_extensions_to_source(self) -> None: - '''Dummy override. Invoked during editable installation. Our binary + """Dummy override. Invoked during editable installation. Our binary should available in `lib`. - ''' + """ if not os.path.exists( - os.path.join(CURRENT_DIR, os.path.pardir, 'lib', lib_name())): - raise ValueError('For using editable installation, please ' + - 'build the shared object first with CMake.') + os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name()) + ): + raise ValueError( + "For using editable installation, please " + + "build the shared object first with CMake." + ) -class Sdist(sdist.sdist): # pylint: disable=too-many-ancestors - '''Copy c++ source into Python directory.''' - logger = logging.getLogger('xgboost sdist') +class Sdist(sdist.sdist): # pylint: disable=too-many-ancestors + """Copy c++ source into Python directory.""" + + logger = logging.getLogger("xgboost sdist") def run(self) -> None: - copy_tree(os.path.join(CURRENT_DIR, os.path.pardir), - os.path.join(CURRENT_DIR, 'xgboost')) - libxgboost = os.path.join( - CURRENT_DIR, os.path.pardir, 'lib', lib_name()) + copy_tree( + os.path.join(CURRENT_DIR, os.path.pardir), + os.path.join(CURRENT_DIR, "xgboost"), + ) + libxgboost = os.path.join(CURRENT_DIR, os.path.pardir, "lib", lib_name()) if os.path.exists(libxgboost): self.logger.warning( - 'Found shared library, removing to avoid being included in source distribution.' + "Found shared library, removing to avoid being included in source distribution." ) os.remove(libxgboost) super().run() class InstallLib(install_lib.install_lib): - '''Copy shared object into installation directory.''' - logger = logging.getLogger('xgboost install_lib') + """Copy shared object into installation directory.""" + + logger = logging.getLogger("xgboost install_lib") def install(self) -> List[str]: outfiles = super().install() - if USER_OPTIONS['use-system-libxgboost'][2] != 0: - self.logger.info('Using system libxgboost.') - lib_path = os.path.join(sys.prefix, 'lib') - msg = 'use-system-libxgboost is specified, but ' + lib_name() + \ - ' is not found in: ' + lib_path + if USER_OPTIONS["use-system-libxgboost"][2] != 0: + self.logger.info("Using system libxgboost.") + lib_path = os.path.join(sys.prefix, "lib") + msg = ( + "use-system-libxgboost is specified, but " + + lib_name() + + " is not found in: " + + lib_path + ) assert os.path.exists(os.path.join(lib_path, lib_name())), msg return [] - lib_dir = os.path.join(self.install_dir, 'xgboost', 'lib') + lib_dir = os.path.join(self.install_dir, "xgboost", "lib") if not os.path.exists(lib_dir): os.mkdir(lib_dir) - dst = os.path.join(self.install_dir, 'xgboost', 'lib', lib_name()) + dst = os.path.join(self.install_dir, "xgboost", "lib", lib_name()) libxgboost_path = lib_name() assert BUILD_TEMP_DIR is not None - dft_lib_dir = os.path.join(CURRENT_DIR, os.path.pardir, 'lib') - build_dir = os.path.join(BUILD_TEMP_DIR, 'xgboost', 'lib') + dft_lib_dir = os.path.join(CURRENT_DIR, os.path.pardir, "lib") + build_dir = os.path.join(BUILD_TEMP_DIR, "xgboost", "lib") if os.path.exists(os.path.join(dft_lib_dir, libxgboost_path)): # The library is built by CMake directly @@ -268,18 +284,21 @@ class InstallLib(install_lib.install_lib): else: # The library is built by setup.py src = os.path.join(build_dir, libxgboost_path) - self.logger.info('Installing shared library: %s', src) + self.logger.info("Installing shared library: %s", src) dst, _ = self.copy_file(src, dst) outfiles.append(dst) return outfiles class Install(install.install): # pylint: disable=too-many-instance-attributes - '''An interface to install command, accepting XGBoost specific + """An interface to install command, accepting XGBoost specific arguments. - ''' - user_options = install.install.user_options + [(k, v[0], v[1]) for k, v in USER_OPTIONS.items()] + """ + + user_options = install.install.user_options + [ + (k, v[0], v[1]) for k, v in USER_OPTIONS.items() + ] def initialize_options(self) -> None: super().initialize_options() @@ -302,13 +321,13 @@ class Install(install.install): # pylint: disable=too-many-instance-attributes # arguments, then here we propagate them into `USER_OPTIONS` for visibility to # other sub-commands like `build_ext`. for k, v in USER_OPTIONS.items(): - arg = k.replace('-', '_') + arg = k.replace("-", "_") if hasattr(self, arg): USER_OPTIONS[k] = (v[0], v[1], getattr(self, arg)) super().run() -if __name__ == '__main__': +if __name__ == "__main__": # Supported commands: # From internet: # - pip install xgboost @@ -326,51 +345,55 @@ if __name__ == '__main__': # - python setup.py develop # same as above logging.basicConfig(level=logging.INFO) - with open(os.path.join(CURRENT_DIR, 'README.rst'), encoding='utf-8') as fd: + with open(os.path.join(CURRENT_DIR, "README.rst"), encoding="utf-8") as fd: description = fd.read() - with open(os.path.join(CURRENT_DIR, 'xgboost/VERSION'), encoding="ascii") as fd: + with open(os.path.join(CURRENT_DIR, "xgboost/VERSION"), encoding="ascii") as fd: version = fd.read().strip() - setup(name='xgboost', - version=version, - description="XGBoost Python Package", - long_description=description, - long_description_content_type="text/x-rst", - install_requires=[ - 'numpy', - 'scipy', - ], - ext_modules=[CMakeExtension('libxgboost')], - # error: expected "str": "Type[Command]" - cmdclass={ - 'build_ext': BuildExt, # type: ignore - 'sdist': Sdist, # type: ignore - 'install_lib': InstallLib, # type: ignore - 'install': Install # type: ignore - }, - extras_require={ - 'pandas': ['pandas'], - 'scikit-learn': ['scikit-learn'], - 'dask': ['dask', 'pandas', 'distributed'], - 'datatable': ['datatable'], - 'plotting': ['graphviz', 'matplotlib'], - "pyspark": ["pyspark", "scikit-learn", "cloudpickle"], - }, - maintainer='Hyunsu Cho', - maintainer_email='chohyu01@cs.washington.edu', - zip_safe=False, - packages=find_packages(), - include_package_data=True, - license='Apache-2.0', - classifiers=['License :: OSI Approved :: Apache Software License', - 'Development Status :: 5 - Production/Stable', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10'], - python_requires=">=3.8", - url='https://github.com/dmlc/xgboost') + setup( + name="xgboost", + version=version, + description="XGBoost Python Package", + long_description=description, + long_description_content_type="text/x-rst", + install_requires=[ + "numpy", + "scipy", + ], + ext_modules=[CMakeExtension("libxgboost")], + # error: expected "str": "Type[Command]" + cmdclass={ + "build_ext": BuildExt, # type: ignore + "sdist": Sdist, # type: ignore + "install_lib": InstallLib, # type: ignore + "install": Install, # type: ignore + }, + extras_require={ + "pandas": ["pandas"], + "scikit-learn": ["scikit-learn"], + "dask": ["dask", "pandas", "distributed"], + "datatable": ["datatable"], + "plotting": ["graphviz", "matplotlib"], + "pyspark": ["pyspark", "scikit-learn", "cloudpickle"], + }, + maintainer="Hyunsu Cho", + maintainer_email="chohyu01@cs.washington.edu", + zip_safe=False, + packages=find_packages(), + include_package_data=True, + license="Apache-2.0", + classifiers=[ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 5 - Production/Stable", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.8", + url="https://github.com/dmlc/xgboost", + ) clean_up() diff --git a/python-package/xgboost/collective.py b/python-package/xgboost/collective.py index 45d018cc7..7c586cba7 100644 --- a/python-package/xgboost/collective.py +++ b/python-package/xgboost/collective.py @@ -152,42 +152,52 @@ def broadcast(data: _T, root: int) -> _T: rank = get_rank() length = ctypes.c_ulong() if root == rank: - assert data is not None, 'need to pass in data when broadcasting' + assert data is not None, "need to pass in data when broadcasting" s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL) length.value = len(s) # run first broadcast - _check_call(_LIB.XGCommunicatorBroadcast(ctypes.byref(length), - ctypes.sizeof(ctypes.c_ulong), root)) + _check_call( + _LIB.XGCommunicatorBroadcast( + ctypes.byref(length), ctypes.sizeof(ctypes.c_ulong), root + ) + ) if root != rank: dptr = (ctypes.c_char * length.value)() # run second - _check_call(_LIB.XGCommunicatorBroadcast(ctypes.cast(dptr, ctypes.c_void_p), - length.value, root)) + _check_call( + _LIB.XGCommunicatorBroadcast( + ctypes.cast(dptr, ctypes.c_void_p), length.value, root + ) + ) data = pickle.loads(dptr.raw) del dptr else: - _check_call(_LIB.XGCommunicatorBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p), - length.value, root)) + _check_call( + _LIB.XGCommunicatorBroadcast( + ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p), length.value, root + ) + ) del s return data # enumeration of dtypes DTYPE_ENUM__ = { - np.dtype('int8'): 0, - np.dtype('uint8'): 1, - np.dtype('int32'): 2, - np.dtype('uint32'): 3, - np.dtype('int64'): 4, - np.dtype('uint64'): 5, - np.dtype('float32'): 6, - np.dtype('float64'): 7 + np.dtype("int8"): 0, + np.dtype("uint8"): 1, + np.dtype("int32"): 2, + np.dtype("uint32"): 3, + np.dtype("int64"): 4, + np.dtype("uint64"): 5, + np.dtype("float32"): 6, + np.dtype("float64"): 7, } @unique class Op(IntEnum): """Supported operations for allreduce.""" + MAX = 0 MIN = 1 SUM = 2 @@ -196,9 +206,7 @@ class Op(IntEnum): BITWISE_XOR = 5 -def allreduce( # pylint:disable=invalid-name - data: np.ndarray, op: Op -) -> np.ndarray: +def allreduce(data: np.ndarray, op: Op) -> np.ndarray: # pylint:disable=invalid-name """Perform allreduce, return the result. Parameters @@ -218,15 +226,22 @@ def allreduce( # pylint:disable=invalid-name This function is not thread-safe. """ if not isinstance(data, np.ndarray): - raise TypeError('allreduce only takes in numpy.ndarray') + raise TypeError("allreduce only takes in numpy.ndarray") buf = data.ravel() if buf.base is data.base: buf = buf.copy() if buf.dtype not in DTYPE_ENUM__: raise Exception(f"data type {buf.dtype} not supported") - _check_call(_LIB.XGCommunicatorAllreduce(buf.ctypes.data_as(ctypes.c_void_p), - buf.size, DTYPE_ENUM__[buf.dtype], - int(op), None, None)) + _check_call( + _LIB.XGCommunicatorAllreduce( + buf.ctypes.data_as(ctypes.c_void_p), + buf.size, + DTYPE_ENUM__[buf.dtype], + int(op), + None, + None, + ) + ) return buf diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index c64737957..ff33a6842 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -81,7 +81,7 @@ def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array return bytes(data, "utf-8") if isinstance(data, list): pointers: ctypes.Array[ctypes.c_char_p] = (ctypes.c_char_p * len(data))() - data_as_bytes = [bytes(d, 'utf-8') for d in data] + data_as_bytes = [bytes(d, "utf-8") for d in data] pointers[:] = data_as_bytes # type: ignore return pointers raise TypeError() @@ -100,9 +100,9 @@ def from_cstr_to_pystr(data: CStrPptr, length: c_bst_ulong) -> List[str]: res = [] for i in range(length.value): try: - res.append(str(cast(bytes, data[i]).decode('ascii'))) + res.append(str(cast(bytes, data[i]).decode("ascii"))) except UnicodeDecodeError: - res.append(str(cast(bytes, data[i]).decode('utf-8'))) + res.append(str(cast(bytes, data[i]).decode("utf-8"))) return res @@ -115,15 +115,13 @@ IterRange = TypeVar("IterRange", Optional[Tuple[int, int]], Tuple[int, int]) def _convert_ntree_limit( - booster: "Booster", - ntree_limit: Optional[int], - iteration_range: IterRange + booster: "Booster", ntree_limit: Optional[int], iteration_range: IterRange ) -> IterRange: if ntree_limit is not None and ntree_limit != 0: warnings.warn( "ntree_limit is deprecated, use `iteration_range` or model " "slicing instead.", - UserWarning + UserWarning, ) if iteration_range is not None and iteration_range[1] != 0: raise ValueError( @@ -149,12 +147,12 @@ def _expect(expectations: Sequence[Type], got: Type) -> str: ------- msg: str """ - msg = 'Expecting ' + msg = "Expecting " for t in range(len(expectations) - 1): msg += str(expectations[t]) - msg += ' or ' + msg += " or " msg += str(expectations[-1]) - msg += '. Got ' + str(got) + msg += ". Got " + str(got) return msg @@ -367,17 +365,17 @@ def ctypes2cupy(cptr: CNumericPtr, length: int, dtype: Type[np.number]) -> CupyT def ctypes2buffer(cptr: CStrPtr, length: int) -> bytearray: """Convert ctypes pointer to buffer type.""" if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): - raise RuntimeError('expected char pointer') + raise RuntimeError("expected char pointer") res = bytearray(length) rptr = (ctypes.c_char * length).from_buffer(res) if not ctypes.memmove(rptr, cptr, length): - raise RuntimeError('memmove failed') + raise RuntimeError("memmove failed") return res def c_str(string: str) -> ctypes.c_char_p: """Convert a python string to cstring.""" - return ctypes.c_char_p(string.encode('utf-8')) + return ctypes.c_char_p(string.encode("utf-8")) def c_array( @@ -496,6 +494,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes pointer. """ + @require_keyword_args(True) def input_data( data: Any, @@ -525,6 +524,7 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes **kwargs, ) self._input_id = id(data) + # pylint: disable=not-callable return self._handle_exception(lambda: self.next(input_data), 0) @@ -625,6 +625,7 @@ _deprecate_positional_args = require_keyword_args(False) @unique class DataSplitMode(IntEnum): """Supported data split mode for DMatrix.""" + ROW = 0 COL = 1 @@ -637,6 +638,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m multiple different sources of data. """ + @_deprecate_positional_args def __init__( self, @@ -774,9 +776,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m } args_cstr = from_pystr_to_cstr(json.dumps(args)) handle = ctypes.c_void_p() - reset_callback, next_callback = it.get_callbacks( - True, enable_categorical - ) + reset_callback, next_callback = it.get_callbacks(True, enable_categorical) ret = _LIB.XGDMatrixCreateFromCallback( None, it.proxy.handle, @@ -808,7 +808,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m label_upper_bound: Optional[ArrayLike] = None, feature_names: Optional[FeatureNames] = None, feature_types: Optional[FeatureTypes] = None, - feature_weights: Optional[ArrayLike] = None + feature_weights: Optional[ArrayLike] = None, ) -> None: """Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`.""" from .data import dispatch_meta_backend @@ -822,18 +822,19 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m if group is not None: self.set_group(group) if qid is not None: - self.set_uint_info('qid', qid) + self.set_uint_info("qid", qid) if label_lower_bound is not None: - self.set_float_info('label_lower_bound', label_lower_bound) + self.set_float_info("label_lower_bound", label_lower_bound) if label_upper_bound is not None: - self.set_float_info('label_upper_bound', label_upper_bound) + self.set_float_info("label_upper_bound", label_upper_bound) if feature_names is not None: self.feature_names = feature_names if feature_types is not None: self.feature_types = feature_types if feature_weights is not None: - dispatch_meta_backend(matrix=self, data=feature_weights, - name='feature_weights') + dispatch_meta_backend( + matrix=self, data=feature_weights, name="feature_weights" + ) def get_float_info(self, field: str) -> np.ndarray: """Get float property from the DMatrix. @@ -850,10 +851,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m """ length = c_bst_ulong() ret = ctypes.POINTER(ctypes.c_float)() - _check_call(_LIB.XGDMatrixGetFloatInfo(self.handle, - c_str(field), - ctypes.byref(length), - ctypes.byref(ret))) + _check_call( + _LIB.XGDMatrixGetFloatInfo( + self.handle, c_str(field), ctypes.byref(length), ctypes.byref(ret) + ) + ) return ctypes2numpy(ret, length.value, np.float32) def get_uint_info(self, field: str) -> np.ndarray: @@ -871,10 +873,11 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m """ length = c_bst_ulong() ret = ctypes.POINTER(ctypes.c_uint)() - _check_call(_LIB.XGDMatrixGetUIntInfo(self.handle, - c_str(field), - ctypes.byref(length), - ctypes.byref(ret))) + _check_call( + _LIB.XGDMatrixGetUIntInfo( + self.handle, c_str(field), ctypes.byref(length), ctypes.byref(ret) + ) + ) return ctypes2numpy(ret, length.value, np.uint32) def set_float_info(self, field: str, data: ArrayLike) -> None: @@ -889,7 +892,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m The array of data to be set """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, data, field, 'float') + + dispatch_meta_backend(self, data, field, "float") def set_float_info_npy2d(self, field: str, data: ArrayLike) -> None: """Set float type property into the DMatrix @@ -904,7 +908,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m The array of data to be set """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, data, field, 'float') + + dispatch_meta_backend(self, data, field, "float") def set_uint_info(self, field: str, data: ArrayLike) -> None: """Set uint type property into the DMatrix. @@ -918,7 +923,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m The array of data to be set """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, data, field, 'uint32') + + dispatch_meta_backend(self, data, field, "uint32") def save_binary(self, fname: Union[str, os.PathLike], silent: bool = True) -> None: """Save DMatrix to an XGBoost buffer. Saved binary can be later loaded @@ -932,9 +938,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m If set, the output is suppressed. """ fname = os.fspath(os.path.expanduser(fname)) - _check_call(_LIB.XGDMatrixSaveBinary(self.handle, - c_str(fname), - ctypes.c_int(silent))) + _check_call( + _LIB.XGDMatrixSaveBinary(self.handle, c_str(fname), ctypes.c_int(silent)) + ) def set_label(self, label: ArrayLike) -> None: """Set label of dmatrix @@ -945,7 +951,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m The label information to be set into DMatrix """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, label, 'label', 'float') + + dispatch_meta_backend(self, label, "label", "float") def set_weight(self, weight: ArrayLike) -> None: """Set weight of each instance. @@ -964,7 +971,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, weight, 'weight', 'float') + + dispatch_meta_backend(self, weight, "weight", "float") def set_base_margin(self, margin: ArrayLike) -> None: """Set base margin of booster to start from. @@ -981,7 +989,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, margin, 'base_margin', 'float') + + dispatch_meta_backend(self, margin, "base_margin", "float") def set_group(self, group: ArrayLike) -> None: """Set group size of DMatrix (used for ranking). @@ -992,7 +1001,8 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m Group size of each group """ from .data import dispatch_meta_backend - dispatch_meta_backend(self, group, 'group', 'uint32') + + dispatch_meta_backend(self, group, "group", "uint32") def get_label(self) -> np.ndarray: """Get the label of the DMatrix. @@ -1001,7 +1011,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m ------- label : array """ - return self.get_float_info('label') + return self.get_float_info("label") def get_weight(self) -> np.ndarray: """Get the weight of the DMatrix. @@ -1010,7 +1020,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m ------- weight : array """ - return self.get_float_info('weight') + return self.get_float_info("weight") def get_base_margin(self) -> np.ndarray: """Get the base margin of the DMatrix. @@ -1019,7 +1029,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m ------- base_margin """ - return self.get_float_info('base_margin') + return self.get_float_info("base_margin") def get_group(self) -> np.ndarray: """Get the group of the DMatrix. @@ -1071,7 +1081,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m def num_nonmissing(self) -> int: """Get the number of non-missing values in the DMatrix. - .. versionadded:: 1.7.0 + .. versionadded:: 1.7.0 """ ret = c_bst_ulong() @@ -1154,30 +1164,40 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m feature_names = [cast(str, feature_names)] if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') + raise ValueError("feature_names must be unique") if len(feature_names) != self.num_col() and self.num_col() != 0: - msg = ("feature_names must have the same length as data, ", - f"expected {self.num_col()}, got {len(feature_names)}") + msg = ( + "feature_names must have the same length as data, ", + f"expected {self.num_col()}, got {len(feature_names)}", + ) raise ValueError(msg) # prohibit to use symbols may affect to parse. e.g. []< - if not all(isinstance(f, str) and - not any(x in f for x in ['[', ']', '<']) - for f in feature_names): - raise ValueError('feature_names must be string, and may not contain [, ] or <') - feature_names_bytes = [bytes(f, encoding='utf-8') for f in feature_names] - c_feature_names = (ctypes.c_char_p * - len(feature_names_bytes))(*feature_names_bytes) - _check_call(_LIB.XGDMatrixSetStrFeatureInfo( - self.handle, c_str('feature_name'), - c_feature_names, - c_bst_ulong(len(feature_names)))) + if not all( + isinstance(f, str) and not any(x in f for x in ["[", "]", "<"]) + for f in feature_names + ): + raise ValueError( + "feature_names must be string, and may not contain [, ] or <" + ) + feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names] + c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))( + *feature_names_bytes + ) + _check_call( + _LIB.XGDMatrixSetStrFeatureInfo( + self.handle, + c_str("feature_name"), + c_feature_names, + c_bst_ulong(len(feature_names)), + ) + ) else: # reset feature_types also - _check_call(_LIB.XGDMatrixSetStrFeatureInfo( - self.handle, - c_str('feature_name'), - None, - c_bst_ulong(0))) + _check_call( + _LIB.XGDMatrixSetStrFeatureInfo( + self.handle, c_str("feature_name"), None, c_bst_ulong(0) + ) + ) self.feature_types = None @property @@ -1190,10 +1210,14 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m """ length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() - _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, - c_str('feature_type'), - ctypes.byref(length), - ctypes.byref(sarr))) + _check_call( + _LIB.XGDMatrixGetStrFeatureInfo( + self.handle, + c_str("feature_type"), + ctypes.byref(length), + ctypes.byref(sarr), + ) + ) res = from_cstr_to_pystr(sarr, length) if not res: return None @@ -1217,8 +1241,7 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m # also used for specifying categorical data type. if feature_types is not None: if not isinstance(feature_types, (list, str)): - raise TypeError( - 'feature_types must be string or list of strings') + raise TypeError("feature_types must be string or list of strings") if isinstance(feature_types, str): # single string will be applied to all columns feature_types = [feature_types] * self.num_col() @@ -1229,25 +1252,29 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m feature_types = [feature_types] except TypeError: feature_types = [cast(str, feature_types)] - feature_types_bytes = [bytes(f, encoding='utf-8') - for f in feature_types] - c_feature_types = (ctypes.c_char_p * - len(feature_types_bytes))(*feature_types_bytes) - _check_call(_LIB.XGDMatrixSetStrFeatureInfo( - self.handle, c_str('feature_type'), - c_feature_types, - c_bst_ulong(len(feature_types)))) + feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types] + c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))( + *feature_types_bytes + ) + _check_call( + _LIB.XGDMatrixSetStrFeatureInfo( + self.handle, + c_str("feature_type"), + c_feature_types, + c_bst_ulong(len(feature_types)), + ) + ) if len(feature_types) != self.num_col() and self.num_col() != 0: - msg = 'feature_types must have the same length as data' + msg = "feature_types must have the same length as data" raise ValueError(msg) else: # Reset. - _check_call(_LIB.XGDMatrixSetStrFeatureInfo( - self.handle, - c_str('feature_type'), - None, - c_bst_ulong(0))) + _check_call( + _LIB.XGDMatrixSetStrFeatureInfo( + self.handle, c_str("feature_type"), None, c_bst_ulong(0) + ) + ) class _ProxyDMatrix(DMatrix): @@ -1452,7 +1479,7 @@ class QuantileDMatrix(DMatrix): class DeviceQuantileDMatrix(QuantileDMatrix): - """ Use `QuantileDMatrix` instead. + """Use `QuantileDMatrix` instead. .. deprecated:: 1.7.0 @@ -1531,7 +1558,7 @@ class Booster: self, params: Optional[BoosterParam] = None, cache: Optional[Sequence[DMatrix]] = None, - model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None + model_file: Optional[Union["Booster", bytearray, os.PathLike, str]] = None, ) -> None: # pylint: disable=invalid-name """ @@ -1547,12 +1574,15 @@ class Booster: cache = cache if cache is not None else [] for d in cache: if not isinstance(d, DMatrix): - raise TypeError(f'invalid cache item: {type(d).__name__}', cache) + raise TypeError(f"invalid cache item: {type(d).__name__}", cache) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle: Optional[ctypes.c_void_p] = ctypes.c_void_p() - _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)), - ctypes.byref(self.handle))) + _check_call( + _LIB.XGBoosterCreate( + dmats, c_bst_ulong(len(cache)), ctypes.byref(self.handle) + ) + ) for d in cache: # Validate feature only after the feature names are saved into booster. self._validate_dmatrix_features(d) @@ -1562,19 +1592,18 @@ class Booster: # We use the pickle interface for getting memory snapshot from # another model, and load the snapshot with this booster. state = model_file.__getstate__() - handle = state['handle'] - del state['handle'] + handle = state["handle"] + del state["handle"] ptr = (ctypes.c_char * len(handle)).from_buffer(handle) length = c_bst_ulong(len(handle)) - _check_call( - _LIB.XGBoosterUnserializeFromBuffer(self.handle, ptr, length)) + _check_call(_LIB.XGBoosterUnserializeFromBuffer(self.handle, ptr, length)) self.__dict__.update(state) elif isinstance(model_file, (str, os.PathLike, bytearray)): self.load_model(model_file) elif model_file is None: pass else: - raise TypeError('Unknown type:', model_file) + raise TypeError("Unknown type:", model_file) params = params or {} params_processed = _configure_metrics(params.copy()) @@ -1651,45 +1680,47 @@ class Booster: return params def __del__(self) -> None: - if hasattr(self, 'handle') and self.handle is not None: + if hasattr(self, "handle") and self.handle is not None: _check_call(_LIB.XGBoosterFree(self.handle)) self.handle = None def __getstate__(self) -> Dict: # can't pickle ctypes pointers, put model content in bytearray this = self.__dict__.copy() - handle = this['handle'] + handle = this["handle"] if handle is not None: length = c_bst_ulong() cptr = ctypes.POINTER(ctypes.c_char)() - _check_call(_LIB.XGBoosterSerializeToBuffer(self.handle, - ctypes.byref(length), - ctypes.byref(cptr))) + _check_call( + _LIB.XGBoosterSerializeToBuffer( + self.handle, ctypes.byref(length), ctypes.byref(cptr) + ) + ) buf = ctypes2buffer(cptr, length.value) this["handle"] = buf return this def __setstate__(self, state: Dict) -> None: # reconstruct handle from raw data - handle = state['handle'] + handle = state["handle"] if handle is not None: buf = handle dmats = c_array(ctypes.c_void_p, []) handle = ctypes.c_void_p() - _check_call(_LIB.XGBoosterCreate( - dmats, c_bst_ulong(0), ctypes.byref(handle))) + _check_call( + _LIB.XGBoosterCreate(dmats, c_bst_ulong(0), ctypes.byref(handle)) + ) length = c_bst_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - _check_call( - _LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length)) - state['handle'] = handle + _check_call(_LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length)) + state["handle"] = handle self.__dict__.update(state) def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster": if isinstance(val, int): - val = slice(val, val+1) + val = slice(val, val + 1) if isinstance(val, tuple): - raise ValueError('Only supports slicing through 1 dimension.') + raise ValueError("Only supports slicing through 1 dimension.") if not isinstance(val, slice): msg = _expect((int, slice), type(val)) raise TypeError(msg) @@ -1702,7 +1733,7 @@ class Booster: else: stop = val.stop if stop < start: - raise ValueError('Invalid slice', val) + raise ValueError("Invalid slice", val) step = val.step if val.step is not None else 1 @@ -1715,7 +1746,7 @@ class Booster: self.handle, c_start, c_stop, c_step, ctypes.byref(sliced_handle) ) if status == -2: - raise IndexError('Layer index out of range') + raise IndexError("Layer index out of range") _check_call(status) sliced = Booster() @@ -1724,36 +1755,35 @@ class Booster: return sliced def save_config(self) -> str: - '''Output internal parameter configuration of Booster as a JSON + """Output internal parameter configuration of Booster as a JSON string. .. versionadded:: 1.0.0 - ''' + """ json_string = ctypes.c_char_p() length = c_bst_ulong() - _check_call(_LIB.XGBoosterSaveJsonConfig( - self.handle, - ctypes.byref(length), - ctypes.byref(json_string))) + _check_call( + _LIB.XGBoosterSaveJsonConfig( + self.handle, ctypes.byref(length), ctypes.byref(json_string) + ) + ) assert json_string.value is not None result = json_string.value.decode() # pylint: disable=no-member return result def load_config(self, config: str) -> None: - '''Load configuration returned by `save_config`. + """Load configuration returned by `save_config`. .. versionadded:: 1.0.0 - ''' + """ assert isinstance(config, str) - _check_call(_LIB.XGBoosterLoadJsonConfig( - self.handle, - c_str(config))) + _check_call(_LIB.XGBoosterLoadJsonConfig(self.handle, c_str(config))) def __copy__(self) -> "Booster": return self.__deepcopy__(None) def __deepcopy__(self, _: Any) -> "Booster": - '''Return a copy of booster.''' + """Return a copy of booster.""" return Booster(model_file=self) def copy(self) -> "Booster": @@ -1781,8 +1811,11 @@ class Booster: """ ret = ctypes.c_char_p() success = ctypes.c_int() - _check_call(_LIB.XGBoosterGetAttr( - self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success))) + _check_call( + _LIB.XGBoosterGetAttr( + self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success) + ) + ) if success.value != 0: value = ret.value assert value @@ -1799,9 +1832,11 @@ class Booster: """ length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() - _check_call(_LIB.XGBoosterGetAttrNames(self.handle, - ctypes.byref(length), - ctypes.byref(sarr))) + _check_call( + _LIB.XGBoosterGetAttrNames( + self.handle, ctypes.byref(length), ctypes.byref(sarr) + ) + ) attr_names = from_cstr_to_pystr(sarr, length) return {n: self.attr(n) for n in attr_names} @@ -1826,7 +1861,10 @@ class Booster: return None _check_call( _LIB.XGBoosterGetStrFeatureInfo( - self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr), + self.handle, + c_str(field), + ctypes.byref(length), + ctypes.byref(sarr), ) ) feature_info = from_cstr_to_pystr(sarr, length) @@ -1836,10 +1874,15 @@ class Booster: if features is not None: assert isinstance(features, list) feature_info_bytes = [bytes(f, encoding="utf-8") for f in features] - c_feature_info = (ctypes.c_char_p * len(feature_info_bytes))(*feature_info_bytes) + c_feature_info = (ctypes.c_char_p * len(feature_info_bytes))( + *feature_info_bytes + ) _check_call( _LIB.XGBoosterSetStrFeatureInfo( - self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features)) + self.handle, + c_str(field), + c_feature_info, + c_bst_ulong(len(features)), ) ) else: @@ -1876,7 +1919,7 @@ class Booster: def set_param( self, params: Union[Dict, Iterable[Tuple[str, Any]], str], - value: Optional[str] = None + value: Optional[str] = None, ) -> None: """Set parameters into the Booster. @@ -1893,8 +1936,9 @@ class Booster: params = [(params, value)] for key, val in cast(Iterable[Tuple[str, str]], params): if val is not None: - _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), - c_str(str(val)))) + _check_call( + _LIB.XGBoosterSetParam(self.handle, c_str(key), c_str(str(val))) + ) def update( self, dtrain: DMatrix, iteration: int, fobj: Optional[Objective] = None @@ -1917,9 +1961,11 @@ class Booster: self._validate_dmatrix_features(dtrain) if fobj is None: - _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, - ctypes.c_int(iteration), - dtrain.handle)) + _check_call( + _LIB.XGBoosterUpdateOneIter( + self.handle, ctypes.c_int(iteration), dtrain.handle + ) + ) else: pred = self.predict(dtrain, output_margin=True, training=True) grad, hess = fobj(pred, dtrain) @@ -1941,24 +1987,27 @@ class Booster: """ if len(grad) != len(hess): - raise ValueError( - f"grad / hess length mismatch: {len(grad)} / {len(hess)}" - ) + raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}") if not isinstance(dtrain, DMatrix): raise TypeError(f"invalid training matrix: {type(dtrain).__name__}") self._validate_dmatrix_features(dtrain) - _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, - c_array(ctypes.c_float, grad), - c_array(ctypes.c_float, hess), - c_bst_ulong(len(grad)))) + _check_call( + _LIB.XGBoosterBoostOneIter( + self.handle, + dtrain.handle, + c_array(ctypes.c_float, grad), + c_array(ctypes.c_float, hess), + c_bst_ulong(len(grad)), + ) + ) def eval_set( self, evals: Sequence[Tuple[DMatrix, str]], iteration: int = 0, feval: Optional[Metric] = None, - output_margin: bool = True + output_margin: bool = True, ) -> str: # pylint: disable=invalid-name """Evaluate a set of data. @@ -2002,7 +2051,8 @@ class Booster: if feval is not None: for dmat, evname in evals: feval_ret = feval( - self.predict(dmat, training=False, output_margin=output_margin), dmat + self.predict(dmat, training=False, output_margin=output_margin), + dmat, ) if isinstance(feval_ret, list): for name, val in feval_ret: @@ -2014,7 +2064,7 @@ class Booster: res += "\t%s-%s:%f" % (evname, name, val) return res - def eval(self, data: DMatrix, name: str = 'eval', iteration: int = 0) -> str: + def eval(self, data: DMatrix, name: str = "eval", iteration: int = 0) -> str: """Evaluate the model on mat. Parameters @@ -2134,7 +2184,7 @@ class Booster: """ if not isinstance(data, DMatrix): - raise TypeError('Expecting data to be a DMatrix object, got: ', type(data)) + raise TypeError("Expecting data to be a DMatrix object, got: ", type(data)) if validate_features: self._validate_dmatrix_features(data) iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range) @@ -2169,7 +2219,7 @@ class Booster: from_pystr_to_cstr(json.dumps(args)), ctypes.byref(shape), ctypes.byref(dims), - ctypes.byref(preds) + ctypes.byref(preds), ) ) return _prediction_output(shape, dims, preds, False) @@ -2183,7 +2233,7 @@ class Booster: missing: float = np.nan, validate_features: bool = True, base_margin: Any = None, - strict_shape: bool = False + strict_shape: bool = False, ) -> NumpyOrCupy: """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction does not cache the prediction result. @@ -2275,6 +2325,7 @@ class Booster: enable_categorical = True if _is_pandas_series(data): import pandas as pd + data = pd.DataFrame(data) if _is_pandas_df(data): data, fns, _ = _transform_pandas_df(data, enable_categorical) @@ -2395,8 +2446,7 @@ class Booster: """ if isinstance(fname, (str, os.PathLike)): # assume file name fname = os.fspath(os.path.expanduser(fname)) - _check_call(_LIB.XGBoosterSaveModel( - self.handle, c_str(fname))) + _check_call(_LIB.XGBoosterSaveModel(self.handle, c_str(fname))) else: raise TypeError("fname must be a string or os PathLike") @@ -2450,16 +2500,14 @@ class Booster: # assume file name, cannot use os.path.exist to check, file can be # from URL. fname = os.fspath(os.path.expanduser(fname)) - _check_call(_LIB.XGBoosterLoadModel( - self.handle, c_str(fname))) + _check_call(_LIB.XGBoosterLoadModel(self.handle, c_str(fname))) elif isinstance(fname, bytearray): buf = fname length = c_bst_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) - _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, - length)) + _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length)) else: - raise TypeError('Unknown file type: ', fname) + raise TypeError("Unknown file type: ", fname) if self.attr("best_iteration") is not None: self.best_iteration = int(self.attr("best_iteration")) # type: ignore @@ -2469,24 +2517,29 @@ class Booster: self.best_ntree_limit = int(self.attr("best_ntree_limit")) # type: ignore def num_boosted_rounds(self) -> int: - '''Get number of boosted rounds. For gblinear this is reset to 0 after + """Get number of boosted rounds. For gblinear this is reset to 0 after serializing the model. - ''' + """ rounds = ctypes.c_int() assert self.handle is not None _check_call(_LIB.XGBoosterBoostedRounds(self.handle, ctypes.byref(rounds))) return rounds.value def num_features(self) -> int: - '''Number of features in booster.''' + """Number of features in booster.""" features = c_bst_ulong() assert self.handle is not None _check_call(_LIB.XGBoosterGetNumFeature(self.handle, ctypes.byref(features))) return features.value - def dump_model(self, fout: Union[str, os.PathLike], fmap: Union[str, os.PathLike] = '', - with_stats: bool = False, dump_format: str = "text") -> None: + def dump_model( + self, + fout: Union[str, os.PathLike], + fmap: Union[str, os.PathLike] = "", + with_stats: bool = False, + dump_format: str = "text", + ) -> None: """Dump model into a text or JSON file. Unlike :py:meth:`save_model`, the output format is primarily used for visualization or interpretation, hence it's more human readable but cannot be loaded back to XGBoost. @@ -2505,19 +2558,19 @@ class Booster: if isinstance(fout, (str, os.PathLike)): fout = os.fspath(os.path.expanduser(fout)) # pylint: disable=consider-using-with - fout_obj = open(fout, 'w', encoding="utf-8") + fout_obj = open(fout, "w", encoding="utf-8") need_close = True else: fout_obj = fout need_close = False ret = self.get_dump(fmap, with_stats, dump_format) - if dump_format == 'json': - fout_obj.write('[\n') + if dump_format == "json": + fout_obj.write("[\n") for i, val in enumerate(ret): fout_obj.write(val) if i < len(ret) - 1: fout_obj.write(",\n") - fout_obj.write('\n]') + fout_obj.write("\n]") else: for i, val in enumerate(ret): fout_obj.write(f"booster[{i}]:\n") @@ -2529,7 +2582,7 @@ class Booster: self, fmap: Union[str, os.PathLike] = "", with_stats: bool = False, - dump_format: str = "text" + dump_format: str = "text", ) -> List[str]: """Returns the model dump as a list of strings. Unlike :py:meth:`save_model`, the output format is primarily used for visualization or interpretation, hence it's more @@ -2548,12 +2601,16 @@ class Booster: fmap = os.fspath(os.path.expanduser(fmap)) length = c_bst_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() - _check_call(_LIB.XGBoosterDumpModelEx(self.handle, - c_str(fmap), - ctypes.c_int(with_stats), - c_str(dump_format), - ctypes.byref(length), - ctypes.byref(sarr))) + _check_call( + _LIB.XGBoosterDumpModelEx( + self.handle, + c_str(fmap), + ctypes.c_int(with_stats), + c_str(dump_format), + ctypes.byref(length), + ctypes.byref(sarr), + ) + ) res = from_cstr_to_pystr(sarr, length) return res @@ -2573,10 +2630,10 @@ class Booster: The name of feature map file """ - return self.get_score(fmap, importance_type='weight') + return self.get_score(fmap, importance_type="weight") def get_score( - self, fmap: Union[str, os.PathLike] = '', importance_type: str = 'weight' + self, fmap: Union[str, os.PathLike] = "", importance_type: str = "weight" ) -> Dict[str, Union[float, List[float]]]: """Get feature importance of each feature. For tree model Importance type can be defined as: @@ -2641,7 +2698,7 @@ class Booster: return results # pylint: disable=too-many-statements - def trees_to_dataframe(self, fmap: Union[str, os.PathLike] = '') -> DataFrame: + def trees_to_dataframe(self, fmap: Union[str, os.PathLike] = "") -> DataFrame: """Parse a boosted tree model text dump into a pandas DataFrame structure. This feature is only defined when the decision tree model is chosen as base @@ -2656,8 +2713,12 @@ class Booster: # pylint: disable=too-many-locals fmap = os.fspath(os.path.expanduser(fmap)) if not PANDAS_INSTALLED: - raise ImportError(('pandas must be available to use this method.' - 'Install pandas before calling again.')) + raise ImportError( + ( + "pandas must be available to use this method." + "Install pandas before calling again." + ) + ) booster = json.loads(self.save_config())["learner"]["gradient_booster"]["name"] if booster not in {"gbtree", "dart"}: raise ValueError(f"This method is not defined for Booster type {booster}") @@ -2675,35 +2736,35 @@ class Booster: trees = self.get_dump(fmap, with_stats=True) for i, tree in enumerate(trees): - for line in tree.split('\n'): - arr = line.split('[') + for line in tree.split("\n"): + arr = line.split("[") # Leaf node if len(arr) == 1: # Last element of line.split is an empty string - if arr == ['']: + if arr == [""]: continue # parse string - parse = arr[0].split(':') - stats = re.split('=|,', parse[1]) + parse = arr[0].split(":") + stats = re.split("=|,", parse[1]) # append to lists tree_ids.append(i) - node_ids.append(int(re.findall(r'\b\d+\b', parse[0])[0])) - fids.append('Leaf') - splits.append(float('NAN')) - categories.append(float('NAN')) - y_directs.append(float('NAN')) - n_directs.append(float('NAN')) - missings.append(float('NAN')) + node_ids.append(int(re.findall(r"\b\d+\b", parse[0])[0])) + fids.append("Leaf") + splits.append(float("NAN")) + categories.append(float("NAN")) + y_directs.append(float("NAN")) + n_directs.append(float("NAN")) + missings.append(float("NAN")) gains.append(float(stats[1])) covers.append(float(stats[3])) # Not a Leaf Node else: # parse string - fid = arr[1].split(']') + fid = arr[1].split("]") if fid[0].find("<") != -1: # numerical - parse = fid[0].split('<') + parse = fid[0].split("<") splits.append(float(parse[1])) categories.append(None) elif fid[0].find(":{") != -1: @@ -2715,30 +2776,41 @@ class Booster: categories.append(cats_split if cats_split else None) else: raise ValueError("Failed to parse model text dump.") - stats = re.split('=|,', fid[1]) + stats = re.split("=|,", fid[1]) # append to lists tree_ids.append(i) - node_ids.append(int(re.findall(r'\b\d+\b', arr[0])[0])) + node_ids.append(int(re.findall(r"\b\d+\b", arr[0])[0])) fids.append(parse[0]) str_i = str(i) - y_directs.append(str_i + '-' + stats[1]) - n_directs.append(str_i + '-' + stats[3]) - missings.append(str_i + '-' + stats[5]) + y_directs.append(str_i + "-" + stats[1]) + n_directs.append(str_i + "-" + stats[3]) + missings.append(str_i + "-" + stats[5]) gains.append(float(stats[7])) covers.append(float(stats[9])) - ids = [str(t_id) + '-' + str(n_id) for t_id, n_id in zip(tree_ids, node_ids)] - df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids, - 'Feature': fids, 'Split': splits, 'Yes': y_directs, - 'No': n_directs, 'Missing': missings, 'Gain': gains, - 'Cover': covers, "Category": categories}) + ids = [str(t_id) + "-" + str(n_id) for t_id, n_id in zip(tree_ids, node_ids)] + df = DataFrame( + { + "Tree": tree_ids, + "Node": node_ids, + "ID": ids, + "Feature": fids, + "Split": splits, + "Yes": y_directs, + "No": n_directs, + "Missing": missings, + "Gain": gains, + "Cover": covers, + "Category": categories, + } + ) - if callable(getattr(df, 'sort_values', None)): + if callable(getattr(df, "sort_values", None)): # pylint: disable=no-member - return df.sort_values(['Tree', 'Node']).reset_index(drop=True) + return df.sort_values(["Tree", "Node"]).reset_index(drop=True) # pylint: disable=no-member - return df.sort(['Tree', 'Node']).reset_index(drop=True) + return df.sort(["Tree", "Node"]).reset_index(drop=True) def _validate_dmatrix_features(self, data: DMatrix) -> None: if data.num_row() == 0: @@ -2793,9 +2865,9 @@ class Booster: def get_split_value_histogram( self, feature: str, - fmap: Union[os.PathLike, str] = '', + fmap: Union[os.PathLike, str] = "", bins: Optional[int] = None, - as_pandas: bool = True + as_pandas: bool = True, ) -> Union[np.ndarray, DataFrame]: """Get split value histogram of a feature @@ -2851,11 +2923,11 @@ class Booster: ) if as_pandas and PANDAS_INSTALLED: - return DataFrame(nph_stacked, columns=['SplitValue', 'Count']) + return DataFrame(nph_stacked, columns=["SplitValue", "Count"]) if as_pandas and not PANDAS_INSTALLED: warnings.warn( "Returning histogram as ndarray" " (as_pandas == True, but pandas is not installed).", - UserWarning + UserWarning, ) return nph_stacked diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 3cfa84395..2ebde84f0 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -1,6 +1,6 @@ # pylint: disable=too-many-arguments, too-many-branches, too-many-lines # pylint: disable=too-many-return-statements, import-error -'''Data dispatching for DMatrix.''' +"""Data dispatching for DMatrix.""" import ctypes import json import os @@ -108,6 +108,7 @@ def _from_scipy_csr( feature_types: Optional[FeatureTypes], ) -> DispatchedDataBackendReturnType: """Initialize data from a CSR matrix.""" + handle = ctypes.c_void_p() data = transform_scipy_sparse(data, True) _check_call( @@ -178,8 +179,7 @@ def _ensure_np_dtype( def _maybe_np_slice(data: DataType, dtype: Optional[NumpyDType]) -> np.ndarray: - '''Handle numpy slice. This can be removed if we use __array_interface__. - ''' + """Handle numpy slice. This can be removed if we use __array_interface__.""" try: if not data.flags.c_contiguous: data = np.array(data, copy=True, dtype=dtype) @@ -653,6 +653,7 @@ def _is_arrow(data: DataType) -> bool: try: import pyarrow as pa from pyarrow import dataset as arrow_dataset + return isinstance(data, (pa.Table, arrow_dataset.Dataset)) except ImportError: return False @@ -878,8 +879,8 @@ def _is_cupy_array(data: DataType) -> bool: def _transform_cupy_array(data: DataType) -> CupyT: import cupy # pylint: disable=import-error - if not hasattr(data, '__cuda_array_interface__') and hasattr( - data, '__array__'): + + if not hasattr(data, "__cuda_array_interface__") and hasattr(data, "__array__"): data = cupy.array(data, copy=False) if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]: data = data.astype(cupy.float32, copy=False) @@ -900,9 +901,9 @@ def _from_cupy_array( config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8") _check_call( _LIB.XGDMatrixCreateFromCudaArrayInterface( - interface_str, - config, - ctypes.byref(handle))) + interface_str, config, ctypes.byref(handle) + ) + ) return handle, feature_names, feature_types @@ -923,12 +924,13 @@ def _is_cupy_csc(data: DataType) -> bool: def _is_dlpack(data: DataType) -> bool: - return 'PyCapsule' in str(type(data)) and "dltensor" in str(data) + return "PyCapsule" in str(type(data)) and "dltensor" in str(data) def _transform_dlpack(data: DataType) -> bool: from cupy import fromDlpack # pylint: disable=E0401 - assert 'used_dltensor' not in str(data) + + assert "used_dltensor" not in str(data) data = fromDlpack(data) return data @@ -941,8 +943,7 @@ def _from_dlpack( feature_types: Optional[FeatureTypes], ) -> DispatchedDataBackendReturnType: data = _transform_dlpack(data) - return _from_cupy_array(data, missing, nthread, feature_names, - feature_types) + return _from_cupy_array(data, missing, nthread, feature_names, feature_types) def _is_uri(data: DataType) -> bool: @@ -1003,13 +1004,13 @@ def _is_iter(data: DataType) -> bool: def _has_array_protocol(data: DataType) -> bool: - return hasattr(data, '__array__') + return hasattr(data, "__array__") def _convert_unknown_data(data: DataType) -> DataType: warnings.warn( - f'Unknown data type: {type(data)}, trying to convert it to csr_matrix', - UserWarning + f"Unknown data type: {type(data)}, trying to convert it to csr_matrix", + UserWarning, ) try: import scipy.sparse @@ -1018,7 +1019,7 @@ def _convert_unknown_data(data: DataType) -> DataType: try: data = scipy.sparse.csr_matrix(data) - except Exception: # pylint: disable=broad-except + except Exception: # pylint: disable=broad-except return None return data @@ -1033,7 +1034,7 @@ def dispatch_data_backend( enable_categorical: bool = False, data_split_mode: DataSplitMode = DataSplitMode.ROW, ) -> DispatchedDataBackendReturnType: - '''Dispatch data for DMatrix.''' + """Dispatch data for DMatrix.""" if not _is_cudf_ser(data) and not _is_pandas_series(data): _check_data_shape(data) if _is_scipy_csr(data): @@ -1054,6 +1055,7 @@ def dispatch_data_backend( return _from_tuple(data, missing, threads, feature_names, feature_types) if _is_pandas_series(data): import pandas as pd + data = pd.DataFrame(data) if _is_pandas_df(data): return _from_pandas_df( @@ -1064,39 +1066,41 @@ def dispatch_data_backend( data, missing, threads, feature_names, feature_types, enable_categorical ) if _is_cupy_array(data): - return _from_cupy_array(data, missing, threads, feature_names, - feature_types) + return _from_cupy_array(data, missing, threads, feature_names, feature_types) if _is_cupy_csr(data): - raise TypeError('cupyx CSR is not supported yet.') + raise TypeError("cupyx CSR is not supported yet.") if _is_cupy_csc(data): - raise TypeError('cupyx CSC is not supported yet.') + raise TypeError("cupyx CSC is not supported yet.") if _is_dlpack(data): - return _from_dlpack(data, missing, threads, feature_names, - feature_types) + return _from_dlpack(data, missing, threads, feature_names, feature_types) if _is_dt_df(data): _warn_unused_missing(data, missing) return _from_dt_df( data, missing, threads, feature_names, feature_types, enable_categorical ) if _is_modin_df(data): - return _from_pandas_df(data, enable_categorical, missing, threads, - feature_names, feature_types) + return _from_pandas_df( + data, enable_categorical, missing, threads, feature_names, feature_types + ) if _is_modin_series(data): return _from_pandas_series( data, missing, threads, enable_categorical, feature_names, feature_types ) if _is_arrow(data): return _from_arrow( - data, missing, threads, feature_names, feature_types, enable_categorical) + data, missing, threads, feature_names, feature_types, enable_categorical + ) if _has_array_protocol(data): array = np.asarray(data) return _from_numpy_array(array, missing, threads, feature_names, feature_types) converted = _convert_unknown_data(data) if converted is not None: - return _from_scipy_csr(converted, missing, threads, feature_names, feature_types) + return _from_scipy_csr( + converted, missing, threads, feature_names, feature_types + ) - raise TypeError('Not supported type for data.' + str(type(data))) + raise TypeError("Not supported type for data." + str(type(data))) def _validate_meta_shape(data: DataType, name: str) -> None: @@ -1128,20 +1132,14 @@ def _meta_from_numpy( def _meta_from_list( - data: Sequence, - field: str, - dtype: Optional[NumpyDType], - handle: ctypes.c_void_p + data: Sequence, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p ) -> None: data_np = np.array(data) _meta_from_numpy(data_np, field, dtype, handle) def _meta_from_tuple( - data: Sequence, - field: str, - dtype: Optional[NumpyDType], - handle: ctypes.c_void_p + data: Sequence, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p ) -> None: return _meta_from_list(data, field, dtype, handle) @@ -1156,39 +1154,27 @@ def _meta_from_cudf_df(data: DataType, field: str, handle: ctypes.c_void_p) -> N def _meta_from_cudf_series(data: DataType, field: str, handle: ctypes.c_void_p) -> None: - interface = bytes(json.dumps([data.__cuda_array_interface__], - indent=2), 'utf-8') - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, - c_str(field), - interface)) + interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) def _meta_from_cupy_array(data: DataType, field: str, handle: ctypes.c_void_p) -> None: data = _transform_cupy_array(data) - interface = bytes(json.dumps([data.__cuda_array_interface__], - indent=2), 'utf-8') - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, - c_str(field), - interface)) + interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) def _meta_from_dt( - data: DataType, - field: str, - dtype: Optional[NumpyDType], - handle: ctypes.c_void_p + data: DataType, field: str, dtype: Optional[NumpyDType], handle: ctypes.c_void_p ) -> None: data, _, _ = _transform_dt_df(data, None, None, field, dtype) _meta_from_numpy(data, field, dtype, handle) def dispatch_meta_backend( - matrix: DMatrix, - data: DataType, - name: str, - dtype: Optional[NumpyDType] = None + matrix: DMatrix, data: DataType, name: str, dtype: Optional[NumpyDType] = None ) -> None: - '''Dispatch for meta info.''' + """Dispatch for meta info.""" handle = matrix.handle assert handle is not None _validate_meta_shape(data, name) @@ -1231,7 +1217,7 @@ def dispatch_meta_backend( _meta_from_numpy(data, name, dtype, handle) return if _is_modin_series(data): - data = data.values.astype('float') + data = data.values.astype("float") assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 _meta_from_numpy(data, name, dtype, handle) return @@ -1240,19 +1226,20 @@ def dispatch_meta_backend( array = np.asarray(data) _meta_from_numpy(array, name, dtype, handle) return - raise TypeError('Unsupported type for ' + name, str(type(data))) + raise TypeError("Unsupported type for " + name, str(type(data))) class SingleBatchInternalIter(DataIter): # pylint: disable=R0902 - '''An iterator for single batch data to help creating device DMatrix. + """An iterator for single batch data to help creating device DMatrix. Transforming input directly to histogram with normal single batch data API can not access weight for sketching. So this iterator acts as a staging area for meta info. - ''' + """ + def __init__(self, **kwargs: Any) -> None: self.kwargs = kwargs - self.it = 0 # pylint: disable=invalid-name + self.it = 0 # pylint: disable=invalid-name # This does not necessarily increase memory usage as the data transformation # might use memory. diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py index 2ac36ef0f..9223acaa5 100644 --- a/python-package/xgboost/libpath.py +++ b/python-package/xgboost/libpath.py @@ -22,45 +22,51 @@ def find_lib_path() -> List[str]: curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) dll_path = [ # normal, after installation `lib` is copied into Python package tree. - os.path.join(curr_path, 'lib'), + os.path.join(curr_path, "lib"), # editable installation, no copying is performed. - os.path.join(curr_path, os.path.pardir, os.path.pardir, 'lib'), + os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"), # use libxgboost from a system prefix, if available. This should be the last # option. - os.path.join(sys.prefix, 'lib'), + os.path.join(sys.prefix, "lib"), ] - if sys.platform == 'win32': - if platform.architecture()[0] == '64bit': - dll_path.append( - os.path.join(curr_path, '../../windows/x64/Release/')) + if sys.platform == "win32": + if platform.architecture()[0] == "64bit": + dll_path.append(os.path.join(curr_path, "../../windows/x64/Release/")) # hack for pip installation when copy all parent source # directory here - dll_path.append(os.path.join(curr_path, './windows/x64/Release/')) + dll_path.append(os.path.join(curr_path, "./windows/x64/Release/")) else: - dll_path.append(os.path.join(curr_path, '../../windows/Release/')) + dll_path.append(os.path.join(curr_path, "../../windows/Release/")) # hack for pip installation when copy all parent source # directory here - dll_path.append(os.path.join(curr_path, './windows/Release/')) - dll_path = [os.path.join(p, 'xgboost.dll') for p in dll_path] - elif sys.platform.startswith(('linux', 'freebsd', 'emscripten')): - dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path] - elif sys.platform == 'darwin': - dll_path = [os.path.join(p, 'libxgboost.dylib') for p in dll_path] - elif sys.platform == 'cygwin': - dll_path = [os.path.join(p, 'cygxgboost.dll') for p in dll_path] - if platform.system() == 'OS400': - dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path] + dll_path.append(os.path.join(curr_path, "./windows/Release/")) + dll_path = [os.path.join(p, "xgboost.dll") for p in dll_path] + elif sys.platform.startswith(("linux", "freebsd", "emscripten")): + dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path] + elif sys.platform == "darwin": + dll_path = [os.path.join(p, "libxgboost.dylib") for p in dll_path] + elif sys.platform == "cygwin": + dll_path = [os.path.join(p, "cygxgboost.dll") for p in dll_path] + if platform.system() == "OS400": + dll_path = [os.path.join(p, "libxgboost.so") for p in dll_path] lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] # XGBOOST_BUILD_DOC is defined by sphinx conf. - if not lib_path and not os.environ.get('XGBOOST_BUILD_DOC', False): - link = 'https://xgboost.readthedocs.io/en/latest/build.html' - msg = 'Cannot find XGBoost Library in the candidate path. ' + \ - 'List of candidates:\n- ' + ('\n- '.join(dll_path)) + \ - '\nXGBoost Python package path: ' + curr_path + \ - '\nsys.prefix: ' + sys.prefix + \ - '\nSee: ' + link + ' for installing XGBoost.' + if not lib_path and not os.environ.get("XGBOOST_BUILD_DOC", False): + link = "https://xgboost.readthedocs.io/en/latest/build.html" + msg = ( + "Cannot find XGBoost Library in the candidate path. " + + "List of candidates:\n- " + + ("\n- ".join(dll_path)) + + "\nXGBoost Python package path: " + + curr_path + + "\nsys.prefix: " + + sys.prefix + + "\nSee: " + + link + + " for installing XGBoost." + ) raise XGBoostLibraryNotFound(msg) return lib_path diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 8439a72e5..a364e1eb6 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -81,22 +81,24 @@ def plot_importance( try: import matplotlib.pyplot as plt except ImportError as e: - raise ImportError('You must install matplotlib to plot importance') from e + raise ImportError("You must install matplotlib to plot importance") from e if isinstance(booster, XGBModel): importance = booster.get_booster().get_score( - importance_type=importance_type, fmap=fmap) + importance_type=importance_type, fmap=fmap + ) elif isinstance(booster, Booster): importance = booster.get_score(importance_type=importance_type, fmap=fmap) elif isinstance(booster, dict): importance = booster else: - raise ValueError('tree must be Booster, XGBModel or dict instance') + raise ValueError("tree must be Booster, XGBModel or dict instance") if not importance: raise ValueError( - 'Booster.get_score() results in empty. ' + - 'This maybe caused by having all trees as decision dumps.') + "Booster.get_score() results in empty. " + + "This maybe caused by having all trees as decision dumps." + ) tuples = [(k, importance[k]) for k in importance] if max_num_features is not None: @@ -110,25 +112,25 @@ def plot_importance( _, ax = plt.subplots(1, 1) ylocs = np.arange(len(values)) - ax.barh(ylocs, values, align='center', height=height, **kwargs) + ax.barh(ylocs, values, align="center", height=height, **kwargs) if show_values is True: for x, y in zip(values, ylocs): - ax.text(x + 1, y, values_format.format(v=x), va='center') + ax.text(x + 1, y, values_format.format(v=x), va="center") ax.set_yticks(ylocs) ax.set_yticklabels(labels) if xlim is not None: if not isinstance(xlim, tuple) or len(xlim) != 2: - raise ValueError('xlim must be a tuple of 2 elements') + raise ValueError("xlim must be a tuple of 2 elements") else: xlim = (0, max(values) * 1.1) ax.set_xlim(xlim) if ylim is not None: if not isinstance(ylim, tuple) or len(ylim) != 2: - raise ValueError('ylim must be a tuple of 2 elements') + raise ValueError("ylim must be a tuple of 2 elements") else: ylim = (-1, len(values)) ax.set_ylim(ylim) @@ -201,44 +203,42 @@ def to_graphviz( try: from graphviz import Source except ImportError as e: - raise ImportError('You must install graphviz to plot tree') from e + raise ImportError("You must install graphviz to plot tree") from e if isinstance(booster, XGBModel): booster = booster.get_booster() # squash everything back into kwargs again for compatibility - parameters = 'dot' + parameters = "dot" extra = {} for key, value in kwargs.items(): extra[key] = value if rankdir is not None: - kwargs['graph_attrs'] = {} - kwargs['graph_attrs']['rankdir'] = rankdir + kwargs["graph_attrs"] = {} + kwargs["graph_attrs"]["rankdir"] = rankdir for key, value in extra.items(): if kwargs.get("graph_attrs", None) is not None: - kwargs['graph_attrs'][key] = value + kwargs["graph_attrs"][key] = value else: - kwargs['graph_attrs'] = {} + kwargs["graph_attrs"] = {} del kwargs[key] if yes_color is not None or no_color is not None: - kwargs['edge'] = {} + kwargs["edge"] = {} if yes_color is not None: - kwargs['edge']['yes_color'] = yes_color + kwargs["edge"]["yes_color"] = yes_color if no_color is not None: - kwargs['edge']['no_color'] = no_color + kwargs["edge"]["no_color"] = no_color if condition_node_params is not None: - kwargs['condition_node_params'] = condition_node_params + kwargs["condition_node_params"] = condition_node_params if leaf_node_params is not None: - kwargs['leaf_node_params'] = leaf_node_params + kwargs["leaf_node_params"] = leaf_node_params if kwargs: - parameters += ':' + parameters += ":" parameters += json.dumps(kwargs) - tree = booster.get_dump( - fmap=fmap, - dump_format=parameters)[num_trees] + tree = booster.get_dump(fmap=fmap, dump_format=parameters)[num_trees] g = Source(tree) return g @@ -277,19 +277,18 @@ def plot_tree( from matplotlib import image from matplotlib import pyplot as plt except ImportError as e: - raise ImportError('You must install matplotlib to plot tree') from e + raise ImportError("You must install matplotlib to plot tree") from e if ax is None: _, ax = plt.subplots(1, 1) - g = to_graphviz(booster, fmap=fmap, num_trees=num_trees, rankdir=rankdir, - **kwargs) + g = to_graphviz(booster, fmap=fmap, num_trees=num_trees, rankdir=rankdir, **kwargs) s = BytesIO() - s.write(g.pipe(format='png')) + s.write(g.pipe(format="png")) s.seek(0) img = image.imread(s) ax.imshow(img) - ax.axis('off') + ax.axis("off") return ax diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py index 68474de57..0b8f143ec 100644 --- a/python-package/xgboost/rabit.py +++ b/python-package/xgboost/rabit.py @@ -24,7 +24,7 @@ def init(args: Optional[List[bytes]] = None) -> None: parsed = {} if args: for arg in args: - kv = arg.decode().split('=') + kv = arg.decode().split("=") if len(kv) == 2: parsed[kv[0]] = kv[1] collective.init(**parsed) @@ -104,6 +104,7 @@ def broadcast(data: T, root: int) -> T: @unique class Op(IntEnum): """Supported operations for rabit.""" + MAX = 0 MIN = 1 SUM = 2 @@ -111,7 +112,7 @@ class Op(IntEnum): def allreduce( # pylint:disable=invalid-name - data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None + data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None ) -> np.ndarray: """Perform allreduce, return the result. Parameters diff --git a/python-package/xgboost/tracker.py b/python-package/xgboost/tracker.py index eb0a11dd5..142a70fc5 100644 --- a/python-package/xgboost/tracker.py +++ b/python-package/xgboost/tracker.py @@ -53,7 +53,7 @@ class ExSocket: # magic number used to verify existence of data -MAGIC_NUM = 0xff99 +MAGIC_NUM = 0xFF99 def get_some_ip(host: str) -> str: @@ -334,19 +334,19 @@ class RabitTracker: while len(shutdown) != n_workers: fd, s_addr = self.sock.accept() s = WorkerEntry(fd, s_addr) - if s.cmd == 'print': + if s.cmd == "print": s.print(self._use_logger) continue - if s.cmd == 'shutdown': + if s.cmd == "shutdown": assert s.rank >= 0 and s.rank not in shutdown assert s.rank not in wait_conn shutdown[s.rank] = s - logging.debug('Received %s signal from %d', s.cmd, s.rank) + logging.debug("Received %s signal from %d", s.cmd, s.rank) continue assert s.cmd in ("start", "recover") # lazily initialize the workers if tree_map is None: - assert s.cmd == 'start' + assert s.cmd == "start" if s.world_size > 0: n_workers = s.world_size tree_map, parent_map, ring_map = self.get_link_map(n_workers) @@ -354,7 +354,7 @@ class RabitTracker: todo_nodes = list(range(n_workers)) else: assert s.world_size in (-1, n_workers) - if s.cmd == 'recover': + if s.cmd == "recover": assert s.rank >= 0 rank = s.decide_rank(job_map) @@ -410,24 +410,25 @@ def get_host_ip(host_ip: Optional[str] = None) -> str: returned as it's """ - if host_ip is None or host_ip == 'auto': - host_ip = 'ip' + if host_ip is None or host_ip == "auto": + host_ip = "ip" - if host_ip == 'dns': + if host_ip == "dns": host_ip = socket.getfqdn() - elif host_ip == 'ip': + elif host_ip == "ip": from socket import gaierror + try: host_ip = socket.gethostbyname(socket.getfqdn()) except gaierror: logging.debug( - 'gethostbyname(socket.getfqdn()) failed... trying on hostname()' + "gethostbyname(socket.getfqdn()) failed... trying on hostname()" ) host_ip = socket.gethostbyname(socket.gethostname()) if host_ip.startswith("127."): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # doesn't have to be reachable - s.connect(('10.255.255.255', 1)) + s.connect(("10.255.255.255", 1)) host_ip = s.getsockname()[0] assert host_ip is not None @@ -458,25 +459,41 @@ def start_rabit_tracker(args: argparse.Namespace) -> None: def main() -> None: """Main function if tracker is executed in standalone mode.""" - parser = argparse.ArgumentParser(description='Rabit Tracker start.') - parser.add_argument('--num-workers', required=True, type=int, - help='Number of worker process to be launched.') + parser = argparse.ArgumentParser(description="Rabit Tracker start.") parser.add_argument( - '--num-servers', default=0, type=int, - help='Number of server process to be launched. Only used in PS jobs.' + "--num-workers", + required=True, + type=int, + help="Number of worker process to be launched.", + ) + parser.add_argument( + "--num-servers", + default=0, + type=int, + help="Number of server process to be launched. Only used in PS jobs.", + ) + parser.add_argument( + "--host-ip", + default=None, + type=str, + help=( + "Host IP addressed, this is only needed " + + "if the host IP cannot be automatically guessed." + ), + ) + parser.add_argument( + "--log-level", + default="INFO", + type=str, + choices=["INFO", "DEBUG"], + help="Logging level of the logger.", ) - parser.add_argument('--host-ip', default=None, type=str, - help=('Host IP addressed, this is only needed ' + - 'if the host IP cannot be automatically guessed.')) - parser.add_argument('--log-level', default='INFO', type=str, - choices=['INFO', 'DEBUG'], - help='Logging level of the logger.') args = parser.parse_args() - fmt = '%(asctime)s %(levelname)s %(message)s' - if args.log_level == 'INFO': + fmt = "%(asctime)s %(levelname)s %(message)s" + if args.log_level == "INFO": level = logging.INFO - elif args.log_level == 'DEBUG': + elif args.log_level == "DEBUG": level = logging.DEBUG else: raise RuntimeError(f"Unknown logging level {args.log_level}") diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index eb01cdeec..5ef6eeaa2 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -205,25 +205,29 @@ def train( class CVPack: - """"Auxiliary datastruct to hold one fold of CV.""" - def __init__(self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]]) -> None: - """"Initialize the CVPack""" + """ "Auxiliary datastruct to hold one fold of CV.""" + + def __init__( + self, dtrain: DMatrix, dtest: DMatrix, param: Optional[Union[Dict, List]] + ) -> None: + """ "Initialize the CVPack""" self.dtrain = dtrain self.dtest = dtest - self.watchlist = [(dtrain, 'train'), (dtest, 'test')] + self.watchlist = [(dtrain, "train"), (dtest, "test")] self.bst = Booster(param, [dtrain, dtest]) def __getattr__(self, name: str) -> Callable: def _inner(*args: Any, **kwargs: Any) -> Any: return getattr(self.bst, name)(*args, **kwargs) + return _inner def update(self, iteration: int, fobj: Optional[Objective]) -> None: - """"Update the boosters for one iteration""" + """ "Update the boosters for one iteration""" self.bst.update(self.dtrain, iteration, fobj) def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> str: - """"Evaluate the CVPack for one iteration.""" + """ "Evaluate the CVPack for one iteration.""" return self.bst.eval_set(self.watchlist, iteration, feval, output_margin) @@ -232,38 +236,42 @@ class _PackedBooster: self.cvfolds = cvfolds def update(self, iteration: int, obj: Optional[Objective]) -> None: - '''Iterate through folds for update''' + """Iterate through folds for update""" for fold in self.cvfolds: fold.update(iteration, obj) - def eval(self, iteration: int, feval: Optional[Metric], output_margin: bool) -> List[str]: - '''Iterate through folds for eval''' + def eval( + self, iteration: int, feval: Optional[Metric], output_margin: bool + ) -> List[str]: + """Iterate through folds for eval""" result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds] return result def set_attr(self, **kwargs: Optional[str]) -> Any: - '''Iterate through folds for setting attributes''' + """Iterate through folds for setting attributes""" for f in self.cvfolds: f.bst.set_attr(**kwargs) def attr(self, key: str) -> Optional[str]: - '''Redirect to booster attr.''' + """Redirect to booster attr.""" return self.cvfolds[0].bst.attr(key) - def set_param(self, - params: Union[Dict, Iterable[Tuple[str, Any]], str], - value: Optional[str] = None) -> None: + def set_param( + self, + params: Union[Dict, Iterable[Tuple[str, Any]], str], + value: Optional[str] = None, + ) -> None: """Iterate through folds for set_param""" for f in self.cvfolds: f.bst.set_param(params, value) def num_boosted_rounds(self) -> int: - '''Number of boosted rounds.''' + """Number of boosted rounds.""" return self.cvfolds[0].num_boosted_rounds() @property def best_iteration(self) -> int: - '''Get best_iteration''' + """Get best_iteration""" return int(cast(int, self.cvfolds[0].bst.attr("best_iteration"))) @property @@ -279,7 +287,7 @@ def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarr :param boundaries: rows index limits of each group :return: row in group """ - return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups]) + return np.concatenate([np.arange(boundaries[g], boundaries[g + 1]) for g in groups]) def mkgroupfold( @@ -305,11 +313,17 @@ def mkgroupfold( # list by fold of test group indexes out_group_idset = np.array_split(idx, nfold) # list by fold of train group indexes - in_group_idset = [np.concatenate([out_group_idset[i] for i in range(nfold) if k != i]) - for k in range(nfold)] + in_group_idset = [ + np.concatenate([out_group_idset[i] for i in range(nfold) if k != i]) + for k in range(nfold) + ] # from the group indexes, convert them to row indexes - in_idset = [groups_to_rows(in_groups, group_boundaries) for in_groups in in_group_idset] - out_idset = [groups_to_rows(out_groups, group_boundaries) for out_groups in out_group_idset] + in_idset = [ + groups_to_rows(in_groups, group_boundaries) for in_groups in in_group_idset + ] + out_idset = [ + groups_to_rows(out_groups, group_boundaries) for out_groups in out_group_idset + ] # build the folds by taking the appropriate slices ret = [] @@ -324,7 +338,7 @@ def mkgroupfold( dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) else: tparam = param - plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] + plst = list(tparam.items()) + [("eval_metric", itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret @@ -348,16 +362,20 @@ def mknfold( if stratified is False and folds is None: # Do standard k-fold cross validation. Automatically determine the folds. - if len(dall.get_uint_info('group_ptr')) > 1: - return mkgroupfold(dall, nfold, param, evals=evals, fpreproc=fpreproc, shuffle=shuffle) + if len(dall.get_uint_info("group_ptr")) > 1: + return mkgroupfold( + dall, nfold, param, evals=evals, fpreproc=fpreproc, shuffle=shuffle + ) if shuffle is True: idx = np.random.permutation(dall.num_row()) else: idx = np.arange(dall.num_row()) out_idset = np.array_split(idx, nfold) - in_idset = [np.concatenate([out_idset[i] for i in range(nfold) if k != i]) - for k in range(nfold)] + in_idset = [ + np.concatenate([out_idset[i] for i in range(nfold) if k != i]) + for k in range(nfold) + ] elif folds is not None: # Use user specified custom split using indices try: @@ -387,7 +405,7 @@ def mknfold( dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy()) else: tparam = param - plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals] + plst = list(tparam.items()) + [("eval_metric", itm) for itm in evals] ret.append(CVPack(dtrain, dtest, plst)) return ret @@ -502,29 +520,32 @@ def cv( evaluation history : list(string) """ if stratified is True and not SKLEARN_INSTALLED: - raise XGBoostError('sklearn needs to be installed in order to use stratified cv') + raise XGBoostError( + "sklearn needs to be installed in order to use stratified cv" + ) if isinstance(metrics, str): metrics = [metrics] params = params.copy() if isinstance(params, list): - _metrics = [x[1] for x in params if x[0] == 'eval_metric'] + _metrics = [x[1] for x in params if x[0] == "eval_metric"] params = dict(params) - if 'eval_metric' in params: - params['eval_metric'] = _metrics + if "eval_metric" in params: + params["eval_metric"] = _metrics - if (not metrics) and 'eval_metric' in params: - if isinstance(params['eval_metric'], list): - metrics = params['eval_metric'] + if (not metrics) and "eval_metric" in params: + if isinstance(params["eval_metric"], list): + metrics = params["eval_metric"] else: - metrics = [params['eval_metric']] + metrics = [params["eval_metric"]] params.pop("eval_metric", None) results: Dict[str, List[float]] = {} - cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, - stratified, folds, shuffle) + cvfolds = mknfold( + dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds, shuffle + ) metric_fn = _configure_custom_metric(feval, custom_metric) @@ -555,20 +576,21 @@ def cv( should_break = callbacks_container.after_iteration(booster, i, dtrain, None) res = callbacks_container.aggregated_cv for key, mean, std in cast(List[Tuple[str, float, float]], res): - if key + '-mean' not in results: - results[key + '-mean'] = [] - if key + '-std' not in results: - results[key + '-std'] = [] - results[key + '-mean'].append(mean) - results[key + '-std'].append(std) + if key + "-mean" not in results: + results[key + "-mean"] = [] + if key + "-std" not in results: + results[key + "-std"] = [] + results[key + "-mean"].append(mean) + results[key + "-std"].append(std) if should_break: for k in results.keys(): # pylint: disable=consider-iterating-dictionary - results[k] = results[k][:(booster.best_iteration + 1)] + results[k] = results[k][: (booster.best_iteration + 1)] break if as_pandas: try: import pandas as pd + results = pd.DataFrame.from_dict(results) except ImportError: pass diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index ac48515fc..82d259581 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -132,16 +132,7 @@ def main(args: argparse.Namespace) -> None: run_black(path) for path in [ # core - "python-package/xgboost/__init__.py", - "python-package/xgboost/_typing.py", - "python-package/xgboost/callback.py", - "python-package/xgboost/compat.py", - "python-package/xgboost/config.py", - "python-package/xgboost/dask.py", - "python-package/xgboost/sklearn.py", - "python-package/xgboost/spark", - "python-package/xgboost/federated.py", - "python-package/xgboost/testing", + "python-package/", # tests "tests/python/test_config.py", "tests/python/test_data_iterator.py",