merge 23Mar01

This commit is contained in:
amdsc21
2023-05-02 00:05:58 +02:00
258 changed files with 7471 additions and 5379 deletions

View File

@@ -1,56 +0,0 @@
include README.rst
include xgboost/LICENSE
include xgboost/VERSION
include xgboost/CMakeLists.txt
include xgboost/py.typed
recursive-include xgboost *.py
recursive-include xgboost/cmake *
exclude xgboost/cmake/RPackageInstall.cmake.in
exclude xgboost/cmake/RPackageInstallTargetSetup.cmake
exclude xgboost/cmake/Sanitizer.cmake
exclude xgboost/cmake/modules/FindASan.cmake
exclude xgboost/cmake/modules/FindLSan.cmake
exclude xgboost/cmake/modules/FindLibR.cmake
exclude xgboost/cmake/modules/FindTSan.cmake
exclude xgboost/cmake/modules/FindUBSan.cmake
recursive-include xgboost/include *
recursive-include xgboost/plugin *
recursive-include xgboost/src *
recursive-include xgboost/gputreeshap/GPUTreeShap *
include xgboost/rabit/CMakeLists.txt
recursive-include xgboost/rabit/include *
recursive-include xgboost/rabit/src *
prune xgboost/rabit/doc
prune xgboost/rabit/guide
include xgboost/dmlc-core/CMakeLists.txt
recursive-include xgboost/dmlc-core/cmake *
exclude xgboost/dmlc-core/cmake/gtest_cmake.in
exclude xgboost/dmlc-core/cmake/lint.cmake
exclude xgboost/dmlc-core/cmake/Sanitizer.cmake
exclude xgboost/dmlc-core/cmake/Modules/FindASan.cmake
exclude xgboost/dmlc-core/cmake/Modules/FindLSan.cmake
exclude xgboost/dmlc-core/cmake/Modules/FindTSan.cmake
exclude xgboost/dmlc-core/cmake/Modules/FindUBSan.cmake
recursive-include xgboost/dmlc-core/include *
recursive-include xgboost/dmlc-core/include *
recursive-include xgboost/dmlc-core/make *
recursive-include xgboost/dmlc-core/src *
include xgboost/dmlc-core/tracker/dmlc-submit
recursive-include xgboost/dmlc-core/tracker/dmlc_tracker *.py
include xgboost/dmlc-core/tracker/yarn/build.bat
include xgboost/dmlc-core/tracker/yarn/build.sh
include xgboost/dmlc-core/tracker/yarn/pom.xml
recursive-include xgboost/dmlc-core/tracker/yarn/src *
include xgboost/dmlc-core/windows/dmlc.sln
include xgboost/dmlc-core/windows/dmlc/dmlc.vcxproj
prune xgboost/dmlc-core/doc
prune xgboost/dmlc-core/scripts/
global-exclude *.py[oc]

View File

@@ -0,0 +1,22 @@
"""
Custom hook to customize the behavior of Hatchling.
Here, we customize the tag of the generated wheels.
"""
import sysconfig
from typing import Any, Dict
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
def get_tag() -> str:
"""Get appropriate wheel tag according to system"""
tag_platform = sysconfig.get_platform().replace("-", "_").replace(".", "_")
return f"py3-none-{tag_platform}"
class CustomBuildHook(BuildHookInterface):
"""A custom build hook"""
def initialize(self, version: str, build_data: Dict[str, Any]) -> None:
"""This step ccurs immediately before each build."""
build_data["tag"] = get_tag()

View File

View File

@@ -0,0 +1,56 @@
"""Build configuration"""
import dataclasses
from typing import Any, Dict, List, Optional
@dataclasses.dataclass
class BuildConfiguration: # pylint: disable=R0902
"""Configurations use when building libxgboost"""
# Whether to hide C++ symbols in libxgboost.so
hide_cxx_symbols: bool = True
# Whether to enable OpenMP
use_openmp: bool = True
# Whether to enable CUDA
use_cuda: bool = False
# Whether to enable NCCL
use_nccl: bool = False
# Whether to enable HDFS
use_hdfs: bool = False
# Whether to enable Azure Storage
use_azure: bool = False
# Whether to enable AWS S3
use_s3: bool = False
# Whether to enable the dense parser plugin
plugin_dense_parser: bool = False
# Special option: See explanation below
use_system_libxgboost: bool = False
def _set_config_setting(
self, config_settings: Dict[str, Any], field_name: str
) -> None:
if field_name in config_settings:
setattr(
self,
field_name,
(config_settings[field_name].lower() in ["true", "1", "on"]),
)
else:
raise ValueError(f"Field {field_name} is not a valid config_settings")
def update(self, config_settings: Optional[Dict[str, Any]]) -> None:
"""Parse config_settings from Pip (or other PEP 517 frontend)"""
if config_settings is not None:
for field_name in [x.name for x in dataclasses.fields(self)]:
self._set_config_setting(config_settings, field_name)
def get_cmake_args(self) -> List[str]:
"""Convert build configuration to CMake args"""
cmake_args = []
for field_name in [x.name for x in dataclasses.fields(self)]:
if field_name in ["use_system_libxgboost"]:
continue
cmake_option = field_name.upper()
cmake_value = "ON" if getattr(self, field_name) is True else "OFF"
cmake_args.append(f"-D{cmake_option}={cmake_value}")
return cmake_args

View File

@@ -0,0 +1,157 @@
"""
Functions for building libxgboost
"""
import logging
import os
import pathlib
import shutil
import subprocess
import sys
from platform import system
from typing import Optional
from .build_config import BuildConfiguration
def _lib_name() -> str:
"""Return platform dependent shared object name."""
if system() in ["Linux", "OS400"] or system().upper().endswith("BSD"):
name = "libxgboost.so"
elif system() == "Darwin":
name = "libxgboost.dylib"
elif system() == "Windows":
name = "xgboost.dll"
else:
raise NotImplementedError(f"System {system()} not supported")
return name
def build_libxgboost(
cpp_src_dir: pathlib.Path,
build_dir: pathlib.Path,
build_config: BuildConfiguration,
) -> pathlib.Path:
"""Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
logger = logging.getLogger("xgboost.packager.build_libxgboost")
if not cpp_src_dir.is_dir():
raise RuntimeError(f"Expected {cpp_src_dir} to be a directory")
logger.info(
"Building %s from the C++ source files in %s...", _lib_name(), str(cpp_src_dir)
)
def _build(*, generator: str) -> None:
cmake_cmd = [
"cmake",
str(cpp_src_dir),
generator,
"-DKEEP_BUILD_ARTIFACTS_IN_BINARY_DIR=ON",
]
cmake_cmd.extend(build_config.get_cmake_args())
# Flag for cross-compiling for Apple Silicon
# We use environment variable because it's the only way to pass down custom flags
# through the cibuildwheel package, which calls `pip wheel` command.
if "CIBW_TARGET_OSX_ARM64" in os.environ:
cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
logger.info("CMake args: %s", str(cmake_cmd))
subprocess.check_call(cmake_cmd, cwd=build_dir)
if system() == "Windows":
subprocess.check_call(
["cmake", "--build", ".", "--config", "Release"], cwd=build_dir
)
else:
nproc = os.cpu_count()
assert build_tool is not None
subprocess.check_call([build_tool, f"-j{nproc}"], cwd=build_dir)
if system() == "Windows":
supported_generators = (
"-GVisual Studio 17 2022",
"-GVisual Studio 16 2019",
"-GVisual Studio 15 2017",
"-GMinGW Makefiles",
)
for generator in supported_generators:
try:
_build(generator=generator)
logger.info(
"Successfully built %s using generator %s", _lib_name(), generator
)
break
except subprocess.CalledProcessError as e:
logger.info(
"Tried building with generator %s but failed with exception %s",
generator,
str(e),
)
# Empty build directory
shutil.rmtree(build_dir)
build_dir.mkdir()
else:
raise RuntimeError(
"None of the supported generators produced a successful build!"
f"Supported generators: {supported_generators}"
)
else:
build_tool = "ninja" if shutil.which("ninja") else "make"
generator = "-GNinja" if build_tool == "ninja" else "-GUnix Makefiles"
try:
_build(generator=generator)
except subprocess.CalledProcessError as e:
logger.info("Failed to build with OpenMP. Exception: %s", str(e))
build_config.use_openmp = False
_build(generator=generator)
return build_dir / "lib" / _lib_name()
def locate_local_libxgboost(
toplevel_dir: pathlib.Path,
logger: logging.Logger,
) -> Optional[pathlib.Path]:
"""
Locate libxgboost from the local project directory's lib/ subdirectory.
"""
libxgboost = toplevel_dir.parent / "lib" / _lib_name()
if libxgboost.exists():
logger.info("Found %s at %s", libxgboost.name, str(libxgboost.parent))
return libxgboost
return None
def locate_or_build_libxgboost(
toplevel_dir: pathlib.Path,
build_dir: pathlib.Path,
build_config: BuildConfiguration,
) -> pathlib.Path:
"""Locate libxgboost; if not exist, build it"""
logger = logging.getLogger("xgboost.packager.locate_or_build_libxgboost")
libxgboost = locate_local_libxgboost(toplevel_dir, logger=logger)
if libxgboost is not None:
return libxgboost
if build_config.use_system_libxgboost:
# Find libxgboost from system prefix
sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
libxgboost = sys_prefix / "lib" / _lib_name()
if not libxgboost.exists():
raise RuntimeError(
f"use_system_libxgboost was specified but {_lib_name()} is "
f"not found in {libxgboost.parent}"
)
logger.info("Using system XGBoost: %s", str(libxgboost))
return libxgboost
if toplevel_dir.joinpath("cpp_src").exists():
# Source distribution; all C++ source files to be found in cpp_src/
cpp_src_dir = toplevel_dir.joinpath("cpp_src")
else:
# Probably running "pip install ." from python-package/
cpp_src_dir = toplevel_dir.parent
if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
return build_libxgboost(cpp_src_dir, build_dir=build_dir, build_config=build_config)

View File

@@ -0,0 +1,157 @@
"""
Custom build backend for XGBoost Python package.
Builds source distribution and binary wheels, following PEP 517 / PEP 660.
Reuses components of Hatchling (https://github.com/pypa/hatch/tree/master/backend) for the sake
of brevity.
"""
import dataclasses
import logging
import os
import pathlib
import tempfile
from contextlib import contextmanager
from typing import Any, Dict, Iterator, Optional, Union
import hatchling.build
from .build_config import BuildConfiguration
from .nativelib import locate_local_libxgboost, locate_or_build_libxgboost
from .sdist import copy_cpp_src_tree
from .util import copy_with_logging, copytree_with_logging
@contextmanager
def cd(path: Union[str, pathlib.Path]) -> Iterator[str]: # pylint: disable=C0103
"""
Temporarily change working directory.
TODO(hcho3): Remove this once we adopt Python 3.11, which implements contextlib.chdir.
"""
path = str(path)
path = os.path.realpath(path)
cwd = os.getcwd()
os.chdir(path)
try:
yield path
finally:
os.chdir(cwd)
TOPLEVEL_DIR = pathlib.Path(__file__).parent.parent.absolute().resolve()
logging.basicConfig(level=logging.INFO)
# Aliases
get_requires_for_build_sdist = hatchling.build.get_requires_for_build_sdist
get_requires_for_build_wheel = hatchling.build.get_requires_for_build_wheel
get_requires_for_build_editable = hatchling.build.get_requires_for_build_editable
def build_wheel(
wheel_directory: str,
config_settings: Optional[Dict[str, Any]] = None,
metadata_directory: Optional[str] = None,
) -> str:
"""Build a wheel"""
logger = logging.getLogger("xgboost.packager.build_wheel")
build_config = BuildConfiguration()
build_config.update(config_settings)
logger.info("Parsed build configuration: %s", dataclasses.asdict(build_config))
# Create tempdir with Python package + libxgboost
with tempfile.TemporaryDirectory() as td:
td_path = pathlib.Path(td)
build_dir = td_path / "libbuild"
build_dir.mkdir()
workspace = td_path / "whl_workspace"
workspace.mkdir()
logger.info("Copying project files to temporary directory %s", str(workspace))
copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
pkg_path = workspace / "xgboost"
copytree_with_logging(TOPLEVEL_DIR / "xgboost", pkg_path, logger=logger)
lib_path = pkg_path / "lib"
lib_path.mkdir()
libxgboost = locate_or_build_libxgboost(
TOPLEVEL_DIR, build_dir=build_dir, build_config=build_config
)
copy_with_logging(libxgboost, lib_path, logger=logger)
with cd(workspace):
wheel_name = hatchling.build.build_wheel(
wheel_directory, config_settings, metadata_directory
)
return wheel_name
def build_sdist(
sdist_directory: str,
config_settings: Optional[Dict[str, Any]] = None,
) -> str:
"""Build a source distribution"""
logger = logging.getLogger("xgboost.packager.build_sdist")
if config_settings:
raise NotImplementedError(
"XGBoost's custom build backend doesn't support config_settings option "
f"when building sdist. {config_settings=}"
)
cpp_src_dir = TOPLEVEL_DIR.parent
if not cpp_src_dir.joinpath("CMakeLists.txt").exists():
raise RuntimeError(f"Did not find CMakeLists.txt from {cpp_src_dir}")
# Create tempdir with Python package + C++ sources
with tempfile.TemporaryDirectory() as td:
td_path = pathlib.Path(td)
workspace = td_path / "sdist_workspace"
workspace.mkdir()
logger.info("Copying project files to temporary directory %s", str(workspace))
copy_with_logging(TOPLEVEL_DIR / "pyproject.toml", workspace, logger=logger)
copy_with_logging(TOPLEVEL_DIR / "hatch_build.py", workspace, logger=logger)
copy_with_logging(TOPLEVEL_DIR / "README.rst", workspace, logger=logger)
copytree_with_logging(
TOPLEVEL_DIR / "xgboost", workspace / "xgboost", logger=logger
)
copytree_with_logging(
TOPLEVEL_DIR / "packager", workspace / "packager", logger=logger
)
temp_cpp_src_dir = workspace / "cpp_src"
copy_cpp_src_tree(cpp_src_dir, target_dir=temp_cpp_src_dir, logger=logger)
with cd(workspace):
sdist_name = hatchling.build.build_sdist(sdist_directory, config_settings)
return sdist_name
def build_editable(
wheel_directory: str,
config_settings: Optional[Dict[str, Any]] = None,
metadata_directory: Optional[str] = None,
) -> str:
"""Build an editable installation. We mostly delegate to Hatchling."""
logger = logging.getLogger("xgboost.packager.build_editable")
if config_settings:
raise NotImplementedError(
"XGBoost's custom build backend doesn't support config_settings option "
f"when building editable installation. {config_settings=}"
)
if locate_local_libxgboost(TOPLEVEL_DIR, logger=logger) is None:
raise RuntimeError(
"To use the editable installation, first build libxgboost with CMake. "
"See https://xgboost.readthedocs.io/en/latest/build.html for detailed instructions."
)
return hatchling.build.build_editable(
wheel_directory, config_settings, metadata_directory
)

View File

@@ -0,0 +1,27 @@
"""
Functions for building sdist
"""
import logging
import pathlib
from .util import copy_with_logging, copytree_with_logging
def copy_cpp_src_tree(
cpp_src_dir: pathlib.Path, target_dir: pathlib.Path, logger: logging.Logger
) -> None:
"""Copy C++ source tree into build directory"""
for subdir in [
"src",
"include",
"dmlc-core",
"gputreeshap",
"rabit",
"cmake",
"plugin",
]:
copytree_with_logging(cpp_src_dir / subdir, target_dir / subdir, logger=logger)
for filename in ["CMakeLists.txt", "LICENSE"]:
copy_with_logging(cpp_src_dir.joinpath(filename), target_dir, logger=logger)

View File

@@ -0,0 +1,25 @@
"""
Utility functions for implementing PEP 517 backend
"""
import logging
import pathlib
import shutil
def copytree_with_logging(
src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
) -> None:
"""Call shutil.copytree() with logging"""
logger.info("Copying %s -> %s", str(src), str(dest))
shutil.copytree(src, dest)
def copy_with_logging(
src: pathlib.Path, dest: pathlib.Path, logger: logging.Logger
) -> None:
"""Call shutil.copy() with logging"""
if dest.is_dir():
logger.info("Copying %s -> %s", str(src), str(dest / src.name))
else:
logger.info("Copying %s -> %s", str(src), str(dest))
shutil.copy(src, dest)

View File

@@ -0,0 +1,42 @@
[build-system]
requires = [
"hatchling>=1.12.1"
]
backend-path = ["."]
build-backend = "packager.pep517"
[project]
name = "xgboost"
version = "2.0.0-dev"
authors = [
{name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu"},
{name = "Jiaming Yuan", email = "jm.yuan@outlook.com"}
]
description = "XGBoost Python Package"
readme = {file = "README.rst", content-type = "text/x-rst"}
requires-python = ">=3.8"
license = {text = "Apache-2.0"}
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Development Status :: 5 - Production/Stable",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10"
]
dependencies = [
"numpy",
"scipy"
]
[project.optional-dependencies]
pandas = ["pandas"]
scikit-learn = ["scikit-learn"]
dask = ["dask", "pandas", "distributed"]
datatable = ["datatable"]
plotting = ["graphviz", "matplotlib"]
pyspark = ["pyspark", "scikit-learn", "cloudpickle"]
[tool.hatch.build.targets.wheel.hooks.custom]

View File

@@ -16,7 +16,7 @@ def config_doc(
extra_note: Optional[str] = None,
parameters: Optional[str] = None,
returns: Optional[str] = None,
see_also: Optional[str] = None
see_also: Optional[str] = None,
) -> Callable[[_F], _F]:
"""Decorator to format docstring for config functions.

View File

@@ -73,6 +73,7 @@ from .core import (
_deprecate_positional_args,
_expect,
)
from .data import _is_cudf_ser, _is_cupy_array
from .sklearn import (
XGBClassifier,
XGBClassifierBase,
@@ -1894,10 +1895,15 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
)
# pylint: disable=attribute-defined-outside-init
if isinstance(y, (da.Array)):
if isinstance(y, da.Array):
self.classes_ = await self.client.compute(da.unique(y))
else:
self.classes_ = await self.client.compute(y.drop_duplicates())
if _is_cudf_ser(self.classes_):
self.classes_ = self.classes_.to_cupy()
if _is_cupy_array(self.classes_):
self.classes_ = self.classes_.get()
self.classes_ = numpy.array(self.classes_)
self.n_classes_ = len(self.classes_)
if self.n_classes_ > 2:

View File

@@ -30,7 +30,7 @@ def plot_importance(
grid: bool = True,
show_values: bool = True,
values_format: str = "{v}",
**kwargs: Any
**kwargs: Any,
) -> Axes:
"""Plot importance based on fitted trees.
@@ -155,7 +155,7 @@ def to_graphviz(
no_color: Optional[str] = None,
condition_node_params: Optional[dict] = None,
leaf_node_params: Optional[dict] = None,
**kwargs: Any
**kwargs: Any,
) -> GraphvizSource:
"""Convert specified tree to graphviz instance. IPython can automatically plot
the returned graphviz instance. Otherwise, you should call .render() method
@@ -250,7 +250,7 @@ def plot_tree(
num_trees: int = 0,
rankdir: Optional[str] = None,
ax: Optional[Axes] = None,
**kwargs: Any
**kwargs: Any,
) -> Axes:
"""Plot specified tree.

View File

@@ -219,7 +219,9 @@ def create_dmatrix_from_partitions( # pylint: disable=too-many-arguments
array: Optional[np.ndarray] = part[feature_cols]
elif part[name].shape[0] > 0:
array = part[name]
array = stack_series(array)
if name == alias.data:
# For the array/vector typed case.
array = stack_series(array)
else:
array = None

View File

@@ -1,4 +1,6 @@
"""Xgboost pyspark integration submodule for params."""
from typing import Dict
# pylint: disable=too-few-public-methods
from pyspark.ml.param import TypeConverters
from pyspark.ml.param.shared import Param, Params
@@ -11,7 +13,7 @@ class HasArbitraryParamsDict(Params):
input.
"""
arbitrary_params_dict: Param[dict] = Param(
arbitrary_params_dict: "Param[Dict]" = Param(
Params._dummy(),
"arbitrary_params_dict",
"arbitrary_params_dict This parameter holds all of the additional parameters which are "

View File

@@ -317,13 +317,15 @@ class TestDataset:
enable_categorical=True,
)
def get_device_dmat(self) -> xgb.QuantileDMatrix:
def get_device_dmat(self, max_bin: Optional[int]) -> xgb.QuantileDMatrix:
import cupy as cp
w = None if self.w is None else cp.array(self.w)
X = cp.array(self.X, dtype=np.float32)
y = cp.array(self.y, dtype=np.float32)
return xgb.QuantileDMatrix(X, y, weight=w, base_margin=self.margin)
return xgb.QuantileDMatrix(
X, y, weight=w, base_margin=self.margin, max_bin=max_bin
)
def get_external_dmat(self) -> xgb.DMatrix:
n_samples = self.X.shape[0]
@@ -431,8 +433,11 @@ def make_ltr(
"""Make a dataset for testing LTR."""
rng = np.random.default_rng(1994)
X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
y = rng.integers(0, max_rel, size=n_samples)
qid = rng.integers(0, n_query_groups, size=n_samples)
y = np.sum(X, axis=1)
y -= y.min()
y = np.round(y / y.max() * max_rel).astype(np.int32)
qid = rng.integers(0, n_query_groups, size=n_samples, dtype=np.int32)
w = rng.normal(0, 1.0, size=n_query_groups)
w -= np.min(w)
w /= np.max(w)
@@ -879,5 +884,12 @@ def data_dir(path: str) -> str:
return os.path.join(demo_dir(path), "data")
def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]:
dpath = data_dir(path)
dtrain = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.train?format=libsvm"))
dtest = xgb.DMatrix(os.path.join(dpath, "agaricus.txt.test?format=libsvm"))
return dtrain, dtest
def project_root(path: str) -> str:
return normpath(os.path.join(demo_dir(path), os.path.pardir))