Compare commits

...

9 Commits

Author SHA1 Message Date
Jiaming Yuan
36eb41c960 Bump version to 1.7.6 (#9305) 2023-06-16 03:33:16 +08:00
Jiaming Yuan
39ddf40a8d [backport] Optimize prediction with QuantileDMatrix. (#9096) (#9303) 2023-06-15 23:32:03 +08:00
Jiaming Yuan
573f1c7db4 [backport] Fix monotone constraints on CPU. (#9122) (#9287)
* [backport] Fix monotone constraints on CPU. (#9122)
2023-06-11 17:51:25 +08:00
Jiaming Yuan
abc80d2a6d [backport] Improve doxygen (#8959) (#9284)
* Remove Sphinx build from GH Action

* Build Doxygen as part of RTD build

* Add jQuery

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-06-11 13:22:23 +08:00
Jiaming Yuan
e882fb3262 [backport] [spark] Make spark model have the same UID with its estimator (#9022) (#9285)
Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
Co-authored-by: WeichenXu <weichen.xu@databricks.com>
2023-06-11 13:18:23 +08:00
Jiaming Yuan
3218f6cd3c [backport] Disable dense opt for distributed training. (#9272) (#9288) 2023-06-11 11:08:45 +08:00
Jiaming Yuan
a962611de7 Disable SHAP test on 1.7 (#9290) 2023-06-11 02:13:36 +08:00
Jiaming Yuan
14476e8868 [backport] Fix tests with pandas 2.0. (#9014) (#9289)
* Fix tests with pandas 2.0.

- `is_categorical` is replaced by `is_categorical_dtype`.
- one hot encoding returns boolean type instead of integer type.
2023-06-11 00:52:44 +08:00
Jiaming Yuan
03f3879b71 [backport] [doc] fix the cudf installation [skip ci] (#9106) (#9286)
Co-authored-by: Bobby Wang <wbo4958@gmail.com>
2023-06-10 04:09:27 +08:00
33 changed files with 284 additions and 233 deletions

View File

@@ -148,66 +148,13 @@ jobs:
run: |
LINT_LANG=cpp make lint
doxygen:
runs-on: ubuntu-latest
name: Generate C/C++ API doc using Doxygen
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: "3.8"
architecture: 'x64'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build
python -m pip install wheel setuptools
python -m pip install awscli
- name: Run Doxygen
run: |
mkdir build
cd build
cmake .. -DBUILD_C_DOC=ON -GNinja
ninja -v doc_doxygen
- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- name: Publish
run: |
cd build/
tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/doxygen/ --acl public-read
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
sphinx:
runs-on: ubuntu-latest
name: Build docs using Sphinx
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'
- uses: actions/setup-python@v2
with:
python-version: "3.8"
architecture: 'x64'
- name: Install system packages
run: |
sudo apt-get install -y --no-install-recommends graphviz
python -m pip install wheel setuptools
python -m pip install -r doc/requirements.txt
- name: Extract branch name
shell: bash
run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
id: extract_branch
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
- name: Run Sphinx
run: |
make -C doc html
env:
SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
python3 dmlc-core/scripts/lint.py --exclude_path \
python-package/xgboost/dmlc-core \
python-package/xgboost/include \
python-package/xgboost/lib \
python-package/xgboost/rabit \
python-package/xgboost/src \
--pylint-rc python-package/.pylintrc \
xgboost \
cpp \
include src python-package

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.7.5)
project(xgboost LANGUAGES CXX C VERSION 1.7.6)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)

View File

@@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 1.7.5.1
Date: 2023-03-29
Version: 1.7.6.1
Date: 2023-06-16
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),

18
R-package/configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for xgboost 1.7.5.
# Generated by GNU Autoconf 2.71 for xgboost 1.7.6.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='xgboost'
PACKAGE_TARNAME='xgboost'
PACKAGE_VERSION='1.7.5'
PACKAGE_STRING='xgboost 1.7.5'
PACKAGE_VERSION='1.7.6'
PACKAGE_STRING='xgboost 1.7.6'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures xgboost 1.7.5 to adapt to many kinds of systems.
\`configure' configures xgboost 1.7.6 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1287,7 +1287,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of xgboost 1.7.5:";;
short | recursive ) echo "Configuration of xgboost 1.7.6:";;
esac
cat <<\_ACEOF
@@ -1367,7 +1367,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
xgboost configure 1.7.5
xgboost configure 1.7.6
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by xgboost $as_me 1.7.5, which was
It was created by xgboost $as_me 1.7.6, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by xgboost $as_me 1.7.5, which was
This file was extended by xgboost $as_me 1.7.6, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
xgboost config.status 1.7.5
xgboost config.status 1.7.6
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -2,7 +2,7 @@
AC_PREREQ(2.69)
AC_INIT([xgboost],[1.7.5],[],[xgboost],[])
AC_INIT([xgboost],[1.7.6],[],[xgboost],[])
: ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then

View File

@@ -8,5 +8,5 @@ As a result it's changing quite often and we don't maintain its stability. Alon
plugin system (see ``plugin/example`` in XGBoost's source tree), users can utilize some
existing c++ headers for gaining more access to the internal of XGBoost.
* `C++ interface documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/files.html>`_
* `C++ interface documentation (latest master branch) <./dev/files.html>`_
* `C++ interface documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/files.html>`_

View File

@@ -10,7 +10,7 @@ simply look at function comments in ``include/xgboost/c_api.h``. The reference i
to sphinx with the help of breathe, which doesn't contain links to examples but might be
easier to read. For the original doxygen pages please visit:
* `C API documentation (latest master branch) <https://xgboost.readthedocs.io/en/latest/dev/c__api_8h.html>`_
* `C API documentation (latest master branch) <./dev/c__api_8h.html>`_
* `C API documentation (last stable release) <https://xgboost.readthedocs.io/en/stable/dev/c__api_8h.html>`_
***************

View File

@@ -11,54 +11,107 @@
#
# All configuration values have a default; values that are commented out
# serve to show the default.
from subprocess import call
from sh.contrib import git
import urllib.request
from urllib.error import HTTPError
import sys
import re
import os
import re
import shutil
import subprocess
import sys
import tarfile
import urllib.request
import warnings
from urllib.error import HTTPError
git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
from sh.contrib import git
CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
TMP_DIR = os.path.join(CURR_PATH, "tmp")
DOX_DIR = "doxygen"
def run_doxygen():
"""Run the doxygen make command in the designated folder."""
curdir = os.path.normpath(os.path.abspath(os.path.curdir))
if os.path.exists(TMP_DIR):
print(f"Delete directory {TMP_DIR}")
shutil.rmtree(TMP_DIR)
else:
print(f"Create directory {TMP_DIR}")
os.mkdir(TMP_DIR)
try:
os.chdir(PROJECT_ROOT)
if not os.path.exists(DOX_DIR):
os.mkdir(DOX_DIR)
os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
print(
"Build doxygen at {}".format(
os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen")
)
)
subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
subprocess.check_call(["ninja", "doc_doxygen"])
src = os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen", "html")
dest = os.path.join(TMP_DIR, "dev")
print(f"Copy directory {src} -> {dest}")
shutil.copytree(src, dest)
except OSError as e:
sys.stderr.write("doxygen execution failed: %s" % e)
finally:
os.chdir(curdir)
def is_readthedocs_build():
if os.environ.get("READTHEDOCS", None) == "True":
return True
warnings.warn(
"Skipping Doxygen build... You won't have documentation for C/C++ functions. "
"Set environment variable READTHEDOCS=True if you want to build Doxygen. "
"(If you do opt in, make sure to install Doxygen, Graphviz, CMake, and C++ compiler "
"on your system.)"
)
return False
if is_readthedocs_build():
run_doxygen()
git_branch = os.getenv("SPHINX_GIT_BRANCH", default=None)
if not git_branch:
# If SPHINX_GIT_BRANCH environment variable is not given, run git
# to determine branch name
git_branch = [
re.sub(r'origin/', '', x.lstrip(' ')) for x in str(
git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
re.sub(r"origin/", "", x.lstrip(" "))
for x in str(git.branch("-r", "--contains", "HEAD")).rstrip("\n").split("\n")
]
git_branch = [x for x in git_branch if 'HEAD' not in x]
git_branch = [x for x in git_branch if "HEAD" not in x]
else:
git_branch = [git_branch]
print('git_branch = {}'.format(git_branch[0]))
print("git_branch = {}".format(git_branch[0]))
try:
filename, _ = urllib.request.urlretrieve(
'https://s3-us-west-2.amazonaws.com/xgboost-docs/{}.tar.bz2'.format(
git_branch[0]))
call(
'if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'
.format(filename),
shell=True)
f"https://s3-us-west-2.amazonaws.com/xgboost-docs/{git_branch[0]}.tar.bz2"
)
if not os.path.exists(TMP_DIR):
print(f"Create directory {TMP_DIR}")
os.mkdir(TMP_DIR)
jvm_doc_dir = os.path.join(TMP_DIR, "jvm")
if os.path.exists(jvm_doc_dir):
print(f"Delete directory {jvm_doc_dir}")
shutil.rmtree(jvm_doc_dir)
print(f"Create directory {jvm_doc_dir}")
os.mkdir(jvm_doc_dir)
with tarfile.open(filename, "r:bz2") as t:
t.extractall(jvm_doc_dir)
except HTTPError:
print('JVM doc not found. Skipping...')
try:
filename, _ = urllib.request.urlretrieve(
'https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.
format(git_branch[0]))
call(
'mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'
.format(filename),
shell=True)
except HTTPError:
print('C API doc not found. Skipping...')
print("JVM doc not found. Skipping...")
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
PROJECT_ROOT = os.path.normpath(os.path.join(CURR_PATH, os.path.pardir))
libpath = os.path.join(PROJECT_ROOT, "python-package/")
sys.path.insert(0, libpath)
sys.path.insert(0, CURR_PATH)
@@ -81,50 +134,56 @@ release = xgboost.__version__
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
extensions = [
'matplotlib.sphinxext.plot_directive',
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.mathjax',
'sphinx.ext.intersphinx',
"matplotlib.sphinxext.plot_directive",
"sphinxcontrib.jquery",
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"sphinx.ext.mathjax",
"sphinx.ext.intersphinx",
"sphinx_gallery.gen_gallery",
'breathe',
'recommonmark'
"breathe",
"recommonmark",
]
sphinx_gallery_conf = {
# path to your example scripts
"examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
# path to where to save gallery generated output
"gallery_dirs": ["python/examples", "python/dask-examples", "python/survival-examples"],
"gallery_dirs": [
"python/examples",
"python/dask-examples",
"python/survival-examples",
],
"matplotlib_animations": True,
}
autodoc_typehints = "description"
graphviz_output_format = 'png'
plot_formats = [('svg', 300), ('png', 100), ('hires.png', 300)]
graphviz_output_format = "png"
plot_formats = [("svg", 300), ("png", 100), ("hires.png", 300)]
plot_html_show_source_link = False
plot_html_show_formats = False
# Breathe extension variables
DOX_DIR = "doxygen"
breathe_projects = {
"xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
}
breathe_projects = {}
if is_readthedocs_build():
breathe_projects = {
"xgboost": os.path.join(PROJECT_ROOT, DOX_DIR, "doc_doxygen/xml")
}
breathe_default_project = "xgboost"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
templates_path = ["_templates"]
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
source_suffix = ['.rst', '.md']
source_suffix = [".rst", ".md"]
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
master_doc = "index"
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
@@ -133,7 +192,7 @@ master_doc = 'index'
# Usually you set "language" from the command line for these cases.
language = "en"
autoclass_content = 'both'
autoclass_content = "both"
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
@@ -143,8 +202,10 @@ autoclass_content = 'both'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
html_extra_path = ['./tmp']
exclude_patterns = ["_build"]
html_extra_path = []
if is_readthedocs_build():
html_extra_path = [TMP_DIR]
# The reST default role (used for this markup: `text`) to use for all
# documents.
@@ -162,7 +223,7 @@ html_extra_path = ['./tmp']
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
@@ -185,27 +246,24 @@ html_logo = "https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/lo
html_css_files = ["css/custom.css"]
html_sidebars = {
'**': ['logo-text.html', 'globaltoc.html', 'searchbox.html']
}
html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "searchbox.html"]}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = ["_static"]
# Output file base name for HTML help builder.
htmlhelp_basename = project + 'doc'
htmlhelp_basename = project + "doc"
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
}
latex_elements = {}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, '%s.tex' % project, project, author, 'manual'),
(master_doc, "%s.tex" % project, project, author, "manual"),
]
intersphinx_mapping = {
@@ -220,30 +278,5 @@ intersphinx_mapping = {
}
# hook for doxygen
def run_doxygen():
"""Run the doxygen make command in the designated folder."""
curdir = os.path.normpath(os.path.abspath(os.path.curdir))
try:
os.chdir(PROJECT_ROOT)
if not os.path.exists(DOX_DIR):
os.mkdir(DOX_DIR)
os.chdir(os.path.join(PROJECT_ROOT, DOX_DIR))
subprocess.check_call(["cmake", "..", "-DBUILD_C_DOC=ON", "-GNinja"])
subprocess.check_call(["ninja", "doc_doxygen"])
except OSError as e:
sys.stderr.write("doxygen execution failed: %s" % e)
finally:
os.chdir(curdir)
def generate_doxygen_xml(app):
"""Run the doxygen make commands if we're on the ReadTheDocs server"""
read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
if read_the_docs_build:
run_doxygen()
def setup(app):
app.add_css_file('custom.css')
app.connect("builder-inited", generate_doxygen_xml)
app.add_css_file("custom.css")

View File

@@ -107,8 +107,8 @@ virtualenv and pip:
python -m venv xgboost_env
source xgboost_env/bin/activate
pip install pyarrow pandas venv-pack xgboost
# https://rapids.ai/pip.html#install
pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# https://docs.rapids.ai/install#pip-install
pip install cudf-cu11 --extra-index-url=https://pypi.nvidia.com
venv-pack -o xgboost_env.tar.gz
With Conda:
@@ -240,7 +240,7 @@ additional spark configurations and dependencies:
--master spark://<master-ip>:7077 \
--conf spark.executor.resource.gpu.amount=1 \
--conf spark.task.resource.gpu.amount=1 \
--packages com.nvidia:rapids-4-spark_2.12:22.08.0 \
--packages com.nvidia:rapids-4-spark_2.12:23.04.0 \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
--conf spark.sql.execution.arrow.maxRecordsPerBatch=1000000 \
--archives xgboost_env.tar.gz#environment \

View File

@@ -508,7 +508,7 @@ class RegTree : public Model {
* \brief drop the trace after fill, must be called after fill.
* \param inst The sparse instance to drop.
*/
void Drop(const SparsePage::Inst& inst);
void Drop();
/*!
* \brief returns the size of the feature vector
* \return the size of the feature vector
@@ -709,13 +709,10 @@ inline void RegTree::FVec::Fill(const SparsePage::Inst& inst) {
has_missing_ = data_.size() != feature_count;
}
inline void RegTree::FVec::Drop(const SparsePage::Inst& inst) {
for (auto const& entry : inst) {
if (entry.index >= data_.size()) {
continue;
}
data_[entry.index].flag = -1;
}
inline void RegTree::FVec::Drop() {
Entry e{};
e.flag = -1;
std::fill_n(data_.data(), data_.size(), e);
has_missing_ = true;
}

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 1
#define XGBOOST_VER_MINOR 7
#define XGBOOST_VER_PATCH 5
#define XGBOOST_VER_PATCH 6
#endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
<packaging>pom</packaging>
<name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j-example_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
<packaging>jar</packaging>
<build>
<plugins>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
<build>
<plugins>
<plugin>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j-spark_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
</parent>
<artifactId>xgboost4j_2.12</artifactId>
<version>1.7.5</version>
<version>1.7.6</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -1 +1 @@
1.7.5
1.7.6

View File

@@ -866,7 +866,11 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
result_xgb_model = self._convert_to_sklearn_model(
bytearray(booster, "utf-8"), config
)
return self._copyValues(self._create_pyspark_model(result_xgb_model))
spark_model = self._create_pyspark_model(result_xgb_model)
# According to pyspark ML convention, the model uid should be the same
# with estimator uid.
spark_model._resetUid(self.uid)
return self._copyValues(spark_model)
def write(self):
"""

View File

@@ -149,10 +149,28 @@ common::ColumnMatrix const &GHistIndexMatrix::Transpose() const {
return *columns_;
}
bst_bin_t GHistIndexMatrix::GetGindex(size_t ridx, size_t fidx) const {
auto begin = RowIdx(ridx);
if (IsDense()) {
return static_cast<bst_bin_t>(index[begin + fidx]);
}
auto end = RowIdx(ridx + 1);
auto const& cut_ptrs = cut.Ptrs();
auto f_begin = cut_ptrs[fidx];
auto f_end = cut_ptrs[fidx + 1];
return BinarySearchBin(begin, end, index, f_begin, f_end);
}
float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
auto const &values = cut.Values();
auto const &mins = cut.MinValues();
auto const &ptrs = cut.Ptrs();
return this->GetFvalue(ptrs, values, mins, ridx, fidx, is_cat);
}
float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
std::vector<float> const &values, std::vector<float> const &mins,
bst_row_t ridx, bst_feature_t fidx, bool is_cat) const {
if (is_cat) {
auto f_begin = ptrs[fidx];
auto f_end = ptrs[fidx + 1];
@@ -172,24 +190,27 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
}
return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
};
if (columns_->GetColumnType(fidx) == common::kDenseColumn) {
if (columns_->AnyMissing()) {
switch (columns_->GetColumnType(fidx)) {
case common::kDenseColumn: {
if (columns_->AnyMissing()) {
return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
return get_bin_val(column);
});
} else {
return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
auto bin_idx = column[ridx];
return common::HistogramCuts::NumericBinValue(ptrs, values, mins, fidx, bin_idx);
});
}
}
case common::kSparseColumn: {
return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
auto column = columns_->DenseColumn<decltype(dtype), true>(fidx);
return get_bin_val(column);
});
} else {
return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
auto column = columns_->DenseColumn<decltype(dtype), false>(fidx);
auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
return get_bin_val(column);
});
}
} else {
return common::DispatchBinType(columns_->GetTypeSize(), [&](auto dtype) {
auto column = columns_->SparseColumn<decltype(dtype)>(fidx, 0);
return get_bin_val(column);
});
}
SPAN_CHECK(false);

View File

@@ -227,7 +227,12 @@ class GHistIndexMatrix {
common::ColumnMatrix const& Transpose() const;
bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
bool is_cat) const;
private:
std::unique_ptr<common::ColumnMatrix> columns_;

View File

@@ -63,7 +63,7 @@ bst_float PredValue(const SparsePage::Inst &inst,
psum += (*trees[i])[nidx].LeafValue();
}
}
p_feats->Drop(inst);
p_feats->Drop();
return psum;
}
@@ -116,13 +116,11 @@ void FVecFill(const size_t block_size, const size_t batch_offset, const int num_
}
}
template <typename DataView>
void FVecDrop(const size_t block_size, const size_t batch_offset, DataView* batch,
const size_t fvec_offset, std::vector<RegTree::FVec>* p_feats) {
void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
std::vector<RegTree::FVec> *p_feats) {
for (size_t i = 0; i < block_size; ++i) {
RegTree::FVec &feats = (*p_feats)[fvec_offset + i];
const SparsePage::Inst inst = (*batch)[batch_offset + i];
feats.Drop(inst);
feats.Drop();
}
}
@@ -142,11 +140,15 @@ struct SparsePageView {
struct GHistIndexMatrixView {
private:
GHistIndexMatrix const &page_;
uint64_t n_features_;
std::uint64_t const n_features_;
common::Span<FeatureType const> ft_;
common::Span<Entry> workspace_;
std::vector<size_t> current_unroll_;
std::vector<std::uint32_t> const& ptrs_;
std::vector<float> const& mins_;
std::vector<float> const& values_;
public:
size_t base_rowid;
@@ -159,6 +161,9 @@ struct GHistIndexMatrixView {
ft_{ft},
workspace_{workplace},
current_unroll_(n_threads > 0 ? n_threads : 1, 0),
ptrs_{_page.cut.Ptrs()},
mins_{_page.cut.MinValues()},
values_{_page.cut.Values()},
base_rowid{_page.base_rowid} {}
SparsePage::Inst operator[](size_t r) {
@@ -167,7 +172,7 @@ struct GHistIndexMatrixView {
size_t non_missing{beg};
for (bst_feature_t c = 0; c < n_features_; ++c) {
float f = page_.GetFvalue(r, c, common::IsCat(ft_, c));
float f = page_.GetFvalue(ptrs_, values_, mins_, r, c, common::IsCat(ft_, c));
if (!common::CheckNAN(f)) {
workspace_[non_missing] = Entry{c, f};
++non_missing;
@@ -250,10 +255,9 @@ void PredictBatchByBlockOfRowsKernel(
FVecFill(block_size, batch_offset, num_feature, &batch, fvec_offset,
p_thread_temp);
// process block of rows through all trees to keep cache locality
PredictByAllTrees(model, tree_begin, tree_end, out_preds,
batch_offset + batch.base_rowid, num_group, thread_temp,
fvec_offset, block_size);
FVecDrop(block_size, batch_offset, &batch, fvec_offset, p_thread_temp);
PredictByAllTrees(model, tree_begin, tree_end, out_preds, batch_offset + batch.base_rowid,
num_group, thread_temp, fvec_offset, block_size);
FVecDrop(block_size, fvec_offset, p_thread_temp);
});
}
@@ -470,7 +474,7 @@ class CPUPredictor : public Predictor {
bst_node_t tid = GetLeafIndex<true, true>(tree, feats, cats);
preds[ridx * ntree_limit + j] = static_cast<bst_float>(tid);
}
feats.Drop(page[i]);
feats.Drop();
});
}
}
@@ -544,7 +548,7 @@ class CPUPredictor : public Predictor {
(tree_weights == nullptr ? 1 : (*tree_weights)[j]);
}
}
feats.Drop(page[i]);
feats.Drop();
// add base margin to BIAS
if (base_margin.Size() != 0) {
CHECK_EQ(base_margin.Shape(1), ngroup);

View File

@@ -389,6 +389,7 @@ class HistEvaluator {
tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
tree[candidate.nid].SplitIndex(), left_weight,
right_weight);
evaluator = tree_evaluator_.GetEvaluator();
auto max_node = std::max(left_child, tree[candidate.nid].RightChild());
max_node = std::max(candidate.nid, max_node);

View File

@@ -48,6 +48,8 @@ class TreeEvaluator {
monotone_.HostVector().resize(n_features, 0);
has_constraint_ = false;
} else {
CHECK_LE(p.monotone_constraints.size(), n_features)
<< "The size of monotone constraint should be less or equal to the number of features.";
monotone_.HostVector() = p.monotone_constraints;
monotone_.HostVector().resize(n_features, 0);
// Initialised to some small size, can grow if needed

View File

@@ -286,7 +286,7 @@ struct GPUHistMakerDevice {
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
matrix.is_dense
matrix.is_dense && !collective::IsDistributed()
};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
return split;
@@ -300,11 +300,11 @@ struct GPUHistMakerDevice {
std::vector<bst_node_t> nidx(2 * candidates.size());
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
EvaluateSplitSharedInputs shared_inputs{
GPUTrainingParam{param}, *quantiser, feature_types, matrix.feature_segments,
matrix.gidx_fvalue_map, matrix.min_fvalue,
matrix.is_dense
};
EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
matrix.feature_segments, matrix.gidx_fvalue_map,
matrix.min_fvalue,
// is_dense represents the local data
matrix.is_dense && !collective::IsDistributed()};
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;

View File

@@ -78,7 +78,7 @@ CPUExpandEntry QuantileHistMaker::Builder::InitRoot(
{
GradientPairPrecise grad_stat;
if (p_fmat->IsDense()) {
if (p_fmat->IsDense() && !collective::IsDistributed()) {
/**
* Specialized code for dense data: For dense data (with no missing value), the sum
* of gradient histogram is equal to snode[nid]

View File

@@ -89,7 +89,7 @@ class TreeRefresher : public TreeUpdater {
dmlc::BeginPtr(stemp[tid]) + offset);
offset += tree->param.num_nodes;
}
feats.Drop(inst);
feats.Drop();
});
}
// aggregate the statistics

View File

@@ -31,6 +31,5 @@ dependencies:
- pyspark
- cloudpickle
- pip:
- shap
- awscli
- auditwheel

View File

@@ -34,7 +34,6 @@ dependencies:
- pyarrow
- protobuf
- cloudpickle
- shap
- modin
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz

View File

@@ -6,6 +6,9 @@
#include <string>
#include "../../../src/tree/constraints.h"
#include "../../../src/tree/hist/evaluate_splits.h"
#include "../../../src/tree/hist/expand_entry.h"
#include "../helpers.h"
namespace xgboost {
namespace tree {
@@ -56,5 +59,38 @@ TEST(CPUFeatureInteractionConstraint, Basic) {
ASSERT_FALSE(constraints.Query(1, 5));
}
TEST(CPUMonoConstraint, Basic) {
std::size_t kRows{64}, kCols{16};
Context ctx;
TrainParam param;
std::vector<std::int32_t> mono(kCols, 1);
I32Array arr;
for (std::size_t i = 0; i < kCols; ++i) {
arr.GetArray().push_back(mono[i]);
}
Json jarr{std::move(arr)};
std::string str_mono;
Json::Dump(jarr, &str_mono);
str_mono.front() = '(';
str_mono.back() = ')';
param.UpdateAllowUnknown(Args{{"monotone_constraints", str_mono}});
auto Xy = RandomDataGenerator{kRows, kCols, 0.0}.GenerateDMatrix(true);
auto sampler = std::make_shared<common::ColumnSampler>();
HistEvaluator<CPUExpandEntry> evalutor{param, Xy->Info(), ctx.Threads(), sampler};
evalutor.InitRoot(GradStats{2.0, 2.0});
SplitEntry split;
split.Update(1.0f, 0, 3.0, false, false, GradStats{1.0, 1.0}, GradStats{1.0, 1.0});
CPUExpandEntry entry{0, 0, split};
RegTree tree;
tree.param.UpdateAllowUnknown(Args{{"num_feature", std::to_string(kCols)}});
evalutor.ApplyTreeSplit(entry, &tree);
ASSERT_TRUE(evalutor.Evaluator().has_constraint);
}
} // namespace tree
} // namespace xgboost

View File

@@ -578,7 +578,7 @@ class TestModels:
y = rng.randn(rows)
feature_names = ["test_feature_" + str(i) for i in range(cols)]
X_pd = pd.DataFrame(X, columns=feature_names)
X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int32)
X_pd[f"test_feature_{3}"] = X_pd.iloc[:, 3].astype(np.int32)
Xy = xgb.DMatrix(X_pd, y)
assert Xy.feature_types[3] == "int"

View File

@@ -75,7 +75,10 @@ class TestPandas:
np.testing.assert_array_equal(result, exp)
dm = xgb.DMatrix(dummies)
assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
assert dm.feature_types == ['int', 'int', 'int', 'int']
if int(pd.__version__[0]) >= 2:
assert dm.feature_types == ['int', 'i', 'i', 'i']
else:
assert dm.feature_types == ['int', 'int', 'int', 'int']
assert dm.num_row() == 3
assert dm.num_col() == 4