Compare commits

...

31 Commits

Author SHA1 Message Date
Jiaming Yuan
21d95f3d8f [backport] [doc][R] Update link. (#8998) (#9001) 2023-03-30 20:02:31 +08:00
Jiaming Yuan
5cd4015d70 [backport] Fill column size. (#8997) 2023-03-30 15:21:42 +08:00
Jiaming Yuan
b8c6b86792 Bump version to 1.7.5. (#8994) 2023-03-29 21:41:10 +08:00
Jiaming Yuan
1baebe231b [backport] [CI] Fix Windows wheel to be compatible with Poetry (#8991) (#8992)
* [CI] Fix Windows wheel to be compatible with Poetry

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-03-29 14:26:20 +08:00
Jiaming Yuan
365da0b8f4 [backport] [doc] Add missing document for pyspark ranker. (#8692) (#8990) 2023-03-29 12:02:51 +08:00
Jiaming Yuan
f5f03dfb61 [backport] Update dmlc-core to get C++17 deprecation warning (#8855) (#8982)
Co-authored-by: Rong Ou <rong.ou@gmail.com>
2023-03-27 21:31:30 +08:00
Jiaming Yuan
a1c209182d [backport] Update c++ requirement to 17 for the R package. (#8860) (#8983) 2023-03-27 18:24:25 +08:00
Jiaming Yuan
4be75d852c [backport] Fix scope of feature set pointers (#8850) (#8972)
---------

Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>
2023-03-27 00:33:08 +08:00
Jiaming Yuan
ba50e6eb62 [backport] [CI] Require C++17 + CMake 3.18; Use CUDA 11.8 in CI (#8853) (#8971)
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-03-26 00:10:03 +08:00
Jiaming Yuan
36ad160501 Bump version to 1.7.4. (#8805) 2023-02-16 06:40:01 +08:00
Jiaming Yuan
c22f6db4bf [backport] Fix CPU bin compression with categorical data. (#8809) (#8810)
* [backport] Fix CPU bin compression with categorical data. (#8809)

* Fix CPU bin compression with categorical data.

* The bug causes the maximum category to be lesser than 256 or the maximum number of bins when
the input data is dense.

* Avoid test symbol.
2023-02-16 06:39:25 +08:00
Jiaming Yuan
f15a6d2b19 [backport] Fix ranking with quantile dmatrix and group weight. (#8762) (#8800)
* [backport] Fix ranking with quantile dmatrix and group weight. (#8762)

* backport test utilities.
2023-02-15 02:45:09 +08:00
Jiaming Yuan
08a547f5c2 [backport] Fix feature types param (#8772) (#8801)
Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
Co-authored-by: WeichenXu <weichen.xu@databricks.com>
2023-02-15 01:39:20 +08:00
Jiaming Yuan
60303db2ee [backport] Fix GPU L1 error. (#8749) (#8770)
* [backport] Fix GPU L1 error. (#8749)

* Fix backport.
2023-02-09 20:16:39 +08:00
Jiaming Yuan
df984f9c43 [backport] Fix different number of features in gpu_hist evaluator. (#8754) (#8769)
Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>
2023-02-09 18:31:49 +08:00
Jiaming Yuan
2f22f8d49b [backport] Make sure input numpy array is aligned. (#8690) (#8696) (#8734)
* [backport] Make sure input numpy array is aligned. (#8690)

- use `np.require` to specify that the alignment is required.
- scipy csr as well.
- validate input pointer in `ArrayInterface`.

* Workaround CUDA warning. (#8696)

* backport from half type support for alignment.

* fix import.
2023-02-06 16:58:15 +08:00
Jiaming Yuan
68d86336d7 [backport] [R] fix OpenMP detection on macOS (#8684) (#8732)
Co-authored-by: James Lamb <jaylamb20@gmail.com>
2023-01-29 12:43:10 +08:00
Jiaming Yuan
76bdca072a [R] Fix threads used to create DMatrix in predict. (#8681) (#8682) 2023-01-15 04:00:31 +08:00
Jiaming Yuan
021e6a842a [backport] [R] Get CXX flags from R CMD config. (#8669) (#8680) 2023-01-14 18:46:59 +08:00
Jiaming Yuan
e5bef4ffce [backport] Fix threads in DMatrix slice. (#8667) (#8679) 2023-01-14 18:46:04 +08:00
Jiaming Yuan
10bb0a74ef [backport] [CI] Skip pyspark sparse tests. (#8675) (#8678) 2023-01-14 06:40:17 +08:00
Jiaming Yuan
e803d06d8c [backport] [R] Remove unused assert definition. (#8526) (#8668) 2023-01-13 04:55:29 +08:00
Jiaming Yuan
ccf43d4ba0 Bump R package version to 1.7.3. (#8649) 2023-01-06 20:34:05 +08:00
Jiaming Yuan
dd58c2ac47 Bump version to 1.7.3. (#8646) 2023-01-06 17:55:51 +08:00
Jiaming Yuan
899e4c8988 [backport] Do not return internal value for get_params. (#8634) (#8642) 2023-01-06 02:28:39 +08:00
Jiaming Yuan
a2085bf223 [backport] Fix loading GPU pickle with a CPU-only xgboost distribution. (#8632) (#8641)
We can handle loading the pickle on a CPU-only machine if the XGBoost is built with CUDA
enabled (Linux and Windows PyPI package), but not if the distribution is CPU-only (macOS
PyPI package).
2023-01-06 02:28:21 +08:00
Jiaming Yuan
067b704e58 [backport] Fix inference with categorical feature. (#8591) (#8602) (#8638)
* Fix inference with categorical feature. (#8591)

* Fix windows build on buildkite. (#8602)

* workaround.
2023-01-06 01:17:49 +08:00
Jiaming Yuan
1a834b2b85 Fix linalg iterator. (#8603) (#8639) 2023-01-05 23:16:10 +08:00
Jiaming Yuan
162b48a1a4 [backport] [CI] Disable gtest with RMM (#8620) (#8640)
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-01-05 23:13:45 +08:00
Jiaming Yuan
83a078b7e5 [backport] Fix sklearn test that calls a removed field (#8579) (#8636)
Co-authored-by: Rong Ou <rong.ou@gmail.com>
2023-01-05 21:17:05 +08:00
Jiaming Yuan
575fba651b [backport] [CI] Fix CI with updated dependencies. (#8631) (#8635) 2023-01-05 19:10:58 +08:00
99 changed files with 1987 additions and 1329 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.7.2)
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.7.5)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
@@ -168,9 +168,6 @@ find_package(Threads REQUIRED)
if (USE_OPENMP)
if (APPLE)
# Require CMake 3.16+ on Mac OSX, as previous versions of CMake had trouble locating
# OpenMP on Mac. See https://github.com/dmlc/xgboost/pull/5146#issuecomment-568312706
cmake_minimum_required(VERSION 3.16)
find_package(OpenMP)
if (NOT OpenMP_FOUND)
# Try again with extra path info; required for libomp 15+ from Homebrew

View File

@@ -31,7 +31,7 @@ if (USE_OPENMP)
endif (USE_OPENMP)
set_target_properties(
xgboost-r PROPERTIES
CXX_STANDARD 14
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)

View File

@@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 1.7.2.1
Date: 2022-12-08
Version: 1.7.5.1
Date: 2023-03-29
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),
@@ -66,5 +66,6 @@ Imports:
methods,
data.table (>= 1.9.6),
jsonlite (>= 1.0),
RoxygenNote: 7.2.1
SystemRequirements: GNU make, C++14
RoxygenNote: 7.2.3
Encoding: UTF-8
SystemRequirements: GNU make, C++17

View File

@@ -1,9 +1,9 @@
Copyright (c) 2014 by Tianqi Chen and Contributors
Copyright (c) 2014-2023, Tianqi Chen and XBGoost Contributors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software

View File

@@ -328,8 +328,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing)
newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata)))

View File

@@ -34,7 +34,7 @@
#' The branches that also used for missing values are marked as bold
#' (as in "carrying extra capacity").
#'
#' This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
#' This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
#'
#' @return
#'

1831
R-package/configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -2,10 +2,25 @@
AC_PREREQ(2.69)
AC_INIT([xgboost],[1.7.2],[],[xgboost],[])
AC_INIT([xgboost],[1.7.5],[],[xgboost],[])
# Use this line to set CC variable to a C compiler
AC_PROG_CC
: ${R_HOME=`R RHOME`}
if test -z "${R_HOME}"; then
echo "could not determine R_HOME"
exit 1
fi
CXX17=`"${R_HOME}/bin/R" CMD config CXX17`
CXX17STD=`"${R_HOME}/bin/R" CMD config CXX17STD`
CXX="${CXX17} ${CXX17STD}"
CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
CC=`"${R_HOME}/bin/R" CMD config CC`
CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
AC_LANG(C++)
### Check whether backtrace() is part of libc or the external lib libexecinfo
AC_MSG_CHECKING([Backtrace lib])
@@ -40,7 +55,7 @@ then
ac_pkg_openmp=no
AC_MSG_CHECKING([whether OpenMP will work in a package])
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
${CC} -o conftest conftest.c ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
${CXX} -o conftest conftest.cpp ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
AC_MSG_RESULT([${ac_pkg_openmp}])
if test "${ac_pkg_openmp}" = no; then
OPENMP_CXXFLAGS=''

View File

@@ -67,7 +67,7 @@ The "Yes" branches are marked by the "< split_value" label.
The branches that also used for missing values are marked as bold
(as in "carrying extra capacity").
This function uses \href{http://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
This function uses \href{https://www.graphviz.org/}{GraphViz} as a backend of DiagrammeR.
}
\examples{
data(agaricus.train, package='xgboost')

View File

@@ -3,7 +3,7 @@ PKGROOT=../../
ENABLE_STD_THREAD=1
# _*_ mode: Makefile; _*_
CXX_STD = CXX14
CXX_STD = CXX17
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -23,7 +23,6 @@ PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
OBJECTS= \
./xgboost_R.o \
./xgboost_custom.o \
./xgboost_assert.o \
./init.o \
$(PKGROOT)/src/metric/metric.o \
$(PKGROOT)/src/metric/elementwise_metric.o \

View File

@@ -3,7 +3,7 @@ PKGROOT=../../
ENABLE_STD_THREAD=0
# _*_ mode: Makefile; _*_
CXX_STD = CXX14
CXX_STD = CXX17
XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
@@ -23,7 +23,6 @@ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) -DDMLC_CMAKE_LITTLE_ENDIAN=1 $(SHLIB_PTHRE
OBJECTS= \
./xgboost_R.o \
./xgboost_custom.o \
./xgboost_assert.o \
./init.o \
$(PKGROOT)/src/metric/metric.o \
$(PKGROOT)/src/metric/elementwise_metric.o \

View File

@@ -1,26 +0,0 @@
// Copyright (c) 2014 by Contributors
#include <stdio.h>
#include <stdarg.h>
#include <Rinternals.h>
// implements error handling
void XGBoostAssert_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("AssertError:%s\n", buf);
}
}
void XGBoostCheck_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("%s\n", buf);
}
}

View File

@@ -178,17 +178,10 @@ function(xgboost_set_cuda_flags target)
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
endif (MSVC)
if (PLUGIN_RMM)
set_target_properties(${target} PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)
else ()
set_target_properties(${target} PROPERTIES
CUDA_STANDARD 14
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)
endif (PLUGIN_RMM)
set_target_properties(${target} PROPERTIES
CUDA_STANDARD 17
CUDA_STANDARD_REQUIRED ON
CUDA_SEPARABLE_COMPILATION OFF)
endfunction(xgboost_set_cuda_flags)
macro(xgboost_link_nccl target)
@@ -205,17 +198,10 @@ endmacro(xgboost_link_nccl)
# compile options
macro(xgboost_target_properties target)
if (PLUGIN_RMM)
set_target_properties(${target} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
else ()
set_target_properties(${target} PROPERTIES
CXX_STANDARD 14
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
endif (PLUGIN_RMM)
set_target_properties(${target} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
if (HIDE_CXX_SYMBOLS)
#-- Hide all C++ symbols

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.13)
cmake_minimum_required(VERSION 3.18)
project(xgboost-c-examples)
add_subdirectory(basic)

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.13)
cmake_minimum_required(VERSION 3.18)
project(external-memory-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.13)
cmake_minimum_required(VERSION 3.18)
project(inference-demo LANGUAGES C VERSION 0.0.1)
find_package(xgboost REQUIRED)

View File

@@ -173,3 +173,13 @@ PySpark API
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: xgboost.spark.SparkXGBRanker
:members:
:inherited-members:
:show-inheritance:
.. autoclass:: xgboost.spark.SparkXGBRankerModel
:members:
:inherited-members:
:show-inheritance:

View File

@@ -45,7 +45,7 @@ Use ``find_package()`` and ``target_link_libraries()`` in your application's CMa
.. code-block:: cmake
cmake_minimum_required(VERSION 3.13)
cmake_minimum_required(VERSION 3.18)
project(your_project_name LANGUAGES C CXX VERSION your_project_version)
find_package(xgboost REQUIRED)
add_executable(your_project_name /path/to/project_file.c)

View File

@@ -138,11 +138,11 @@ Miscellaneous
By default, XGBoost assumes input categories are integers starting from 0 till the number
of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
values due to mistakes or missing values. It can be negative value, integer values that
can not be accurately represented by 32-bit floating point, or values that are larger than
actual number of unique categories. During training this is validated but for prediction
it's treated as the same as missing value for performance reasons. Lastly, missing values
are treated as the same as numerical features (using the learned split direction).
values due to mistakes or missing values in training dataset. It can be negative value,
integer values that can not be accurately represented by 32-bit floating point, or values
that are larger than actual number of unique categories. During training this is
validated but for prediction it's treated as the same as not-chosen category for
performance reasons.
**********

View File

@@ -43,10 +43,10 @@ in spark estimator, and some parameters are replaced with pyspark specific param
such as `weight_col`, `validation_indicator_col`, `use_gpu`, for details please see
`SparkXGBRegressor` doc.
The following code snippet shows how to train a spark xgboost regressor model,
first we need to prepare a training dataset as a spark dataframe contains
"label" column and "features" column(s), the "features" column(s) must be `pyspark.ml.linalg.Vector`
type or spark array type or a list of feature column names.
The following code snippet shows how to train a spark xgboost regressor model, first we
need to prepare a training dataset as a spark dataframe contains "label" column and
"features" column(s), the "features" column(s) must be ``pyspark.ml.linalg.Vector`` type
or spark array type or a list of feature column names.
.. code-block:: python
@@ -54,10 +54,10 @@ type or spark array type or a list of feature column names.
xgb_regressor_model = xgb_regressor.fit(train_spark_dataframe)
The following code snippet shows how to predict test data using a spark xgboost regressor model,
first we need to prepare a test dataset as a spark dataframe contains
"features" and "label" column, the "features" column must be `pyspark.ml.linalg.Vector`
type or spark array type.
The following code snippet shows how to predict test data using a spark xgboost regressor
model, first we need to prepare a test dataset as a spark dataframe contains "features"
and "label" column, the "features" column must be ``pyspark.ml.linalg.Vector`` type or
spark array type.
.. code-block:: python

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 1
#define XGBOOST_VER_MINOR 7
#define XGBOOST_VER_PATCH 2
#define XGBOOST_VER_PATCH 5
#endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
<packaging>pom</packaging>
<name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j-example_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
<packaging>jar</packaging>
<build>
<plugins>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
<build>
<plugins>
<plugin>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j-spark_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -1,9 +1,9 @@
from sklearn.datasets import load_iris
import numpy as np
import pandas
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
y = y.astype(np.int)
y = y.astype(np.int32)
df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
df['class'] = np.vectorize(class_id_to_name.get)(y)

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
</parent>
<artifactId>xgboost4j_2.12</artifactId>
<version>1.7.2</version>
<version>1.7.5</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -15,7 +15,7 @@ if (PLUGIN_UPDATER_ONEAPI)
target_link_libraries(oneapi_plugin PUBLIC -fsycl)
set_target_properties(oneapi_plugin PROPERTIES
COMPILE_FLAGS -fsycl
CXX_STANDARD 14
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
POSITION_INDEPENDENT_CODE ON)
if (USE_OPENMP)

View File

@@ -1 +1 @@
1.7.2
1.7.5

View File

@@ -36,7 +36,6 @@ try:
PANDAS_INSTALLED = True
except ImportError:
MultiIndex = object
DataFrame = object
Series = object
@@ -161,6 +160,7 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem
# `importlib.utils`, except it's unclear from its document on how to use it. This one
# seems to be easy to understand and works out of box.
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this

View File

@@ -2172,6 +2172,7 @@ class Booster:
)
return _prediction_output(shape, dims, preds, False)
# pylint: disable=too-many-statements
def inplace_predict(
self,
data: DataType,
@@ -2192,10 +2193,10 @@ class Booster:
.. code-block:: python
booster.set_param({'predictor': 'gpu_predictor'})
booster.set_param({"predictor": "gpu_predictor"})
booster.inplace_predict(cupy_array)
booster.set_param({'predictor': 'cpu_predictor})
booster.set_param({"predictor": "cpu_predictor"})
booster.inplace_predict(numpy_array)
.. versionadded:: 1.1.0
@@ -2301,14 +2302,16 @@ class Booster:
)
return _prediction_output(shape, dims, preds, False)
if isinstance(data, scipy.sparse.csr_matrix):
csr = data
from .data import _transform_scipy_csr
data = _transform_scipy_csr(data)
_check_call(
_LIB.XGBoosterPredictFromCSR(
self.handle,
_array_interface(csr.indptr),
_array_interface(csr.indices),
_array_interface(csr.data),
c_bst_ulong(csr.shape[1]),
_array_interface(data.indptr),
_array_interface(data.indices),
_array_interface(data.data),
c_bst_ulong(data.shape[1]),
from_pystr_to_cstr(json.dumps(args)),
p_handle,
ctypes.byref(shape),

View File

@@ -30,6 +30,7 @@ from .core import (
c_array,
c_str,
from_pystr_to_cstr,
make_jcargs,
)
DispatchedDataBackendReturnType = Tuple[
@@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes:
return interface_str
def _transform_scipy_csr(data: DataType) -> DataType:
from scipy.sparse import csr_matrix
indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
values, _ = _ensure_np_dtype(data.data, data.data.dtype)
if (
indptr is not data.indptr
or indices is not data.indices
or values is not data.data
):
data = csr_matrix((values, indices, indptr), shape=data.shape)
return data
def _from_scipy_csr(
data: DataType,
missing: FloatCompatible,
@@ -93,18 +109,14 @@ def _from_scipy_csr(
f"length mismatch: {len(data.indices)} vs {len(data.data)}"
)
handle = ctypes.c_void_p()
args = {
"missing": float(missing),
"nthread": int(nthread),
}
config = bytes(json.dumps(args), "utf-8")
data = _transform_scipy_csr(data)
_check_call(
_LIB.XGDMatrixCreateFromCSR(
_array_interface(data.indptr),
_array_interface(data.indices),
_array_interface(data.data),
c_bst_ulong(data.shape[1]),
config,
make_jcargs(missing=float(missing), nthread=int(nthread)),
ctypes.byref(handle),
)
)
@@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool:
def _ensure_np_dtype(
data: DataType,
dtype: Optional[NumpyDType]
data: DataType, dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
data = data.astype(np.float32, copy=False)
dtype = np.float32
data = data.astype(dtype, copy=False)
if not data.flags.aligned:
data = np.require(data, requirements="A")
return data, dtype
@@ -1197,11 +1210,13 @@ def _proxy_transform(
data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types
if _is_scipy_csr(data):
data = _transform_scipy_csr(data)
return data, None, feature_names, feature_types
if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
arr, _ = _ensure_np_dtype(arr, arr.dtype)
return arr, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))

View File

@@ -674,7 +674,7 @@ class XGBModel(XGBModelBase):
self.kwargs = {}
self.kwargs[key] = value
if hasattr(self, "_Booster"):
if self.__sklearn_is_fitted__():
parameters = self.get_xgb_params()
self.get_booster().set_param(parameters)
@@ -701,39 +701,12 @@ class XGBModel(XGBModelBase):
np.iinfo(np.int32).max
)
def parse_parameter(value: Any) -> Optional[Union[int, float, str]]:
for t in (int, float, str):
try:
ret = t(value)
return ret
except ValueError:
continue
return None
# Get internal parameter values
try:
config = json.loads(self.get_booster().save_config())
stack = [config]
internal = {}
while stack:
obj = stack.pop()
for k, v in obj.items():
if k.endswith("_param"):
for p_k, p_v in v.items():
internal[p_k] = p_v
elif isinstance(v, dict):
stack.append(v)
for k, v in internal.items():
if k in params and params[k] is None:
params[k] = parse_parameter(v)
except ValueError:
pass
return params
def get_xgb_params(self) -> Dict[str, Any]:
"""Get xgboost specific parameters."""
params = self.get_params()
params: Dict[str, Any] = self.get_params()
# Parameters that should not go into native learner.
wrapper_specific = {
"importance_type",
@@ -750,6 +723,7 @@ class XGBModel(XGBModelBase):
for k, v in params.items():
if k not in wrapper_specific and not callable(v):
filtered[k] = v
return filtered
def get_num_boosting_rounds(self) -> int:
@@ -1070,7 +1044,7 @@ class XGBModel(XGBModelBase):
# error with incompatible data type.
# Inplace predict doesn't handle as many data types as DMatrix, but it's
# sufficient for dask interface where input is simpiler.
predictor = self.get_params().get("predictor", None)
predictor = self.get_xgb_params().get("predictor", None)
if predictor in ("auto", None) and self.booster != "gblinear":
return True
return False
@@ -1336,7 +1310,7 @@ class XGBModel(XGBModelBase):
-------
coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]``
"""
if self.get_params()["booster"] != "gblinear":
if self.get_xgb_params()["booster"] != "gblinear":
raise AttributeError(
f"Coefficients are not defined for Booster type {self.booster}"
)
@@ -1366,7 +1340,7 @@ class XGBModel(XGBModelBase):
-------
intercept_ : array of shape ``(1,)`` or ``[n_classes]``
"""
if self.get_params()["booster"] != "gblinear":
if self.get_xgb_params()["booster"] != "gblinear":
raise AttributeError(
f"Intercept (bias) is not defined for Booster type {self.booster}"
)

View File

@@ -1,6 +1,5 @@
# type: ignore
"""PySpark XGBoost integration interface
"""
"""PySpark XGBoost integration interface"""
try:
import pyspark
@@ -11,6 +10,7 @@ from .estimator import (
SparkXGBClassifier,
SparkXGBClassifierModel,
SparkXGBRanker,
SparkXGBRankerModel,
SparkXGBRegressor,
SparkXGBRegressorModel,
)
@@ -21,4 +21,5 @@ __all__ = [
"SparkXGBRegressor",
"SparkXGBRegressorModel",
"SparkXGBRanker",
"SparkXGBRankerModel",
]

View File

@@ -140,6 +140,13 @@ _unsupported_predict_params = {
}
# TODO: supply hint message for all other unsupported params.
_unsupported_params_hint_message = {
"enable_categorical": "`xgboost.spark` estimators do not have 'enable_categorical' param, "
"but you can set `feature_types` param and mark categorical features with 'c' string."
}
class _SparkXGBParams(
HasFeaturesCol,
HasLabelCol,
@@ -523,7 +530,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
or k in _unsupported_predict_params
or k in _unsupported_train_params
):
raise ValueError(f"Unsupported param '{k}'.")
err_msg = _unsupported_params_hint_message.get(
k, f"Unsupported param '{k}'."
)
raise ValueError(err_msg)
_extra_params[k] = v
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -749,6 +759,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
"feature_weights": self.getOrDefault(self.feature_weights),
"missing": float(self.getOrDefault(self.missing)),
}
if dmatrix_kwargs["feature_types"] is not None:
dmatrix_kwargs["enable_categorical"] = True
booster_params["nthread"] = cpu_per_task
use_gpu = self.getOrDefault(self.use_gpu)

View File

@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.3)
cmake_minimum_required(VERSION 3.18)
find_package(Threads REQUIRED)

View File

@@ -48,20 +48,21 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
return cat < 0 || cat >= kMaxCat;
}
/* \brief Whether should it traverse to left branch of a tree.
/**
* \brief Whether should it traverse to left branch of a tree.
*
* For one hot split, go to left if it's NOT the matching category.
* Go to left if it's NOT the matching category, which matches one-hot encoding.
*/
template <bool validate = true>
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
KCatBitField const s_cats(cats);
// FIXME: Size() is not accurate since it represents the size of bit set instead of
// actual number of categories.
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
return dft_left;
if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
return true;
}
auto pos = KCatBitField::ToBitPos(cat);
// If the input category is larger than the size of the bit field, it implies that the
// category is not chosen. Otherwise the bit field would have the category instead of
// being smaller than the category value.
if (pos.int_pos >= cats.size()) {
return true;
}

View File

@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
feature_offsets_[fid] = accum_index;
}
SetTypeSize(gmat.max_num_bins);
SetTypeSize(gmat.MaxNumBinPerFeat());
auto storage_size =
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
index_.resize(storage_size, 0);

View File

@@ -62,7 +62,7 @@ void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t,
#endif // !defined(XGBOOST_USE_CUDA)
template <typename T, std::int32_t kDim>
auto cbegin(TensorView<T, kDim> v) { // NOLINT
auto cbegin(TensorView<T, kDim> const& v) { // NOLINT
auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
});
@@ -70,19 +70,19 @@ auto cbegin(TensorView<T, kDim> v) { // NOLINT
}
template <typename T, std::int32_t kDim>
auto cend(TensorView<T, kDim> v) { // NOLINT
auto cend(TensorView<T, kDim> const& v) { // NOLINT
return cbegin(v) + v.Size();
}
template <typename T, std::int32_t kDim>
auto begin(TensorView<T, kDim> v) { // NOLINT
auto begin(TensorView<T, kDim>& v) { // NOLINT
auto it = common::MakeIndexTransformIter(
[&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
return it;
}
template <typename T, std::int32_t kDim>
auto end(TensorView<T, kDim> v) { // NOLINT
auto end(TensorView<T, kDim>& v) { // NOLINT
return begin(v) + v.Size();
}
} // namespace linalg

View File

@@ -144,7 +144,7 @@ class PartitionBuilder {
auto gidx = gidx_calc(ridx);
bool go_left = default_left;
if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx], default_left);
go_left = Decision(node_cats, cut_values[gidx]);
}
return go_left;
} else {
@@ -157,7 +157,7 @@ class PartitionBuilder {
bool go_left = default_left;
if (gidx > -1) {
if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx], default_left);
go_left = Decision(node_cats, cut_values[gidx]);
} else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
}

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2019-2021 by Contributors
/**
* Copyright 2019-2023 by XGBoost Contributors
* \file array_interface.h
* \brief View of __array_interface__
*/
@@ -7,9 +7,11 @@
#define XGBOOST_DATA_ARRAY_INTERFACE_H_
#include <algorithm>
#include <cinttypes>
#include <cstddef> // std::size_t
#include <cstdint>
#include <map>
#include <string>
#include <type_traits> // std::alignment_of,std::remove_pointer_t
#include <utility>
#include <vector>
@@ -394,6 +396,11 @@ class ArrayInterface {
data = ArrayInterfaceHandler::ExtractData(array, n);
static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
auto alignment = this->ElementAlignment();
auto ptr = reinterpret_cast<uintptr_t>(this->data);
CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment.";
if (allow_mask) {
common::Span<RBitField8::value_type> s_mask;
size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
@@ -512,9 +519,15 @@ class ArrayInterface {
return func(reinterpret_cast<uint64_t const *>(data));
}
XGBOOST_DEVICE size_t ElementSize() {
return this->DispatchCall(
[](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); });
XGBOOST_DEVICE std::size_t ElementSize() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
});
}
XGBOOST_DEVICE std::size_t ElementAlignment() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
});
}
template <typename T = float, typename... Index>

View File

@@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
common::Span<float> hess) {
common::Span<float> hess)
: max_numeric_bins_per_feat{max_bins_per_feat} {
CHECK(p_fmat->SingleColBlock());
// We use sorted sketching for approx tree method since it's more efficient in
// computation time (but higher memory usage).
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
max_num_bins = max_bins_per_feat;
const uint32_t nbins = cut.Ptrs().back();
hit_count.resize(nbins, 0);
hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
: row_ptr(info.num_row_ + 1, 0),
hit_count(cuts.TotalBins(), 0),
cut{std::forward<common::HistogramCuts>(cuts)},
max_num_bins(max_bin_per_feat),
max_numeric_bins_per_feat(max_bin_per_feat),
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
#if !defined(XGBOOST_USE_CUDA)
@@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
}
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
bool isDense, double sparse_thresh, int32_t n_threads) {
common::HistogramCuts cuts, int32_t max_bins_per_feat,
bool isDense, double sparse_thresh, int32_t n_threads)
: cut{std::move(cuts)},
max_numeric_bins_per_feat{max_bins_per_feat},
base_rowid{batch.base_rowid},
isDense_{isDense} {
CHECK_GE(n_threads, 1);
base_rowid = batch.base_rowid;
isDense_ = isDense;
cut = cuts;
max_num_bins = max_bins_per_feat;
CHECK_EQ(row_ptr.size(), 0);
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
@@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
#undef INSTANTIATION_PUSH
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
isDense) {
// compress dense index to uint8
index.SetBinTypeSize(common::kUint8BinsTypeSize);
index.Resize((sizeof(uint8_t)) * n_index);
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
isDense) {
// compress dense index to uint16
index.SetBinTypeSize(common::kUint16BinsTypeSize);

View File

@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
EllpackPage const& in_page, BatchParam const& p)
: max_num_bins{p.max_bin} {
: max_numeric_bins_per_feat{p.max_bin} {
auto page = in_page.Impl();
isDense_ = page->is_dense;

View File

@@ -133,11 +133,15 @@ class GHistIndexMatrix {
std::vector<size_t> hit_count;
/*! \brief The corresponding cuts */
common::HistogramCuts cut;
/*! \brief max_bin for each feature. */
bst_bin_t max_num_bins;
/** \brief max_bin for each feature. */
bst_bin_t max_numeric_bins_per_feat;
/*! \brief base row index for current page (used by external memory) */
size_t base_rowid{0};
bst_bin_t MaxNumBinPerFeat() const {
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
}
~GHistIndexMatrix();
/**
* \brief Constrcutor for SimpleDMatrix.
@@ -160,7 +164,7 @@ class GHistIndexMatrix {
* \brief Constructor for external memory.
*/
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
double sparse_thresh, int32_t n_threads);
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.

View File

@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
if (!fi->Read(&page->hit_count)) {
return false;
}
if (!fi->Read(&page->max_num_bins)) {
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
return false;
}
if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
sizeof(uint64_t);
// max_bins, base row, is_dense
fo->Write(page.max_num_bins);
bytes += sizeof(page.max_num_bins);
fo->Write(page.max_numeric_bins_per_feat);
bytes += sizeof(page.max_numeric_bins_per_feat);
fo->Write(page.base_rowid);
bytes += sizeof(page.base_rowid);
fo->Write(page.IsDense());

View File

@@ -58,6 +58,13 @@ void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, Bat
}
};
auto ellpack = [&]() {
// workaround ellpack being initialized from CPU.
if (p.gpu_id == Context::kCpuId) {
p.gpu_id = ref_->Ctx()->gpu_id;
}
if (p.gpu_id == Context::kCpuId) {
p.gpu_id = 0;
}
for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
GetCutsFromEllpack(page, p_cuts);
break;
@@ -103,6 +110,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
size_t n_threads = ctx_.Threads();
size_t n_features = column_sizes.size();
linalg::Tensor<size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
column_sizes_tloc.Data()->Fill(0);
auto view = column_sizes_tloc.HostView();
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
auto const& line = value.GetLine(i);
@@ -172,9 +180,9 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
size_t i = 0;
while (iter.Next()) {
if (!p_sketch) {
p_sketch.reset(new common::HostSketchContainer{batch_param_.max_bin,
proxy->Info().feature_types.ConstHostSpan(),
column_sizes, false, ctx_.Threads()});
p_sketch.reset(new common::HostSketchContainer{
batch_param_.max_bin, proxy->Info().feature_types.ConstHostSpan(), column_sizes,
!proxy->Info().group_ptr_.empty(), ctx_.Threads()});
}
HostAdapterDispatch(proxy, [&](auto const& batch) {
proxy->Info().num_nonzero_ = batch_nnz[i];

View File

@@ -42,6 +42,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
out->Info() = this->Info().Slice(ridxs);
out->Info().num_nonzero_ = h_offset.back();
}
out->ctx_ = this->ctx_;
return out;
}

View File

@@ -28,6 +28,7 @@
#include "xgboost/logging.h"
#include "xgboost/objective.h"
#include "xgboost/predictor.h"
#include "xgboost/string_view.h"
#include "xgboost/tree_updater.h"
namespace xgboost {
@@ -395,23 +396,36 @@ void GBTree::LoadConfig(Json const& in) {
tparam_.process_type = TreeProcessType::kDefault;
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
LOG(WARNING)
<< "Loading from a raw memory buffer on CPU only machine. "
"Changing predictor to auto.";
LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
"Changing predictor to auto.";
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
}
auto msg = StringView{
R"(
Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
machine. Consider using `save_model/load_model` instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
for more details about differences between saving model and serializing.)"};
if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
LOG(WARNING)
<< "Loading from a raw memory buffer on CPU only machine. "
"Changing tree_method to hist.";
LOG(WARNING) << msg << " Changing `tree_method` to `hist`.";
}
auto const& j_updaters = get<Object const>(in["updater"]);
updaters_.clear();
for (auto const& kv : j_updaters) {
std::unique_ptr<TreeUpdater> up(
TreeUpdater::Create(kv.first, ctx_, model_.learner_model_param->task));
auto name = kv.first;
if (n_gpus == 0 && name == "grow_gpu_hist") {
name = "grow_quantile_histmaker";
LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
}
std::unique_ptr<TreeUpdater> up{
TreeUpdater::Create(name, ctx_, model_.learner_model_param->task)};
up->LoadConfig(kv.second);
updaters_.push_back(std::move(up));
}

View File

@@ -18,9 +18,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
if (has_categorical && common::IsCat(cats.split_type, nid)) {
auto node_categories =
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
return common::Decision<true>(node_categories, fvalue, node.DefaultLeft())
? node.LeftChild()
: node.RightChild();
return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
} else {
return node.LeftChild() + !(fvalue < node.SplitCond());
}

View File

@@ -248,8 +248,10 @@ class EvaluateSplitAgent {
template <int kBlockSize>
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
const EvaluateSplitSharedInputs shared_inputs,
common::Span<bst_feature_t> sorted_idx,
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) {
// Aligned && shared storage for best_split
@@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
__syncthreads();
// Allocate blocks to one feature of one node
const auto input_idx = blockIdx.x / number_active_features;
const auto input_idx = blockIdx.x / max_active_features;
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
// One block for each feature. Features are sampled, so fidx != blockIdx.x
int fidx = inputs.feature_set[blockIdx.x % number_active_features];
// Some blocks may not have any feature to work on, simply return
int feature_offset = blockIdx.x % max_active_features;
if (feature_offset >= inputs.feature_set.size()) {
return;
}
int fidx = inputs.feature_set[feature_offset];
using AgentT = EvaluateSplitAgent<kBlockSize>;
__shared__ typename AgentT::TempStorage temp_storage;
@@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
}
void GPUHistEvaluator::LaunchEvaluateSplits(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits) {
@@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
this->SortHistogram(d_inputs, shared_inputs, evaluator);
}
size_t combined_num_features = number_active_features * d_inputs.size();
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
size_t combined_num_features = max_active_features * d_inputs.size();
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
combined_num_features, DeviceSplitCandidate());
// One block for each feature
uint32_t constexpr kBlockThreads = 32;
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs,
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
0}(
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
shared_inputs,
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
evaluator, dh::ToSpan(feature_best_splits));
// Reduce to get best candidate for left and right child over all features
auto reduce_offset = dh::MakeTransformIterator<size_t>(
thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
auto reduce_offset =
dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) -> size_t {
return idx * max_active_features;
});
size_t temp_storage_bytes = 0;
auto num_segments = out_splits.size();
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
@@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
}
void GPUHistEvaluator::EvaluateSplits(
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) {
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
auto out_splits = dh::ToSpan(splits_out_storage);
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
out_splits);
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
evaluator, out_splits);
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries;

View File

@@ -170,13 +170,18 @@ class GPUHistEvaluator {
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
// impl of evaluate splits, contains CUDA kernels so it's public
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
void LaunchEvaluateSplits(
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_splits);
/**

View File

@@ -188,7 +188,8 @@ struct GPUHistMakerDevice {
common::Span<GradientPair> gpair;
dh::device_vector<int> monotone_constraints;
dh::device_vector<float> update_predictions;
// node idx for each sample
dh::device_vector<bst_node_t> positions;
TrainParam param;
@@ -305,6 +306,8 @@ struct GPUHistMakerDevice {
matrix.is_dense
};
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
// Store the feature set ptrs so they dont go out of scope before the kernel is called
std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
for (size_t i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
@@ -313,29 +316,34 @@ struct GPUHistMakerDevice {
nidx[i * 2 + 1] = right_nidx;
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
feature_sets.emplace_back(left_sampled_features);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
feature_sets.emplace_back(right_sampled_features);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
left_feature_set, hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
right_feature_set, hist.GetNodeHistogram(right_nidx)};
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
right_nidx);
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
candidate.split.left_sum, left_feature_set,
hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
candidate.split.right_sum, right_feature_set,
hist.GetNodeHistogram(right_nidx)};
}
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
bst_feature_t max_active_features = 0;
for (auto input : h_node_inputs) {
CHECK_EQ(input.feature_set.size(), number_active_features)
<< "Current implementation assumes that the number of active features "
"(after sampling) in any node is the same";
max_active_features = std::max(max_active_features,
bst_feature_t(input.feature_set.size()));
}
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
cudaMemcpyDefault));
dh::safe_cuda(cudaMemcpyAsync(
d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
this->evaluator_.EvaluateSplits(nidx, max_active_features,
dh::ToSpan(d_node_inputs), shared_inputs,
dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost));
@@ -403,8 +411,7 @@ struct GPUHistMakerDevice {
go_left = data.split_node.DefaultLeft();
} else {
if (data.split_type == FeatureType::kCategorical) {
go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
data.split_node.DefaultLeft());
go_left = common::Decision(data.node_cats.Bits(), cut_value);
} else {
go_left = cut_value <= data.split_node.SplitCond();
}
@@ -424,7 +431,7 @@ struct GPUHistMakerDevice {
LOG(FATAL) << "Current objective function can not be used with external memory.";
}
p_out_position->Resize(0);
update_predictions.clear();
positions.clear();
return;
}
@@ -459,8 +466,6 @@ struct GPUHistMakerDevice {
HostDeviceVector<bst_node_t>* p_out_position) {
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
auto d_gpair = this->gpair;
update_predictions.resize(row_partitioner->GetRows().size());
auto d_update_predictions = dh::ToSpan(update_predictions);
p_out_position->SetDevice(ctx_->gpu_id);
p_out_position->Resize(row_partitioner->GetRows().size());
@@ -481,7 +486,7 @@ struct GPUHistMakerDevice {
if (common::IsCat(d_feature_types, position)) {
auto node_cats = categories.subspan(categories_segments[position].beg,
categories_segments[position].size);
go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
go_left = common::Decision(node_cats, element);
} else {
go_left = element <= node.SplitCond();
}
@@ -495,32 +500,45 @@ struct GPUHistMakerDevice {
node = d_nodes[position];
}
d_update_predictions[row_id] = node.LeafValue();
return position;
}; // NOLINT
auto d_out_position = p_out_position->DeviceSpan();
row_partitioner->FinalisePosition(d_out_position, new_position_op);
auto s_position = p_out_position->ConstDeviceSpan();
positions.resize(s_position.size());
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), cudaMemcpyDeviceToDevice));
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
bst_node_t position = d_out_position[idx];
d_update_predictions[idx] = d_nodes[position].LeafValue();
bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
d_out_position[idx] = is_row_sampled ? ~position : position;
});
}
bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
if (update_predictions.empty()) {
if (positions.empty()) {
return false;
}
CHECK(p_tree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
auto d_update_predictions = dh::ToSpan(update_predictions);
CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
out_preds_d(idx) += d_update_predictions[idx];
auto d_position = dh::ToSpan(positions);
CHECK_EQ(out_preds_d.Size(), d_position.size());
auto const& h_nodes = p_tree->GetNodes();
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
auto d_nodes = dh::ToSpan(nodes);
dh::LaunchN(d_position.size(), [=] XGBOOST_DEVICE(std::size_t idx) mutable {
bst_node_t nidx = d_position[idx];
auto weight = d_nodes[nidx].LeafValue();
out_preds_d(idx) += weight;
});
return true;
}
@@ -863,6 +881,7 @@ class GPUHistMaker : public TreeUpdater {
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
char const* Name() const override { return "grow_gpu_hist"; }
bool HasNodePosition() const override { return true; }
private:
bool initialised_{false};

View File

@@ -23,10 +23,15 @@ case "${container}" in
gpu|rmm)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
if [[ $container == "rmm" ]]
then
BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
fi
;;
gpu_build_centos7|jvm_gpu_build)
BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
;;
*)

View File

@@ -15,7 +15,8 @@ fi
command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
`"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
`"NCCL_VERSION_ARG=$NCCL_VERSION"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \

View File

@@ -16,7 +16,8 @@ else
fi
command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"
`"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
`"NCCL_VERSION_ARG=$NCCL_VERSION"
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh

View File

@@ -14,5 +14,7 @@ else
fi
tests/ci_build/ci_build.sh jvm_gpu_build nvidia-docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} tests/ci_build/build_jvm_packages.sh \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
--build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
tests/ci_build/build_jvm_packages.sh \
${SPARK_VERSION} -Duse.cuda=ON ${arch_flag}

View File

@@ -12,10 +12,10 @@ if ( $is_release_branch -eq 0 ) {
}
mkdir build
cd build
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_UNITY_BUILD=ON ${arch_flag}
cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
$msbuild = -join @(
"C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\Community\\MSBuild\\15.0"
"C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
"\\Bin\\MSBuild.exe"
)
& $msbuild xgboost.sln /m /p:Configuration=Release /nodeReuse:false

View File

@@ -22,9 +22,10 @@ function set_buildkite_env_vars_in_container {
set -x
CUDA_VERSION=11.0.3
RAPIDS_VERSION=22.10
SPARK_VERSION=3.0.1
CUDA_VERSION=11.8.0
NCCL_VERSION=2.16.5-1
RAPIDS_VERSION=23.02
SPARK_VERSION=3.1.1
JDK_VERSION=8
if [[ -z ${BUILDKITE:-} ]]

View File

@@ -9,5 +9,6 @@ then
echo "--- Deploy JVM packages to xgboost-maven-repo S3 repo"
tests/ci_build/ci_build.sh jvm_gpu_build docker \
--build-arg CUDA_VERSION_ARG=${CUDA_VERSION} \
--build-arg NCCL_VERSION_ARG=${NCCL_VERSION} \
tests/ci_build/deploy_jvm_packages.sh ${SPARK_VERSION}
fi

View File

@@ -2,12 +2,16 @@ import argparse
import copy
import os
import re
import sys
import boto3
import botocore
from metadata import AMI_ID, COMMON_STACK_PARAMS, STACK_PARAMS
current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from common_blocks.utils import create_or_update_stack, wait
TEMPLATE_URL = "https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
@@ -68,72 +72,7 @@ def get_full_stack_id(stack_id):
return f"buildkite-{stack_id}-autoscaling-group"
def stack_exists(args, *, stack_name):
client = boto3.client("cloudformation", region_name=args.aws_region)
waiter = client.get_waiter("stack_exists")
try:
waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
return True
except botocore.exceptions.WaiterError as e:
return False
def create_or_update_stack(
args, *, stack_name, template_url=None, template_body=None, params=None
):
kwargs = {
"StackName": stack_name,
"Capabilities": [
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
}
if template_url:
kwargs["TemplateURL"] = template_url
if template_body:
kwargs["TemplateBody"] = template_body
if params:
kwargs["Parameters"] = params
client = boto3.client("cloudformation", region_name=args.aws_region)
if stack_exists(args, stack_name=stack_name):
print(f"Stack {stack_name} already exists. Updating...")
try:
response = client.update_stack(**kwargs)
return {"StackName": stack_name, "Action": "update"}
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "ValidationError" and re.search(
"No updates are to be performed", e.response["Error"]["Message"]
):
print(f"No update was made to {stack_name}")
return {"StackName": stack_name, "Action": "noop"}
else:
raise e
else:
kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
response = client.create_stack(**kwargs)
return {"StackName": stack_name, "Action": "create"}
def wait(promise):
client = boto3.client("cloudformation", region_name=args.aws_region)
stack_name = promise["StackName"]
print(f"Waiting for {stack_name}...")
if promise["Action"] == "create":
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_name)
print(f"Finished creating stack {stack_name}")
elif promise["Action"] == "update":
waiter = client.get_waiter("stack_update_complete")
waiter.wait(StackName=stack_name)
print(f"Finished updating stack {stack_name}")
elif promise["Action"] != "noop":
raise ValueError(f"Invalid promise {promise}")
def create_agent_iam_policy(args):
def create_agent_iam_policy(args, *, client):
policy_stack_name = "buildkite-agent-iam-policy"
print(f"Creating stack {policy_stack_name} for agent IAM policy...")
with open(
@@ -142,9 +81,9 @@ def create_agent_iam_policy(args):
) as f:
policy_template = f.read()
promise = create_or_update_stack(
args, stack_name=policy_stack_name, template_body=policy_template
args, client=client, stack_name=policy_stack_name, template_body=policy_template
)
wait(promise)
wait(promise, client=client)
cf = boto3.resource("cloudformation", region_name=args.aws_region)
policy = cf.StackResource(policy_stack_name, "BuildkiteAgentManagedPolicy")
@@ -152,10 +91,10 @@ def create_agent_iam_policy(args):
def main(args):
agent_iam_policy = create_agent_iam_policy(args)
client = boto3.client("cloudformation", region_name=args.aws_region)
agent_iam_policy = create_agent_iam_policy(args, client=client)
promises = []
for stack_id in AMI_ID:
@@ -167,13 +106,17 @@ def main(args):
)
promise = create_or_update_stack(
args, stack_name=stack_id_full, template_url=TEMPLATE_URL, params=params
args,
client=client,
stack_name=stack_id_full,
template_url=TEMPLATE_URL,
params=params,
)
promises.append(promise)
print(f"CI stack {stack_id_full} is in progress in the background")
for promise in promises:
wait(promise)
wait(promise, client=client)
if __name__ == "__main__":

View File

@@ -1,27 +1,27 @@
AMI_ID = {
# Managed by XGBoost team
"linux-amd64-gpu": {
"us-west-2": "ami-00ed92bd37f77bc33",
"us-west-2": "ami-094271bed4788ddb5",
},
"linux-amd64-mgpu": {
"us-west-2": "ami-00ed92bd37f77bc33",
"us-west-2": "ami-094271bed4788ddb5",
},
"windows-gpu": {
"us-west-2": "ami-0a1a2ea551a07ad5f",
"us-west-2": "ami-0839681594a1d7627",
},
"windows-cpu": {
"us-west-2": "ami-0a1a2ea551a07ad5f",
"us-west-2": "ami-0839681594a1d7627",
},
# Managed by BuildKite
# from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
"linux-amd64-cpu": {
"us-west-2": "ami-075d4c25d5f0c17c1",
"us-west-2": "ami-00f2127550cf03658",
},
"pipeline-loader": {
"us-west-2": "ami-075d4c25d5f0c17c1",
"us-west-2": "ami-00f2127550cf03658",
},
"linux-arm64-cpu": {
"us-west-2": "ami-0952c6fb6db9a9891",
"us-west-2": "ami-0c5789068f4a2d1b5",
},
}

View File

@@ -0,0 +1,97 @@
import re
import boto3
import botocore
def stack_exists(args, *, stack_name):
client = boto3.client("cloudformation", region_name=args.aws_region)
waiter = client.get_waiter("stack_exists")
try:
waiter.wait(StackName=stack_name, WaiterConfig={"MaxAttempts": 1})
return True
except botocore.exceptions.WaiterError as e:
return False
def create_or_update_stack(
args, *, client, stack_name, template_url=None, template_body=None, params=None
):
kwargs = {
"StackName": stack_name,
"Capabilities": [
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
}
if template_url:
kwargs["TemplateURL"] = template_url
if template_body:
kwargs["TemplateBody"] = template_body
if params:
kwargs["Parameters"] = params
if stack_exists(args, stack_name=stack_name):
print(f"Stack {stack_name} already exists. Updating...")
try:
response = client.update_stack(**kwargs)
return {"StackName": stack_name, "Action": "update"}
except botocore.exceptions.ClientError as e:
if e.response["Error"]["Code"] == "ValidationError" and re.search(
"No updates are to be performed", e.response["Error"]["Message"]
):
print(f"No update was made to {stack_name}")
return {"StackName": stack_name, "Action": "noop"}
else:
raise e
else:
kwargs.update({"OnFailure": "ROLLBACK", "EnableTerminationProtection": False})
response = client.create_stack(**kwargs)
return {"StackName": stack_name, "Action": "create"}
def replace_stack(
args, *, client, stack_name, template_url=None, template_body=None, params=None
):
"""Delete an existing stack and create a new stack with identical name"""
if not stack_exists(args, stack_name=stack_name):
raise ValueError(f"Stack {stack_name} does not exist")
r = client.delete_stack(StackName=stack_name)
delete_waiter = client.get_waiter("stack_delete_complete")
delete_waiter.wait(StackName=stack_name)
kwargs = {
"StackName": stack_name,
"Capabilities": [
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
"OnFailure": "ROLLBACK",
"EnableTerminationProtection": False,
}
if template_url:
kwargs["TemplateURL"] = template_url
if template_body:
kwargs["TemplateBody"] = template_body
if params:
kwargs["Parameters"] = params
response = client.create_stack(**kwargs)
return {"StackName": stack_name, "Action": "create"}
def wait(promise, *, client):
stack_name = promise["StackName"]
print(f"Waiting for {stack_name}...")
if promise["Action"] == "create":
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_name)
print(f"Finished creating stack {stack_name}")
elif promise["Action"] == "update":
waiter = client.get_waiter("stack_update_complete")
waiter.wait(StackName=stack_name)
print(f"Finished updating stack {stack_name}")
elif promise["Action"] != "noop":
raise ValueError(f"Invalid promise {promise}")

View File

@@ -2,6 +2,7 @@ import argparse
import copy
import json
import os
import sys
from urllib.request import urlopen
import boto3
@@ -9,6 +10,9 @@ import cfn_flip
from metadata import IMAGE_PARAMS
current_dir = os.path.dirname(__file__)
sys.path.append(os.path.join(current_dir, ".."))
from common_blocks.utils import replace_stack, wait
BUILDKITE_CF_TEMPLATE_URL = (
"https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml"
@@ -47,6 +51,9 @@ def main(args):
ami_mapping = get_ami_mapping()
client = boto3.client("cloudformation", region_name=args.aws_region)
promises = []
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
print(f"Creating EC2 image builder stack {stack_id_full}...")
@@ -55,28 +62,20 @@ def main(args):
stack_id=stack_id, aws_region=args.aws_region, ami_mapping=ami_mapping
)
client = boto3.client("cloudformation", region_name=args.aws_region)
response = client.create_stack(
StackName=stack_id_full,
TemplateBody=ec2_image_pipeline_template,
Capabilities=[
"CAPABILITY_IAM",
"CAPABILITY_NAMED_IAM",
"CAPABILITY_AUTO_EXPAND",
],
OnFailure="ROLLBACK",
EnableTerminationProtection=False,
Parameters=params,
promise = replace_stack(
args,
client=client,
stack_name=stack_id_full,
template_body=ec2_image_pipeline_template,
params=params,
)
promises.append(promise)
print(
f"EC2 image builder stack {stack_id_full} is in progress in the background"
)
for stack_id in IMAGE_PARAMS:
stack_id_full = get_full_stack_id(stack_id)
waiter = client.get_waiter("stack_create_complete")
waiter.wait(StackName=stack_id_full)
print(f"EC2 image builder stack {stack_id_full} is now finished.")
for promise in promises:
wait(promise, client=client)
if __name__ == "__main__":

View File

@@ -58,7 +58,7 @@ Resources:
BootstrapComponent:
Type: AWS::ImageBuilder::Component
Properties:
Name: !Sub "${AWS::StackName}-bootstrap-component"
Name: !Join ["-", [!Ref AWS::StackName, "bootstrap-component", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
Platform: !Ref InstanceOperatingSystem
Version: "1.0.0"
Description: Execute a bootstrap script.
@@ -67,7 +67,7 @@ Resources:
Recipe:
Type: AWS::ImageBuilder::ImageRecipe
Properties:
Name: !Sub "${AWS::StackName}-image"
Name: !Join ["-", [!Ref AWS::StackName, "image", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
Components:
- ComponentArn: !Ref BootstrapComponent
ParentImage: !Ref BaseImageId
@@ -83,7 +83,7 @@ Resources:
Infrastructure:
Type: AWS::ImageBuilder::InfrastructureConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-infrastructure"
Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-infrastructure", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
InstanceProfileName: !Ref InstanceProfile
InstanceTypes:
- !Ref InstanceType
@@ -93,7 +93,7 @@ Resources:
Distribution:
Type: AWS::ImageBuilder::DistributionConfiguration
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline-distribution-config"
Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline-distribution-config", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
Distributions:
- Region: !Ref AWS::Region
AmiDistributionConfiguration: {}
@@ -102,7 +102,7 @@ Resources:
Pipeline:
Type: AWS::ImageBuilder::ImagePipeline
Properties:
Name: !Sub "${AWS::StackName}-image-pipeline"
Name: !Join ["-", [!Ref AWS::StackName, "image-pipeline", !Select [2, !Split ['/', !Ref AWS::StackId]]]]
DistributionConfigurationArn: !Ref Distribution
ImageRecipeArn: !Ref Recipe
InfrastructureConfigurationArn: !Ref Infrastructure

View File

@@ -13,6 +13,6 @@ IMAGE_PARAMS = {
"BootstrapScript": "windows-gpu-bootstrap.yml",
"InstanceType": "g4dn.2xlarge",
"InstanceOperatingSystem": "Windows",
"VolumeSize": "80", # in GiBs
"VolumeSize": "120", # in GiBs
},
}

View File

@@ -15,9 +15,9 @@ phases:
choco --version
choco feature enable -n=allowGlobalConfirmation
# CMake 3.18
Write-Host '>>> Installing CMake 3.18...'
choco install cmake --version 3.18.0 --installargs "ADD_CMAKE_TO_PATH=System"
# CMake 3.25
Write-Host '>>> Installing CMake 3.25...'
choco install cmake --version 3.25.2 --installargs "ADD_CMAKE_TO_PATH=System"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Notepad++
@@ -45,18 +45,18 @@ phases:
choco install graphviz
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Visual Studio Community 2017 (15.9)
Write-Host '>>> Installing Visual Studio 2017 Community (15.9)...'
choco install visualstudio2017community --version 15.9.23.0 `
# Install Visual Studio 2022 Community
Write-Host '>>> Installing Visual Studio 2022 Community...'
choco install visualstudio2022community `
--params "--wait --passive --norestart"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install visualstudio2017-workload-nativedesktop --params `
choco install visualstudio2022-workload-nativedesktop --params `
"--wait --passive --norestart --includeOptional"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install CUDA 11.0
Write-Host '>>> Installing CUDA 11.0...'
choco install cuda --version 11.0.3
# Install CUDA 11.8
Write-Host '>>> Installing CUDA 11.8...'
choco install cuda --version=11.8.0.52206
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
# Install Python packages

View File

@@ -22,11 +22,11 @@ steps:
queue: linux-amd64-cpu
- wait
#### -------- BUILD --------
- label: ":console: Run clang-tidy"
command: "tests/buildkite/run-clang-tidy.sh"
key: run-clang-tidy
agents:
queue: linux-amd64-cpu
# - label: ":console: Run clang-tidy"
# command: "tests/buildkite/run-clang-tidy.sh"
# key: run-clang-tidy
# agents:
# queue: linux-amd64-cpu
- wait
- label: ":console: Build CPU"
command: "tests/buildkite/build-cpu.sh"

View File

@@ -4,7 +4,7 @@ set -euo pipefail
source tests/buildkite/conftest.sh
echo "--- Run Google Tests with CUDA, using 4 GPUs"
echo "--- Run Google Tests with CUDA, using a GPU"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \
@@ -12,11 +12,13 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost
echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled"
rm -rfv build/
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
chmod +x build/testxgboost
tests/ci_build/ci_build.sh rmm nvidia-docker \
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
"source activate gpu_test && build/testxgboost --use-rmm-pool"
# Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
# echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
# rm -rfv build/
# buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
# chmod +x build/testxgboost
# tests/ci_build/ci_build.sh rmm nvidia-docker \
# --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
# --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
# --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
# "source activate gpu_test && build/testxgboost --use-rmm-pool"

View File

@@ -15,8 +15,8 @@ RUN \
add-apt-repository -u 'deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main' && \
apt-get update && \
apt-get install -y llvm-11 clang-tidy-11 clang-11 && \
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
# Set default clang-tidy version
RUN \

View File

@@ -12,8 +12,8 @@ RUN \
apt-get update && \
apt-get install -y tar unzip wget git build-essential doxygen graphviz llvm libasan2 libidn11 ninja-build gcc-8 g++-8 openjdk-8-jdk-headless && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Python
wget -nv https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh && \
bash Mambaforge-Linux-x86_64.sh -b -p /opt/python

View File

@@ -22,10 +22,10 @@ ENV PATH=/opt/python/bin:$PATH
RUN \
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
dask dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
pyspark cloudpickle cuda-python=11.7.0 && \
pyspark cloudpickle cuda-python && \
mamba clean --all && \
conda run --no-capture-output -n gpu_test pip install buildkite-test-collector

View File

@@ -1,6 +1,7 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
# Install all basic requirements
RUN \
@@ -21,7 +22,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.13.4-1 && \
export NCCL_VERSION=$NCCL_VERSION_ARG && \
wget -nv -nc https://developer.download.nvidia.com/compute/machine-learning/repos/rhel7/x86_64/nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
rpm -i nvidia-machine-learning-repo-rhel7-1.0.0-1.x86_64.rpm && \
yum -y update && \

View File

@@ -36,8 +36,8 @@ RUN \
bash Miniconda3.sh -b -p /opt/python && \
/opt/python/bin/python -m pip install auditwheel awscli && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
ENV GOSU_VERSION 1.10

View File

@@ -12,8 +12,8 @@ RUN \
wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Maven
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \

View File

@@ -1,6 +1,7 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
ARG CUDA_VERSION_ARG
ARG NCCL_VERSION_ARG
# Install all basic requirements
RUN \
@@ -14,8 +15,8 @@ RUN \
wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
# CMake
wget -nv -nc https://cmake.org/files/v3.14/cmake-3.14.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.14.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Maven
wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
@@ -24,7 +25,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.13.4-1 && \
export NCCL_VERSION=$NCCL_VERSION_ARG && \
yum-config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
yum -y update && \
yum install -y libnccl-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-devel-${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-static-${NCCL_VERSION}+cuda${CUDA_SHORT}

View File

@@ -1,7 +1,8 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu18.04
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
ARG CUDA_VERSION_ARG
ARG RAPIDS_VERSION_ARG
ARG NCCL_VERSION_ARG
# Environment
ENV DEBIAN_FRONTEND noninteractive
@@ -19,7 +20,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
export NCCL_VERSION=2.13.4-1 && \
export NCCL_VERSION=$NCCL_VERSION_ARG && \
apt-get update && \
apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
@@ -29,7 +30,7 @@ ENV PATH=/opt/python/bin:$PATH
RUN \
conda install -c conda-forge mamba && \
mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
python=3.9 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
mamba clean --all
ENV GOSU_VERSION 1.10

View File

@@ -15,7 +15,7 @@ mv xgboost/ xgboost_rpack/
mkdir build
cd build
cmake .. -G"Visual Studio 15 2017 Win64" -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="c:\\Program Files\\R\\R-3.6.3"
cmake --build . --config Release --parallel
cd ..

View File

@@ -36,7 +36,8 @@ dependencies:
- cloudpickle
- shap
- modin
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
- pyspark>=3.3.1
- pip:
- datatable
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
- https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz

View File

@@ -1,19 +1,102 @@
import sys
import re
import zipfile
import argparse
import base64
import glob
import hashlib
import os
import pathlib
import re
import shutil
import tempfile
if len(sys.argv) != 2:
print('Usage: {} [wheel]'.format(sys.argv[0]))
sys.exit(1)
VCOMP140_PATH = "C:\\Windows\\System32\\vcomp140.dll"
vcomp140_path = 'C:\\Windows\\System32\\vcomp140.dll'
for wheel_path in sorted(glob.glob(sys.argv[1])):
m = re.search(r'xgboost-(.*)-py3', wheel_path)
assert m, f'wheel_path = {wheel_path}'
version = m.group(1)
def get_sha256sum(path):
return (
base64.urlsafe_b64encode(hashlib.sha256(open(path, "rb").read()).digest())
.decode("latin1")
.rstrip("=")
)
print(f"Inserting vcomp140.dll into {wheel_path}...")
with zipfile.ZipFile(wheel_path, 'a') as f:
f.write(vcomp140_path, 'xgboost-{}.data/data/xgboost/vcomp140.dll'.format(version))
def update_record(*, wheel_content_dir, xgboost_version):
vcomp140_size = os.path.getsize(VCOMP140_PATH)
vcomp140_hash = get_sha256sum(VCOMP140_PATH)
record_path = wheel_content_dir / pathlib.Path(
f"xgboost-{xgboost_version}.dist-info/RECORD"
)
with open(record_path, "r") as f:
record_content = f.read()
record_content += f"xgboost-{xgboost_version}.data/data/xgboost/vcomp140.dll,"
record_content += f"sha256={vcomp140_hash},{vcomp140_size}\n"
with open(record_path, "w") as f:
f.write(record_content)
def main(args):
candidates = list(sorted(glob.glob(args.wheel_path)))
for wheel_path in candidates:
print(f"Processing wheel {wheel_path}")
m = re.search(r"xgboost-(.*)\+.*-py3", wheel_path)
if not m:
raise ValueError(f"Wheel {wheel_path} has unexpected name")
version = m.group(1)
print(f" Detected version for {wheel_path}: {version}")
print(f" Inserting vcomp140.dll into {wheel_path}...")
with tempfile.TemporaryDirectory() as tempdir:
wheel_content_dir = pathlib.Path(tempdir) / "wheel_content"
print(f" Extract {wheel_path} into {wheel_content_dir}")
shutil.unpack_archive(
wheel_path, extract_dir=wheel_content_dir, format="zip"
)
data_dir = wheel_content_dir / pathlib.Path(
f"xgboost-{version}.data/data/xgboost"
)
data_dir.mkdir(parents=True, exist_ok=True)
print(f" Copy {VCOMP140_PATH} -> {data_dir}")
shutil.copy(VCOMP140_PATH, data_dir)
print(f" Update RECORD")
update_record(wheel_content_dir=wheel_content_dir, xgboost_version=version)
print(f" Content of {wheel_content_dir}:")
for e in sorted(wheel_content_dir.rglob("*")):
if e.is_file():
r = e.relative_to(wheel_content_dir)
print(f" {r}")
print(f" Create new wheel...")
new_wheel_tmp_path = pathlib.Path(tempdir) / "new_wheel"
shutil.make_archive(
str(new_wheel_tmp_path.resolve()),
format="zip",
root_dir=wheel_content_dir,
)
new_wheel_tmp_path = new_wheel_tmp_path.resolve().with_suffix(".zip")
new_wheel_tmp_path = new_wheel_tmp_path.rename(
new_wheel_tmp_path.with_suffix(".whl")
)
print(f" Created new wheel {new_wheel_tmp_path}")
# Rename the old wheel with suffix .bak
# The new wheel takes the name of the old wheel
wheel_path_obj = pathlib.Path(wheel_path).resolve()
backup_path = wheel_path_obj.with_suffix(".whl.bak")
print(f" Rename {wheel_path_obj} -> {backup_path}")
wheel_path_obj.replace(backup_path)
print(f" Rename {new_wheel_tmp_path} -> {wheel_path_obj}")
new_wheel_tmp_path.replace(wheel_path_obj)
shutil.rmtree(wheel_content_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"wheel_path", type=str, help="Path to wheel (wildcard permitted)"
)
args = parser.parse_args()
main(args)

View File

@@ -1,11 +1,14 @@
/*!
* Copyright 2021 by XGBoost Contributors
* Copyright 2021-2022 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/json.h>
#include <xgboost/learner.h>
#include <limits>
#include "../../../src/common/categorical.h"
#include "../helpers.h"
namespace xgboost {
namespace common {
@@ -15,29 +18,76 @@ TEST(Categorical, Decision) {
ASSERT_TRUE(common::InvalidCat(a));
std::vector<uint32_t> cats(256, 0);
ASSERT_TRUE(Decision(cats, a, true));
ASSERT_TRUE(Decision(cats, a));
// larger than size
a = 256;
ASSERT_TRUE(Decision(cats, a, true));
ASSERT_TRUE(Decision(cats, a));
// negative
a = -1;
ASSERT_TRUE(Decision(cats, a, true));
ASSERT_TRUE(Decision(cats, a));
CatBitField bits{cats};
bits.Set(0);
a = -0.5;
ASSERT_TRUE(Decision(cats, a, true));
ASSERT_TRUE(Decision(cats, a));
// round toward 0
a = 0.5;
ASSERT_FALSE(Decision(cats, a, true));
ASSERT_FALSE(Decision(cats, a));
// valid
a = 13;
bits.Set(a);
ASSERT_FALSE(Decision(bits.Bits(), a, true));
ASSERT_FALSE(Decision(bits.Bits(), a));
}
/**
* Test for running inference with input category greater than the one stored in tree.
*/
TEST(Categorical, MinimalSet) {
std::size_t constexpr kRows = 256, kCols = 1, kCat = 3;
std::vector<FeatureType> types{FeatureType::kCategorical};
auto Xy =
RandomDataGenerator{kRows, kCols, 0.0}.Type(types).MaxCategory(kCat).GenerateDMatrix(true);
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
learner->SetParam("max_depth", "1");
learner->SetParam("tree_method", "hist");
learner->Configure();
learner->UpdateOneIter(0, Xy);
Json model{Object{}};
learner->SaveModel(&model);
auto tree = model["learner"]["gradient_booster"]["model"]["trees"][0];
ASSERT_GE(get<I32Array const>(tree["categories"]).size(), 1);
auto v = get<I32Array const>(tree["categories"])[0];
HostDeviceVector<float> predt;
{
std::vector<float> data{static_cast<float>(kCat),
static_cast<float>(kCat + 1), 32.0f, 33.0f, 34.0f};
auto test = GetDMatrixFromData(data, data.size(), kCols);
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
ASSERT_EQ(predt.Size(), data.size());
auto const& h_predt = predt.ConstHostSpan();
for (auto v : h_predt) {
ASSERT_EQ(v, 1); // left child of root node
}
}
{
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
learner->LoadModel(model);
std::vector<float> data = {static_cast<float>(v)};
auto test = GetDMatrixFromData(data, data.size(), kCols);
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
auto const& h_predt = predt.ConstHostSpan();
for (auto v : h_predt) {
ASSERT_EQ(v, 2); // right child of root node
}
}
}
} // namespace common
} // namespace xgboost

View File

@@ -1,10 +1,12 @@
/*!
* Copyright 2020-2021 by XGBoost Contributors
/**
* Copyright 2020-2023 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <xgboost/host_device_vector.h>
#include "../helpers.h"
#include "../../../src/data/array_interface.h"
#include "dmlc/logging.h"
#include "xgboost/json.h"
namespace xgboost {
TEST(ArrayInterface, Initialize) {
@@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) {
column["mask"]["data"] = Null{};
common::Span<RBitField8::value_type> s_mask;
EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error);
get<Object>(column).erase("mask");
// misaligned.
j_data = {Json(Integer(reinterpret_cast<Integer::Int>(
reinterpret_cast<char const*>(storage.ConstHostPointer()) + 1))),
Json(Boolean(false))};
column["data"] = j_data;
EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error);
}
TEST(ArrayInterface, GetElement) {

View File

@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
}
}
TEST(GradientIndex, FromCategoricalLarge) {
size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
bst_bin_t max_bins = 8;
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
auto m = GetDMatrixFromData(x, kRows, 1);
Context ctx;
auto &h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);
BatchParam p{max_bins, 0.8};
{
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {});
ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
}
{
for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
common::HistogramCuts cut = page.cut;
GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
}
}
}
TEST(GradientIndex, PushBatch) {
size_t constexpr kRows = 64, kCols = 4;
bst_bin_t max_bins = 64;

View File

@@ -1,13 +1,19 @@
// Copyright by Contributors
/**
* Copyright 2016-2023 by XGBoost Contributors
*/
#include <xgboost/data.h>
#include <array>
#include <array> // std::array
#include <limits> // std::numeric_limits
#include <memory> // std::unique_ptr
#include "../../../src/data/adapter.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h"
#include "../../../src/data/adapter.h" // ArrayAdapter
#include "../../../src/data/simple_dmatrix.h" // SimpleDMatrix
#include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" // RandomDataGenerator,CreateSimpleTestData
#include "xgboost/base.h"
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/string_view.h" // StringView
using namespace xgboost; // NOLINT
@@ -298,6 +304,17 @@ TEST(SimpleDMatrix, Slice) {
ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
ASSERT_EQ(out->Info().num_row_, ridxs.size());
ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols); // dense
{
HostDeviceVector<float> data;
auto arr_str = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&data);
auto adapter = data::ArrayAdapter{StringView{arr_str}};
auto n_threads = 2;
std::unique_ptr<DMatrix> p_fmat{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), n_threads, "")};
std::unique_ptr<DMatrix> slice{p_fmat->Slice(ridxs)};
ASSERT_LE(slice->Ctx()->Threads(), n_threads);
}
}
TEST(SimpleDMatrix, SaveLoadBinary) {

View File

@@ -0,0 +1,24 @@
/**
* Copyright 2023 by XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/task.h>
#include <xgboost/tree_updater.h>
namespace xgboost {
TEST(Updater, HasNodePosition) {
Context ctx;
ObjInfo task{ObjInfo::kRegression, true, true};
std::unique_ptr<TreeUpdater> up{TreeUpdater::Create("grow_histmaker", &ctx, task)};
ASSERT_TRUE(up->HasNodePosition());
up.reset(TreeUpdater::Create("grow_quantile_histmaker", &ctx, task));
ASSERT_TRUE(up->HasNodePosition());
#if defined(XGBOOST_USE_CUDA)
ctx.gpu_id = 0;
up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, task));
ASSERT_TRUE(up->HasNodePosition());
#endif // defined(XGBOOST_USE_CUDA)
}
} // namespace xgboost

View File

@@ -139,3 +139,17 @@ class TestDeviceQuantileDMatrix:
booster.predict(xgb.DMatrix(d_m.get_data())),
atol=1e-6,
)
def test_ltr(self) -> None:
import cupy as cp
X, y, qid, w = tm.make_ltr(100, 3, 3, 5)
# make sure GPU is used to run sketching.
cpX = cp.array(X)
Xy_qdm = xgb.QuantileDMatrix(cpX, y, qid=qid, weight=w)
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
xgb.train({"tree_method": "gpu_hist", "objective": "rank:ndcg"}, Xy)
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
assert tm.predictor_equal(from_qdm, from_dm)

View File

@@ -1,8 +1,14 @@
import numpy as np
import sys
import numpy as np
import pandas as pd
import xgboost as xgb
sys.path.append("tests/python")
# Don't import the test class, otherwise they will run twice.
import test_interaction_constraints as test_ic # noqa
rng = np.random.RandomState(1994)
@@ -10,7 +16,34 @@ class TestGPUInteractionConstraints:
cputest = test_ic.TestInteractionConstraints()
def test_interaction_constraints(self):
self.cputest.run_interaction_constraints(tree_method='gpu_hist')
self.cputest.run_interaction_constraints(tree_method="gpu_hist")
def test_training_accuracy(self):
self.cputest.training_accuracy(tree_method='gpu_hist')
self.cputest.training_accuracy(tree_method="gpu_hist")
# case where different number of features can occur in the evaluator
def test_issue_8730(self):
X = pd.DataFrame(
zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
columns=["A", "B", "C", "D"],
)
y = np.array([*([0] * 50), *([1] * 50)])
dm = xgb.DMatrix(X, label=y)
params = {
"eta": 0.16095019509249486,
"min_child_weight": 1,
"subsample": 0.688567929338029,
"colsample_bynode": 0.7,
"gamma": 5.666579817418348e-06,
"lambda": 0.14943712232059794,
"grow_policy": "depthwise",
"max_depth": 3,
"tree_method": "gpu_hist",
"interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
"objective": "count:poisson",
"eval_metric": "poisson-nloglik",
"verbosity": 0,
}
xgb.train(params, dm, num_boost_round=100)

View File

@@ -216,6 +216,7 @@ class TestGPUPredict:
def test_inplace_predict_cupy(self):
self.run_inplace_predict_cupy(0)
@pytest.mark.xfail
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
def test_inplace_predict_cupy_specified_device(self):
@@ -338,13 +339,21 @@ class TestGPUPredict:
@given(predict_parameter_strategy, tm.dataset_strategy)
@settings(deadline=None, max_examples=20, print_blob=True)
def test_predict_leaf_gbtree(self, param, dataset):
# Unsupported for random forest
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
return
param['booster'] = 'gbtree'
param['tree_method'] = 'gpu_hist'
self.run_predict_leaf_booster(param, 10, dataset)
@given(predict_parameter_strategy, tm.dataset_strategy)
@settings(deadline=None, max_examples=20, print_blob=True)
def test_predict_leaf_dart(self, param, dataset):
def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
# Unsupported for random forest
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
return
param['booster'] = 'dart'
param['tree_method'] = 'gpu_hist'
self.run_predict_leaf_booster(param, 10, dataset)

View File

@@ -215,3 +215,34 @@ class TestGPUUpdaters:
@pytest.mark.parametrize("weighted", [True, False])
def test_adaptive(self, weighted) -> None:
self.cputest.run_adaptive("gpu_hist", weighted)
@pytest.mark.skipif(**tm.no_pandas())
def test_issue8824(self):
# column sampling by node crashes because shared pointers go out of scope
import pandas as pd
data = pd.DataFrame(np.random.rand(1024, 8))
data.columns = "x" + data.columns.astype(str)
features = data.columns
data["y"] = data.sum(axis=1) < 4
dtrain = xgb.DMatrix(data[features], label=data["y"])
model = xgb.train(
dtrain=dtrain,
params={
"max_depth": 5,
"learning_rate": 0.05,
"objective": "binary:logistic",
"tree_method": "gpu_hist",
"colsample_bytree": 0.5,
"colsample_bylevel": 0.5,
"colsample_bynode": 0.5, # Causes issues
"reg_alpha": 0.05,
"reg_lambda": 0.005,
"seed": 66,
"subsample": 0.5,
"gamma": 0.2,
"predictor": "auto",
"eval_metric": "auc",
},
num_boost_round=150,
)

View File

@@ -326,7 +326,7 @@ class TestDMatrix:
nrow = 100
ncol = 1000
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
assert x.indices.max() < ncol - 1
assert x.indices.max() < ncol
x.data[:] = 1
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)

View File

@@ -9,7 +9,9 @@ from testing import (
make_batches,
make_batches_sparse,
make_categorical,
make_ltr,
make_sparse_regression,
predictor_equal,
)
import xgboost as xgb
@@ -218,6 +220,16 @@ class TestQuantileDMatrix:
b = booster.predict(qXy)
np.testing.assert_allclose(a, b)
def test_ltr(self) -> None:
X, y, qid, w = make_ltr(100, 3, 3, 5)
Xy_qdm = xgb.QuantileDMatrix(X, y, qid=qid, weight=w)
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
xgb.train({"tree_method": "hist", "objective": "rank:ndcg"}, Xy)
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
assert predictor_equal(from_qdm, from_dm)
# we don't test empty Quantile DMatrix in single node construction.
@given(
strategies.integers(1, 1000),

View File

@@ -41,6 +41,16 @@ logging.getLogger("py4j").setLevel(logging.INFO)
pytestmark = testing.timeout(60)
def no_sparse_unwrap():
try:
from pyspark.sql.functions import unwrap_udt
except ImportError:
return {"reason": "PySpark<3.4", "condition": True}
return {"reason": "PySpark<3.4", "condition": False}
class XgboostLocalTest(SparkTestCase):
def setUp(self):
logging.getLogger().setLevel("INFO")
@@ -985,6 +995,7 @@ class XgboostLocalTest(SparkTestCase):
model = classifier.fit(self.cls_df_train)
model.transform(self.cls_df_test).collect()
@pytest.mark.skipif(**no_sparse_unwrap())
def test_regressor_with_sparse_optim(self):
regressor = SparkXGBRegressor(missing=0.0)
model = regressor.fit(self.reg_df_sparse_train)
@@ -1001,6 +1012,7 @@ class XgboostLocalTest(SparkTestCase):
for row1, row2 in zip(pred_result, pred_result2):
self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))
@pytest.mark.skipif(**no_sparse_unwrap())
def test_classifier_with_sparse_optim(self):
cls = SparkXGBClassifier(missing=0.0)
model = cls.fit(self.cls_df_sparse_train)

View File

@@ -458,6 +458,22 @@ class TestTreeMethod:
config_0 = json.loads(booster_0.save_config())
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
evals_result: Dict[str, Dict[str, list]] = {}
xgb.train(
{
"tree_method": tree_method,
"objective": "reg:absoluteerror",
"subsample": 0.8
},
Xy,
num_boost_round=10,
evals=[(Xy, "Train")],
evals_result=evals_result,
)
mae = evals_result["Train"]["mae"]
assert mae[-1] < 20.0
assert tm.non_increasing(mae)
@pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize(
"tree_method,weighted", [

View File

@@ -112,7 +112,6 @@ class TestPandas:
# test Index as columns
df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
print(df.columns, isinstance(df.columns, pd.Index))
Xy = xgb.DMatrix(df)
np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))

View File

@@ -4,7 +4,7 @@ import pytest
try:
import shap
except ImportError:
except Exception:
shap = None
pass

View File

@@ -2,6 +2,7 @@ import collections
import importlib.util
import json
import os
import pickle
import random
import tempfile
from typing import Callable, Optional
@@ -636,26 +637,74 @@ def test_sklearn_n_jobs():
def test_parameters_access():
from sklearn import datasets
params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1}
params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
clf = xgb.XGBClassifier(n_estimators=1000, **params)
assert clf.get_params()['updater'] == 'grow_gpu_hist'
assert clf.get_params()['subsample'] == .5
assert clf.get_params()['n_estimators'] == 1000
assert clf.get_params()["updater"] == "grow_gpu_hist"
assert clf.get_params()["subsample"] == 0.5
assert clf.get_params()["n_estimators"] == 1000
clf = xgb.XGBClassifier(n_estimators=1, nthread=4)
X, y = datasets.load_iris(return_X_y=True)
clf.fit(X, y)
config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 4
assert int(config["learner"]["generic_param"]["nthread"]) == 4
clf.set_params(nthread=16)
config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 16
assert int(config["learner"]["generic_param"]["nthread"]) == 16
clf.predict(X)
config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 16
assert int(config["learner"]["generic_param"]["nthread"]) == 16
clf = xgb.XGBClassifier(n_estimators=2)
assert clf.tree_method is None
assert clf.get_params()["tree_method"] is None
clf.fit(X, y)
assert clf.get_params()["tree_method"] is None
def save_load(clf: xgb.XGBClassifier) -> xgb.XGBClassifier:
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
clf.save_model(path)
clf = xgb.XGBClassifier()
clf.load_model(path)
return clf
def get_tm(clf: xgb.XGBClassifier) -> str:
tm = json.loads(clf.get_booster().save_config())["learner"]["gradient_booster"][
"gbtree_train_param"
]["tree_method"]
return tm
assert get_tm(clf) == "exact"
clf = pickle.loads(pickle.dumps(clf))
assert clf.tree_method is None
assert clf.n_estimators == 2
assert clf.get_params()["tree_method"] is None
assert clf.get_params()["n_estimators"] == 2
assert get_tm(clf) == "exact" # preserved for pickle
clf = save_load(clf)
assert clf.tree_method is None
assert clf.n_estimators == 2
assert clf.get_params()["tree_method"] is None
assert clf.get_params()["n_estimators"] == 2
assert get_tm(clf) == "auto" # discarded for save/load_model
clf.set_params(tree_method="hist")
assert clf.get_params()["tree_method"] == "hist"
clf = pickle.loads(pickle.dumps(clf))
assert clf.get_params()["tree_method"] == "hist"
clf = save_load(clf)
# FIXME(jiamingy): We should remove this behavior once we remove parameters
# serialization for skl save/load_model.
assert clf.get_params()["tree_method"] == "hist"
def test_kwargs_error():
@@ -695,13 +744,19 @@ def test_sklearn_clone():
def test_sklearn_get_default_params():
from sklearn.datasets import load_digits
digits_2class = load_digits(n_class=2)
X = digits_2class['data']
y = digits_2class['target']
X = digits_2class["data"]
y = digits_2class["target"]
cls = xgb.XGBClassifier()
assert cls.get_params()['base_score'] is None
assert cls.get_params()["base_score"] is None
cls.fit(X[:4, ...], y[:4, ...])
assert cls.get_params()['base_score'] is not None
base_score = float(
json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][
"base_score"
]
)
np.testing.assert_equal(base_score, 0.5)
def run_validation_weights(model):
@@ -1029,9 +1084,9 @@ def test_pandas_input():
clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
clf_isotonic.fit(train, target)
assert isinstance(
clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier
)
clf = clf_isotonic.calibrated_classifiers_[0]
est = clf.estimator if hasattr(clf, "estimator") else clf.base_estimator
assert isinstance(est, xgb.XGBClassifier)
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))

View File

@@ -466,7 +466,22 @@ def make_categorical(
return df, label
def _cat_sampled_from():
def make_ltr(
n_samples: int, n_features: int, n_query_groups: int, max_rel: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Make a dataset for testing LTR."""
rng = np.random.default_rng(1994)
X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
y = rng.integers(0, max_rel, size=n_samples)
qid = rng.integers(0, n_query_groups, size=n_samples)
w = rng.normal(0, 1.0, size=n_query_groups)
w -= np.min(w)
w /= np.max(w)
qid = np.sort(qid)
return X, y, qid, w
def _cat_sampled_from() -> strategies.SearchStrategy:
@strategies.composite
def _make_cat(draw):
n_samples = draw(strategies.integers(2, 512))
@@ -775,6 +790,19 @@ class DirectoryExcursion:
os.remove(f)
def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
"""Assert whether two DMatrices contain the same predictors."""
lcsr = lhs.get_data()
rcsr = rhs.get_data()
return all(
(
np.array_equal(lcsr.data, rcsr.data),
np.array_equal(lcsr.indices, rcsr.indices),
np.array_equal(lcsr.indptr, rcsr.indptr),
)
)
@contextmanager
def captured_output():
"""Reassign stdout temporarily in order to test printed statements