Compare commits

..

22 Commits

Author SHA1 Message Date
Jiaming Yuan
36ad160501 Bump version to 1.7.4. (#8805) 2023-02-16 06:40:01 +08:00
Jiaming Yuan
c22f6db4bf [backport] Fix CPU bin compression with categorical data. (#8809) (#8810)
* [backport] Fix CPU bin compression with categorical data. (#8809)

* Fix CPU bin compression with categorical data.

* The bug causes the maximum category to be lesser than 256 or the maximum number of bins when
the input data is dense.

* Avoid test symbol.
2023-02-16 06:39:25 +08:00
Jiaming Yuan
f15a6d2b19 [backport] Fix ranking with quantile dmatrix and group weight. (#8762) (#8800)
* [backport] Fix ranking with quantile dmatrix and group weight. (#8762)

* backport test utilities.
2023-02-15 02:45:09 +08:00
Jiaming Yuan
08a547f5c2 [backport] Fix feature types param (#8772) (#8801)
Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
Co-authored-by: WeichenXu <weichen.xu@databricks.com>
2023-02-15 01:39:20 +08:00
Jiaming Yuan
60303db2ee [backport] Fix GPU L1 error. (#8749) (#8770)
* [backport] Fix GPU L1 error. (#8749)

* Fix backport.
2023-02-09 20:16:39 +08:00
Jiaming Yuan
df984f9c43 [backport] Fix different number of features in gpu_hist evaluator. (#8754) (#8769)
Co-authored-by: Rory Mitchell <r.a.mitchell.nz@gmail.com>
2023-02-09 18:31:49 +08:00
Jiaming Yuan
2f22f8d49b [backport] Make sure input numpy array is aligned. (#8690) (#8696) (#8734)
* [backport] Make sure input numpy array is aligned. (#8690)

- use `np.require` to specify that the alignment is required.
- scipy csr as well.
- validate input pointer in `ArrayInterface`.

* Workaround CUDA warning. (#8696)

* backport from half type support for alignment.

* fix import.
2023-02-06 16:58:15 +08:00
Jiaming Yuan
68d86336d7 [backport] [R] fix OpenMP detection on macOS (#8684) (#8732)
Co-authored-by: James Lamb <jaylamb20@gmail.com>
2023-01-29 12:43:10 +08:00
Jiaming Yuan
76bdca072a [R] Fix threads used to create DMatrix in predict. (#8681) (#8682) 2023-01-15 04:00:31 +08:00
Jiaming Yuan
021e6a842a [backport] [R] Get CXX flags from R CMD config. (#8669) (#8680) 2023-01-14 18:46:59 +08:00
Jiaming Yuan
e5bef4ffce [backport] Fix threads in DMatrix slice. (#8667) (#8679) 2023-01-14 18:46:04 +08:00
Jiaming Yuan
10bb0a74ef [backport] [CI] Skip pyspark sparse tests. (#8675) (#8678) 2023-01-14 06:40:17 +08:00
Jiaming Yuan
e803d06d8c [backport] [R] Remove unused assert definition. (#8526) (#8668) 2023-01-13 04:55:29 +08:00
Jiaming Yuan
ccf43d4ba0 Bump R package version to 1.7.3. (#8649) 2023-01-06 20:34:05 +08:00
Jiaming Yuan
dd58c2ac47 Bump version to 1.7.3. (#8646) 2023-01-06 17:55:51 +08:00
Jiaming Yuan
899e4c8988 [backport] Do not return internal value for get_params. (#8634) (#8642) 2023-01-06 02:28:39 +08:00
Jiaming Yuan
a2085bf223 [backport] Fix loading GPU pickle with a CPU-only xgboost distribution. (#8632) (#8641)
We can handle loading the pickle on a CPU-only machine if the XGBoost is built with CUDA
enabled (Linux and Windows PyPI package), but not if the distribution is CPU-only (macOS
PyPI package).
2023-01-06 02:28:21 +08:00
Jiaming Yuan
067b704e58 [backport] Fix inference with categorical feature. (#8591) (#8602) (#8638)
* Fix inference with categorical feature. (#8591)

* Fix windows build on buildkite. (#8602)

* workaround.
2023-01-06 01:17:49 +08:00
Jiaming Yuan
1a834b2b85 Fix linalg iterator. (#8603) (#8639) 2023-01-05 23:16:10 +08:00
Jiaming Yuan
162b48a1a4 [backport] [CI] Disable gtest with RMM (#8620) (#8640)
Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2023-01-05 23:13:45 +08:00
Jiaming Yuan
83a078b7e5 [backport] Fix sklearn test that calls a removed field (#8579) (#8636)
Co-authored-by: Rong Ou <rong.ou@gmail.com>
2023-01-05 21:17:05 +08:00
Jiaming Yuan
575fba651b [backport] [CI] Fix CI with updated dependencies. (#8631) (#8635) 2023-01-05 19:10:58 +08:00
58 changed files with 1609 additions and 1120 deletions

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR) cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.7.2) project(xgboost LANGUAGES CXX C VERSION 1.7.4)
include(cmake/Utils.cmake) include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0022 NEW)

View File

@@ -1,8 +1,8 @@
Package: xgboost Package: xgboost
Type: Package Type: Package
Title: Extreme Gradient Boosting Title: Extreme Gradient Boosting
Version: 1.7.2.1 Version: 1.7.4.1
Date: 2022-12-08 Date: 2023-02-15
Authors@R: c( Authors@R: c(
person("Tianqi", "Chen", role = c("aut"), person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"), email = "tianqi.tchen@gmail.com"),
@@ -66,5 +66,5 @@ Imports:
methods, methods,
data.table (>= 1.9.6), data.table (>= 1.9.6),
jsonlite (>= 1.0), jsonlite (>= 1.0),
RoxygenNote: 7.2.1 RoxygenNote: 7.2.2
SystemRequirements: GNU make, C++14 SystemRequirements: GNU make, C++14

View File

@@ -328,8 +328,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE, predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) { reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
object <- xgb.Booster.complete(object, saveraw = FALSE) object <- xgb.Booster.complete(object, saveraw = FALSE)
if (!inherits(newdata, "xgb.DMatrix")) if (!inherits(newdata, "xgb.DMatrix"))
newdata <- xgb.DMatrix(newdata, missing = missing) newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
if (!is.null(object[["feature_names"]]) && if (!is.null(object[["feature_names"]]) &&
!is.null(colnames(newdata)) && !is.null(colnames(newdata)) &&
!identical(object[["feature_names"]], colnames(newdata))) !identical(object[["feature_names"]], colnames(newdata)))

1831
R-package/configure vendored

File diff suppressed because it is too large Load Diff

View File

@@ -2,10 +2,25 @@
AC_PREREQ(2.69) AC_PREREQ(2.69)
AC_INIT([xgboost],[1.7.2],[],[xgboost],[]) AC_INIT([xgboost],[1.7.4],[],[xgboost],[])
# Use this line to set CC variable to a C compiler : ${R_HOME=`R RHOME`}
AC_PROG_CC if test -z "${R_HOME}"; then
echo "could not determine R_HOME"
exit 1
fi
CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
CXX="${CXX14} ${CXX14STD}"
CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
CC=`"${R_HOME}/bin/R" CMD config CC`
CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
AC_LANG(C++)
### Check whether backtrace() is part of libc or the external lib libexecinfo ### Check whether backtrace() is part of libc or the external lib libexecinfo
AC_MSG_CHECKING([Backtrace lib]) AC_MSG_CHECKING([Backtrace lib])
@@ -40,7 +55,7 @@ then
ac_pkg_openmp=no ac_pkg_openmp=no
AC_MSG_CHECKING([whether OpenMP will work in a package]) AC_MSG_CHECKING([whether OpenMP will work in a package])
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])]) AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
${CC} -o conftest conftest.c ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes ${CXX} -o conftest conftest.cpp ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
AC_MSG_RESULT([${ac_pkg_openmp}]) AC_MSG_RESULT([${ac_pkg_openmp}])
if test "${ac_pkg_openmp}" = no; then if test "${ac_pkg_openmp}" = no; then
OPENMP_CXXFLAGS='' OPENMP_CXXFLAGS=''

View File

@@ -23,7 +23,6 @@ PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
OBJECTS= \ OBJECTS= \
./xgboost_R.o \ ./xgboost_R.o \
./xgboost_custom.o \ ./xgboost_custom.o \
./xgboost_assert.o \
./init.o \ ./init.o \
$(PKGROOT)/src/metric/metric.o \ $(PKGROOT)/src/metric/metric.o \
$(PKGROOT)/src/metric/elementwise_metric.o \ $(PKGROOT)/src/metric/elementwise_metric.o \

View File

@@ -23,7 +23,6 @@ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) -DDMLC_CMAKE_LITTLE_ENDIAN=1 $(SHLIB_PTHRE
OBJECTS= \ OBJECTS= \
./xgboost_R.o \ ./xgboost_R.o \
./xgboost_custom.o \ ./xgboost_custom.o \
./xgboost_assert.o \
./init.o \ ./init.o \
$(PKGROOT)/src/metric/metric.o \ $(PKGROOT)/src/metric/metric.o \
$(PKGROOT)/src/metric/elementwise_metric.o \ $(PKGROOT)/src/metric/elementwise_metric.o \

View File

@@ -1,26 +0,0 @@
// Copyright (c) 2014 by Contributors
#include <stdio.h>
#include <stdarg.h>
#include <Rinternals.h>
// implements error handling
void XGBoostAssert_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("AssertError:%s\n", buf);
}
}
void XGBoostCheck_R(int exp, const char *fmt, ...) {
char buf[1024];
if (exp == 0) {
va_list args;
va_start(args, fmt);
vsprintf(buf, fmt, args);
va_end(args);
error("%s\n", buf);
}
}

View File

@@ -138,11 +138,11 @@ Miscellaneous
By default, XGBoost assumes input categories are integers starting from 0 till the number By default, XGBoost assumes input categories are integers starting from 0 till the number
of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
values due to mistakes or missing values. It can be negative value, integer values that values due to mistakes or missing values in training dataset. It can be negative value,
can not be accurately represented by 32-bit floating point, or values that are larger than integer values that can not be accurately represented by 32-bit floating point, or values
actual number of unique categories. During training this is validated but for prediction that are larger than actual number of unique categories. During training this is
it's treated as the same as missing value for performance reasons. Lastly, missing values validated but for prediction it's treated as the same as not-chosen category for
are treated as the same as numerical features (using the learned split direction). performance reasons.
********** **********

View File

@@ -6,6 +6,6 @@
#define XGBOOST_VER_MAJOR 1 #define XGBOOST_VER_MAJOR 1
#define XGBOOST_VER_MINOR 7 #define XGBOOST_VER_MINOR 7
#define XGBOOST_VER_PATCH 2 #define XGBOOST_VER_PATCH 4
#endif // XGBOOST_VERSION_CONFIG_H_ #endif // XGBOOST_VERSION_CONFIG_H_

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<name>XGBoost JVM Package</name> <name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description> <description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j-example_2.12</artifactId> <artifactId>xgboost4j-example_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<build> <build>
<plugins> <plugins>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId> <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId> <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j-flink_2.12</artifactId> <artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
<build> <build>
<plugins> <plugins>
<plugin> <plugin>
@@ -26,7 +26,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.commons</groupId> <groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j-gpu_2.12</artifactId> <artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId> <artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId> <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j-spark_2.12</artifactId> <artifactId>xgboost4j-spark_2.12</artifactId>
<build> <build>
@@ -24,7 +24,7 @@
<dependency> <dependency>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId> <artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>

View File

@@ -1,9 +1,9 @@
from sklearn.datasets import load_iris
import numpy as np import numpy as np
import pandas import pandas
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
y = y.astype(np.int) y = y.astype(np.int32)
df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width']) df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'} class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
df['class'] = np.vectorize(class_id_to_name.get)(y) df['class'] = np.vectorize(class_id_to_name.get)(y)

View File

@@ -6,10 +6,10 @@
<parent> <parent>
<groupId>ml.dmlc</groupId> <groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId> <artifactId>xgboost-jvm_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
</parent> </parent>
<artifactId>xgboost4j_2.12</artifactId> <artifactId>xgboost4j_2.12</artifactId>
<version>1.7.2</version> <version>1.7.4</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<dependencies> <dependencies>

View File

@@ -1 +1 @@
1.7.2 1.7.4

View File

@@ -36,7 +36,6 @@ try:
PANDAS_INSTALLED = True PANDAS_INSTALLED = True
except ImportError: except ImportError:
MultiIndex = object MultiIndex = object
DataFrame = object DataFrame = object
Series = object Series = object
@@ -161,6 +160,7 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem
# `importlib.utils`, except it's unclear from its document on how to use it. This one # `importlib.utils`, except it's unclear from its document on how to use it. This one
# seems to be easy to understand and works out of box. # seems to be easy to understand and works out of box.
# Copyright 2015 The TensorFlow Authors. All Rights Reserved. # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this

View File

@@ -2172,6 +2172,7 @@ class Booster:
) )
return _prediction_output(shape, dims, preds, False) return _prediction_output(shape, dims, preds, False)
# pylint: disable=too-many-statements
def inplace_predict( def inplace_predict(
self, self,
data: DataType, data: DataType,
@@ -2192,10 +2193,10 @@ class Booster:
.. code-block:: python .. code-block:: python
booster.set_param({'predictor': 'gpu_predictor'}) booster.set_param({"predictor": "gpu_predictor"})
booster.inplace_predict(cupy_array) booster.inplace_predict(cupy_array)
booster.set_param({'predictor': 'cpu_predictor}) booster.set_param({"predictor": "cpu_predictor"})
booster.inplace_predict(numpy_array) booster.inplace_predict(numpy_array)
.. versionadded:: 1.1.0 .. versionadded:: 1.1.0
@@ -2301,14 +2302,16 @@ class Booster:
) )
return _prediction_output(shape, dims, preds, False) return _prediction_output(shape, dims, preds, False)
if isinstance(data, scipy.sparse.csr_matrix): if isinstance(data, scipy.sparse.csr_matrix):
csr = data from .data import _transform_scipy_csr
data = _transform_scipy_csr(data)
_check_call( _check_call(
_LIB.XGBoosterPredictFromCSR( _LIB.XGBoosterPredictFromCSR(
self.handle, self.handle,
_array_interface(csr.indptr), _array_interface(data.indptr),
_array_interface(csr.indices), _array_interface(data.indices),
_array_interface(csr.data), _array_interface(data.data),
c_bst_ulong(csr.shape[1]), c_bst_ulong(data.shape[1]),
from_pystr_to_cstr(json.dumps(args)), from_pystr_to_cstr(json.dumps(args)),
p_handle, p_handle,
ctypes.byref(shape), ctypes.byref(shape),

View File

@@ -30,6 +30,7 @@ from .core import (
c_array, c_array,
c_str, c_str,
from_pystr_to_cstr, from_pystr_to_cstr,
make_jcargs,
) )
DispatchedDataBackendReturnType = Tuple[ DispatchedDataBackendReturnType = Tuple[
@@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes:
return interface_str return interface_str
def _transform_scipy_csr(data: DataType) -> DataType:
from scipy.sparse import csr_matrix
indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
values, _ = _ensure_np_dtype(data.data, data.data.dtype)
if (
indptr is not data.indptr
or indices is not data.indices
or values is not data.data
):
data = csr_matrix((values, indices, indptr), shape=data.shape)
return data
def _from_scipy_csr( def _from_scipy_csr(
data: DataType, data: DataType,
missing: FloatCompatible, missing: FloatCompatible,
@@ -93,18 +109,14 @@ def _from_scipy_csr(
f"length mismatch: {len(data.indices)} vs {len(data.data)}" f"length mismatch: {len(data.indices)} vs {len(data.data)}"
) )
handle = ctypes.c_void_p() handle = ctypes.c_void_p()
args = { data = _transform_scipy_csr(data)
"missing": float(missing),
"nthread": int(nthread),
}
config = bytes(json.dumps(args), "utf-8")
_check_call( _check_call(
_LIB.XGDMatrixCreateFromCSR( _LIB.XGDMatrixCreateFromCSR(
_array_interface(data.indptr), _array_interface(data.indptr),
_array_interface(data.indices), _array_interface(data.indices),
_array_interface(data.data), _array_interface(data.data),
c_bst_ulong(data.shape[1]), c_bst_ulong(data.shape[1]),
config, make_jcargs(missing=float(missing), nthread=int(nthread)),
ctypes.byref(handle), ctypes.byref(handle),
) )
) )
@@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool:
def _ensure_np_dtype( def _ensure_np_dtype(
data: DataType, data: DataType, dtype: Optional[NumpyDType]
dtype: Optional[NumpyDType]
) -> Tuple[np.ndarray, Optional[NumpyDType]]: ) -> Tuple[np.ndarray, Optional[NumpyDType]]:
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
data = data.astype(np.float32, copy=False)
dtype = np.float32 dtype = np.float32
data = data.astype(dtype, copy=False)
if not data.flags.aligned:
data = np.require(data, requirements="A")
return data, dtype return data, dtype
@@ -1197,11 +1210,13 @@ def _proxy_transform(
data, _ = _ensure_np_dtype(data, data.dtype) data, _ = _ensure_np_dtype(data, data.dtype)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_scipy_csr(data): if _is_scipy_csr(data):
data = _transform_scipy_csr(data)
return data, None, feature_names, feature_types return data, None, feature_names, feature_types
if _is_pandas_df(data): if _is_pandas_df(data):
arr, feature_names, feature_types = _transform_pandas_df( arr, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types data, enable_categorical, feature_names, feature_types
) )
arr, _ = _ensure_np_dtype(arr, arr.dtype)
return arr, None, feature_names, feature_types return arr, None, feature_names, feature_types
raise TypeError("Value type is not supported for data iterator:" + str(type(data))) raise TypeError("Value type is not supported for data iterator:" + str(type(data)))

View File

@@ -674,7 +674,7 @@ class XGBModel(XGBModelBase):
self.kwargs = {} self.kwargs = {}
self.kwargs[key] = value self.kwargs[key] = value
if hasattr(self, "_Booster"): if self.__sklearn_is_fitted__():
parameters = self.get_xgb_params() parameters = self.get_xgb_params()
self.get_booster().set_param(parameters) self.get_booster().set_param(parameters)
@@ -701,39 +701,12 @@ class XGBModel(XGBModelBase):
np.iinfo(np.int32).max np.iinfo(np.int32).max
) )
def parse_parameter(value: Any) -> Optional[Union[int, float, str]]:
for t in (int, float, str):
try:
ret = t(value)
return ret
except ValueError:
continue
return None
# Get internal parameter values
try:
config = json.loads(self.get_booster().save_config())
stack = [config]
internal = {}
while stack:
obj = stack.pop()
for k, v in obj.items():
if k.endswith("_param"):
for p_k, p_v in v.items():
internal[p_k] = p_v
elif isinstance(v, dict):
stack.append(v)
for k, v in internal.items():
if k in params and params[k] is None:
params[k] = parse_parameter(v)
except ValueError:
pass
return params return params
def get_xgb_params(self) -> Dict[str, Any]: def get_xgb_params(self) -> Dict[str, Any]:
"""Get xgboost specific parameters.""" """Get xgboost specific parameters."""
params = self.get_params() params: Dict[str, Any] = self.get_params()
# Parameters that should not go into native learner. # Parameters that should not go into native learner.
wrapper_specific = { wrapper_specific = {
"importance_type", "importance_type",
@@ -750,6 +723,7 @@ class XGBModel(XGBModelBase):
for k, v in params.items(): for k, v in params.items():
if k not in wrapper_specific and not callable(v): if k not in wrapper_specific and not callable(v):
filtered[k] = v filtered[k] = v
return filtered return filtered
def get_num_boosting_rounds(self) -> int: def get_num_boosting_rounds(self) -> int:
@@ -1070,7 +1044,7 @@ class XGBModel(XGBModelBase):
# error with incompatible data type. # error with incompatible data type.
# Inplace predict doesn't handle as many data types as DMatrix, but it's # Inplace predict doesn't handle as many data types as DMatrix, but it's
# sufficient for dask interface where input is simpiler. # sufficient for dask interface where input is simpiler.
predictor = self.get_params().get("predictor", None) predictor = self.get_xgb_params().get("predictor", None)
if predictor in ("auto", None) and self.booster != "gblinear": if predictor in ("auto", None) and self.booster != "gblinear":
return True return True
return False return False
@@ -1336,7 +1310,7 @@ class XGBModel(XGBModelBase):
------- -------
coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]`` coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]``
""" """
if self.get_params()["booster"] != "gblinear": if self.get_xgb_params()["booster"] != "gblinear":
raise AttributeError( raise AttributeError(
f"Coefficients are not defined for Booster type {self.booster}" f"Coefficients are not defined for Booster type {self.booster}"
) )
@@ -1366,7 +1340,7 @@ class XGBModel(XGBModelBase):
------- -------
intercept_ : array of shape ``(1,)`` or ``[n_classes]`` intercept_ : array of shape ``(1,)`` or ``[n_classes]``
""" """
if self.get_params()["booster"] != "gblinear": if self.get_xgb_params()["booster"] != "gblinear":
raise AttributeError( raise AttributeError(
f"Intercept (bias) is not defined for Booster type {self.booster}" f"Intercept (bias) is not defined for Booster type {self.booster}"
) )

View File

@@ -140,6 +140,13 @@ _unsupported_predict_params = {
} }
# TODO: supply hint message for all other unsupported params.
_unsupported_params_hint_message = {
"enable_categorical": "`xgboost.spark` estimators do not have 'enable_categorical' param, "
"but you can set `feature_types` param and mark categorical features with 'c' string."
}
class _SparkXGBParams( class _SparkXGBParams(
HasFeaturesCol, HasFeaturesCol,
HasLabelCol, HasLabelCol,
@@ -523,7 +530,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
or k in _unsupported_predict_params or k in _unsupported_predict_params
or k in _unsupported_train_params or k in _unsupported_train_params
): ):
raise ValueError(f"Unsupported param '{k}'.") err_msg = _unsupported_params_hint_message.get(
k, f"Unsupported param '{k}'."
)
raise ValueError(err_msg)
_extra_params[k] = v _extra_params[k] = v
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict) _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params}) self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
@@ -749,6 +759,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
"feature_weights": self.getOrDefault(self.feature_weights), "feature_weights": self.getOrDefault(self.feature_weights),
"missing": float(self.getOrDefault(self.missing)), "missing": float(self.getOrDefault(self.missing)),
} }
if dmatrix_kwargs["feature_types"] is not None:
dmatrix_kwargs["enable_categorical"] = True
booster_params["nthread"] = cpu_per_task booster_params["nthread"] = cpu_per_task
use_gpu = self.getOrDefault(self.use_gpu) use_gpu = self.getOrDefault(self.use_gpu)

View File

@@ -48,20 +48,21 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
return cat < 0 || cat >= kMaxCat; return cat < 0 || cat >= kMaxCat;
} }
/* \brief Whether should it traverse to left branch of a tree. /**
* \brief Whether should it traverse to left branch of a tree.
* *
* For one hot split, go to left if it's NOT the matching category. * Go to left if it's NOT the matching category, which matches one-hot encoding.
*/ */
template <bool validate = true> inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
KCatBitField const s_cats(cats); KCatBitField const s_cats(cats);
// FIXME: Size() is not accurate since it represents the size of bit set instead of if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
// actual number of categories. return true;
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
return dft_left;
} }
auto pos = KCatBitField::ToBitPos(cat); auto pos = KCatBitField::ToBitPos(cat);
// If the input category is larger than the size of the bit field, it implies that the
// category is not chosen. Otherwise the bit field would have the category instead of
// being smaller than the category value.
if (pos.int_pos >= cats.size()) { if (pos.int_pos >= cats.size()) {
return true; return true;
} }

View File

@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
feature_offsets_[fid] = accum_index; feature_offsets_[fid] = accum_index;
} }
SetTypeSize(gmat.max_num_bins); SetTypeSize(gmat.MaxNumBinPerFeat());
auto storage_size = auto storage_size =
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_); feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
index_.resize(storage_size, 0); index_.resize(storage_size, 0);

View File

@@ -62,7 +62,7 @@ void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t,
#endif // !defined(XGBOOST_USE_CUDA) #endif // !defined(XGBOOST_USE_CUDA)
template <typename T, std::int32_t kDim> template <typename T, std::int32_t kDim>
auto cbegin(TensorView<T, kDim> v) { // NOLINT auto cbegin(TensorView<T, kDim> const& v) { // NOLINT
auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& { auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
}); });
@@ -70,19 +70,19 @@ auto cbegin(TensorView<T, kDim> v) { // NOLINT
} }
template <typename T, std::int32_t kDim> template <typename T, std::int32_t kDim>
auto cend(TensorView<T, kDim> v) { // NOLINT auto cend(TensorView<T, kDim> const& v) { // NOLINT
return cbegin(v) + v.Size(); return cbegin(v) + v.Size();
} }
template <typename T, std::int32_t kDim> template <typename T, std::int32_t kDim>
auto begin(TensorView<T, kDim> v) { // NOLINT auto begin(TensorView<T, kDim>& v) { // NOLINT
auto it = common::MakeIndexTransformIter( auto it = common::MakeIndexTransformIter(
[&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); }); [&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
return it; return it;
} }
template <typename T, std::int32_t kDim> template <typename T, std::int32_t kDim>
auto end(TensorView<T, kDim> v) { // NOLINT auto end(TensorView<T, kDim>& v) { // NOLINT
return begin(v) + v.Size(); return begin(v) + v.Size();
} }
} // namespace linalg } // namespace linalg

View File

@@ -144,7 +144,7 @@ class PartitionBuilder {
auto gidx = gidx_calc(ridx); auto gidx = gidx_calc(ridx);
bool go_left = default_left; bool go_left = default_left;
if (gidx > -1) { if (gidx > -1) {
go_left = Decision(node_cats, cut_values[gidx], default_left); go_left = Decision(node_cats, cut_values[gidx]);
} }
return go_left; return go_left;
} else { } else {
@@ -157,7 +157,7 @@ class PartitionBuilder {
bool go_left = default_left; bool go_left = default_left;
if (gidx > -1) { if (gidx > -1) {
if (is_cat) { if (is_cat) {
go_left = Decision(node_cats, cut_values[gidx], default_left); go_left = Decision(node_cats, cut_values[gidx]);
} else { } else {
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value; go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
} }

View File

@@ -1,5 +1,5 @@
/*! /**
* Copyright 2019-2021 by Contributors * Copyright 2019-2023 by XGBoost Contributors
* \file array_interface.h * \file array_interface.h
* \brief View of __array_interface__ * \brief View of __array_interface__
*/ */
@@ -7,9 +7,11 @@
#define XGBOOST_DATA_ARRAY_INTERFACE_H_ #define XGBOOST_DATA_ARRAY_INTERFACE_H_
#include <algorithm> #include <algorithm>
#include <cinttypes> #include <cstddef> // std::size_t
#include <cstdint>
#include <map> #include <map>
#include <string> #include <string>
#include <type_traits> // std::alignment_of,std::remove_pointer_t
#include <utility> #include <utility>
#include <vector> #include <vector>
@@ -394,6 +396,11 @@ class ArrayInterface {
data = ArrayInterfaceHandler::ExtractData(array, n); data = ArrayInterfaceHandler::ExtractData(array, n);
static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported."); static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
auto alignment = this->ElementAlignment();
auto ptr = reinterpret_cast<uintptr_t>(this->data);
CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment.";
if (allow_mask) { if (allow_mask) {
common::Span<RBitField8::value_type> s_mask; common::Span<RBitField8::value_type> s_mask;
size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask); size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
@@ -512,9 +519,15 @@ class ArrayInterface {
return func(reinterpret_cast<uint64_t const *>(data)); return func(reinterpret_cast<uint64_t const *>(data));
} }
XGBOOST_DEVICE size_t ElementSize() { XGBOOST_DEVICE std::size_t ElementSize() const {
return this->DispatchCall( return this->DispatchCall([](auto *typed_data_ptr) {
[](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); }); return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
});
}
XGBOOST_DEVICE std::size_t ElementAlignment() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
});
} }
template <typename T = float, typename... Index> template <typename T = float, typename... Index>

View File

@@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat, GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
double sparse_thresh, bool sorted_sketch, int32_t n_threads, double sparse_thresh, bool sorted_sketch, int32_t n_threads,
common::Span<float> hess) { common::Span<float> hess)
: max_numeric_bins_per_feat{max_bins_per_feat} {
CHECK(p_fmat->SingleColBlock()); CHECK(p_fmat->SingleColBlock());
// We use sorted sketching for approx tree method since it's more efficient in // We use sorted sketching for approx tree method since it's more efficient in
// computation time (but higher memory usage). // computation time (but higher memory usage).
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess); cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
max_num_bins = max_bins_per_feat;
const uint32_t nbins = cut.Ptrs().back(); const uint32_t nbins = cut.Ptrs().back();
hit_count.resize(nbins, 0); hit_count.resize(nbins, 0);
hit_count_tloc_.resize(n_threads * nbins, 0); hit_count_tloc_.resize(n_threads * nbins, 0);
@@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
: row_ptr(info.num_row_ + 1, 0), : row_ptr(info.num_row_ + 1, 0),
hit_count(cuts.TotalBins(), 0), hit_count(cuts.TotalBins(), 0),
cut{std::forward<common::HistogramCuts>(cuts)}, cut{std::forward<common::HistogramCuts>(cuts)},
max_num_bins(max_bin_per_feat), max_numeric_bins_per_feat(max_bin_per_feat),
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {} isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
#if !defined(XGBOOST_USE_CUDA) #if !defined(XGBOOST_USE_CUDA)
@@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
} }
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft, GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
common::HistogramCuts const &cuts, int32_t max_bins_per_feat, common::HistogramCuts cuts, int32_t max_bins_per_feat,
bool isDense, double sparse_thresh, int32_t n_threads) { bool isDense, double sparse_thresh, int32_t n_threads)
: cut{std::move(cuts)},
max_numeric_bins_per_feat{max_bins_per_feat},
base_rowid{batch.base_rowid},
isDense_{isDense} {
CHECK_GE(n_threads, 1); CHECK_GE(n_threads, 1);
base_rowid = batch.base_rowid;
isDense_ = isDense;
cut = cuts;
max_num_bins = max_bins_per_feat;
CHECK_EQ(row_ptr.size(), 0); CHECK_EQ(row_ptr.size(), 0);
// The number of threads is pegged to the batch size. If the OMP // The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size, // block is parallelized on anything other than the batch/block size,
@@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
#undef INSTANTIATION_PUSH #undef INSTANTIATION_PUSH
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) { void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) { if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
isDense) {
// compress dense index to uint8 // compress dense index to uint8
index.SetBinTypeSize(common::kUint8BinsTypeSize); index.SetBinTypeSize(common::kUint8BinsTypeSize);
index.Resize((sizeof(uint8_t)) * n_index); index.Resize((sizeof(uint8_t)) * n_index);
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) && } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
isDense) { isDense) {
// compress dense index to uint16 // compress dense index to uint16
index.SetBinTypeSize(common::kUint16BinsTypeSize); index.SetBinTypeSize(common::kUint16BinsTypeSize);

View File

@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
EllpackPage const& in_page, BatchParam const& p) EllpackPage const& in_page, BatchParam const& p)
: max_num_bins{p.max_bin} { : max_numeric_bins_per_feat{p.max_bin} {
auto page = in_page.Impl(); auto page = in_page.Impl();
isDense_ = page->is_dense; isDense_ = page->is_dense;

View File

@@ -133,11 +133,15 @@ class GHistIndexMatrix {
std::vector<size_t> hit_count; std::vector<size_t> hit_count;
/*! \brief The corresponding cuts */ /*! \brief The corresponding cuts */
common::HistogramCuts cut; common::HistogramCuts cut;
/*! \brief max_bin for each feature. */ /** \brief max_bin for each feature. */
bst_bin_t max_num_bins; bst_bin_t max_numeric_bins_per_feat;
/*! \brief base row index for current page (used by external memory) */ /*! \brief base row index for current page (used by external memory) */
size_t base_rowid{0}; size_t base_rowid{0};
bst_bin_t MaxNumBinPerFeat() const {
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
}
~GHistIndexMatrix(); ~GHistIndexMatrix();
/** /**
* \brief Constrcutor for SimpleDMatrix. * \brief Constrcutor for SimpleDMatrix.
@@ -160,7 +164,7 @@ class GHistIndexMatrix {
* \brief Constructor for external memory. * \brief Constructor for external memory.
*/ */
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft, GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense, common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
double sparse_thresh, int32_t n_threads); double sparse_thresh, int32_t n_threads);
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back. GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.

View File

@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
if (!fi->Read(&page->hit_count)) { if (!fi->Read(&page->hit_count)) {
return false; return false;
} }
if (!fi->Read(&page->max_num_bins)) { if (!fi->Read(&page->max_numeric_bins_per_feat)) {
return false; return false;
} }
if (!fi->Read(&page->base_rowid)) { if (!fi->Read(&page->base_rowid)) {
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) + page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
sizeof(uint64_t); sizeof(uint64_t);
// max_bins, base row, is_dense // max_bins, base row, is_dense
fo->Write(page.max_num_bins); fo->Write(page.max_numeric_bins_per_feat);
bytes += sizeof(page.max_num_bins); bytes += sizeof(page.max_numeric_bins_per_feat);
fo->Write(page.base_rowid); fo->Write(page.base_rowid);
bytes += sizeof(page.base_rowid); bytes += sizeof(page.base_rowid);
fo->Write(page.IsDense()); fo->Write(page.IsDense());

View File

@@ -58,6 +58,13 @@ void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, Bat
} }
}; };
auto ellpack = [&]() { auto ellpack = [&]() {
// workaround ellpack being initialized from CPU.
if (p.gpu_id == Context::kCpuId) {
p.gpu_id = ref_->Ctx()->gpu_id;
}
if (p.gpu_id == Context::kCpuId) {
p.gpu_id = 0;
}
for (auto const& page : ref_->GetBatches<EllpackPage>(p)) { for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
GetCutsFromEllpack(page, p_cuts); GetCutsFromEllpack(page, p_cuts);
break; break;
@@ -172,9 +179,9 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
size_t i = 0; size_t i = 0;
while (iter.Next()) { while (iter.Next()) {
if (!p_sketch) { if (!p_sketch) {
p_sketch.reset(new common::HostSketchContainer{batch_param_.max_bin, p_sketch.reset(new common::HostSketchContainer{
proxy->Info().feature_types.ConstHostSpan(), batch_param_.max_bin, proxy->Info().feature_types.ConstHostSpan(), column_sizes,
column_sizes, false, ctx_.Threads()}); !proxy->Info().group_ptr_.empty(), ctx_.Threads()});
} }
HostAdapterDispatch(proxy, [&](auto const& batch) { HostAdapterDispatch(proxy, [&](auto const& batch) {
proxy->Info().num_nonzero_ = batch_nnz[i]; proxy->Info().num_nonzero_ = batch_nnz[i];

View File

@@ -42,6 +42,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
out->Info() = this->Info().Slice(ridxs); out->Info() = this->Info().Slice(ridxs);
out->Info().num_nonzero_ = h_offset.back(); out->Info().num_nonzero_ = h_offset.back();
} }
out->ctx_ = this->ctx_;
return out; return out;
} }

View File

@@ -28,6 +28,7 @@
#include "xgboost/logging.h" #include "xgboost/logging.h"
#include "xgboost/objective.h" #include "xgboost/objective.h"
#include "xgboost/predictor.h" #include "xgboost/predictor.h"
#include "xgboost/string_view.h"
#include "xgboost/tree_updater.h" #include "xgboost/tree_updater.h"
namespace xgboost { namespace xgboost {
@@ -395,23 +396,36 @@ void GBTree::LoadConfig(Json const& in) {
tparam_.process_type = TreeProcessType::kDefault; tparam_.process_type = TreeProcessType::kDefault;
int32_t const n_gpus = xgboost::common::AllVisibleGPUs(); int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) { if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
LOG(WARNING) LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
<< "Loading from a raw memory buffer on CPU only machine. " "Changing predictor to auto.";
"Changing predictor to auto.";
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}}); tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
} }
auto msg = StringView{
R"(
Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
machine. Consider using `save_model/load_model` instead. See:
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
for more details about differences between saving model and serializing.)"};
if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) { if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}}); tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
LOG(WARNING) LOG(WARNING) << msg << " Changing `tree_method` to `hist`.";
<< "Loading from a raw memory buffer on CPU only machine. "
"Changing tree_method to hist.";
} }
auto const& j_updaters = get<Object const>(in["updater"]); auto const& j_updaters = get<Object const>(in["updater"]);
updaters_.clear(); updaters_.clear();
for (auto const& kv : j_updaters) { for (auto const& kv : j_updaters) {
std::unique_ptr<TreeUpdater> up( auto name = kv.first;
TreeUpdater::Create(kv.first, ctx_, model_.learner_model_param->task)); if (n_gpus == 0 && name == "grow_gpu_hist") {
name = "grow_quantile_histmaker";
LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
}
std::unique_ptr<TreeUpdater> up{
TreeUpdater::Create(name, ctx_, model_.learner_model_param->task)};
up->LoadConfig(kv.second); up->LoadConfig(kv.second);
updaters_.push_back(std::move(up)); updaters_.push_back(std::move(up));
} }

View File

@@ -18,9 +18,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
if (has_categorical && common::IsCat(cats.split_type, nid)) { if (has_categorical && common::IsCat(cats.split_type, nid)) {
auto node_categories = auto node_categories =
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size); cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
return common::Decision<true>(node_categories, fvalue, node.DefaultLeft()) return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
? node.LeftChild()
: node.RightChild();
} else { } else {
return node.LeftChild() + !(fvalue < node.SplitCond()); return node.LeftChild() + !(fvalue < node.SplitCond());
} }

View File

@@ -248,8 +248,10 @@ class EvaluateSplitAgent {
template <int kBlockSize> template <int kBlockSize>
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs, bst_feature_t max_active_features,
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx, common::Span<const EvaluateSplitInputs> d_inputs,
const EvaluateSplitSharedInputs shared_inputs,
common::Span<bst_feature_t> sorted_idx,
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) { common::Span<DeviceSplitCandidate> out_candidates) {
// Aligned && shared storage for best_split // Aligned && shared storage for best_split
@@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
__syncthreads(); __syncthreads();
// Allocate blocks to one feature of one node // Allocate blocks to one feature of one node
const auto input_idx = blockIdx.x / number_active_features; const auto input_idx = blockIdx.x / max_active_features;
const EvaluateSplitInputs &inputs = d_inputs[input_idx]; const EvaluateSplitInputs &inputs = d_inputs[input_idx];
// One block for each feature. Features are sampled, so fidx != blockIdx.x // One block for each feature. Features are sampled, so fidx != blockIdx.x
// Some blocks may not have any feature to work on, simply return
int fidx = inputs.feature_set[blockIdx.x % number_active_features]; int feature_offset = blockIdx.x % max_active_features;
if (feature_offset >= inputs.feature_set.size()) {
return;
}
int fidx = inputs.feature_set[feature_offset];
using AgentT = EvaluateSplitAgent<kBlockSize>; using AgentT = EvaluateSplitAgent<kBlockSize>;
__shared__ typename AgentT::TempStorage temp_storage; __shared__ typename AgentT::TempStorage temp_storage;
@@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
} }
void GPUHistEvaluator::LaunchEvaluateSplits( void GPUHistEvaluator::LaunchEvaluateSplits(
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs, EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits) { common::Span<DeviceSplitCandidate> out_splits) {
@@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
this->SortHistogram(d_inputs, shared_inputs, evaluator); this->SortHistogram(d_inputs, shared_inputs, evaluator);
} }
size_t combined_num_features = number_active_features * d_inputs.size(); size_t combined_num_features = max_active_features * d_inputs.size();
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features); dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
combined_num_features, DeviceSplitCandidate());
// One block for each feature // One block for each feature
uint32_t constexpr kBlockThreads = 32; uint32_t constexpr kBlockThreads = 32;
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}( dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs, 0}(
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
shared_inputs,
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
evaluator, dh::ToSpan(feature_best_splits)); evaluator, dh::ToSpan(feature_best_splits));
// Reduce to get best candidate for left and right child over all features // Reduce to get best candidate for left and right child over all features
auto reduce_offset = dh::MakeTransformIterator<size_t>( auto reduce_offset =
thrust::make_counting_iterator(0llu), dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; }); [=] __device__(size_t idx) -> size_t {
return idx * max_active_features;
});
size_t temp_storage_bytes = 0; size_t temp_storage_bytes = 0;
auto num_segments = out_splits.size(); auto num_segments = out_splits.size();
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(), cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
@@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
} }
void GPUHistEvaluator::EvaluateSplits( void GPUHistEvaluator::EvaluateSplits(
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features, const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_entries) { common::Span<GPUExpandEntry> out_entries) {
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>(); auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size()); dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
auto out_splits = dh::ToSpan(splits_out_storage); auto out_splits = dh::ToSpan(splits_out_storage);
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator, this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
out_splits); evaluator, out_splits);
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
auto d_entries = out_entries; auto d_entries = out_entries;

View File

@@ -170,13 +170,18 @@ class GPUHistEvaluator {
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator); TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
// impl of evaluate splits, contains CUDA kernels so it's public // impl of evaluate splits, contains CUDA kernels so it's public
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs, void LaunchEvaluateSplits(
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator, bst_feature_t max_active_features,
common::Span<DeviceSplitCandidate> out_splits); common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
/** /**
* \brief Evaluate splits for left and right nodes. * \brief Evaluate splits for left and right nodes.
*/ */
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs, void EvaluateSplits(const std::vector<bst_node_t> &nidx,
bst_feature_t max_active_features,
common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs, EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_splits); common::Span<GPUExpandEntry> out_splits);
/** /**

View File

@@ -188,7 +188,8 @@ struct GPUHistMakerDevice {
common::Span<GradientPair> gpair; common::Span<GradientPair> gpair;
dh::device_vector<int> monotone_constraints; dh::device_vector<int> monotone_constraints;
dh::device_vector<float> update_predictions; // node idx for each sample
dh::device_vector<bst_node_t> positions;
TrainParam param; TrainParam param;
@@ -318,24 +319,27 @@ struct GPUHistMakerDevice {
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx)); auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id); right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set = common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); interaction_constraints.Query(right_sampled_features->DeviceSpan(),
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum, right_nidx);
left_feature_set, hist.GetNodeHistogram(left_nidx)}; h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum, candidate.split.left_sum, left_feature_set,
right_feature_set, hist.GetNodeHistogram(right_nidx)}; hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
candidate.split.right_sum, right_feature_set,
hist.GetNodeHistogram(right_nidx)};
} }
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); bst_feature_t max_active_features = 0;
for (auto input : h_node_inputs) { for (auto input : h_node_inputs) {
CHECK_EQ(input.feature_set.size(), number_active_features) max_active_features = std::max(max_active_features,
<< "Current implementation assumes that the number of active features " bst_feature_t(input.feature_set.size()));
"(after sampling) in any node is the same";
} }
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), dh::safe_cuda(cudaMemcpyAsync(
h_node_inputs.size() * sizeof(EvaluateSplitInputs), d_node_inputs.data().get(), h_node_inputs.data(),
cudaMemcpyDefault)); h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs), this->evaluator_.EvaluateSplits(nidx, max_active_features,
shared_inputs, dh::ToSpan(entries)); dh::ToSpan(d_node_inputs), shared_inputs,
dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
@@ -403,8 +407,7 @@ struct GPUHistMakerDevice {
go_left = data.split_node.DefaultLeft(); go_left = data.split_node.DefaultLeft();
} else { } else {
if (data.split_type == FeatureType::kCategorical) { if (data.split_type == FeatureType::kCategorical) {
go_left = common::Decision<false>(data.node_cats.Bits(), cut_value, go_left = common::Decision(data.node_cats.Bits(), cut_value);
data.split_node.DefaultLeft());
} else { } else {
go_left = cut_value <= data.split_node.SplitCond(); go_left = cut_value <= data.split_node.SplitCond();
} }
@@ -424,7 +427,7 @@ struct GPUHistMakerDevice {
LOG(FATAL) << "Current objective function can not be used with external memory."; LOG(FATAL) << "Current objective function can not be used with external memory.";
} }
p_out_position->Resize(0); p_out_position->Resize(0);
update_predictions.clear(); positions.clear();
return; return;
} }
@@ -459,8 +462,6 @@ struct GPUHistMakerDevice {
HostDeviceVector<bst_node_t>* p_out_position) { HostDeviceVector<bst_node_t>* p_out_position) {
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
auto d_gpair = this->gpair; auto d_gpair = this->gpair;
update_predictions.resize(row_partitioner->GetRows().size());
auto d_update_predictions = dh::ToSpan(update_predictions);
p_out_position->SetDevice(ctx_->gpu_id); p_out_position->SetDevice(ctx_->gpu_id);
p_out_position->Resize(row_partitioner->GetRows().size()); p_out_position->Resize(row_partitioner->GetRows().size());
@@ -481,7 +482,7 @@ struct GPUHistMakerDevice {
if (common::IsCat(d_feature_types, position)) { if (common::IsCat(d_feature_types, position)) {
auto node_cats = categories.subspan(categories_segments[position].beg, auto node_cats = categories.subspan(categories_segments[position].beg,
categories_segments[position].size); categories_segments[position].size);
go_left = common::Decision<false>(node_cats, element, node.DefaultLeft()); go_left = common::Decision(node_cats, element);
} else { } else {
go_left = element <= node.SplitCond(); go_left = element <= node.SplitCond();
} }
@@ -495,32 +496,45 @@ struct GPUHistMakerDevice {
node = d_nodes[position]; node = d_nodes[position];
} }
d_update_predictions[row_id] = node.LeafValue();
return position; return position;
}; // NOLINT }; // NOLINT
auto d_out_position = p_out_position->DeviceSpan(); auto d_out_position = p_out_position->DeviceSpan();
row_partitioner->FinalisePosition(d_out_position, new_position_op); row_partitioner->FinalisePosition(d_out_position, new_position_op);
auto s_position = p_out_position->ConstDeviceSpan();
positions.resize(s_position.size());
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
s_position.size_bytes(), cudaMemcpyDeviceToDevice));
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) { dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
bst_node_t position = d_out_position[idx]; bst_node_t position = d_out_position[idx];
d_update_predictions[idx] = d_nodes[position].LeafValue();
bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f; bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
d_out_position[idx] = is_row_sampled ? ~position : position; d_out_position[idx] = is_row_sampled ? ~position : position;
}); });
} }
bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) { bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
if (update_predictions.empty()) { if (positions.empty()) {
return false; return false;
} }
CHECK(p_tree); CHECK(p_tree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id); CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
auto d_update_predictions = dh::ToSpan(update_predictions);
CHECK_EQ(out_preds_d.Size(), d_update_predictions.size()); auto d_position = dh::ToSpan(positions);
dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable { CHECK_EQ(out_preds_d.Size(), d_position.size());
out_preds_d(idx) += d_update_predictions[idx];
auto const& h_nodes = p_tree->GetNodes();
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
auto d_nodes = dh::ToSpan(nodes);
dh::LaunchN(d_position.size(), [=] XGBOOST_DEVICE(std::size_t idx) mutable {
bst_node_t nidx = d_position[idx];
auto weight = d_nodes[nidx].LeafValue();
out_preds_d(idx) += weight;
}); });
return true; return true;
} }
@@ -863,6 +877,7 @@ class GPUHistMaker : public TreeUpdater {
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
char const* Name() const override { return "grow_gpu_hist"; } char const* Name() const override { return "grow_gpu_hist"; }
bool HasNodePosition() const override { return true; }
private: private:
bool initialised_{false}; bool initialised_{false};

View File

@@ -4,7 +4,7 @@ set -euo pipefail
source tests/buildkite/conftest.sh source tests/buildkite/conftest.sh
echo "--- Run Google Tests with CUDA, using 4 GPUs" echo "--- Run Google Tests with CUDA, using a GPU"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda buildkite-agent artifact download "build/testxgboost" . --step build-cuda
chmod +x build/testxgboost chmod +x build/testxgboost
tests/ci_build/ci_build.sh gpu nvidia-docker \ tests/ci_build/ci_build.sh gpu nvidia-docker \
@@ -12,11 +12,12 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
build/testxgboost build/testxgboost
echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled" # Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
rm -rfv build/ # echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm # rm -rfv build/
chmod +x build/testxgboost # buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
tests/ci_build/ci_build.sh rmm nvidia-docker \ # chmod +x build/testxgboost
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ # tests/ci_build/ci_build.sh rmm nvidia-docker \
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \ # --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
"source activate gpu_test && build/testxgboost --use-rmm-pool" # --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
# "source activate gpu_test && build/testxgboost --use-rmm-pool"

View File

@@ -36,7 +36,8 @@ dependencies:
- cloudpickle - cloudpickle
- shap - shap
- modin - modin
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
- pyspark>=3.3.1
- pip: - pip:
- datatable - datatable
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
- https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz

View File

@@ -1,11 +1,14 @@
/*! /*!
* Copyright 2021 by XGBoost Contributors * Copyright 2021-2022 by XGBoost Contributors
*/ */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <xgboost/json.h>
#include <xgboost/learner.h>
#include <limits> #include <limits>
#include "../../../src/common/categorical.h" #include "../../../src/common/categorical.h"
#include "../helpers.h"
namespace xgboost { namespace xgboost {
namespace common { namespace common {
@@ -15,29 +18,76 @@ TEST(Categorical, Decision) {
ASSERT_TRUE(common::InvalidCat(a)); ASSERT_TRUE(common::InvalidCat(a));
std::vector<uint32_t> cats(256, 0); std::vector<uint32_t> cats(256, 0);
ASSERT_TRUE(Decision(cats, a, true)); ASSERT_TRUE(Decision(cats, a));
// larger than size // larger than size
a = 256; a = 256;
ASSERT_TRUE(Decision(cats, a, true)); ASSERT_TRUE(Decision(cats, a));
// negative // negative
a = -1; a = -1;
ASSERT_TRUE(Decision(cats, a, true)); ASSERT_TRUE(Decision(cats, a));
CatBitField bits{cats}; CatBitField bits{cats};
bits.Set(0); bits.Set(0);
a = -0.5; a = -0.5;
ASSERT_TRUE(Decision(cats, a, true)); ASSERT_TRUE(Decision(cats, a));
// round toward 0 // round toward 0
a = 0.5; a = 0.5;
ASSERT_FALSE(Decision(cats, a, true)); ASSERT_FALSE(Decision(cats, a));
// valid // valid
a = 13; a = 13;
bits.Set(a); bits.Set(a);
ASSERT_FALSE(Decision(bits.Bits(), a, true)); ASSERT_FALSE(Decision(bits.Bits(), a));
}
/**
* Test for running inference with input category greater than the one stored in tree.
*/
TEST(Categorical, MinimalSet) {
std::size_t constexpr kRows = 256, kCols = 1, kCat = 3;
std::vector<FeatureType> types{FeatureType::kCategorical};
auto Xy =
RandomDataGenerator{kRows, kCols, 0.0}.Type(types).MaxCategory(kCat).GenerateDMatrix(true);
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
learner->SetParam("max_depth", "1");
learner->SetParam("tree_method", "hist");
learner->Configure();
learner->UpdateOneIter(0, Xy);
Json model{Object{}};
learner->SaveModel(&model);
auto tree = model["learner"]["gradient_booster"]["model"]["trees"][0];
ASSERT_GE(get<I32Array const>(tree["categories"]).size(), 1);
auto v = get<I32Array const>(tree["categories"])[0];
HostDeviceVector<float> predt;
{
std::vector<float> data{static_cast<float>(kCat),
static_cast<float>(kCat + 1), 32.0f, 33.0f, 34.0f};
auto test = GetDMatrixFromData(data, data.size(), kCols);
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
ASSERT_EQ(predt.Size(), data.size());
auto const& h_predt = predt.ConstHostSpan();
for (auto v : h_predt) {
ASSERT_EQ(v, 1); // left child of root node
}
}
{
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
learner->LoadModel(model);
std::vector<float> data = {static_cast<float>(v)};
auto test = GetDMatrixFromData(data, data.size(), kCols);
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
auto const& h_predt = predt.ConstHostSpan();
for (auto v : h_predt) {
ASSERT_EQ(v, 2); // right child of root node
}
}
} }
} // namespace common } // namespace common
} // namespace xgboost } // namespace xgboost

View File

@@ -1,10 +1,12 @@
/*! /**
* Copyright 2020-2021 by XGBoost Contributors * Copyright 2020-2023 by XGBoost Contributors
*/ */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <xgboost/host_device_vector.h> #include <xgboost/host_device_vector.h>
#include "../helpers.h" #include "../helpers.h"
#include "../../../src/data/array_interface.h" #include "../../../src/data/array_interface.h"
#include "dmlc/logging.h"
#include "xgboost/json.h"
namespace xgboost { namespace xgboost {
TEST(ArrayInterface, Initialize) { TEST(ArrayInterface, Initialize) {
@@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) {
column["mask"]["data"] = Null{}; column["mask"]["data"] = Null{};
common::Span<RBitField8::value_type> s_mask; common::Span<RBitField8::value_type> s_mask;
EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error); EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error);
get<Object>(column).erase("mask");
// misaligned.
j_data = {Json(Integer(reinterpret_cast<Integer::Int>(
reinterpret_cast<char const*>(storage.ConstHostPointer()) + 1))),
Json(Boolean(false))};
column["data"] = j_data;
EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error);
} }
TEST(ArrayInterface, GetElement) { TEST(ArrayInterface, GetElement) {

View File

@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
} }
} }
TEST(GradientIndex, FromCategoricalLarge) {
size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
bst_bin_t max_bins = 8;
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
auto m = GetDMatrixFromData(x, kRows, 1);
Context ctx;
auto &h_ft = m->Info().feature_types.HostVector();
h_ft.resize(kCols, FeatureType::kCategorical);
BatchParam p{max_bins, 0.8};
{
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {});
ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
}
{
for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
common::HistogramCuts cut = page.cut;
GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
}
}
}
TEST(GradientIndex, PushBatch) { TEST(GradientIndex, PushBatch) {
size_t constexpr kRows = 64, kCols = 4; size_t constexpr kRows = 64, kCols = 4;
bst_bin_t max_bins = 64; bst_bin_t max_bins = 64;

View File

@@ -1,13 +1,19 @@
// Copyright by Contributors /**
* Copyright 2016-2023 by XGBoost Contributors
*/
#include <xgboost/data.h> #include <xgboost/data.h>
#include <array> #include <array> // std::array
#include <limits> // std::numeric_limits
#include <memory> // std::unique_ptr
#include "../../../src/data/adapter.h" #include "../../../src/data/adapter.h" // ArrayAdapter
#include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/simple_dmatrix.h" // SimpleDMatrix
#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../filesystem.h" // dmlc::TemporaryDirectory
#include "../helpers.h" #include "../helpers.h" // RandomDataGenerator,CreateSimpleTestData
#include "xgboost/base.h" #include "xgboost/base.h"
#include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/string_view.h" // StringView
using namespace xgboost; // NOLINT using namespace xgboost; // NOLINT
@@ -298,6 +304,17 @@ TEST(SimpleDMatrix, Slice) {
ASSERT_EQ(out->Info().num_col_, out->Info().num_col_); ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
ASSERT_EQ(out->Info().num_row_, ridxs.size()); ASSERT_EQ(out->Info().num_row_, ridxs.size());
ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols); // dense ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols); // dense
{
HostDeviceVector<float> data;
auto arr_str = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&data);
auto adapter = data::ArrayAdapter{StringView{arr_str}};
auto n_threads = 2;
std::unique_ptr<DMatrix> p_fmat{
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), n_threads, "")};
std::unique_ptr<DMatrix> slice{p_fmat->Slice(ridxs)};
ASSERT_LE(slice->Ctx()->Threads(), n_threads);
}
} }
TEST(SimpleDMatrix, SaveLoadBinary) { TEST(SimpleDMatrix, SaveLoadBinary) {

View File

@@ -0,0 +1,24 @@
/**
* Copyright 2023 by XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/task.h>
#include <xgboost/tree_updater.h>
namespace xgboost {
TEST(Updater, HasNodePosition) {
Context ctx;
ObjInfo task{ObjInfo::kRegression, true, true};
std::unique_ptr<TreeUpdater> up{TreeUpdater::Create("grow_histmaker", &ctx, task)};
ASSERT_TRUE(up->HasNodePosition());
up.reset(TreeUpdater::Create("grow_quantile_histmaker", &ctx, task));
ASSERT_TRUE(up->HasNodePosition());
#if defined(XGBOOST_USE_CUDA)
ctx.gpu_id = 0;
up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, task));
ASSERT_TRUE(up->HasNodePosition());
#endif // defined(XGBOOST_USE_CUDA)
}
} // namespace xgboost

View File

@@ -139,3 +139,17 @@ class TestDeviceQuantileDMatrix:
booster.predict(xgb.DMatrix(d_m.get_data())), booster.predict(xgb.DMatrix(d_m.get_data())),
atol=1e-6, atol=1e-6,
) )
def test_ltr(self) -> None:
import cupy as cp
X, y, qid, w = tm.make_ltr(100, 3, 3, 5)
# make sure GPU is used to run sketching.
cpX = cp.array(X)
Xy_qdm = xgb.QuantileDMatrix(cpX, y, qid=qid, weight=w)
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
xgb.train({"tree_method": "gpu_hist", "objective": "rank:ndcg"}, Xy)
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
assert tm.predictor_equal(from_qdm, from_dm)

View File

@@ -1,8 +1,14 @@
import numpy as np
import sys import sys
import numpy as np
import pandas as pd
import xgboost as xgb
sys.path.append("tests/python") sys.path.append("tests/python")
# Don't import the test class, otherwise they will run twice. # Don't import the test class, otherwise they will run twice.
import test_interaction_constraints as test_ic # noqa import test_interaction_constraints as test_ic # noqa
rng = np.random.RandomState(1994) rng = np.random.RandomState(1994)
@@ -10,7 +16,34 @@ class TestGPUInteractionConstraints:
cputest = test_ic.TestInteractionConstraints() cputest = test_ic.TestInteractionConstraints()
def test_interaction_constraints(self): def test_interaction_constraints(self):
self.cputest.run_interaction_constraints(tree_method='gpu_hist') self.cputest.run_interaction_constraints(tree_method="gpu_hist")
def test_training_accuracy(self): def test_training_accuracy(self):
self.cputest.training_accuracy(tree_method='gpu_hist') self.cputest.training_accuracy(tree_method="gpu_hist")
# case where different number of features can occur in the evaluator
def test_issue_8730(self):
X = pd.DataFrame(
zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
columns=["A", "B", "C", "D"],
)
y = np.array([*([0] * 50), *([1] * 50)])
dm = xgb.DMatrix(X, label=y)
params = {
"eta": 0.16095019509249486,
"min_child_weight": 1,
"subsample": 0.688567929338029,
"colsample_bynode": 0.7,
"gamma": 5.666579817418348e-06,
"lambda": 0.14943712232059794,
"grow_policy": "depthwise",
"max_depth": 3,
"tree_method": "gpu_hist",
"interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
"objective": "count:poisson",
"eval_metric": "poisson-nloglik",
"verbosity": 0,
}
xgb.train(params, dm, num_boost_round=100)

View File

@@ -338,13 +338,21 @@ class TestGPUPredict:
@given(predict_parameter_strategy, tm.dataset_strategy) @given(predict_parameter_strategy, tm.dataset_strategy)
@settings(deadline=None, max_examples=20, print_blob=True) @settings(deadline=None, max_examples=20, print_blob=True)
def test_predict_leaf_gbtree(self, param, dataset): def test_predict_leaf_gbtree(self, param, dataset):
# Unsupported for random forest
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
return
param['booster'] = 'gbtree' param['booster'] = 'gbtree'
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'gpu_hist'
self.run_predict_leaf_booster(param, 10, dataset) self.run_predict_leaf_booster(param, 10, dataset)
@given(predict_parameter_strategy, tm.dataset_strategy) @given(predict_parameter_strategy, tm.dataset_strategy)
@settings(deadline=None, max_examples=20, print_blob=True) @settings(deadline=None, max_examples=20, print_blob=True)
def test_predict_leaf_dart(self, param, dataset): def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
# Unsupported for random forest
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
return
param['booster'] = 'dart' param['booster'] = 'dart'
param['tree_method'] = 'gpu_hist' param['tree_method'] = 'gpu_hist'
self.run_predict_leaf_booster(param, 10, dataset) self.run_predict_leaf_booster(param, 10, dataset)

View File

@@ -326,7 +326,7 @@ class TestDMatrix:
nrow = 100 nrow = 100
ncol = 1000 ncol = 1000
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
assert x.indices.max() < ncol - 1 assert x.indices.max() < ncol
x.data[:] = 1 x.data[:] = 1
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)

View File

@@ -9,7 +9,9 @@ from testing import (
make_batches, make_batches,
make_batches_sparse, make_batches_sparse,
make_categorical, make_categorical,
make_ltr,
make_sparse_regression, make_sparse_regression,
predictor_equal,
) )
import xgboost as xgb import xgboost as xgb
@@ -218,6 +220,16 @@ class TestQuantileDMatrix:
b = booster.predict(qXy) b = booster.predict(qXy)
np.testing.assert_allclose(a, b) np.testing.assert_allclose(a, b)
def test_ltr(self) -> None:
X, y, qid, w = make_ltr(100, 3, 3, 5)
Xy_qdm = xgb.QuantileDMatrix(X, y, qid=qid, weight=w)
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
xgb.train({"tree_method": "hist", "objective": "rank:ndcg"}, Xy)
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
assert predictor_equal(from_qdm, from_dm)
# we don't test empty Quantile DMatrix in single node construction. # we don't test empty Quantile DMatrix in single node construction.
@given( @given(
strategies.integers(1, 1000), strategies.integers(1, 1000),

View File

@@ -41,6 +41,16 @@ logging.getLogger("py4j").setLevel(logging.INFO)
pytestmark = testing.timeout(60) pytestmark = testing.timeout(60)
def no_sparse_unwrap():
try:
from pyspark.sql.functions import unwrap_udt
except ImportError:
return {"reason": "PySpark<3.4", "condition": True}
return {"reason": "PySpark<3.4", "condition": False}
class XgboostLocalTest(SparkTestCase): class XgboostLocalTest(SparkTestCase):
def setUp(self): def setUp(self):
logging.getLogger().setLevel("INFO") logging.getLogger().setLevel("INFO")
@@ -985,6 +995,7 @@ class XgboostLocalTest(SparkTestCase):
model = classifier.fit(self.cls_df_train) model = classifier.fit(self.cls_df_train)
model.transform(self.cls_df_test).collect() model.transform(self.cls_df_test).collect()
@pytest.mark.skipif(**no_sparse_unwrap())
def test_regressor_with_sparse_optim(self): def test_regressor_with_sparse_optim(self):
regressor = SparkXGBRegressor(missing=0.0) regressor = SparkXGBRegressor(missing=0.0)
model = regressor.fit(self.reg_df_sparse_train) model = regressor.fit(self.reg_df_sparse_train)
@@ -1001,6 +1012,7 @@ class XgboostLocalTest(SparkTestCase):
for row1, row2 in zip(pred_result, pred_result2): for row1, row2 in zip(pred_result, pred_result2):
self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3)) self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))
@pytest.mark.skipif(**no_sparse_unwrap())
def test_classifier_with_sparse_optim(self): def test_classifier_with_sparse_optim(self):
cls = SparkXGBClassifier(missing=0.0) cls = SparkXGBClassifier(missing=0.0)
model = cls.fit(self.cls_df_sparse_train) model = cls.fit(self.cls_df_sparse_train)

View File

@@ -458,6 +458,22 @@ class TestTreeMethod:
config_0 = json.loads(booster_0.save_config()) config_0 = json.loads(booster_0.save_config())
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1) np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
evals_result: Dict[str, Dict[str, list]] = {}
xgb.train(
{
"tree_method": tree_method,
"objective": "reg:absoluteerror",
"subsample": 0.8
},
Xy,
num_boost_round=10,
evals=[(Xy, "Train")],
evals_result=evals_result,
)
mae = evals_result["Train"]["mae"]
assert mae[-1] < 20.0
assert tm.non_increasing(mae)
@pytest.mark.skipif(**tm.no_sklearn()) @pytest.mark.skipif(**tm.no_sklearn())
@pytest.mark.parametrize( @pytest.mark.parametrize(
"tree_method,weighted", [ "tree_method,weighted", [

View File

@@ -112,7 +112,6 @@ class TestPandas:
# test Index as columns # test Index as columns
df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2])) df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
print(df.columns, isinstance(df.columns, pd.Index))
Xy = xgb.DMatrix(df) Xy = xgb.DMatrix(df)
np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"])) np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))

View File

@@ -4,7 +4,7 @@ import pytest
try: try:
import shap import shap
except ImportError: except Exception:
shap = None shap = None
pass pass

View File

@@ -2,6 +2,7 @@ import collections
import importlib.util import importlib.util
import json import json
import os import os
import pickle
import random import random
import tempfile import tempfile
from typing import Callable, Optional from typing import Callable, Optional
@@ -636,26 +637,74 @@ def test_sklearn_n_jobs():
def test_parameters_access(): def test_parameters_access():
from sklearn import datasets from sklearn import datasets
params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1}
params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
clf = xgb.XGBClassifier(n_estimators=1000, **params) clf = xgb.XGBClassifier(n_estimators=1000, **params)
assert clf.get_params()['updater'] == 'grow_gpu_hist' assert clf.get_params()["updater"] == "grow_gpu_hist"
assert clf.get_params()['subsample'] == .5 assert clf.get_params()["subsample"] == 0.5
assert clf.get_params()['n_estimators'] == 1000 assert clf.get_params()["n_estimators"] == 1000
clf = xgb.XGBClassifier(n_estimators=1, nthread=4) clf = xgb.XGBClassifier(n_estimators=1, nthread=4)
X, y = datasets.load_iris(return_X_y=True) X, y = datasets.load_iris(return_X_y=True)
clf.fit(X, y) clf.fit(X, y)
config = json.loads(clf.get_booster().save_config()) config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 4 assert int(config["learner"]["generic_param"]["nthread"]) == 4
clf.set_params(nthread=16) clf.set_params(nthread=16)
config = json.loads(clf.get_booster().save_config()) config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 16 assert int(config["learner"]["generic_param"]["nthread"]) == 16
clf.predict(X) clf.predict(X)
config = json.loads(clf.get_booster().save_config()) config = json.loads(clf.get_booster().save_config())
assert int(config['learner']['generic_param']['nthread']) == 16 assert int(config["learner"]["generic_param"]["nthread"]) == 16
clf = xgb.XGBClassifier(n_estimators=2)
assert clf.tree_method is None
assert clf.get_params()["tree_method"] is None
clf.fit(X, y)
assert clf.get_params()["tree_method"] is None
def save_load(clf: xgb.XGBClassifier) -> xgb.XGBClassifier:
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "model.json")
clf.save_model(path)
clf = xgb.XGBClassifier()
clf.load_model(path)
return clf
def get_tm(clf: xgb.XGBClassifier) -> str:
tm = json.loads(clf.get_booster().save_config())["learner"]["gradient_booster"][
"gbtree_train_param"
]["tree_method"]
return tm
assert get_tm(clf) == "exact"
clf = pickle.loads(pickle.dumps(clf))
assert clf.tree_method is None
assert clf.n_estimators == 2
assert clf.get_params()["tree_method"] is None
assert clf.get_params()["n_estimators"] == 2
assert get_tm(clf) == "exact" # preserved for pickle
clf = save_load(clf)
assert clf.tree_method is None
assert clf.n_estimators == 2
assert clf.get_params()["tree_method"] is None
assert clf.get_params()["n_estimators"] == 2
assert get_tm(clf) == "auto" # discarded for save/load_model
clf.set_params(tree_method="hist")
assert clf.get_params()["tree_method"] == "hist"
clf = pickle.loads(pickle.dumps(clf))
assert clf.get_params()["tree_method"] == "hist"
clf = save_load(clf)
# FIXME(jiamingy): We should remove this behavior once we remove parameters
# serialization for skl save/load_model.
assert clf.get_params()["tree_method"] == "hist"
def test_kwargs_error(): def test_kwargs_error():
@@ -695,13 +744,19 @@ def test_sklearn_clone():
def test_sklearn_get_default_params(): def test_sklearn_get_default_params():
from sklearn.datasets import load_digits from sklearn.datasets import load_digits
digits_2class = load_digits(n_class=2) digits_2class = load_digits(n_class=2)
X = digits_2class['data'] X = digits_2class["data"]
y = digits_2class['target'] y = digits_2class["target"]
cls = xgb.XGBClassifier() cls = xgb.XGBClassifier()
assert cls.get_params()['base_score'] is None assert cls.get_params()["base_score"] is None
cls.fit(X[:4, ...], y[:4, ...]) cls.fit(X[:4, ...], y[:4, ...])
assert cls.get_params()['base_score'] is not None base_score = float(
json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][
"base_score"
]
)
np.testing.assert_equal(base_score, 0.5)
def run_validation_weights(model): def run_validation_weights(model):
@@ -1029,9 +1084,9 @@ def test_pandas_input():
clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic") clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
clf_isotonic.fit(train, target) clf_isotonic.fit(train, target)
assert isinstance( clf = clf_isotonic.calibrated_classifiers_[0]
clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier est = clf.estimator if hasattr(clf, "estimator") else clf.base_estimator
) assert isinstance(est, xgb.XGBClassifier)
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1])) np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))

View File

@@ -466,7 +466,22 @@ def make_categorical(
return df, label return df, label
def _cat_sampled_from(): def make_ltr(
n_samples: int, n_features: int, n_query_groups: int, max_rel: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Make a dataset for testing LTR."""
rng = np.random.default_rng(1994)
X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
y = rng.integers(0, max_rel, size=n_samples)
qid = rng.integers(0, n_query_groups, size=n_samples)
w = rng.normal(0, 1.0, size=n_query_groups)
w -= np.min(w)
w /= np.max(w)
qid = np.sort(qid)
return X, y, qid, w
def _cat_sampled_from() -> strategies.SearchStrategy:
@strategies.composite @strategies.composite
def _make_cat(draw): def _make_cat(draw):
n_samples = draw(strategies.integers(2, 512)) n_samples = draw(strategies.integers(2, 512))
@@ -775,6 +790,19 @@ class DirectoryExcursion:
os.remove(f) os.remove(f)
def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
"""Assert whether two DMatrices contain the same predictors."""
lcsr = lhs.get_data()
rcsr = rhs.get_data()
return all(
(
np.array_equal(lcsr.data, rcsr.data),
np.array_equal(lcsr.indices, rcsr.indices),
np.array_equal(lcsr.indptr, rcsr.indptr),
)
)
@contextmanager @contextmanager
def captured_output(): def captured_output():
"""Reassign stdout temporarily in order to test printed statements """Reassign stdout temporarily in order to test printed statements