Compare commits
22 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
36ad160501 | ||
|
|
c22f6db4bf | ||
|
|
f15a6d2b19 | ||
|
|
08a547f5c2 | ||
|
|
60303db2ee | ||
|
|
df984f9c43 | ||
|
|
2f22f8d49b | ||
|
|
68d86336d7 | ||
|
|
76bdca072a | ||
|
|
021e6a842a | ||
|
|
e5bef4ffce | ||
|
|
10bb0a74ef | ||
|
|
e803d06d8c | ||
|
|
ccf43d4ba0 | ||
|
|
dd58c2ac47 | ||
|
|
899e4c8988 | ||
|
|
a2085bf223 | ||
|
|
067b704e58 | ||
|
|
1a834b2b85 | ||
|
|
162b48a1a4 | ||
|
|
83a078b7e5 | ||
|
|
575fba651b |
@@ -1,5 +1,5 @@
|
|||||||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||||
project(xgboost LANGUAGES CXX C VERSION 1.7.2)
|
project(xgboost LANGUAGES CXX C VERSION 1.7.4)
|
||||||
include(cmake/Utils.cmake)
|
include(cmake/Utils.cmake)
|
||||||
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
||||||
cmake_policy(SET CMP0022 NEW)
|
cmake_policy(SET CMP0022 NEW)
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
Package: xgboost
|
Package: xgboost
|
||||||
Type: Package
|
Type: Package
|
||||||
Title: Extreme Gradient Boosting
|
Title: Extreme Gradient Boosting
|
||||||
Version: 1.7.2.1
|
Version: 1.7.4.1
|
||||||
Date: 2022-12-08
|
Date: 2023-02-15
|
||||||
Authors@R: c(
|
Authors@R: c(
|
||||||
person("Tianqi", "Chen", role = c("aut"),
|
person("Tianqi", "Chen", role = c("aut"),
|
||||||
email = "tianqi.tchen@gmail.com"),
|
email = "tianqi.tchen@gmail.com"),
|
||||||
@@ -66,5 +66,5 @@ Imports:
|
|||||||
methods,
|
methods,
|
||||||
data.table (>= 1.9.6),
|
data.table (>= 1.9.6),
|
||||||
jsonlite (>= 1.0),
|
jsonlite (>= 1.0),
|
||||||
RoxygenNote: 7.2.1
|
RoxygenNote: 7.2.2
|
||||||
SystemRequirements: GNU make, C++14
|
SystemRequirements: GNU make, C++14
|
||||||
|
|||||||
@@ -328,8 +328,9 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
|
|||||||
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
|
||||||
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
|
reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
|
||||||
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
object <- xgb.Booster.complete(object, saveraw = FALSE)
|
||||||
|
|
||||||
if (!inherits(newdata, "xgb.DMatrix"))
|
if (!inherits(newdata, "xgb.DMatrix"))
|
||||||
newdata <- xgb.DMatrix(newdata, missing = missing)
|
newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
|
||||||
if (!is.null(object[["feature_names"]]) &&
|
if (!is.null(object[["feature_names"]]) &&
|
||||||
!is.null(colnames(newdata)) &&
|
!is.null(colnames(newdata)) &&
|
||||||
!identical(object[["feature_names"]], colnames(newdata)))
|
!identical(object[["feature_names"]], colnames(newdata)))
|
||||||
|
|||||||
1831
R-package/configure
vendored
1831
R-package/configure
vendored
File diff suppressed because it is too large
Load Diff
@@ -2,10 +2,25 @@
|
|||||||
|
|
||||||
AC_PREREQ(2.69)
|
AC_PREREQ(2.69)
|
||||||
|
|
||||||
AC_INIT([xgboost],[1.7.2],[],[xgboost],[])
|
AC_INIT([xgboost],[1.7.4],[],[xgboost],[])
|
||||||
|
|
||||||
# Use this line to set CC variable to a C compiler
|
: ${R_HOME=`R RHOME`}
|
||||||
AC_PROG_CC
|
if test -z "${R_HOME}"; then
|
||||||
|
echo "could not determine R_HOME"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
CXX14=`"${R_HOME}/bin/R" CMD config CXX14`
|
||||||
|
CXX14STD=`"${R_HOME}/bin/R" CMD config CXX14STD`
|
||||||
|
CXX="${CXX14} ${CXX14STD}"
|
||||||
|
CXXFLAGS=`"${R_HOME}/bin/R" CMD config CXXFLAGS`
|
||||||
|
|
||||||
|
CC=`"${R_HOME}/bin/R" CMD config CC`
|
||||||
|
CFLAGS=`"${R_HOME}/bin/R" CMD config CFLAGS`
|
||||||
|
CPPFLAGS=`"${R_HOME}/bin/R" CMD config CPPFLAGS`
|
||||||
|
|
||||||
|
LDFLAGS=`"${R_HOME}/bin/R" CMD config LDFLAGS`
|
||||||
|
AC_LANG(C++)
|
||||||
|
|
||||||
### Check whether backtrace() is part of libc or the external lib libexecinfo
|
### Check whether backtrace() is part of libc or the external lib libexecinfo
|
||||||
AC_MSG_CHECKING([Backtrace lib])
|
AC_MSG_CHECKING([Backtrace lib])
|
||||||
@@ -40,7 +55,7 @@ then
|
|||||||
ac_pkg_openmp=no
|
ac_pkg_openmp=no
|
||||||
AC_MSG_CHECKING([whether OpenMP will work in a package])
|
AC_MSG_CHECKING([whether OpenMP will work in a package])
|
||||||
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
|
AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <omp.h>]], [[ return (omp_get_max_threads() <= 1); ]])])
|
||||||
${CC} -o conftest conftest.c ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
${CXX} -o conftest conftest.cpp ${CPPFLAGS} ${LDFLAGS} ${OPENMP_LIB} ${OPENMP_CXXFLAGS} 2>/dev/null && ./conftest && ac_pkg_openmp=yes
|
||||||
AC_MSG_RESULT([${ac_pkg_openmp}])
|
AC_MSG_RESULT([${ac_pkg_openmp}])
|
||||||
if test "${ac_pkg_openmp}" = no; then
|
if test "${ac_pkg_openmp}" = no; then
|
||||||
OPENMP_CXXFLAGS=''
|
OPENMP_CXXFLAGS=''
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
|
|||||||
OBJECTS= \
|
OBJECTS= \
|
||||||
./xgboost_R.o \
|
./xgboost_R.o \
|
||||||
./xgboost_custom.o \
|
./xgboost_custom.o \
|
||||||
./xgboost_assert.o \
|
|
||||||
./init.o \
|
./init.o \
|
||||||
$(PKGROOT)/src/metric/metric.o \
|
$(PKGROOT)/src/metric/metric.o \
|
||||||
$(PKGROOT)/src/metric/elementwise_metric.o \
|
$(PKGROOT)/src/metric/elementwise_metric.o \
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) -DDMLC_CMAKE_LITTLE_ENDIAN=1 $(SHLIB_PTHRE
|
|||||||
OBJECTS= \
|
OBJECTS= \
|
||||||
./xgboost_R.o \
|
./xgboost_R.o \
|
||||||
./xgboost_custom.o \
|
./xgboost_custom.o \
|
||||||
./xgboost_assert.o \
|
|
||||||
./init.o \
|
./init.o \
|
||||||
$(PKGROOT)/src/metric/metric.o \
|
$(PKGROOT)/src/metric/metric.o \
|
||||||
$(PKGROOT)/src/metric/elementwise_metric.o \
|
$(PKGROOT)/src/metric/elementwise_metric.o \
|
||||||
|
|||||||
@@ -1,26 +0,0 @@
|
|||||||
// Copyright (c) 2014 by Contributors
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <Rinternals.h>
|
|
||||||
|
|
||||||
// implements error handling
|
|
||||||
void XGBoostAssert_R(int exp, const char *fmt, ...) {
|
|
||||||
char buf[1024];
|
|
||||||
if (exp == 0) {
|
|
||||||
va_list args;
|
|
||||||
va_start(args, fmt);
|
|
||||||
vsprintf(buf, fmt, args);
|
|
||||||
va_end(args);
|
|
||||||
error("AssertError:%s\n", buf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void XGBoostCheck_R(int exp, const char *fmt, ...) {
|
|
||||||
char buf[1024];
|
|
||||||
if (exp == 0) {
|
|
||||||
va_list args;
|
|
||||||
va_start(args, fmt);
|
|
||||||
vsprintf(buf, fmt, args);
|
|
||||||
va_end(args);
|
|
||||||
error("%s\n", buf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -138,11 +138,11 @@ Miscellaneous
|
|||||||
|
|
||||||
By default, XGBoost assumes input categories are integers starting from 0 till the number
|
By default, XGBoost assumes input categories are integers starting from 0 till the number
|
||||||
of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
|
of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
|
||||||
values due to mistakes or missing values. It can be negative value, integer values that
|
values due to mistakes or missing values in training dataset. It can be negative value,
|
||||||
can not be accurately represented by 32-bit floating point, or values that are larger than
|
integer values that can not be accurately represented by 32-bit floating point, or values
|
||||||
actual number of unique categories. During training this is validated but for prediction
|
that are larger than actual number of unique categories. During training this is
|
||||||
it's treated as the same as missing value for performance reasons. Lastly, missing values
|
validated but for prediction it's treated as the same as not-chosen category for
|
||||||
are treated as the same as numerical features (using the learned split direction).
|
performance reasons.
|
||||||
|
|
||||||
|
|
||||||
**********
|
**********
|
||||||
|
|||||||
@@ -6,6 +6,6 @@
|
|||||||
|
|
||||||
#define XGBOOST_VER_MAJOR 1
|
#define XGBOOST_VER_MAJOR 1
|
||||||
#define XGBOOST_VER_MINOR 7
|
#define XGBOOST_VER_MINOR 7
|
||||||
#define XGBOOST_VER_PATCH 2
|
#define XGBOOST_VER_PATCH 4
|
||||||
|
|
||||||
#endif // XGBOOST_VERSION_CONFIG_H_
|
#endif // XGBOOST_VERSION_CONFIG_H_
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
<packaging>pom</packaging>
|
<packaging>pom</packaging>
|
||||||
<name>XGBoost JVM Package</name>
|
<name>XGBoost JVM Package</name>
|
||||||
<description>JVM Package for XGBoost</description>
|
<description>JVM Package for XGBoost</description>
|
||||||
|
|||||||
@@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-example_2.12</artifactId>
|
<artifactId>xgboost4j-example_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
@@ -26,7 +26,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
@@ -37,7 +37,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
|||||||
@@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-flink_2.12</artifactId>
|
<artifactId>xgboost4j-flink_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
<build>
|
<build>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
@@ -26,7 +26,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
|
|||||||
@@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-gpu_2.12</artifactId>
|
<artifactId>xgboost4j-gpu_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
|
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
|
||||||
<build>
|
<build>
|
||||||
@@ -24,7 +24,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j-spark_2.12</artifactId>
|
<artifactId>xgboost4j-spark_2.12</artifactId>
|
||||||
<build>
|
<build>
|
||||||
@@ -24,7 +24,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
from sklearn.datasets import load_iris
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas
|
import pandas
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
y = y.astype(np.int)
|
y = y.astype(np.int32)
|
||||||
df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
|
df = pandas.DataFrame(data=X, columns=['sepal length', 'sepal width', 'petal length', 'petal width'])
|
||||||
class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
|
class_id_to_name = {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
|
||||||
df['class'] = np.vectorize(class_id_to_name.get)(y)
|
df['class'] = np.vectorize(class_id_to_name.get)(y)
|
||||||
|
|||||||
@@ -6,10 +6,10 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<groupId>ml.dmlc</groupId>
|
<groupId>ml.dmlc</groupId>
|
||||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
</parent>
|
</parent>
|
||||||
<artifactId>xgboost4j_2.12</artifactId>
|
<artifactId>xgboost4j_2.12</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.4</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
1.7.2
|
1.7.4
|
||||||
|
|||||||
@@ -36,7 +36,6 @@ try:
|
|||||||
|
|
||||||
PANDAS_INSTALLED = True
|
PANDAS_INSTALLED = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
||||||
MultiIndex = object
|
MultiIndex = object
|
||||||
DataFrame = object
|
DataFrame = object
|
||||||
Series = object
|
Series = object
|
||||||
@@ -161,6 +160,7 @@ def concat(value: Sequence[_T]) -> _T: # pylint: disable=too-many-return-statem
|
|||||||
# `importlib.utils`, except it's unclear from its document on how to use it. This one
|
# `importlib.utils`, except it's unclear from its document on how to use it. This one
|
||||||
# seems to be easy to understand and works out of box.
|
# seems to be easy to understand and works out of box.
|
||||||
|
|
||||||
|
|
||||||
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this
|
||||||
|
|||||||
@@ -2172,6 +2172,7 @@ class Booster:
|
|||||||
)
|
)
|
||||||
return _prediction_output(shape, dims, preds, False)
|
return _prediction_output(shape, dims, preds, False)
|
||||||
|
|
||||||
|
# pylint: disable=too-many-statements
|
||||||
def inplace_predict(
|
def inplace_predict(
|
||||||
self,
|
self,
|
||||||
data: DataType,
|
data: DataType,
|
||||||
@@ -2192,10 +2193,10 @@ class Booster:
|
|||||||
|
|
||||||
.. code-block:: python
|
.. code-block:: python
|
||||||
|
|
||||||
booster.set_param({'predictor': 'gpu_predictor'})
|
booster.set_param({"predictor": "gpu_predictor"})
|
||||||
booster.inplace_predict(cupy_array)
|
booster.inplace_predict(cupy_array)
|
||||||
|
|
||||||
booster.set_param({'predictor': 'cpu_predictor})
|
booster.set_param({"predictor": "cpu_predictor"})
|
||||||
booster.inplace_predict(numpy_array)
|
booster.inplace_predict(numpy_array)
|
||||||
|
|
||||||
.. versionadded:: 1.1.0
|
.. versionadded:: 1.1.0
|
||||||
@@ -2301,14 +2302,16 @@ class Booster:
|
|||||||
)
|
)
|
||||||
return _prediction_output(shape, dims, preds, False)
|
return _prediction_output(shape, dims, preds, False)
|
||||||
if isinstance(data, scipy.sparse.csr_matrix):
|
if isinstance(data, scipy.sparse.csr_matrix):
|
||||||
csr = data
|
from .data import _transform_scipy_csr
|
||||||
|
|
||||||
|
data = _transform_scipy_csr(data)
|
||||||
_check_call(
|
_check_call(
|
||||||
_LIB.XGBoosterPredictFromCSR(
|
_LIB.XGBoosterPredictFromCSR(
|
||||||
self.handle,
|
self.handle,
|
||||||
_array_interface(csr.indptr),
|
_array_interface(data.indptr),
|
||||||
_array_interface(csr.indices),
|
_array_interface(data.indices),
|
||||||
_array_interface(csr.data),
|
_array_interface(data.data),
|
||||||
c_bst_ulong(csr.shape[1]),
|
c_bst_ulong(data.shape[1]),
|
||||||
from_pystr_to_cstr(json.dumps(args)),
|
from_pystr_to_cstr(json.dumps(args)),
|
||||||
p_handle,
|
p_handle,
|
||||||
ctypes.byref(shape),
|
ctypes.byref(shape),
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ from .core import (
|
|||||||
c_array,
|
c_array,
|
||||||
c_str,
|
c_str,
|
||||||
from_pystr_to_cstr,
|
from_pystr_to_cstr,
|
||||||
|
make_jcargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
DispatchedDataBackendReturnType = Tuple[
|
DispatchedDataBackendReturnType = Tuple[
|
||||||
@@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes:
|
|||||||
return interface_str
|
return interface_str
|
||||||
|
|
||||||
|
|
||||||
|
def _transform_scipy_csr(data: DataType) -> DataType:
|
||||||
|
from scipy.sparse import csr_matrix
|
||||||
|
|
||||||
|
indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype)
|
||||||
|
indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype)
|
||||||
|
values, _ = _ensure_np_dtype(data.data, data.data.dtype)
|
||||||
|
if (
|
||||||
|
indptr is not data.indptr
|
||||||
|
or indices is not data.indices
|
||||||
|
or values is not data.data
|
||||||
|
):
|
||||||
|
data = csr_matrix((values, indices, indptr), shape=data.shape)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _from_scipy_csr(
|
def _from_scipy_csr(
|
||||||
data: DataType,
|
data: DataType,
|
||||||
missing: FloatCompatible,
|
missing: FloatCompatible,
|
||||||
@@ -93,18 +109,14 @@ def _from_scipy_csr(
|
|||||||
f"length mismatch: {len(data.indices)} vs {len(data.data)}"
|
f"length mismatch: {len(data.indices)} vs {len(data.data)}"
|
||||||
)
|
)
|
||||||
handle = ctypes.c_void_p()
|
handle = ctypes.c_void_p()
|
||||||
args = {
|
data = _transform_scipy_csr(data)
|
||||||
"missing": float(missing),
|
|
||||||
"nthread": int(nthread),
|
|
||||||
}
|
|
||||||
config = bytes(json.dumps(args), "utf-8")
|
|
||||||
_check_call(
|
_check_call(
|
||||||
_LIB.XGDMatrixCreateFromCSR(
|
_LIB.XGDMatrixCreateFromCSR(
|
||||||
_array_interface(data.indptr),
|
_array_interface(data.indptr),
|
||||||
_array_interface(data.indices),
|
_array_interface(data.indices),
|
||||||
_array_interface(data.data),
|
_array_interface(data.data),
|
||||||
c_bst_ulong(data.shape[1]),
|
c_bst_ulong(data.shape[1]),
|
||||||
config,
|
make_jcargs(missing=float(missing), nthread=int(nthread)),
|
||||||
ctypes.byref(handle),
|
ctypes.byref(handle),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def _ensure_np_dtype(
|
def _ensure_np_dtype(
|
||||||
data: DataType,
|
data: DataType, dtype: Optional[NumpyDType]
|
||||||
dtype: Optional[NumpyDType]
|
|
||||||
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
|
) -> Tuple[np.ndarray, Optional[NumpyDType]]:
|
||||||
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
|
if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
|
||||||
data = data.astype(np.float32, copy=False)
|
|
||||||
dtype = np.float32
|
dtype = np.float32
|
||||||
|
data = data.astype(dtype, copy=False)
|
||||||
|
if not data.flags.aligned:
|
||||||
|
data = np.require(data, requirements="A")
|
||||||
return data, dtype
|
return data, dtype
|
||||||
|
|
||||||
|
|
||||||
@@ -1197,11 +1210,13 @@ def _proxy_transform(
|
|||||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||||
return data, None, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_scipy_csr(data):
|
if _is_scipy_csr(data):
|
||||||
|
data = _transform_scipy_csr(data)
|
||||||
return data, None, feature_names, feature_types
|
return data, None, feature_names, feature_types
|
||||||
if _is_pandas_df(data):
|
if _is_pandas_df(data):
|
||||||
arr, feature_names, feature_types = _transform_pandas_df(
|
arr, feature_names, feature_types = _transform_pandas_df(
|
||||||
data, enable_categorical, feature_names, feature_types
|
data, enable_categorical, feature_names, feature_types
|
||||||
)
|
)
|
||||||
|
arr, _ = _ensure_np_dtype(arr, arr.dtype)
|
||||||
return arr, None, feature_names, feature_types
|
return arr, None, feature_names, feature_types
|
||||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||||
|
|
||||||
|
|||||||
@@ -674,7 +674,7 @@ class XGBModel(XGBModelBase):
|
|||||||
self.kwargs = {}
|
self.kwargs = {}
|
||||||
self.kwargs[key] = value
|
self.kwargs[key] = value
|
||||||
|
|
||||||
if hasattr(self, "_Booster"):
|
if self.__sklearn_is_fitted__():
|
||||||
parameters = self.get_xgb_params()
|
parameters = self.get_xgb_params()
|
||||||
self.get_booster().set_param(parameters)
|
self.get_booster().set_param(parameters)
|
||||||
|
|
||||||
@@ -701,39 +701,12 @@ class XGBModel(XGBModelBase):
|
|||||||
np.iinfo(np.int32).max
|
np.iinfo(np.int32).max
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_parameter(value: Any) -> Optional[Union[int, float, str]]:
|
|
||||||
for t in (int, float, str):
|
|
||||||
try:
|
|
||||||
ret = t(value)
|
|
||||||
return ret
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Get internal parameter values
|
|
||||||
try:
|
|
||||||
config = json.loads(self.get_booster().save_config())
|
|
||||||
stack = [config]
|
|
||||||
internal = {}
|
|
||||||
while stack:
|
|
||||||
obj = stack.pop()
|
|
||||||
for k, v in obj.items():
|
|
||||||
if k.endswith("_param"):
|
|
||||||
for p_k, p_v in v.items():
|
|
||||||
internal[p_k] = p_v
|
|
||||||
elif isinstance(v, dict):
|
|
||||||
stack.append(v)
|
|
||||||
|
|
||||||
for k, v in internal.items():
|
|
||||||
if k in params and params[k] is None:
|
|
||||||
params[k] = parse_parameter(v)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
def get_xgb_params(self) -> Dict[str, Any]:
|
def get_xgb_params(self) -> Dict[str, Any]:
|
||||||
"""Get xgboost specific parameters."""
|
"""Get xgboost specific parameters."""
|
||||||
params = self.get_params()
|
params: Dict[str, Any] = self.get_params()
|
||||||
|
|
||||||
# Parameters that should not go into native learner.
|
# Parameters that should not go into native learner.
|
||||||
wrapper_specific = {
|
wrapper_specific = {
|
||||||
"importance_type",
|
"importance_type",
|
||||||
@@ -750,6 +723,7 @@ class XGBModel(XGBModelBase):
|
|||||||
for k, v in params.items():
|
for k, v in params.items():
|
||||||
if k not in wrapper_specific and not callable(v):
|
if k not in wrapper_specific and not callable(v):
|
||||||
filtered[k] = v
|
filtered[k] = v
|
||||||
|
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
def get_num_boosting_rounds(self) -> int:
|
def get_num_boosting_rounds(self) -> int:
|
||||||
@@ -1070,7 +1044,7 @@ class XGBModel(XGBModelBase):
|
|||||||
# error with incompatible data type.
|
# error with incompatible data type.
|
||||||
# Inplace predict doesn't handle as many data types as DMatrix, but it's
|
# Inplace predict doesn't handle as many data types as DMatrix, but it's
|
||||||
# sufficient for dask interface where input is simpiler.
|
# sufficient for dask interface where input is simpiler.
|
||||||
predictor = self.get_params().get("predictor", None)
|
predictor = self.get_xgb_params().get("predictor", None)
|
||||||
if predictor in ("auto", None) and self.booster != "gblinear":
|
if predictor in ("auto", None) and self.booster != "gblinear":
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
@@ -1336,7 +1310,7 @@ class XGBModel(XGBModelBase):
|
|||||||
-------
|
-------
|
||||||
coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]``
|
coef_ : array of shape ``[n_features]`` or ``[n_classes, n_features]``
|
||||||
"""
|
"""
|
||||||
if self.get_params()["booster"] != "gblinear":
|
if self.get_xgb_params()["booster"] != "gblinear":
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
f"Coefficients are not defined for Booster type {self.booster}"
|
f"Coefficients are not defined for Booster type {self.booster}"
|
||||||
)
|
)
|
||||||
@@ -1366,7 +1340,7 @@ class XGBModel(XGBModelBase):
|
|||||||
-------
|
-------
|
||||||
intercept_ : array of shape ``(1,)`` or ``[n_classes]``
|
intercept_ : array of shape ``(1,)`` or ``[n_classes]``
|
||||||
"""
|
"""
|
||||||
if self.get_params()["booster"] != "gblinear":
|
if self.get_xgb_params()["booster"] != "gblinear":
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
f"Intercept (bias) is not defined for Booster type {self.booster}"
|
f"Intercept (bias) is not defined for Booster type {self.booster}"
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -140,6 +140,13 @@ _unsupported_predict_params = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: supply hint message for all other unsupported params.
|
||||||
|
_unsupported_params_hint_message = {
|
||||||
|
"enable_categorical": "`xgboost.spark` estimators do not have 'enable_categorical' param, "
|
||||||
|
"but you can set `feature_types` param and mark categorical features with 'c' string."
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class _SparkXGBParams(
|
class _SparkXGBParams(
|
||||||
HasFeaturesCol,
|
HasFeaturesCol,
|
||||||
HasLabelCol,
|
HasLabelCol,
|
||||||
@@ -523,7 +530,10 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
|||||||
or k in _unsupported_predict_params
|
or k in _unsupported_predict_params
|
||||||
or k in _unsupported_train_params
|
or k in _unsupported_train_params
|
||||||
):
|
):
|
||||||
raise ValueError(f"Unsupported param '{k}'.")
|
err_msg = _unsupported_params_hint_message.get(
|
||||||
|
k, f"Unsupported param '{k}'."
|
||||||
|
)
|
||||||
|
raise ValueError(err_msg)
|
||||||
_extra_params[k] = v
|
_extra_params[k] = v
|
||||||
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
|
_existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
|
||||||
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
|
self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
|
||||||
@@ -749,6 +759,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
|
|||||||
"feature_weights": self.getOrDefault(self.feature_weights),
|
"feature_weights": self.getOrDefault(self.feature_weights),
|
||||||
"missing": float(self.getOrDefault(self.missing)),
|
"missing": float(self.getOrDefault(self.missing)),
|
||||||
}
|
}
|
||||||
|
if dmatrix_kwargs["feature_types"] is not None:
|
||||||
|
dmatrix_kwargs["enable_categorical"] = True
|
||||||
booster_params["nthread"] = cpu_per_task
|
booster_params["nthread"] = cpu_per_task
|
||||||
use_gpu = self.getOrDefault(self.use_gpu)
|
use_gpu = self.getOrDefault(self.use_gpu)
|
||||||
|
|
||||||
|
|||||||
@@ -48,20 +48,21 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
|
|||||||
return cat < 0 || cat >= kMaxCat;
|
return cat < 0 || cat >= kMaxCat;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* \brief Whether should it traverse to left branch of a tree.
|
/**
|
||||||
|
* \brief Whether should it traverse to left branch of a tree.
|
||||||
*
|
*
|
||||||
* For one hot split, go to left if it's NOT the matching category.
|
* Go to left if it's NOT the matching category, which matches one-hot encoding.
|
||||||
*/
|
*/
|
||||||
template <bool validate = true>
|
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
|
||||||
inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat, bool dft_left) {
|
|
||||||
KCatBitField const s_cats(cats);
|
KCatBitField const s_cats(cats);
|
||||||
// FIXME: Size() is not accurate since it represents the size of bit set instead of
|
if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
|
||||||
// actual number of categories.
|
return true;
|
||||||
if (XGBOOST_EXPECT(validate && (InvalidCat(cat) || cat >= s_cats.Size()), false)) {
|
|
||||||
return dft_left;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto pos = KCatBitField::ToBitPos(cat);
|
auto pos = KCatBitField::ToBitPos(cat);
|
||||||
|
// If the input category is larger than the size of the bit field, it implies that the
|
||||||
|
// category is not chosen. Otherwise the bit field would have the category instead of
|
||||||
|
// being smaller than the category value.
|
||||||
if (pos.int_pos >= cats.size()) {
|
if (pos.int_pos >= cats.size()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
|
|||||||
feature_offsets_[fid] = accum_index;
|
feature_offsets_[fid] = accum_index;
|
||||||
}
|
}
|
||||||
|
|
||||||
SetTypeSize(gmat.max_num_bins);
|
SetTypeSize(gmat.MaxNumBinPerFeat());
|
||||||
auto storage_size =
|
auto storage_size =
|
||||||
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
|
||||||
index_.resize(storage_size, 0);
|
index_.resize(storage_size, 0);
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t,
|
|||||||
#endif // !defined(XGBOOST_USE_CUDA)
|
#endif // !defined(XGBOOST_USE_CUDA)
|
||||||
|
|
||||||
template <typename T, std::int32_t kDim>
|
template <typename T, std::int32_t kDim>
|
||||||
auto cbegin(TensorView<T, kDim> v) { // NOLINT
|
auto cbegin(TensorView<T, kDim> const& v) { // NOLINT
|
||||||
auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
|
auto it = common::MakeIndexTransformIter([&](size_t i) -> std::remove_cv_t<T> const& {
|
||||||
return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
|
return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape()));
|
||||||
});
|
});
|
||||||
@@ -70,19 +70,19 @@ auto cbegin(TensorView<T, kDim> v) { // NOLINT
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, std::int32_t kDim>
|
template <typename T, std::int32_t kDim>
|
||||||
auto cend(TensorView<T, kDim> v) { // NOLINT
|
auto cend(TensorView<T, kDim> const& v) { // NOLINT
|
||||||
return cbegin(v) + v.Size();
|
return cbegin(v) + v.Size();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, std::int32_t kDim>
|
template <typename T, std::int32_t kDim>
|
||||||
auto begin(TensorView<T, kDim> v) { // NOLINT
|
auto begin(TensorView<T, kDim>& v) { // NOLINT
|
||||||
auto it = common::MakeIndexTransformIter(
|
auto it = common::MakeIndexTransformIter(
|
||||||
[&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
|
[&](size_t i) -> T& { return linalg::detail::Apply(v, linalg::UnravelIndex(i, v.Shape())); });
|
||||||
return it;
|
return it;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, std::int32_t kDim>
|
template <typename T, std::int32_t kDim>
|
||||||
auto end(TensorView<T, kDim> v) { // NOLINT
|
auto end(TensorView<T, kDim>& v) { // NOLINT
|
||||||
return begin(v) + v.Size();
|
return begin(v) + v.Size();
|
||||||
}
|
}
|
||||||
} // namespace linalg
|
} // namespace linalg
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ class PartitionBuilder {
|
|||||||
auto gidx = gidx_calc(ridx);
|
auto gidx = gidx_calc(ridx);
|
||||||
bool go_left = default_left;
|
bool go_left = default_left;
|
||||||
if (gidx > -1) {
|
if (gidx > -1) {
|
||||||
go_left = Decision(node_cats, cut_values[gidx], default_left);
|
go_left = Decision(node_cats, cut_values[gidx]);
|
||||||
}
|
}
|
||||||
return go_left;
|
return go_left;
|
||||||
} else {
|
} else {
|
||||||
@@ -157,7 +157,7 @@ class PartitionBuilder {
|
|||||||
bool go_left = default_left;
|
bool go_left = default_left;
|
||||||
if (gidx > -1) {
|
if (gidx > -1) {
|
||||||
if (is_cat) {
|
if (is_cat) {
|
||||||
go_left = Decision(node_cats, cut_values[gidx], default_left);
|
go_left = Decision(node_cats, cut_values[gidx]);
|
||||||
} else {
|
} else {
|
||||||
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
go_left = cut_values[gidx] <= nodes[node_in_set].split.split_value;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2019-2021 by Contributors
|
* Copyright 2019-2023 by XGBoost Contributors
|
||||||
* \file array_interface.h
|
* \file array_interface.h
|
||||||
* \brief View of __array_interface__
|
* \brief View of __array_interface__
|
||||||
*/
|
*/
|
||||||
@@ -7,9 +7,11 @@
|
|||||||
#define XGBOOST_DATA_ARRAY_INTERFACE_H_
|
#define XGBOOST_DATA_ARRAY_INTERFACE_H_
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cstddef> // std::size_t
|
||||||
|
#include <cstdint>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <type_traits> // std::alignment_of,std::remove_pointer_t
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@@ -394,6 +396,11 @@ class ArrayInterface {
|
|||||||
|
|
||||||
data = ArrayInterfaceHandler::ExtractData(array, n);
|
data = ArrayInterfaceHandler::ExtractData(array, n);
|
||||||
static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
|
static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported.");
|
||||||
|
|
||||||
|
auto alignment = this->ElementAlignment();
|
||||||
|
auto ptr = reinterpret_cast<uintptr_t>(this->data);
|
||||||
|
CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment.";
|
||||||
|
|
||||||
if (allow_mask) {
|
if (allow_mask) {
|
||||||
common::Span<RBitField8::value_type> s_mask;
|
common::Span<RBitField8::value_type> s_mask;
|
||||||
size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
|
size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
|
||||||
@@ -512,9 +519,15 @@ class ArrayInterface {
|
|||||||
return func(reinterpret_cast<uint64_t const *>(data));
|
return func(reinterpret_cast<uint64_t const *>(data));
|
||||||
}
|
}
|
||||||
|
|
||||||
XGBOOST_DEVICE size_t ElementSize() {
|
XGBOOST_DEVICE std::size_t ElementSize() const {
|
||||||
return this->DispatchCall(
|
return this->DispatchCall([](auto *typed_data_ptr) {
|
||||||
[](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); });
|
return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
XGBOOST_DEVICE std::size_t ElementAlignment() const {
|
||||||
|
return this->DispatchCall([](auto *typed_data_ptr) {
|
||||||
|
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T = float, typename... Index>
|
template <typename T = float, typename... Index>
|
||||||
|
|||||||
@@ -20,13 +20,13 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
|
|||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
|
GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
|
||||||
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
|
double sparse_thresh, bool sorted_sketch, int32_t n_threads,
|
||||||
common::Span<float> hess) {
|
common::Span<float> hess)
|
||||||
|
: max_numeric_bins_per_feat{max_bins_per_feat} {
|
||||||
CHECK(p_fmat->SingleColBlock());
|
CHECK(p_fmat->SingleColBlock());
|
||||||
// We use sorted sketching for approx tree method since it's more efficient in
|
// We use sorted sketching for approx tree method since it's more efficient in
|
||||||
// computation time (but higher memory usage).
|
// computation time (but higher memory usage).
|
||||||
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
|
cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess);
|
||||||
|
|
||||||
max_num_bins = max_bins_per_feat;
|
|
||||||
const uint32_t nbins = cut.Ptrs().back();
|
const uint32_t nbins = cut.Ptrs().back();
|
||||||
hit_count.resize(nbins, 0);
|
hit_count.resize(nbins, 0);
|
||||||
hit_count_tloc_.resize(n_threads * nbins, 0);
|
hit_count_tloc_.resize(n_threads * nbins, 0);
|
||||||
@@ -63,7 +63,7 @@ GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &
|
|||||||
: row_ptr(info.num_row_ + 1, 0),
|
: row_ptr(info.num_row_ + 1, 0),
|
||||||
hit_count(cuts.TotalBins(), 0),
|
hit_count(cuts.TotalBins(), 0),
|
||||||
cut{std::forward<common::HistogramCuts>(cuts)},
|
cut{std::forward<common::HistogramCuts>(cuts)},
|
||||||
max_num_bins(max_bin_per_feat),
|
max_numeric_bins_per_feat(max_bin_per_feat),
|
||||||
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
|
||||||
|
|
||||||
#if !defined(XGBOOST_USE_CUDA)
|
#if !defined(XGBOOST_USE_CUDA)
|
||||||
@@ -86,13 +86,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span<FeatureTy
|
|||||||
}
|
}
|
||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
|
GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<FeatureType const> ft,
|
||||||
common::HistogramCuts const &cuts, int32_t max_bins_per_feat,
|
common::HistogramCuts cuts, int32_t max_bins_per_feat,
|
||||||
bool isDense, double sparse_thresh, int32_t n_threads) {
|
bool isDense, double sparse_thresh, int32_t n_threads)
|
||||||
|
: cut{std::move(cuts)},
|
||||||
|
max_numeric_bins_per_feat{max_bins_per_feat},
|
||||||
|
base_rowid{batch.base_rowid},
|
||||||
|
isDense_{isDense} {
|
||||||
CHECK_GE(n_threads, 1);
|
CHECK_GE(n_threads, 1);
|
||||||
base_rowid = batch.base_rowid;
|
|
||||||
isDense_ = isDense;
|
|
||||||
cut = cuts;
|
|
||||||
max_num_bins = max_bins_per_feat;
|
|
||||||
CHECK_EQ(row_ptr.size(), 0);
|
CHECK_EQ(row_ptr.size(), 0);
|
||||||
// The number of threads is pegged to the batch size. If the OMP
|
// The number of threads is pegged to the batch size. If the OMP
|
||||||
// block is parallelized on anything other than the batch/block size,
|
// block is parallelized on anything other than the batch/block size,
|
||||||
@@ -127,12 +127,13 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
|
|||||||
#undef INSTANTIATION_PUSH
|
#undef INSTANTIATION_PUSH
|
||||||
|
|
||||||
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
|
||||||
if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
|
if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
|
||||||
|
isDense) {
|
||||||
// compress dense index to uint8
|
// compress dense index to uint8
|
||||||
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
index.SetBinTypeSize(common::kUint8BinsTypeSize);
|
||||||
index.Resize((sizeof(uint8_t)) * n_index);
|
index.Resize((sizeof(uint8_t)) * n_index);
|
||||||
} else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
} else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
|
||||||
max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
|
||||||
isDense) {
|
isDense) {
|
||||||
// compress dense index to uint16
|
// compress dense index to uint16
|
||||||
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
index.SetBinTypeSize(common::kUint16BinsTypeSize);
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
|
|||||||
|
|
||||||
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
|
GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
|
||||||
EllpackPage const& in_page, BatchParam const& p)
|
EllpackPage const& in_page, BatchParam const& p)
|
||||||
: max_num_bins{p.max_bin} {
|
: max_numeric_bins_per_feat{p.max_bin} {
|
||||||
auto page = in_page.Impl();
|
auto page = in_page.Impl();
|
||||||
isDense_ = page->is_dense;
|
isDense_ = page->is_dense;
|
||||||
|
|
||||||
|
|||||||
@@ -133,11 +133,15 @@ class GHistIndexMatrix {
|
|||||||
std::vector<size_t> hit_count;
|
std::vector<size_t> hit_count;
|
||||||
/*! \brief The corresponding cuts */
|
/*! \brief The corresponding cuts */
|
||||||
common::HistogramCuts cut;
|
common::HistogramCuts cut;
|
||||||
/*! \brief max_bin for each feature. */
|
/** \brief max_bin for each feature. */
|
||||||
bst_bin_t max_num_bins;
|
bst_bin_t max_numeric_bins_per_feat;
|
||||||
/*! \brief base row index for current page (used by external memory) */
|
/*! \brief base row index for current page (used by external memory) */
|
||||||
size_t base_rowid{0};
|
size_t base_rowid{0};
|
||||||
|
|
||||||
|
bst_bin_t MaxNumBinPerFeat() const {
|
||||||
|
return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
|
||||||
|
}
|
||||||
|
|
||||||
~GHistIndexMatrix();
|
~GHistIndexMatrix();
|
||||||
/**
|
/**
|
||||||
* \brief Constrcutor for SimpleDMatrix.
|
* \brief Constrcutor for SimpleDMatrix.
|
||||||
@@ -160,7 +164,7 @@ class GHistIndexMatrix {
|
|||||||
* \brief Constructor for external memory.
|
* \brief Constructor for external memory.
|
||||||
*/
|
*/
|
||||||
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
|
GHistIndexMatrix(SparsePage const& page, common::Span<FeatureType const> ft,
|
||||||
common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense,
|
common::HistogramCuts cuts, int32_t max_bins_per_feat, bool is_dense,
|
||||||
double sparse_thresh, int32_t n_threads);
|
double sparse_thresh, int32_t n_threads);
|
||||||
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
|
GHistIndexMatrix(); // also for ext mem, empty ctor so that we can read the cache back.
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
|||||||
if (!fi->Read(&page->hit_count)) {
|
if (!fi->Read(&page->hit_count)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!fi->Read(&page->max_num_bins)) {
|
if (!fi->Read(&page->max_numeric_bins_per_feat)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!fi->Read(&page->base_rowid)) {
|
if (!fi->Read(&page->base_rowid)) {
|
||||||
@@ -76,8 +76,8 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
|
|||||||
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
|
page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
|
||||||
sizeof(uint64_t);
|
sizeof(uint64_t);
|
||||||
// max_bins, base row, is_dense
|
// max_bins, base row, is_dense
|
||||||
fo->Write(page.max_num_bins);
|
fo->Write(page.max_numeric_bins_per_feat);
|
||||||
bytes += sizeof(page.max_num_bins);
|
bytes += sizeof(page.max_numeric_bins_per_feat);
|
||||||
fo->Write(page.base_rowid);
|
fo->Write(page.base_rowid);
|
||||||
bytes += sizeof(page.base_rowid);
|
bytes += sizeof(page.base_rowid);
|
||||||
fo->Write(page.IsDense());
|
fo->Write(page.IsDense());
|
||||||
|
|||||||
@@ -58,6 +58,13 @@ void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, Bat
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
auto ellpack = [&]() {
|
auto ellpack = [&]() {
|
||||||
|
// workaround ellpack being initialized from CPU.
|
||||||
|
if (p.gpu_id == Context::kCpuId) {
|
||||||
|
p.gpu_id = ref_->Ctx()->gpu_id;
|
||||||
|
}
|
||||||
|
if (p.gpu_id == Context::kCpuId) {
|
||||||
|
p.gpu_id = 0;
|
||||||
|
}
|
||||||
for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
|
for (auto const& page : ref_->GetBatches<EllpackPage>(p)) {
|
||||||
GetCutsFromEllpack(page, p_cuts);
|
GetCutsFromEllpack(page, p_cuts);
|
||||||
break;
|
break;
|
||||||
@@ -172,9 +179,9 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
|
|||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
while (iter.Next()) {
|
while (iter.Next()) {
|
||||||
if (!p_sketch) {
|
if (!p_sketch) {
|
||||||
p_sketch.reset(new common::HostSketchContainer{batch_param_.max_bin,
|
p_sketch.reset(new common::HostSketchContainer{
|
||||||
proxy->Info().feature_types.ConstHostSpan(),
|
batch_param_.max_bin, proxy->Info().feature_types.ConstHostSpan(), column_sizes,
|
||||||
column_sizes, false, ctx_.Threads()});
|
!proxy->Info().group_ptr_.empty(), ctx_.Threads()});
|
||||||
}
|
}
|
||||||
HostAdapterDispatch(proxy, [&](auto const& batch) {
|
HostAdapterDispatch(proxy, [&](auto const& batch) {
|
||||||
proxy->Info().num_nonzero_ = batch_nnz[i];
|
proxy->Info().num_nonzero_ = batch_nnz[i];
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ DMatrix* SimpleDMatrix::Slice(common::Span<int32_t const> ridxs) {
|
|||||||
out->Info() = this->Info().Slice(ridxs);
|
out->Info() = this->Info().Slice(ridxs);
|
||||||
out->Info().num_nonzero_ = h_offset.back();
|
out->Info().num_nonzero_ = h_offset.back();
|
||||||
}
|
}
|
||||||
|
out->ctx_ = this->ctx_;
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@
|
|||||||
#include "xgboost/logging.h"
|
#include "xgboost/logging.h"
|
||||||
#include "xgboost/objective.h"
|
#include "xgboost/objective.h"
|
||||||
#include "xgboost/predictor.h"
|
#include "xgboost/predictor.h"
|
||||||
|
#include "xgboost/string_view.h"
|
||||||
#include "xgboost/tree_updater.h"
|
#include "xgboost/tree_updater.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@@ -395,23 +396,36 @@ void GBTree::LoadConfig(Json const& in) {
|
|||||||
tparam_.process_type = TreeProcessType::kDefault;
|
tparam_.process_type = TreeProcessType::kDefault;
|
||||||
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
|
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
|
||||||
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
|
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
|
||||||
LOG(WARNING)
|
LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
|
||||||
<< "Loading from a raw memory buffer on CPU only machine. "
|
"Changing predictor to auto.";
|
||||||
"Changing predictor to auto.";
|
|
||||||
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
|
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto msg = StringView{
|
||||||
|
R"(
|
||||||
|
Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
|
||||||
|
machine. Consider using `save_model/load_model` instead. See:
|
||||||
|
|
||||||
|
https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
|
||||||
|
|
||||||
|
for more details about differences between saving model and serializing.)"};
|
||||||
|
|
||||||
if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
|
if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) {
|
||||||
tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
|
tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}});
|
||||||
LOG(WARNING)
|
LOG(WARNING) << msg << " Changing `tree_method` to `hist`.";
|
||||||
<< "Loading from a raw memory buffer on CPU only machine. "
|
|
||||||
"Changing tree_method to hist.";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto const& j_updaters = get<Object const>(in["updater"]);
|
auto const& j_updaters = get<Object const>(in["updater"]);
|
||||||
updaters_.clear();
|
updaters_.clear();
|
||||||
|
|
||||||
for (auto const& kv : j_updaters) {
|
for (auto const& kv : j_updaters) {
|
||||||
std::unique_ptr<TreeUpdater> up(
|
auto name = kv.first;
|
||||||
TreeUpdater::Create(kv.first, ctx_, model_.learner_model_param->task));
|
if (n_gpus == 0 && name == "grow_gpu_hist") {
|
||||||
|
name = "grow_quantile_histmaker";
|
||||||
|
LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
|
||||||
|
}
|
||||||
|
std::unique_ptr<TreeUpdater> up{
|
||||||
|
TreeUpdater::Create(name, ctx_, model_.learner_model_param->task)};
|
||||||
up->LoadConfig(kv.second);
|
up->LoadConfig(kv.second);
|
||||||
updaters_.push_back(std::move(up));
|
updaters_.push_back(std::move(up));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,9 +18,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
|
|||||||
if (has_categorical && common::IsCat(cats.split_type, nid)) {
|
if (has_categorical && common::IsCat(cats.split_type, nid)) {
|
||||||
auto node_categories =
|
auto node_categories =
|
||||||
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
|
cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
|
||||||
return common::Decision<true>(node_categories, fvalue, node.DefaultLeft())
|
return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
|
||||||
? node.LeftChild()
|
|
||||||
: node.RightChild();
|
|
||||||
} else {
|
} else {
|
||||||
return node.LeftChild() + !(fvalue < node.SplitCond());
|
return node.LeftChild() + !(fvalue < node.SplitCond());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -248,8 +248,10 @@ class EvaluateSplitAgent {
|
|||||||
|
|
||||||
template <int kBlockSize>
|
template <int kBlockSize>
|
||||||
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
||||||
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
bst_feature_t max_active_features,
|
||||||
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
const EvaluateSplitSharedInputs shared_inputs,
|
||||||
|
common::Span<bst_feature_t> sorted_idx,
|
||||||
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_candidates) {
|
common::Span<DeviceSplitCandidate> out_candidates) {
|
||||||
// Aligned && shared storage for best_split
|
// Aligned && shared storage for best_split
|
||||||
@@ -263,11 +265,15 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Allocate blocks to one feature of one node
|
// Allocate blocks to one feature of one node
|
||||||
const auto input_idx = blockIdx.x / number_active_features;
|
const auto input_idx = blockIdx.x / max_active_features;
|
||||||
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
|
const EvaluateSplitInputs &inputs = d_inputs[input_idx];
|
||||||
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
// One block for each feature. Features are sampled, so fidx != blockIdx.x
|
||||||
|
// Some blocks may not have any feature to work on, simply return
|
||||||
int fidx = inputs.feature_set[blockIdx.x % number_active_features];
|
int feature_offset = blockIdx.x % max_active_features;
|
||||||
|
if (feature_offset >= inputs.feature_set.size()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int fidx = inputs.feature_set[feature_offset];
|
||||||
|
|
||||||
using AgentT = EvaluateSplitAgent<kBlockSize>;
|
using AgentT = EvaluateSplitAgent<kBlockSize>;
|
||||||
__shared__ typename AgentT::TempStorage temp_storage;
|
__shared__ typename AgentT::TempStorage temp_storage;
|
||||||
@@ -338,7 +344,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GPUHistEvaluator::LaunchEvaluateSplits(
|
void GPUHistEvaluator::LaunchEvaluateSplits(
|
||||||
bst_feature_t number_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
|
bst_feature_t max_active_features,
|
||||||
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitSharedInputs shared_inputs,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
common::Span<DeviceSplitCandidate> out_splits) {
|
common::Span<DeviceSplitCandidate> out_splits) {
|
||||||
@@ -346,20 +353,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
|
|||||||
this->SortHistogram(d_inputs, shared_inputs, evaluator);
|
this->SortHistogram(d_inputs, shared_inputs, evaluator);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t combined_num_features = number_active_features * d_inputs.size();
|
size_t combined_num_features = max_active_features * d_inputs.size();
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(combined_num_features);
|
dh::TemporaryArray<DeviceSplitCandidate> feature_best_splits(
|
||||||
|
combined_num_features, DeviceSplitCandidate());
|
||||||
|
|
||||||
// One block for each feature
|
// One block for each feature
|
||||||
uint32_t constexpr kBlockThreads = 32;
|
uint32_t constexpr kBlockThreads = 32;
|
||||||
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads, 0}(
|
dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads,
|
||||||
EvaluateSplitsKernel<kBlockThreads>, number_active_features, d_inputs,
|
0}(
|
||||||
shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
|
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
|
||||||
|
shared_inputs,
|
||||||
|
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
|
||||||
evaluator, dh::ToSpan(feature_best_splits));
|
evaluator, dh::ToSpan(feature_best_splits));
|
||||||
|
|
||||||
// Reduce to get best candidate for left and right child over all features
|
// Reduce to get best candidate for left and right child over all features
|
||||||
auto reduce_offset = dh::MakeTransformIterator<size_t>(
|
auto reduce_offset =
|
||||||
thrust::make_counting_iterator(0llu),
|
dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
|
||||||
[=] __device__(size_t idx) -> size_t { return idx * number_active_features; });
|
[=] __device__(size_t idx) -> size_t {
|
||||||
|
return idx * max_active_features;
|
||||||
|
});
|
||||||
size_t temp_storage_bytes = 0;
|
size_t temp_storage_bytes = 0;
|
||||||
auto num_segments = out_splits.size();
|
auto num_segments = out_splits.size();
|
||||||
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
|
||||||
@@ -386,15 +398,16 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GPUHistEvaluator::EvaluateSplits(
|
void GPUHistEvaluator::EvaluateSplits(
|
||||||
const std::vector<bst_node_t> &nidx, bst_feature_t number_active_features,
|
const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
|
||||||
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
common::Span<GPUExpandEntry> out_entries) {
|
common::Span<GPUExpandEntry> out_entries) {
|
||||||
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();
|
||||||
|
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
|
||||||
auto out_splits = dh::ToSpan(splits_out_storage);
|
auto out_splits = dh::ToSpan(splits_out_storage);
|
||||||
this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator,
|
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
|
||||||
out_splits);
|
evaluator, out_splits);
|
||||||
|
|
||||||
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
|
||||||
auto d_entries = out_entries;
|
auto d_entries = out_entries;
|
||||||
|
|||||||
@@ -170,13 +170,18 @@ class GPUHistEvaluator {
|
|||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
|
||||||
|
|
||||||
// impl of evaluate splits, contains CUDA kernels so it's public
|
// impl of evaluate splits, contains CUDA kernels so it's public
|
||||||
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
|
void LaunchEvaluateSplits(
|
||||||
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
bst_feature_t max_active_features,
|
||||||
common::Span<DeviceSplitCandidate> out_splits);
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
|
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
|
||||||
|
common::Span<DeviceSplitCandidate> out_splits);
|
||||||
/**
|
/**
|
||||||
* \brief Evaluate splits for left and right nodes.
|
* \brief Evaluate splits for left and right nodes.
|
||||||
*/
|
*/
|
||||||
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
|
void EvaluateSplits(const std::vector<bst_node_t> &nidx,
|
||||||
|
bst_feature_t max_active_features,
|
||||||
|
common::Span<const EvaluateSplitInputs> d_inputs,
|
||||||
EvaluateSplitSharedInputs shared_inputs,
|
EvaluateSplitSharedInputs shared_inputs,
|
||||||
common::Span<GPUExpandEntry> out_splits);
|
common::Span<GPUExpandEntry> out_splits);
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -188,7 +188,8 @@ struct GPUHistMakerDevice {
|
|||||||
common::Span<GradientPair> gpair;
|
common::Span<GradientPair> gpair;
|
||||||
|
|
||||||
dh::device_vector<int> monotone_constraints;
|
dh::device_vector<int> monotone_constraints;
|
||||||
dh::device_vector<float> update_predictions;
|
// node idx for each sample
|
||||||
|
dh::device_vector<bst_node_t> positions;
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
|
|
||||||
@@ -318,24 +319,27 @@ struct GPUHistMakerDevice {
|
|||||||
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
|
||||||
right_sampled_features->SetDevice(ctx_->gpu_id);
|
right_sampled_features->SetDevice(ctx_->gpu_id);
|
||||||
common::Span<bst_feature_t> right_feature_set =
|
common::Span<bst_feature_t> right_feature_set =
|
||||||
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
|
interaction_constraints.Query(right_sampled_features->DeviceSpan(),
|
||||||
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
|
right_nidx);
|
||||||
left_feature_set, hist.GetNodeHistogram(left_nidx)};
|
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1,
|
||||||
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
|
candidate.split.left_sum, left_feature_set,
|
||||||
right_feature_set, hist.GetNodeHistogram(right_nidx)};
|
hist.GetNodeHistogram(left_nidx)};
|
||||||
|
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1,
|
||||||
|
candidate.split.right_sum, right_feature_set,
|
||||||
|
hist.GetNodeHistogram(right_nidx)};
|
||||||
}
|
}
|
||||||
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
|
bst_feature_t max_active_features = 0;
|
||||||
for (auto input : h_node_inputs) {
|
for (auto input : h_node_inputs) {
|
||||||
CHECK_EQ(input.feature_set.size(), number_active_features)
|
max_active_features = std::max(max_active_features,
|
||||||
<< "Current implementation assumes that the number of active features "
|
bst_feature_t(input.feature_set.size()));
|
||||||
"(after sampling) in any node is the same";
|
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
|
d_node_inputs.data().get(), h_node_inputs.data(),
|
||||||
cudaMemcpyDefault));
|
h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
|
||||||
|
|
||||||
this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
|
this->evaluator_.EvaluateSplits(nidx, max_active_features,
|
||||||
shared_inputs, dh::ToSpan(entries));
|
dh::ToSpan(d_node_inputs), shared_inputs,
|
||||||
|
dh::ToSpan(entries));
|
||||||
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
|
||||||
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
@@ -403,8 +407,7 @@ struct GPUHistMakerDevice {
|
|||||||
go_left = data.split_node.DefaultLeft();
|
go_left = data.split_node.DefaultLeft();
|
||||||
} else {
|
} else {
|
||||||
if (data.split_type == FeatureType::kCategorical) {
|
if (data.split_type == FeatureType::kCategorical) {
|
||||||
go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
|
go_left = common::Decision(data.node_cats.Bits(), cut_value);
|
||||||
data.split_node.DefaultLeft());
|
|
||||||
} else {
|
} else {
|
||||||
go_left = cut_value <= data.split_node.SplitCond();
|
go_left = cut_value <= data.split_node.SplitCond();
|
||||||
}
|
}
|
||||||
@@ -424,7 +427,7 @@ struct GPUHistMakerDevice {
|
|||||||
LOG(FATAL) << "Current objective function can not be used with external memory.";
|
LOG(FATAL) << "Current objective function can not be used with external memory.";
|
||||||
}
|
}
|
||||||
p_out_position->Resize(0);
|
p_out_position->Resize(0);
|
||||||
update_predictions.clear();
|
positions.clear();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -459,8 +462,6 @@ struct GPUHistMakerDevice {
|
|||||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
||||||
auto d_gpair = this->gpair;
|
auto d_gpair = this->gpair;
|
||||||
update_predictions.resize(row_partitioner->GetRows().size());
|
|
||||||
auto d_update_predictions = dh::ToSpan(update_predictions);
|
|
||||||
p_out_position->SetDevice(ctx_->gpu_id);
|
p_out_position->SetDevice(ctx_->gpu_id);
|
||||||
p_out_position->Resize(row_partitioner->GetRows().size());
|
p_out_position->Resize(row_partitioner->GetRows().size());
|
||||||
|
|
||||||
@@ -481,7 +482,7 @@ struct GPUHistMakerDevice {
|
|||||||
if (common::IsCat(d_feature_types, position)) {
|
if (common::IsCat(d_feature_types, position)) {
|
||||||
auto node_cats = categories.subspan(categories_segments[position].beg,
|
auto node_cats = categories.subspan(categories_segments[position].beg,
|
||||||
categories_segments[position].size);
|
categories_segments[position].size);
|
||||||
go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
|
go_left = common::Decision(node_cats, element);
|
||||||
} else {
|
} else {
|
||||||
go_left = element <= node.SplitCond();
|
go_left = element <= node.SplitCond();
|
||||||
}
|
}
|
||||||
@@ -495,32 +496,45 @@ struct GPUHistMakerDevice {
|
|||||||
node = d_nodes[position];
|
node = d_nodes[position];
|
||||||
}
|
}
|
||||||
|
|
||||||
d_update_predictions[row_id] = node.LeafValue();
|
|
||||||
return position;
|
return position;
|
||||||
}; // NOLINT
|
}; // NOLINT
|
||||||
|
|
||||||
auto d_out_position = p_out_position->DeviceSpan();
|
auto d_out_position = p_out_position->DeviceSpan();
|
||||||
row_partitioner->FinalisePosition(d_out_position, new_position_op);
|
row_partitioner->FinalisePosition(d_out_position, new_position_op);
|
||||||
|
|
||||||
|
auto s_position = p_out_position->ConstDeviceSpan();
|
||||||
|
positions.resize(s_position.size());
|
||||||
|
dh::safe_cuda(cudaMemcpyAsync(positions.data().get(), s_position.data(),
|
||||||
|
s_position.size_bytes(), cudaMemcpyDeviceToDevice));
|
||||||
|
|
||||||
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
|
dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
|
||||||
bst_node_t position = d_out_position[idx];
|
bst_node_t position = d_out_position[idx];
|
||||||
d_update_predictions[idx] = d_nodes[position].LeafValue();
|
|
||||||
bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
|
bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
|
||||||
d_out_position[idx] = is_row_sampled ? ~position : position;
|
d_out_position[idx] = is_row_sampled ? ~position : position;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
|
bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
|
||||||
if (update_predictions.empty()) {
|
if (positions.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
CHECK(p_tree);
|
CHECK(p_tree);
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
||||||
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
|
CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
|
||||||
auto d_update_predictions = dh::ToSpan(update_predictions);
|
|
||||||
CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
|
auto d_position = dh::ToSpan(positions);
|
||||||
dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
|
CHECK_EQ(out_preds_d.Size(), d_position.size());
|
||||||
out_preds_d(idx) += d_update_predictions[idx];
|
|
||||||
|
auto const& h_nodes = p_tree->GetNodes();
|
||||||
|
dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
|
||||||
|
dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
|
||||||
|
h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
|
||||||
|
auto d_nodes = dh::ToSpan(nodes);
|
||||||
|
dh::LaunchN(d_position.size(), [=] XGBOOST_DEVICE(std::size_t idx) mutable {
|
||||||
|
bst_node_t nidx = d_position[idx];
|
||||||
|
auto weight = d_nodes[nidx].LeafValue();
|
||||||
|
out_preds_d(idx) += weight;
|
||||||
});
|
});
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -863,6 +877,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
|
std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker; // NOLINT
|
||||||
|
|
||||||
char const* Name() const override { return "grow_gpu_hist"; }
|
char const* Name() const override { return "grow_gpu_hist"; }
|
||||||
|
bool HasNodePosition() const override { return true; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool initialised_{false};
|
bool initialised_{false};
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ set -euo pipefail
|
|||||||
|
|
||||||
source tests/buildkite/conftest.sh
|
source tests/buildkite/conftest.sh
|
||||||
|
|
||||||
echo "--- Run Google Tests with CUDA, using 4 GPUs"
|
echo "--- Run Google Tests with CUDA, using a GPU"
|
||||||
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
|
buildkite-agent artifact download "build/testxgboost" . --step build-cuda
|
||||||
chmod +x build/testxgboost
|
chmod +x build/testxgboost
|
||||||
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
tests/ci_build/ci_build.sh gpu nvidia-docker \
|
||||||
@@ -12,11 +12,12 @@ tests/ci_build/ci_build.sh gpu nvidia-docker \
|
|||||||
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
|
||||||
build/testxgboost
|
build/testxgboost
|
||||||
|
|
||||||
echo "--- Run Google Tests with CUDA, using 4 GPUs, RMM enabled"
|
# Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
|
||||||
rm -rfv build/
|
# echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
|
||||||
buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
|
# rm -rfv build/
|
||||||
chmod +x build/testxgboost
|
# buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
|
||||||
tests/ci_build/ci_build.sh rmm nvidia-docker \
|
# chmod +x build/testxgboost
|
||||||
--build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
# tests/ci_build/ci_build.sh rmm nvidia-docker \
|
||||||
--build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
|
# --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
|
||||||
"source activate gpu_test && build/testxgboost --use-rmm-pool"
|
# --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION bash -c \
|
||||||
|
# "source activate gpu_test && build/testxgboost --use-rmm-pool"
|
||||||
|
|||||||
@@ -36,7 +36,8 @@ dependencies:
|
|||||||
- cloudpickle
|
- cloudpickle
|
||||||
- shap
|
- shap
|
||||||
- modin
|
- modin
|
||||||
|
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
|
||||||
|
# - https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
|
||||||
|
- pyspark>=3.3.1
|
||||||
- pip:
|
- pip:
|
||||||
- datatable
|
- datatable
|
||||||
# TODO: Replace it with pyspark>=3.4 once 3.4 released.
|
|
||||||
- https://ml-team-public-read.s3.us-west-2.amazonaws.com/pyspark-3.4.0.dev0.tar.gz
|
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2021 by XGBoost Contributors
|
* Copyright 2021-2022 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
|
#include <xgboost/json.h>
|
||||||
|
#include <xgboost/learner.h>
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#include "../../../src/common/categorical.h"
|
#include "../../../src/common/categorical.h"
|
||||||
|
#include "../helpers.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
@@ -15,29 +18,76 @@ TEST(Categorical, Decision) {
|
|||||||
|
|
||||||
ASSERT_TRUE(common::InvalidCat(a));
|
ASSERT_TRUE(common::InvalidCat(a));
|
||||||
std::vector<uint32_t> cats(256, 0);
|
std::vector<uint32_t> cats(256, 0);
|
||||||
ASSERT_TRUE(Decision(cats, a, true));
|
ASSERT_TRUE(Decision(cats, a));
|
||||||
|
|
||||||
// larger than size
|
// larger than size
|
||||||
a = 256;
|
a = 256;
|
||||||
ASSERT_TRUE(Decision(cats, a, true));
|
ASSERT_TRUE(Decision(cats, a));
|
||||||
|
|
||||||
// negative
|
// negative
|
||||||
a = -1;
|
a = -1;
|
||||||
ASSERT_TRUE(Decision(cats, a, true));
|
ASSERT_TRUE(Decision(cats, a));
|
||||||
|
|
||||||
CatBitField bits{cats};
|
CatBitField bits{cats};
|
||||||
bits.Set(0);
|
bits.Set(0);
|
||||||
a = -0.5;
|
a = -0.5;
|
||||||
ASSERT_TRUE(Decision(cats, a, true));
|
ASSERT_TRUE(Decision(cats, a));
|
||||||
|
|
||||||
// round toward 0
|
// round toward 0
|
||||||
a = 0.5;
|
a = 0.5;
|
||||||
ASSERT_FALSE(Decision(cats, a, true));
|
ASSERT_FALSE(Decision(cats, a));
|
||||||
|
|
||||||
// valid
|
// valid
|
||||||
a = 13;
|
a = 13;
|
||||||
bits.Set(a);
|
bits.Set(a);
|
||||||
ASSERT_FALSE(Decision(bits.Bits(), a, true));
|
ASSERT_FALSE(Decision(bits.Bits(), a));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test for running inference with input category greater than the one stored in tree.
|
||||||
|
*/
|
||||||
|
TEST(Categorical, MinimalSet) {
|
||||||
|
std::size_t constexpr kRows = 256, kCols = 1, kCat = 3;
|
||||||
|
std::vector<FeatureType> types{FeatureType::kCategorical};
|
||||||
|
auto Xy =
|
||||||
|
RandomDataGenerator{kRows, kCols, 0.0}.Type(types).MaxCategory(kCat).GenerateDMatrix(true);
|
||||||
|
|
||||||
|
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
|
||||||
|
learner->SetParam("max_depth", "1");
|
||||||
|
learner->SetParam("tree_method", "hist");
|
||||||
|
learner->Configure();
|
||||||
|
learner->UpdateOneIter(0, Xy);
|
||||||
|
|
||||||
|
Json model{Object{}};
|
||||||
|
learner->SaveModel(&model);
|
||||||
|
auto tree = model["learner"]["gradient_booster"]["model"]["trees"][0];
|
||||||
|
ASSERT_GE(get<I32Array const>(tree["categories"]).size(), 1);
|
||||||
|
auto v = get<I32Array const>(tree["categories"])[0];
|
||||||
|
|
||||||
|
HostDeviceVector<float> predt;
|
||||||
|
{
|
||||||
|
std::vector<float> data{static_cast<float>(kCat),
|
||||||
|
static_cast<float>(kCat + 1), 32.0f, 33.0f, 34.0f};
|
||||||
|
auto test = GetDMatrixFromData(data, data.size(), kCols);
|
||||||
|
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
|
||||||
|
ASSERT_EQ(predt.Size(), data.size());
|
||||||
|
auto const& h_predt = predt.ConstHostSpan();
|
||||||
|
for (auto v : h_predt) {
|
||||||
|
ASSERT_EQ(v, 1); // left child of root node
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::unique_ptr<Learner> learner{Learner::Create({Xy})};
|
||||||
|
learner->LoadModel(model);
|
||||||
|
std::vector<float> data = {static_cast<float>(v)};
|
||||||
|
auto test = GetDMatrixFromData(data, data.size(), kCols);
|
||||||
|
learner->Predict(test, false, &predt, 0, 0, false, /*pred_leaf=*/true);
|
||||||
|
auto const& h_predt = predt.ConstHostSpan();
|
||||||
|
for (auto v : h_predt) {
|
||||||
|
ASSERT_EQ(v, 2); // right child of root node
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace common
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2020-2021 by XGBoost Contributors
|
* Copyright 2020-2023 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <xgboost/host_device_vector.h>
|
#include <xgboost/host_device_vector.h>
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "../../../src/data/array_interface.h"
|
#include "../../../src/data/array_interface.h"
|
||||||
|
#include "dmlc/logging.h"
|
||||||
|
#include "xgboost/json.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
TEST(ArrayInterface, Initialize) {
|
TEST(ArrayInterface, Initialize) {
|
||||||
@@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) {
|
|||||||
column["mask"]["data"] = Null{};
|
column["mask"]["data"] = Null{};
|
||||||
common::Span<RBitField8::value_type> s_mask;
|
common::Span<RBitField8::value_type> s_mask;
|
||||||
EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error);
|
EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error);
|
||||||
|
|
||||||
|
get<Object>(column).erase("mask");
|
||||||
|
// misaligned.
|
||||||
|
j_data = {Json(Integer(reinterpret_cast<Integer::Int>(
|
||||||
|
reinterpret_cast<char const*>(storage.ConstHostPointer()) + 1))),
|
||||||
|
Json(Boolean(false))};
|
||||||
|
column["data"] = j_data;
|
||||||
|
EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ArrayInterface, GetElement) {
|
TEST(ArrayInterface, GetElement) {
|
||||||
|
|||||||
@@ -68,6 +68,30 @@ TEST(GradientIndex, FromCategoricalBasic) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(GradientIndex, FromCategoricalLarge) {
|
||||||
|
size_t constexpr kRows = 1000, kCats = 512, kCols = 1;
|
||||||
|
bst_bin_t max_bins = 8;
|
||||||
|
auto x = GenerateRandomCategoricalSingleColumn(kRows, kCats);
|
||||||
|
auto m = GetDMatrixFromData(x, kRows, 1);
|
||||||
|
Context ctx;
|
||||||
|
|
||||||
|
auto &h_ft = m->Info().feature_types.HostVector();
|
||||||
|
h_ft.resize(kCols, FeatureType::kCategorical);
|
||||||
|
|
||||||
|
BatchParam p{max_bins, 0.8};
|
||||||
|
{
|
||||||
|
GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, Context{}.Threads(), {});
|
||||||
|
ASSERT_TRUE(gidx.index.GetBinTypeSize() == common::kUint16BinsTypeSize);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
for (auto const &page : m->GetBatches<GHistIndexMatrix>(p)) {
|
||||||
|
common::HistogramCuts cut = page.cut;
|
||||||
|
GHistIndexMatrix gidx{m->Info(), std::move(cut), max_bins};
|
||||||
|
ASSERT_EQ(gidx.MaxNumBinPerFeat(), kCats);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST(GradientIndex, PushBatch) {
|
TEST(GradientIndex, PushBatch) {
|
||||||
size_t constexpr kRows = 64, kCols = 4;
|
size_t constexpr kRows = 64, kCols = 4;
|
||||||
bst_bin_t max_bins = 64;
|
bst_bin_t max_bins = 64;
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
// Copyright by Contributors
|
/**
|
||||||
|
* Copyright 2016-2023 by XGBoost Contributors
|
||||||
|
*/
|
||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
|
|
||||||
#include <array>
|
#include <array> // std::array
|
||||||
|
#include <limits> // std::numeric_limits
|
||||||
|
#include <memory> // std::unique_ptr
|
||||||
|
|
||||||
#include "../../../src/data/adapter.h"
|
#include "../../../src/data/adapter.h" // ArrayAdapter
|
||||||
#include "../../../src/data/simple_dmatrix.h"
|
#include "../../../src/data/simple_dmatrix.h" // SimpleDMatrix
|
||||||
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
#include "../filesystem.h" // dmlc::TemporaryDirectory
|
||||||
#include "../helpers.h"
|
#include "../helpers.h" // RandomDataGenerator,CreateSimpleTestData
|
||||||
#include "xgboost/base.h"
|
#include "xgboost/base.h"
|
||||||
|
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||||
|
#include "xgboost/string_view.h" // StringView
|
||||||
|
|
||||||
using namespace xgboost; // NOLINT
|
using namespace xgboost; // NOLINT
|
||||||
|
|
||||||
@@ -298,6 +304,17 @@ TEST(SimpleDMatrix, Slice) {
|
|||||||
ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
|
ASSERT_EQ(out->Info().num_col_, out->Info().num_col_);
|
||||||
ASSERT_EQ(out->Info().num_row_, ridxs.size());
|
ASSERT_EQ(out->Info().num_row_, ridxs.size());
|
||||||
ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols); // dense
|
ASSERT_EQ(out->Info().num_nonzero_, ridxs.size() * kCols); // dense
|
||||||
|
|
||||||
|
{
|
||||||
|
HostDeviceVector<float> data;
|
||||||
|
auto arr_str = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&data);
|
||||||
|
auto adapter = data::ArrayAdapter{StringView{arr_str}};
|
||||||
|
auto n_threads = 2;
|
||||||
|
std::unique_ptr<DMatrix> p_fmat{
|
||||||
|
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), n_threads, "")};
|
||||||
|
std::unique_ptr<DMatrix> slice{p_fmat->Slice(ridxs)};
|
||||||
|
ASSERT_LE(slice->Ctx()->Threads(), n_threads);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(SimpleDMatrix, SaveLoadBinary) {
|
TEST(SimpleDMatrix, SaveLoadBinary) {
|
||||||
|
|||||||
24
tests/cpp/tree/test_node_partition.cc
Normal file
24
tests/cpp/tree/test_node_partition.cc
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2023 by XGBoost contributors
|
||||||
|
*/
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
#include <xgboost/task.h>
|
||||||
|
#include <xgboost/tree_updater.h>
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
|
TEST(Updater, HasNodePosition) {
|
||||||
|
Context ctx;
|
||||||
|
ObjInfo task{ObjInfo::kRegression, true, true};
|
||||||
|
std::unique_ptr<TreeUpdater> up{TreeUpdater::Create("grow_histmaker", &ctx, task)};
|
||||||
|
ASSERT_TRUE(up->HasNodePosition());
|
||||||
|
|
||||||
|
up.reset(TreeUpdater::Create("grow_quantile_histmaker", &ctx, task));
|
||||||
|
ASSERT_TRUE(up->HasNodePosition());
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
|
ctx.gpu_id = 0;
|
||||||
|
up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, task));
|
||||||
|
ASSERT_TRUE(up->HasNodePosition());
|
||||||
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
|
}
|
||||||
|
} // namespace xgboost
|
||||||
@@ -139,3 +139,17 @@ class TestDeviceQuantileDMatrix:
|
|||||||
booster.predict(xgb.DMatrix(d_m.get_data())),
|
booster.predict(xgb.DMatrix(d_m.get_data())),
|
||||||
atol=1e-6,
|
atol=1e-6,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_ltr(self) -> None:
|
||||||
|
import cupy as cp
|
||||||
|
X, y, qid, w = tm.make_ltr(100, 3, 3, 5)
|
||||||
|
# make sure GPU is used to run sketching.
|
||||||
|
cpX = cp.array(X)
|
||||||
|
Xy_qdm = xgb.QuantileDMatrix(cpX, y, qid=qid, weight=w)
|
||||||
|
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
|
||||||
|
xgb.train({"tree_method": "gpu_hist", "objective": "rank:ndcg"}, Xy)
|
||||||
|
|
||||||
|
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
|
||||||
|
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
|
||||||
|
|
||||||
|
assert tm.predictor_equal(from_qdm, from_dm)
|
||||||
|
|||||||
@@ -1,8 +1,14 @@
|
|||||||
import numpy as np
|
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import xgboost as xgb
|
||||||
|
|
||||||
sys.path.append("tests/python")
|
sys.path.append("tests/python")
|
||||||
# Don't import the test class, otherwise they will run twice.
|
# Don't import the test class, otherwise they will run twice.
|
||||||
import test_interaction_constraints as test_ic # noqa
|
import test_interaction_constraints as test_ic # noqa
|
||||||
|
|
||||||
rng = np.random.RandomState(1994)
|
rng = np.random.RandomState(1994)
|
||||||
|
|
||||||
|
|
||||||
@@ -10,7 +16,34 @@ class TestGPUInteractionConstraints:
|
|||||||
cputest = test_ic.TestInteractionConstraints()
|
cputest = test_ic.TestInteractionConstraints()
|
||||||
|
|
||||||
def test_interaction_constraints(self):
|
def test_interaction_constraints(self):
|
||||||
self.cputest.run_interaction_constraints(tree_method='gpu_hist')
|
self.cputest.run_interaction_constraints(tree_method="gpu_hist")
|
||||||
|
|
||||||
def test_training_accuracy(self):
|
def test_training_accuracy(self):
|
||||||
self.cputest.training_accuracy(tree_method='gpu_hist')
|
self.cputest.training_accuracy(tree_method="gpu_hist")
|
||||||
|
|
||||||
|
# case where different number of features can occur in the evaluator
|
||||||
|
def test_issue_8730(self):
|
||||||
|
X = pd.DataFrame(
|
||||||
|
zip(range(0, 100), range(200, 300), range(300, 400), range(400, 500)),
|
||||||
|
columns=["A", "B", "C", "D"],
|
||||||
|
)
|
||||||
|
y = np.array([*([0] * 50), *([1] * 50)])
|
||||||
|
dm = xgb.DMatrix(X, label=y)
|
||||||
|
|
||||||
|
params = {
|
||||||
|
"eta": 0.16095019509249486,
|
||||||
|
"min_child_weight": 1,
|
||||||
|
"subsample": 0.688567929338029,
|
||||||
|
"colsample_bynode": 0.7,
|
||||||
|
"gamma": 5.666579817418348e-06,
|
||||||
|
"lambda": 0.14943712232059794,
|
||||||
|
"grow_policy": "depthwise",
|
||||||
|
"max_depth": 3,
|
||||||
|
"tree_method": "gpu_hist",
|
||||||
|
"interaction_constraints": [["A", "B"], ["B", "D", "C"], ["C", "D"]],
|
||||||
|
"objective": "count:poisson",
|
||||||
|
"eval_metric": "poisson-nloglik",
|
||||||
|
"verbosity": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
xgb.train(params, dm, num_boost_round=100)
|
||||||
|
|||||||
@@ -338,13 +338,21 @@ class TestGPUPredict:
|
|||||||
@given(predict_parameter_strategy, tm.dataset_strategy)
|
@given(predict_parameter_strategy, tm.dataset_strategy)
|
||||||
@settings(deadline=None, max_examples=20, print_blob=True)
|
@settings(deadline=None, max_examples=20, print_blob=True)
|
||||||
def test_predict_leaf_gbtree(self, param, dataset):
|
def test_predict_leaf_gbtree(self, param, dataset):
|
||||||
|
# Unsupported for random forest
|
||||||
|
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
|
||||||
|
return
|
||||||
|
|
||||||
param['booster'] = 'gbtree'
|
param['booster'] = 'gbtree'
|
||||||
param['tree_method'] = 'gpu_hist'
|
param['tree_method'] = 'gpu_hist'
|
||||||
self.run_predict_leaf_booster(param, 10, dataset)
|
self.run_predict_leaf_booster(param, 10, dataset)
|
||||||
|
|
||||||
@given(predict_parameter_strategy, tm.dataset_strategy)
|
@given(predict_parameter_strategy, tm.dataset_strategy)
|
||||||
@settings(deadline=None, max_examples=20, print_blob=True)
|
@settings(deadline=None, max_examples=20, print_blob=True)
|
||||||
def test_predict_leaf_dart(self, param, dataset):
|
def test_predict_leaf_dart(self, param: dict, dataset: tm.TestDataset) -> None:
|
||||||
|
# Unsupported for random forest
|
||||||
|
if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
|
||||||
|
return
|
||||||
|
|
||||||
param['booster'] = 'dart'
|
param['booster'] = 'dart'
|
||||||
param['tree_method'] = 'gpu_hist'
|
param['tree_method'] = 'gpu_hist'
|
||||||
self.run_predict_leaf_booster(param, 10, dataset)
|
self.run_predict_leaf_booster(param, 10, dataset)
|
||||||
|
|||||||
@@ -326,7 +326,7 @@ class TestDMatrix:
|
|||||||
nrow = 100
|
nrow = 100
|
||||||
ncol = 1000
|
ncol = 1000
|
||||||
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
|
||||||
assert x.indices.max() < ncol - 1
|
assert x.indices.max() < ncol
|
||||||
x.data[:] = 1
|
x.data[:] = 1
|
||||||
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
|
dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
|
||||||
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
|
||||||
|
|||||||
@@ -9,7 +9,9 @@ from testing import (
|
|||||||
make_batches,
|
make_batches,
|
||||||
make_batches_sparse,
|
make_batches_sparse,
|
||||||
make_categorical,
|
make_categorical,
|
||||||
|
make_ltr,
|
||||||
make_sparse_regression,
|
make_sparse_regression,
|
||||||
|
predictor_equal,
|
||||||
)
|
)
|
||||||
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
@@ -218,6 +220,16 @@ class TestQuantileDMatrix:
|
|||||||
b = booster.predict(qXy)
|
b = booster.predict(qXy)
|
||||||
np.testing.assert_allclose(a, b)
|
np.testing.assert_allclose(a, b)
|
||||||
|
|
||||||
|
def test_ltr(self) -> None:
|
||||||
|
X, y, qid, w = make_ltr(100, 3, 3, 5)
|
||||||
|
Xy_qdm = xgb.QuantileDMatrix(X, y, qid=qid, weight=w)
|
||||||
|
Xy = xgb.DMatrix(X, y, qid=qid, weight=w)
|
||||||
|
xgb.train({"tree_method": "hist", "objective": "rank:ndcg"}, Xy)
|
||||||
|
|
||||||
|
from_qdm = xgb.QuantileDMatrix(X, weight=w, ref=Xy_qdm)
|
||||||
|
from_dm = xgb.QuantileDMatrix(X, weight=w, ref=Xy)
|
||||||
|
assert predictor_equal(from_qdm, from_dm)
|
||||||
|
|
||||||
# we don't test empty Quantile DMatrix in single node construction.
|
# we don't test empty Quantile DMatrix in single node construction.
|
||||||
@given(
|
@given(
|
||||||
strategies.integers(1, 1000),
|
strategies.integers(1, 1000),
|
||||||
|
|||||||
@@ -41,6 +41,16 @@ logging.getLogger("py4j").setLevel(logging.INFO)
|
|||||||
pytestmark = testing.timeout(60)
|
pytestmark = testing.timeout(60)
|
||||||
|
|
||||||
|
|
||||||
|
def no_sparse_unwrap():
|
||||||
|
try:
|
||||||
|
from pyspark.sql.functions import unwrap_udt
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
return {"reason": "PySpark<3.4", "condition": True}
|
||||||
|
|
||||||
|
return {"reason": "PySpark<3.4", "condition": False}
|
||||||
|
|
||||||
|
|
||||||
class XgboostLocalTest(SparkTestCase):
|
class XgboostLocalTest(SparkTestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
logging.getLogger().setLevel("INFO")
|
logging.getLogger().setLevel("INFO")
|
||||||
@@ -985,6 +995,7 @@ class XgboostLocalTest(SparkTestCase):
|
|||||||
model = classifier.fit(self.cls_df_train)
|
model = classifier.fit(self.cls_df_train)
|
||||||
model.transform(self.cls_df_test).collect()
|
model.transform(self.cls_df_test).collect()
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**no_sparse_unwrap())
|
||||||
def test_regressor_with_sparse_optim(self):
|
def test_regressor_with_sparse_optim(self):
|
||||||
regressor = SparkXGBRegressor(missing=0.0)
|
regressor = SparkXGBRegressor(missing=0.0)
|
||||||
model = regressor.fit(self.reg_df_sparse_train)
|
model = regressor.fit(self.reg_df_sparse_train)
|
||||||
@@ -1001,6 +1012,7 @@ class XgboostLocalTest(SparkTestCase):
|
|||||||
for row1, row2 in zip(pred_result, pred_result2):
|
for row1, row2 in zip(pred_result, pred_result2):
|
||||||
self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))
|
self.assertTrue(np.isclose(row1.prediction, row2.prediction, atol=1e-3))
|
||||||
|
|
||||||
|
@pytest.mark.skipif(**no_sparse_unwrap())
|
||||||
def test_classifier_with_sparse_optim(self):
|
def test_classifier_with_sparse_optim(self):
|
||||||
cls = SparkXGBClassifier(missing=0.0)
|
cls = SparkXGBClassifier(missing=0.0)
|
||||||
model = cls.fit(self.cls_df_sparse_train)
|
model = cls.fit(self.cls_df_sparse_train)
|
||||||
|
|||||||
@@ -458,6 +458,22 @@ class TestTreeMethod:
|
|||||||
config_0 = json.loads(booster_0.save_config())
|
config_0 = json.loads(booster_0.save_config())
|
||||||
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
|
np.testing.assert_allclose(get_score(config_0), get_score(config_1) + 1)
|
||||||
|
|
||||||
|
evals_result: Dict[str, Dict[str, list]] = {}
|
||||||
|
xgb.train(
|
||||||
|
{
|
||||||
|
"tree_method": tree_method,
|
||||||
|
"objective": "reg:absoluteerror",
|
||||||
|
"subsample": 0.8
|
||||||
|
},
|
||||||
|
Xy,
|
||||||
|
num_boost_round=10,
|
||||||
|
evals=[(Xy, "Train")],
|
||||||
|
evals_result=evals_result,
|
||||||
|
)
|
||||||
|
mae = evals_result["Train"]["mae"]
|
||||||
|
assert mae[-1] < 20.0
|
||||||
|
assert tm.non_increasing(mae)
|
||||||
|
|
||||||
@pytest.mark.skipif(**tm.no_sklearn())
|
@pytest.mark.skipif(**tm.no_sklearn())
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"tree_method,weighted", [
|
"tree_method,weighted", [
|
||||||
|
|||||||
@@ -112,7 +112,6 @@ class TestPandas:
|
|||||||
|
|
||||||
# test Index as columns
|
# test Index as columns
|
||||||
df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
|
df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
|
||||||
print(df.columns, isinstance(df.columns, pd.Index))
|
|
||||||
Xy = xgb.DMatrix(df)
|
Xy = xgb.DMatrix(df)
|
||||||
np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))
|
np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import pytest
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
import shap
|
import shap
|
||||||
except ImportError:
|
except Exception:
|
||||||
shap = None
|
shap = None
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import collections
|
|||||||
import importlib.util
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import pickle
|
||||||
import random
|
import random
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
@@ -636,26 +637,74 @@ def test_sklearn_n_jobs():
|
|||||||
|
|
||||||
def test_parameters_access():
|
def test_parameters_access():
|
||||||
from sklearn import datasets
|
from sklearn import datasets
|
||||||
params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1}
|
|
||||||
|
params = {"updater": "grow_gpu_hist", "subsample": 0.5, "n_jobs": -1}
|
||||||
clf = xgb.XGBClassifier(n_estimators=1000, **params)
|
clf = xgb.XGBClassifier(n_estimators=1000, **params)
|
||||||
assert clf.get_params()['updater'] == 'grow_gpu_hist'
|
assert clf.get_params()["updater"] == "grow_gpu_hist"
|
||||||
assert clf.get_params()['subsample'] == .5
|
assert clf.get_params()["subsample"] == 0.5
|
||||||
assert clf.get_params()['n_estimators'] == 1000
|
assert clf.get_params()["n_estimators"] == 1000
|
||||||
|
|
||||||
clf = xgb.XGBClassifier(n_estimators=1, nthread=4)
|
clf = xgb.XGBClassifier(n_estimators=1, nthread=4)
|
||||||
X, y = datasets.load_iris(return_X_y=True)
|
X, y = datasets.load_iris(return_X_y=True)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
|
|
||||||
config = json.loads(clf.get_booster().save_config())
|
config = json.loads(clf.get_booster().save_config())
|
||||||
assert int(config['learner']['generic_param']['nthread']) == 4
|
assert int(config["learner"]["generic_param"]["nthread"]) == 4
|
||||||
|
|
||||||
clf.set_params(nthread=16)
|
clf.set_params(nthread=16)
|
||||||
config = json.loads(clf.get_booster().save_config())
|
config = json.loads(clf.get_booster().save_config())
|
||||||
assert int(config['learner']['generic_param']['nthread']) == 16
|
assert int(config["learner"]["generic_param"]["nthread"]) == 16
|
||||||
|
|
||||||
clf.predict(X)
|
clf.predict(X)
|
||||||
config = json.loads(clf.get_booster().save_config())
|
config = json.loads(clf.get_booster().save_config())
|
||||||
assert int(config['learner']['generic_param']['nthread']) == 16
|
assert int(config["learner"]["generic_param"]["nthread"]) == 16
|
||||||
|
|
||||||
|
clf = xgb.XGBClassifier(n_estimators=2)
|
||||||
|
assert clf.tree_method is None
|
||||||
|
assert clf.get_params()["tree_method"] is None
|
||||||
|
clf.fit(X, y)
|
||||||
|
assert clf.get_params()["tree_method"] is None
|
||||||
|
|
||||||
|
def save_load(clf: xgb.XGBClassifier) -> xgb.XGBClassifier:
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
path = os.path.join(tmpdir, "model.json")
|
||||||
|
clf.save_model(path)
|
||||||
|
clf = xgb.XGBClassifier()
|
||||||
|
clf.load_model(path)
|
||||||
|
return clf
|
||||||
|
|
||||||
|
def get_tm(clf: xgb.XGBClassifier) -> str:
|
||||||
|
tm = json.loads(clf.get_booster().save_config())["learner"]["gradient_booster"][
|
||||||
|
"gbtree_train_param"
|
||||||
|
]["tree_method"]
|
||||||
|
return tm
|
||||||
|
|
||||||
|
assert get_tm(clf) == "exact"
|
||||||
|
|
||||||
|
clf = pickle.loads(pickle.dumps(clf))
|
||||||
|
|
||||||
|
assert clf.tree_method is None
|
||||||
|
assert clf.n_estimators == 2
|
||||||
|
assert clf.get_params()["tree_method"] is None
|
||||||
|
assert clf.get_params()["n_estimators"] == 2
|
||||||
|
assert get_tm(clf) == "exact" # preserved for pickle
|
||||||
|
|
||||||
|
clf = save_load(clf)
|
||||||
|
|
||||||
|
assert clf.tree_method is None
|
||||||
|
assert clf.n_estimators == 2
|
||||||
|
assert clf.get_params()["tree_method"] is None
|
||||||
|
assert clf.get_params()["n_estimators"] == 2
|
||||||
|
assert get_tm(clf) == "auto" # discarded for save/load_model
|
||||||
|
|
||||||
|
clf.set_params(tree_method="hist")
|
||||||
|
assert clf.get_params()["tree_method"] == "hist"
|
||||||
|
clf = pickle.loads(pickle.dumps(clf))
|
||||||
|
assert clf.get_params()["tree_method"] == "hist"
|
||||||
|
clf = save_load(clf)
|
||||||
|
# FIXME(jiamingy): We should remove this behavior once we remove parameters
|
||||||
|
# serialization for skl save/load_model.
|
||||||
|
assert clf.get_params()["tree_method"] == "hist"
|
||||||
|
|
||||||
|
|
||||||
def test_kwargs_error():
|
def test_kwargs_error():
|
||||||
@@ -695,13 +744,19 @@ def test_sklearn_clone():
|
|||||||
|
|
||||||
def test_sklearn_get_default_params():
|
def test_sklearn_get_default_params():
|
||||||
from sklearn.datasets import load_digits
|
from sklearn.datasets import load_digits
|
||||||
|
|
||||||
digits_2class = load_digits(n_class=2)
|
digits_2class = load_digits(n_class=2)
|
||||||
X = digits_2class['data']
|
X = digits_2class["data"]
|
||||||
y = digits_2class['target']
|
y = digits_2class["target"]
|
||||||
cls = xgb.XGBClassifier()
|
cls = xgb.XGBClassifier()
|
||||||
assert cls.get_params()['base_score'] is None
|
assert cls.get_params()["base_score"] is None
|
||||||
cls.fit(X[:4, ...], y[:4, ...])
|
cls.fit(X[:4, ...], y[:4, ...])
|
||||||
assert cls.get_params()['base_score'] is not None
|
base_score = float(
|
||||||
|
json.loads(cls.get_booster().save_config())["learner"]["learner_model_param"][
|
||||||
|
"base_score"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
np.testing.assert_equal(base_score, 0.5)
|
||||||
|
|
||||||
|
|
||||||
def run_validation_weights(model):
|
def run_validation_weights(model):
|
||||||
@@ -1029,9 +1084,9 @@ def test_pandas_input():
|
|||||||
|
|
||||||
clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
|
clf_isotonic = CalibratedClassifierCV(model, cv="prefit", method="isotonic")
|
||||||
clf_isotonic.fit(train, target)
|
clf_isotonic.fit(train, target)
|
||||||
assert isinstance(
|
clf = clf_isotonic.calibrated_classifiers_[0]
|
||||||
clf_isotonic.calibrated_classifiers_[0].base_estimator, xgb.XGBClassifier
|
est = clf.estimator if hasattr(clf, "estimator") else clf.base_estimator
|
||||||
)
|
assert isinstance(est, xgb.XGBClassifier)
|
||||||
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
|
np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -466,7 +466,22 @@ def make_categorical(
|
|||||||
return df, label
|
return df, label
|
||||||
|
|
||||||
|
|
||||||
def _cat_sampled_from():
|
def make_ltr(
|
||||||
|
n_samples: int, n_features: int, n_query_groups: int, max_rel: int
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||||
|
"""Make a dataset for testing LTR."""
|
||||||
|
rng = np.random.default_rng(1994)
|
||||||
|
X = rng.normal(0, 1.0, size=n_samples * n_features).reshape(n_samples, n_features)
|
||||||
|
y = rng.integers(0, max_rel, size=n_samples)
|
||||||
|
qid = rng.integers(0, n_query_groups, size=n_samples)
|
||||||
|
w = rng.normal(0, 1.0, size=n_query_groups)
|
||||||
|
w -= np.min(w)
|
||||||
|
w /= np.max(w)
|
||||||
|
qid = np.sort(qid)
|
||||||
|
return X, y, qid, w
|
||||||
|
|
||||||
|
|
||||||
|
def _cat_sampled_from() -> strategies.SearchStrategy:
|
||||||
@strategies.composite
|
@strategies.composite
|
||||||
def _make_cat(draw):
|
def _make_cat(draw):
|
||||||
n_samples = draw(strategies.integers(2, 512))
|
n_samples = draw(strategies.integers(2, 512))
|
||||||
@@ -775,6 +790,19 @@ class DirectoryExcursion:
|
|||||||
os.remove(f)
|
os.remove(f)
|
||||||
|
|
||||||
|
|
||||||
|
def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
|
||||||
|
"""Assert whether two DMatrices contain the same predictors."""
|
||||||
|
lcsr = lhs.get_data()
|
||||||
|
rcsr = rhs.get_data()
|
||||||
|
return all(
|
||||||
|
(
|
||||||
|
np.array_equal(lcsr.data, rcsr.data),
|
||||||
|
np.array_equal(lcsr.indices, rcsr.indices),
|
||||||
|
np.array_equal(lcsr.indptr, rcsr.indptr),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def captured_output():
|
def captured_output():
|
||||||
"""Reassign stdout temporarily in order to test printed statements
|
"""Reassign stdout temporarily in order to test printed statements
|
||||||
|
|||||||
Reference in New Issue
Block a user