Compare commits
3 Commits
v1.5.1
...
release_1.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e7decb9775 | ||
|
|
1920118bcb | ||
|
|
2032547426 |
@@ -1,5 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
|
||||
project(xgboost LANGUAGES CXX C VERSION 1.5.1)
|
||||
project(xgboost LANGUAGES CXX C VERSION 1.5.0)
|
||||
include(cmake/Utils.cmake)
|
||||
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
|
||||
cmake_policy(SET CMP0022 NEW)
|
||||
@@ -135,10 +135,6 @@ if (USE_CUDA)
|
||||
set(GEN_CODE "")
|
||||
format_gencode_flags("${GPU_COMPUTE_VER}" GEN_CODE)
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
|
||||
|
||||
if ((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 11.4) AND (NOT BUILD_WITH_CUDA_CUB))
|
||||
message(SEND_ERROR "`BUILD_WITH_CUDA_CUB` should be set to `ON` for CUDA >= 11.4")
|
||||
endif ()
|
||||
endif (USE_CUDA)
|
||||
|
||||
if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
^.*\.Rproj$
|
||||
^\.Rproj\.user$
|
||||
README.md
|
||||
CMakeLists.txt
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
Package: xgboost
|
||||
Type: Package
|
||||
Title: Extreme Gradient Boosting
|
||||
Version: 1.5.1.1
|
||||
Date: 2021-10-13
|
||||
Version: 1.5.0.2
|
||||
Date: 2021-11-19
|
||||
Authors@R: c(
|
||||
person("Tianqi", "Chen", role = c("aut"),
|
||||
email = "tianqi.tchen@gmail.com"),
|
||||
|
||||
@@ -148,8 +148,7 @@ From the command line on Linux starting from the XGBoost directory:
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
# For CUDA toolkit >= 11.4, `BUILD_WITH_CUDA_CUB` is required.
|
||||
cmake .. -DUSE_CUDA=ON -DBUILD_WITH_CUDA_CUB=ON
|
||||
cmake .. -DUSE_CUDA=ON
|
||||
make -j4
|
||||
|
||||
.. note:: Specifying compute capability
|
||||
|
||||
@@ -95,13 +95,13 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
|
||||
shap_interaction_values = model.predict(dtrain, pred_interactions=True)
|
||||
|
||||
See examples `here
|
||||
<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
|
||||
<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`_.
|
||||
|
||||
Multi-node Multi-GPU Training
|
||||
=============================
|
||||
XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_. For
|
||||
getting started see our tutorial :doc:`/tutorials/dask` and worked examples `here
|
||||
<https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation
|
||||
<https://github.com/dmlc/xgboost/tree/master/demo/dask>`_, also Python documentation
|
||||
:ref:`dask_api` for complete reference.
|
||||
|
||||
|
||||
@@ -238,7 +238,7 @@ Working memory is allocated inside the algorithm proportional to the number of r
|
||||
|
||||
The quantile finding algorithm also uses some amount of working device memory. It is able to operate in batches, but is not currently well optimised for sparse data.
|
||||
|
||||
If you are getting out-of-memory errors on a big dataset, try the :doc:`external memory version </tutorials/external_memory>`.
|
||||
If you are getting out-of-memory errors on a big dataset, try the `external memory version <../tutorials/external_memory.html>`_.
|
||||
|
||||
Developer notes
|
||||
===============
|
||||
|
||||
@@ -79,7 +79,7 @@ The first thing in data transformation is to load the dataset as Spark's structu
|
||||
StructField("class", StringType, true)))
|
||||
val rawInput = spark.read.schema(schema).csv("input_path")
|
||||
|
||||
At the first line, we create a instance of `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
|
||||
At the first line, we create a instance of `SparkSession <http://spark.apache.org/docs/latest/sql-programming-guide.html#starting-point-sparksession>`_ which is the entry of any Spark program working with DataFrame. The ``schema`` variable defines the schema of DataFrame wrapping Iris data. With this explicitly set schema, we can define the columns' name as well as their types; otherwise the column name would be the default ones derived by Spark, such as ``_col0``, etc. Finally, we can use Spark's built-in csv reader to load Iris csv file as a DataFrame named ``rawInput``.
|
||||
|
||||
Spark also contains many built-in readers for other format. The latest version of Spark supports CSV, JSON, Parquet, and LIBSVM.
|
||||
|
||||
@@ -130,7 +130,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri
|
||||
Dealing with missing values
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-values>`_).
|
||||
XGBoost supports missing values by default (`as desribed here <https://xgboost.readthedocs.io/en/latest/faq.html#how-to-deal-with-missing-value>`_).
|
||||
If given a SparseVector, XGBoost will treat any values absent from the SparseVector as missing. You are also able to
|
||||
specify to XGBoost to treat a specific value in your Dataset as if it was a missing value. By default XGBoost will treat NaN as the value representing missing.
|
||||
|
||||
@@ -369,7 +369,7 @@ Then we can load this model with single node Python XGBoost:
|
||||
|
||||
When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
|
||||
|
||||
1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
||||
1. Build XGBoost4J-Spark with the steps described in `here <https://xgboost.readthedocs.io/en/latest/jvm/index.html#installation-from-source>`_, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
|
||||
|
||||
- However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
|
||||
|
||||
|
||||
@@ -366,8 +366,8 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
||||
- ``rank:pairwise``: Use LambdaMART to perform pairwise ranking where the pairwise loss is minimized
|
||||
- ``rank:ndcg``: Use LambdaMART to perform list-wise ranking where `Normalized Discounted Cumulative Gain (NDCG) <http://en.wikipedia.org/wiki/NDCG>`_ is maximized
|
||||
- ``rank:map``: Use LambdaMART to perform list-wise ranking where `Mean Average Precision (MAP) <http://en.wikipedia.org/wiki/Mean_average_precision#Mean_average_precision>`_ is maximized
|
||||
- ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Occurrence_and_applications>`_.
|
||||
- ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Occurrence_and_applications>`_.
|
||||
- ``reg:gamma``: gamma regression with log-link. Output is a mean of gamma distribution. It might be useful, e.g., for modeling insurance claims severity, or for any outcome that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`_.
|
||||
- ``reg:tweedie``: Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be `Tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`_.
|
||||
|
||||
* ``base_score`` [default=0.5]
|
||||
|
||||
@@ -390,7 +390,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
|
||||
- ``error@t``: a different than 0.5 binary classification threshold value could be specified by providing a numerical value through 't'.
|
||||
- ``merror``: Multiclass classification error rate. It is calculated as ``#(wrong cases)/#(all cases)``.
|
||||
- ``mlogloss``: `Multiclass logloss <http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html>`_.
|
||||
- ``auc``: `Receiver Operating Characteristic Area under the Curve <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
|
||||
- ``auc``: `Receiver Operating Characteristic Area under the Curve <http://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_curve>`_.
|
||||
Available for classification and learning-to-rank tasks.
|
||||
|
||||
- When used with binary classification, the objective should be ``binary:logistic`` or similar functions that work on probability.
|
||||
|
||||
@@ -11,7 +11,7 @@ In order to run a XGBoost job in a Kubernetes cluster, perform the following ste
|
||||
|
||||
1. Install XGBoost Operator on the Kubernetes cluster.
|
||||
|
||||
a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#install-xgboost-operator>`_ to install XGBoost Operator.
|
||||
a. XGBoost Operator is designed to manage the scheduling and monitoring of XGBoost jobs. Follow `this installation guide <https://github.com/kubeflow/xgboost-operator#installing-xgboost-operator>`_ to install XGBoost Operator.
|
||||
|
||||
2. Write application code that will be executed by the XGBoost Operator.
|
||||
|
||||
|
||||
@@ -227,15 +227,15 @@ XGBoost has a function called ``dump_model`` in Booster object, which lets you t
|
||||
the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary
|
||||
use case for it is for model interpretation or visualization, and is not supposed to be
|
||||
loaded back to XGBoost. The JSON version has a `schema
|
||||
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`__. See next section for
|
||||
<https://github.com/dmlc/xgboost/blob/master/doc/dump.schema>`_. See next section for
|
||||
more info.
|
||||
|
||||
***********
|
||||
JSON Schema
|
||||
***********
|
||||
|
||||
Another important feature of JSON format is a documented `schema
|
||||
<https://json-schema.org/>`__, based on which one can easily reuse the output model from
|
||||
Another important feature of JSON format is a documented `Schema
|
||||
<https://json-schema.org/>`_, based on which one can easily reuse the output model from
|
||||
XGBoost. Here is the initial draft of JSON schema for the output model (not
|
||||
serialization, which will not be stable as noted above). It's subject to change due to
|
||||
the beta status. For an example of parsing XGBoost tree model, see ``/demo/json-model``.
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>XGBoost JVM Package</name>
|
||||
<description>JVM Package for XGBoost</description>
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-example_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
<packaging>jar</packaging>
|
||||
<build>
|
||||
<plugins>
|
||||
@@ -26,7 +26,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
@@ -37,7 +37,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-flink_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
@@ -26,7 +26,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-gpu_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<properties>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
|
||||
<build>
|
||||
@@ -24,7 +24,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j-spark_2.12</artifactId>
|
||||
<build>
|
||||
@@ -24,7 +24,7 @@
|
||||
<dependency>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
||||
@@ -17,13 +17,11 @@
|
||||
package ml.dmlc.xgboost4j.scala.spark.params
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.ml.param.{ParamPair, Params}
|
||||
import org.json4s.JsonDSL._
|
||||
import org.json4s.jackson.JsonMethods._
|
||||
import org.json4s.{JArray, JBool, JDouble, JField, JInt, JNothing, JObject, JString, JValue}
|
||||
|
||||
import JsonDSLXGBoost._
|
||||
import org.json4s.{JObject, _}
|
||||
|
||||
// This originates from apache-spark DefaultPramsWriter copy paste
|
||||
private[spark] object DefaultXGBoostParamsWriter {
|
||||
@@ -89,62 +87,3 @@ private[spark] object DefaultXGBoostParamsWriter {
|
||||
metadataJson
|
||||
}
|
||||
}
|
||||
|
||||
// Fix json4s bin-incompatible issue.
|
||||
// This originates from org.json4s.JsonDSL of 3.6.6
|
||||
object JsonDSLXGBoost {
|
||||
|
||||
implicit def seq2jvalue[A](s: Iterable[A])(implicit ev: A => JValue): JArray =
|
||||
JArray(s.toList.map(ev))
|
||||
|
||||
implicit def map2jvalue[A](m: Map[String, A])(implicit ev: A => JValue): JObject =
|
||||
JObject(m.toList.map { case (k, v) => JField(k, ev(v)) })
|
||||
|
||||
implicit def option2jvalue[A](opt: Option[A])(implicit ev: A => JValue): JValue = opt match {
|
||||
case Some(x) => ev(x)
|
||||
case None => JNothing
|
||||
}
|
||||
|
||||
implicit def short2jvalue(x: Short): JValue = JInt(x)
|
||||
implicit def byte2jvalue(x: Byte): JValue = JInt(x)
|
||||
implicit def char2jvalue(x: Char): JValue = JInt(x)
|
||||
implicit def int2jvalue(x: Int): JValue = JInt(x)
|
||||
implicit def long2jvalue(x: Long): JValue = JInt(x)
|
||||
implicit def bigint2jvalue(x: BigInt): JValue = JInt(x)
|
||||
implicit def double2jvalue(x: Double): JValue = JDouble(x)
|
||||
implicit def float2jvalue(x: Float): JValue = JDouble(x.toDouble)
|
||||
implicit def bigdecimal2jvalue(x: BigDecimal): JValue = JDouble(x.doubleValue)
|
||||
implicit def boolean2jvalue(x: Boolean): JValue = JBool(x)
|
||||
implicit def string2jvalue(x: String): JValue = JString(x)
|
||||
|
||||
implicit def symbol2jvalue(x: Symbol): JString = JString(x.name)
|
||||
implicit def pair2jvalue[A](t: (String, A))(implicit ev: A => JValue): JObject =
|
||||
JObject(List(JField(t._1, ev(t._2))))
|
||||
implicit def list2jvalue(l: List[JField]): JObject = JObject(l)
|
||||
implicit def jobject2assoc(o: JObject): JsonListAssoc = new JsonListAssoc(o.obj)
|
||||
implicit def pair2Assoc[A](t: (String, A))(implicit ev: A => JValue): JsonAssoc[A] =
|
||||
new JsonAssoc(t)
|
||||
}
|
||||
|
||||
final class JsonAssoc[A](private val left: (String, A)) extends AnyVal {
|
||||
def ~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject = {
|
||||
val l: JValue = ev1(left._2)
|
||||
val r: JValue = ev2(right._2)
|
||||
JObject(JField(left._1, l) :: JField(right._1, r) :: Nil)
|
||||
}
|
||||
|
||||
def ~(right: JObject)(implicit ev: A => JValue): JObject = {
|
||||
val l: JValue = ev(left._2)
|
||||
JObject(JField(left._1, l) :: right.obj)
|
||||
}
|
||||
def ~~[B](right: (String, B))(implicit ev1: A => JValue, ev2: B => JValue): JObject =
|
||||
this.~(right)
|
||||
def ~~(right: JObject)(implicit ev: A => JValue): JObject = this.~(right)
|
||||
}
|
||||
|
||||
final class JsonListAssoc(private val left: List[JField]) extends AnyVal {
|
||||
def ~(right: (String, JValue)): JObject = JObject(left ::: List(JField(right._1, right._2)))
|
||||
def ~(right: JObject): JObject = JObject(left ::: right.obj)
|
||||
def ~~(right: (String, JValue)): JObject = this.~(right)
|
||||
def ~~(right: JObject): JObject = this.~(right)
|
||||
}
|
||||
|
||||
@@ -6,10 +6,10 @@
|
||||
<parent>
|
||||
<groupId>ml.dmlc</groupId>
|
||||
<artifactId>xgboost-jvm_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
</parent>
|
||||
<artifactId>xgboost4j_2.12</artifactId>
|
||||
<version>1.5.1</version>
|
||||
<version>1.5.0</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
|
||||
@@ -1 +1 @@
|
||||
1.5.1
|
||||
1.5.0
|
||||
|
||||
@@ -386,7 +386,7 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
raise exc # pylint: disable=raising-bad-type
|
||||
|
||||
def __del__(self) -> None:
|
||||
assert self._temporary_data is None
|
||||
assert self._temporary_data is None, self._temporary_data
|
||||
assert self._exception is None
|
||||
|
||||
def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument
|
||||
@@ -410,19 +410,19 @@ class DataIter: # pylint: disable=too-many-instance-attributes
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
):
|
||||
from .data import dispatch_proxy_set_data
|
||||
from .data import _proxy_transform
|
||||
|
||||
new, cat_codes, feature_names, feature_types = _proxy_transform(
|
||||
transformed, feature_names, feature_types = _proxy_transform(
|
||||
data,
|
||||
feature_names,
|
||||
feature_types,
|
||||
self._enable_categorical,
|
||||
)
|
||||
# Stage the data, meta info are copied inside C++ MetaInfo.
|
||||
self._temporary_data = (new, cat_codes)
|
||||
dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
|
||||
self._temporary_data = transformed
|
||||
dispatch_proxy_set_data(self.proxy, transformed, self._allow_host)
|
||||
self.proxy.set_info(
|
||||
feature_names=feature_names,
|
||||
feature_types=feature_types,
|
||||
@@ -1103,7 +1103,7 @@ class _ProxyDMatrix(DMatrix):
|
||||
self.handle = ctypes.c_void_p()
|
||||
_check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
|
||||
|
||||
def _set_data_from_cuda_interface(self, data) -> None:
|
||||
def _set_data_from_cuda_interface(self, data):
|
||||
"""Set data from CUDA array interface."""
|
||||
interface = data.__cuda_array_interface__
|
||||
interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
|
||||
@@ -1111,11 +1111,11 @@ class _ProxyDMatrix(DMatrix):
|
||||
_LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
|
||||
)
|
||||
|
||||
def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
|
||||
def _set_data_from_cuda_columnar(self, data):
|
||||
"""Set data from CUDA columnar format."""
|
||||
from .data import _cudf_array_interfaces
|
||||
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
_check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
|
||||
|
||||
def _set_data_from_array(self, data: np.ndarray):
|
||||
@@ -1805,7 +1805,7 @@ class Booster(object):
|
||||
.. note::
|
||||
|
||||
See `Prediction
|
||||
<https://xgboost.readthedocs.io/en/latest/prediction.html>`_
|
||||
<https://xgboost.readthedocs.io/en/latest/tutorials/prediction.html>`_
|
||||
for issues like thread safety and a summary of outputs from this function.
|
||||
|
||||
Parameters
|
||||
@@ -1986,6 +1986,13 @@ class Booster(object):
|
||||
preds = ctypes.POINTER(ctypes.c_float)()
|
||||
|
||||
# once caching is supported, we can pass id(data) as cache id.
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(data, pd.DataFrame):
|
||||
data = data.values
|
||||
except ImportError:
|
||||
pass
|
||||
args = {
|
||||
"type": 0,
|
||||
"training": False,
|
||||
@@ -2020,20 +2027,7 @@ class Booster(object):
|
||||
f"got {data.shape[1]}"
|
||||
)
|
||||
|
||||
from .data import _is_pandas_df, _transform_pandas_df
|
||||
from .data import _array_interface
|
||||
if (
|
||||
_is_pandas_df(data)
|
||||
or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
|
||||
):
|
||||
ft = self.feature_types
|
||||
if ft is None:
|
||||
enable_categorical = False
|
||||
else:
|
||||
enable_categorical = any(f == "c" for f in ft)
|
||||
if _is_pandas_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, enable_categorical)
|
||||
|
||||
if isinstance(data, np.ndarray):
|
||||
from .data import _ensure_np_dtype
|
||||
data, _ = _ensure_np_dtype(data, data.dtype)
|
||||
@@ -2086,11 +2080,9 @@ class Booster(object):
|
||||
)
|
||||
return _prediction_output(shape, dims, preds, True)
|
||||
if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
|
||||
from .data import _cudf_array_interfaces, _transform_cudf_df
|
||||
data, cat_codes, _, _ = _transform_cudf_df(
|
||||
data, None, None, enable_categorical
|
||||
)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
from .data import _cudf_array_interfaces
|
||||
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
_check_call(
|
||||
_LIB.XGBoosterPredictFromCudaColumnar(
|
||||
self.handle,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# pylint: disable=too-many-arguments, too-many-branches, too-many-lines
|
||||
# pylint: disable=too-many-arguments, too-many-branches
|
||||
# pylint: disable=too-many-return-statements, import-error
|
||||
'''Data dispatching for DMatrix.'''
|
||||
import ctypes
|
||||
@@ -12,7 +12,7 @@ import numpy as np
|
||||
from .core import c_array, _LIB, _check_call, c_str
|
||||
from .core import _cuda_array_interface
|
||||
from .core import DataIter, _ProxyDMatrix, DMatrix
|
||||
from .compat import lazy_isinstance, DataFrame
|
||||
from .compat import lazy_isinstance
|
||||
|
||||
c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
|
||||
|
||||
@@ -217,48 +217,36 @@ _pandas_dtype_mapper = {
|
||||
}
|
||||
|
||||
|
||||
def _invalid_dataframe_dtype(data) -> None:
|
||||
# pandas series has `dtypes` but it's just a single object
|
||||
# cudf series doesn't have `dtypes`.
|
||||
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
|
||||
bad_fields = [
|
||||
str(data.columns[i])
|
||||
for i, dtype in enumerate(data.dtypes)
|
||||
if dtype.name not in _pandas_dtype_mapper
|
||||
]
|
||||
err = " Invalid columns:" + ", ".join(bad_fields)
|
||||
else:
|
||||
err = ""
|
||||
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||
be set to `True`.""" + err
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def _transform_pandas_df(
|
||||
data: DataFrame,
|
||||
data,
|
||||
enable_categorical: bool,
|
||||
feature_names: Optional[List[str]] = None,
|
||||
feature_types: Optional[List[str]] = None,
|
||||
meta: Optional[str] = None,
|
||||
meta_type: Optional[str] = None,
|
||||
) -> Tuple[np.ndarray, Optional[List[str]], Optional[List[str]]]:
|
||||
meta=None,
|
||||
meta_type=None,
|
||||
):
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_sparse, is_categorical_dtype
|
||||
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or is_sparse(dtype)
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
|
||||
(is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in data.dtypes):
|
||||
bad_fields = [
|
||||
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
|
||||
if dtype.name not in _pandas_dtype_mapper
|
||||
]
|
||||
|
||||
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
|
||||
categorical type is supplied, DMatrix parameter `enable_categorical` must
|
||||
be set to `True`."""
|
||||
raise ValueError(msg + ', '.join(bad_fields))
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None and meta is None:
|
||||
if isinstance(data.columns, pd.MultiIndex):
|
||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||
feature_names = [
|
||||
' '.join([str(x) for x in i]) for i in data.columns
|
||||
]
|
||||
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
|
||||
feature_names = list(map(str, data.columns))
|
||||
else:
|
||||
@@ -275,24 +263,21 @@ def _transform_pandas_df(
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
|
||||
# handle category codes.
|
||||
# handle categorical codes.
|
||||
transformed = pd.DataFrame()
|
||||
if enable_categorical:
|
||||
for i, dtype in enumerate(data.dtypes):
|
||||
if is_categorical_dtype(dtype):
|
||||
# pandas uses -1 as default missing value for categorical data
|
||||
transformed[data.columns[i]] = (
|
||||
data[data.columns[i]]
|
||||
.cat.codes.astype(np.float32)
|
||||
.replace(-1.0, np.NaN)
|
||||
)
|
||||
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
|
||||
else:
|
||||
transformed[data.columns[i]] = data[data.columns[i]]
|
||||
else:
|
||||
transformed = data
|
||||
|
||||
if meta and len(data.columns) > 1:
|
||||
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
|
||||
raise ValueError(
|
||||
f"DataFrame for {meta} cannot have multiple columns"
|
||||
)
|
||||
|
||||
dtype = meta_type if meta_type else np.float32
|
||||
arr = transformed.values
|
||||
@@ -302,7 +287,7 @@ def _transform_pandas_df(
|
||||
|
||||
|
||||
def _from_pandas_df(
|
||||
data: DataFrame,
|
||||
data,
|
||||
enable_categorical: bool,
|
||||
missing,
|
||||
nthread,
|
||||
@@ -315,7 +300,6 @@ def _from_pandas_df(
|
||||
feature_types)
|
||||
|
||||
|
||||
|
||||
def _is_pandas_series(data):
|
||||
try:
|
||||
import pandas as pd
|
||||
@@ -334,26 +318,13 @@ def _is_modin_series(data):
|
||||
|
||||
def _from_pandas_series(
|
||||
data,
|
||||
missing: float,
|
||||
nthread: int,
|
||||
enable_categorical: bool,
|
||||
missing,
|
||||
nthread,
|
||||
feature_names: Optional[List[str]],
|
||||
feature_types: Optional[List[str]],
|
||||
):
|
||||
from pandas.api.types import is_categorical_dtype
|
||||
|
||||
if (data.dtype.name not in _pandas_dtype_mapper) and not (
|
||||
is_categorical_dtype(data.dtype) and enable_categorical
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
if enable_categorical and is_categorical_dtype(data.dtype):
|
||||
data = data.cat.codes
|
||||
return _from_numpy_array(
|
||||
data.values.reshape(data.shape[0], 1).astype("float"),
|
||||
missing,
|
||||
nthread,
|
||||
feature_names,
|
||||
feature_types,
|
||||
data.values.astype("float"), missing, nthread, feature_names, feature_types
|
||||
)
|
||||
|
||||
|
||||
@@ -457,7 +428,7 @@ def _is_cudf_df(data):
|
||||
return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
|
||||
|
||||
|
||||
def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
def _cudf_array_interfaces(data) -> Tuple[list, bytes]:
|
||||
"""Extract CuDF __cuda_array_interface__. This is special as it returns a new list of
|
||||
data and a list of array interfaces. The data is list of categorical codes that
|
||||
caller can safely ignore, but have to keep their reference alive until usage of array
|
||||
@@ -469,27 +440,23 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
|
||||
except ImportError:
|
||||
from cudf.utils.dtypes import is_categorical_dtype
|
||||
|
||||
cat_codes = []
|
||||
interfaces = []
|
||||
if _is_cudf_ser(data):
|
||||
if is_categorical_dtype(data.dtype):
|
||||
interface = cat_codes[0].__cuda_array_interface__
|
||||
else:
|
||||
interface = data.__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
interfaces.append(data.__cuda_array_interface__)
|
||||
else:
|
||||
for i, col in enumerate(data):
|
||||
for col in data:
|
||||
if is_categorical_dtype(data[col].dtype):
|
||||
codes = cat_codes[i]
|
||||
codes = data[col].cat.codes
|
||||
interface = codes.__cuda_array_interface__
|
||||
cat_codes.append(codes)
|
||||
else:
|
||||
interface = data[col].__cuda_array_interface__
|
||||
if "mask" in interface:
|
||||
interface["mask"] = interface["mask"].__cuda_array_interface__
|
||||
interfaces.append(interface)
|
||||
interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8")
|
||||
return interfaces_str
|
||||
return cat_codes, interfaces_str
|
||||
|
||||
|
||||
def _transform_cudf_df(
|
||||
@@ -503,57 +470,25 @@ def _transform_cudf_df(
|
||||
except ImportError:
|
||||
from cudf.utils.dtypes import is_categorical_dtype
|
||||
|
||||
if _is_cudf_ser(data):
|
||||
dtypes = [data.dtype]
|
||||
else:
|
||||
dtypes = data.dtypes
|
||||
|
||||
if not all(
|
||||
dtype.name in _pandas_dtype_mapper
|
||||
or (is_categorical_dtype(dtype) and enable_categorical)
|
||||
for dtype in dtypes
|
||||
):
|
||||
_invalid_dataframe_dtype(data)
|
||||
|
||||
# handle feature names
|
||||
if feature_names is None:
|
||||
if _is_cudf_ser(data):
|
||||
feature_names = [data.name]
|
||||
elif lazy_isinstance(data.columns, "cudf.core.multiindex", "MultiIndex"):
|
||||
feature_names = [" ".join([str(x) for x in i]) for i in data.columns]
|
||||
elif (
|
||||
lazy_isinstance(data.columns, "cudf.core.index", "RangeIndex")
|
||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int64Index")
|
||||
# Unique to cuDF, no equivalence in pandas 1.3.3
|
||||
or lazy_isinstance(data.columns, "cudf.core.index", "Int32Index")
|
||||
):
|
||||
feature_names = list(map(str, data.columns))
|
||||
else:
|
||||
feature_names = data.columns.format()
|
||||
|
||||
# handle feature types
|
||||
if feature_types is None:
|
||||
feature_types = []
|
||||
if _is_cudf_ser(data):
|
||||
dtypes = [data.dtype]
|
||||
else:
|
||||
dtypes = data.dtypes
|
||||
for dtype in dtypes:
|
||||
if is_categorical_dtype(dtype) and enable_categorical:
|
||||
feature_types.append(CAT_T)
|
||||
else:
|
||||
feature_types.append(_pandas_dtype_mapper[dtype.name])
|
||||
|
||||
# handle categorical data
|
||||
cat_codes = []
|
||||
if _is_cudf_ser(data):
|
||||
# unlike pandas, cuDF uses NA for missing data.
|
||||
if is_categorical_dtype(data.dtype) and enable_categorical:
|
||||
codes = data.cat.codes
|
||||
cat_codes.append(codes)
|
||||
else:
|
||||
for col in data:
|
||||
if is_categorical_dtype(data[col].dtype) and enable_categorical:
|
||||
codes = data[col].cat.codes
|
||||
cat_codes.append(codes)
|
||||
|
||||
return data, cat_codes, feature_names, feature_types
|
||||
return data, feature_names, feature_types
|
||||
|
||||
|
||||
def _from_cudf_df(
|
||||
@@ -564,10 +499,10 @@ def _from_cudf_df(
|
||||
feature_types: Optional[List[str]],
|
||||
enable_categorical: bool,
|
||||
) -> Tuple[ctypes.c_void_p, Any, Any]:
|
||||
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
|
||||
data, feature_names, feature_types = _transform_cudf_df(
|
||||
data, feature_names, feature_types, enable_categorical
|
||||
)
|
||||
interfaces_str = _cudf_array_interfaces(data, cat_codes)
|
||||
_, interfaces_str = _cudf_array_interfaces(data)
|
||||
handle = ctypes.c_void_p()
|
||||
config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
|
||||
_check_call(
|
||||
@@ -772,9 +707,8 @@ def dispatch_data_backend(
|
||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||
feature_names, feature_types)
|
||||
if _is_pandas_series(data):
|
||||
return _from_pandas_series(
|
||||
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return _from_pandas_series(data, missing, threads, feature_names,
|
||||
feature_types)
|
||||
if _is_cudf_df(data) or _is_cudf_ser(data):
|
||||
return _from_cudf_df(
|
||||
data, missing, threads, feature_names, feature_types, enable_categorical
|
||||
@@ -798,9 +732,8 @@ def dispatch_data_backend(
|
||||
return _from_pandas_df(data, enable_categorical, missing, threads,
|
||||
feature_names, feature_types)
|
||||
if _is_modin_series(data):
|
||||
return _from_pandas_series(
|
||||
data, missing, threads, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return _from_pandas_series(data, missing, threads, feature_names,
|
||||
feature_types)
|
||||
if _has_array_protocol(data):
|
||||
array = np.asarray(data)
|
||||
return _from_numpy_array(array, missing, threads, feature_names, feature_types)
|
||||
@@ -933,7 +866,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
|
||||
_meta_from_dt(data, name, dtype, handle)
|
||||
return
|
||||
if _is_modin_df(data):
|
||||
data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
|
||||
data, _, _ = _transform_pandas_df(
|
||||
data, False, meta=name, meta_type=dtype)
|
||||
_meta_from_numpy(data, name, dtype, handle)
|
||||
return
|
||||
if _is_modin_series(data):
|
||||
@@ -983,38 +917,30 @@ def _proxy_transform(
|
||||
)
|
||||
if _is_cupy_array(data):
|
||||
data = _transform_cupy_array(data)
|
||||
return data, None, feature_names, feature_types
|
||||
return data, feature_names, feature_types
|
||||
if _is_dlpack(data):
|
||||
return _transform_dlpack(data), None, feature_names, feature_types
|
||||
return _transform_dlpack(data), feature_names, feature_types
|
||||
if _is_numpy_array(data):
|
||||
return data, None, feature_names, feature_types
|
||||
return data, feature_names, feature_types
|
||||
if _is_scipy_csr(data):
|
||||
return data, None, feature_names, feature_types
|
||||
return data, feature_names, feature_types
|
||||
if _is_pandas_df(data):
|
||||
arr, feature_names, feature_types = _transform_pandas_df(
|
||||
data, enable_categorical, feature_names, feature_types
|
||||
)
|
||||
return arr, None, feature_names, feature_types
|
||||
return arr, feature_names, feature_types
|
||||
raise TypeError("Value type is not supported for data iterator:" + str(type(data)))
|
||||
|
||||
|
||||
def dispatch_proxy_set_data(
|
||||
proxy: _ProxyDMatrix,
|
||||
data: Any,
|
||||
cat_codes: Optional[list],
|
||||
allow_host: bool,
|
||||
) -> None:
|
||||
def dispatch_proxy_set_data(proxy: _ProxyDMatrix, data: Any, allow_host: bool) -> None:
|
||||
"""Dispatch for DeviceQuantileDMatrix."""
|
||||
if not _is_cudf_ser(data) and not _is_pandas_series(data):
|
||||
_check_data_shape(data)
|
||||
|
||||
if _is_cudf_df(data):
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
return
|
||||
if _is_cudf_ser(data):
|
||||
# pylint: disable=W0212
|
||||
proxy._set_data_from_cuda_columnar(data, cat_codes)
|
||||
proxy._set_data_from_cuda_columnar(data) # pylint: disable=W0212
|
||||
return
|
||||
if _is_cupy_array(data):
|
||||
proxy._set_data_from_cuda_interface(data) # pylint: disable=W0212
|
||||
|
||||
@@ -144,7 +144,7 @@ class RabitTracker(object):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = False
|
||||
self, hostIP, nslave, port=9091, port_end=9999, use_logger: bool = True
|
||||
) -> None:
|
||||
"""A Python implementation of RABIT tracker.
|
||||
|
||||
@@ -384,17 +384,16 @@ def start_rabit_tracker(args):
|
||||
----------
|
||||
args: arguments to start the rabit tracker.
|
||||
"""
|
||||
envs = {"DMLC_NUM_WORKER": args.num_workers, "DMLC_NUM_SERVER": args.num_servers}
|
||||
rabit = RabitTracker(
|
||||
hostIP=get_host_ip(args.host_ip), nslave=args.num_workers, use_logger=True
|
||||
)
|
||||
envs = {'DMLC_NUM_WORKER': args.num_workers,
|
||||
'DMLC_NUM_SERVER': args.num_servers}
|
||||
rabit = RabitTracker(hostIP=get_host_ip(args.host_ip), nslave=args.num_workers)
|
||||
envs.update(rabit.slave_envs())
|
||||
rabit.start(args.num_workers)
|
||||
sys.stdout.write("DMLC_TRACKER_ENV_START\n")
|
||||
sys.stdout.write('DMLC_TRACKER_ENV_START\n')
|
||||
# simply write configuration to stdout
|
||||
for k, v in envs.items():
|
||||
sys.stdout.write(f"{k}={v}\n")
|
||||
sys.stdout.write("DMLC_TRACKER_ENV_END\n")
|
||||
sys.stdout.write('DMLC_TRACKER_ENV_END\n')
|
||||
sys.stdout.flush()
|
||||
rabit.join()
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*!
|
||||
* Copyright 2020-2021 by XGBoost Contributors
|
||||
* Copyright 2020 by XGBoost Contributors
|
||||
* \file categorical.h
|
||||
*/
|
||||
#ifndef XGBOOST_COMMON_CATEGORICAL_H_
|
||||
@@ -42,11 +42,6 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
|
||||
return !s_cats.Check(cat);
|
||||
}
|
||||
|
||||
inline void CheckCat(bst_cat_t cat) {
|
||||
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
|
||||
"should be non-negative.";
|
||||
}
|
||||
|
||||
struct IsCatOp {
|
||||
XGBOOST_DEVICE bool operator()(FeatureType ft) {
|
||||
return ft == FeatureType::kCategorical;
|
||||
|
||||
@@ -711,12 +711,6 @@ constexpr std::pair<int, int> CUDAVersion() {
|
||||
constexpr std::pair<int32_t, int32_t> ThrustVersion() {
|
||||
return std::make_pair(THRUST_MAJOR_VERSION, THRUST_MINOR_VERSION);
|
||||
}
|
||||
// Whether do we have thrust 1.x with x >= minor
|
||||
template <int32_t minor>
|
||||
constexpr bool HasThrustMinorVer() {
|
||||
return (ThrustVersion().first == 1 && ThrustVersion().second >= minor) ||
|
||||
ThrustVersion().first > 1;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
template <typename T>
|
||||
@@ -731,8 +725,10 @@ class TypedDiscard : public thrust::discard_iterator<T> {
|
||||
|
||||
template <typename T>
|
||||
using TypedDiscard =
|
||||
std::conditional_t<HasThrustMinorVer<12>(), detail::TypedDiscardCTK114<T>,
|
||||
detail::TypedDiscard<T>>;
|
||||
std::conditional_t<((ThrustVersion().first == 1 &&
|
||||
ThrustVersion().second >= 12) ||
|
||||
ThrustVersion().first > 1),
|
||||
detail::TypedDiscardCTK114<T>, detail::TypedDiscard<T>>;
|
||||
|
||||
/**
|
||||
* \class AllReducer
|
||||
@@ -1446,39 +1442,24 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
|
||||
namespace detail {
|
||||
// Wrapper around cub sort for easier `descending` sort.
|
||||
template <bool descending, typename KeyT, typename ValueT,
|
||||
typename BeginOffsetIteratorT, typename EndOffsetIteratorT>
|
||||
typename OffsetIteratorT>
|
||||
void DeviceSegmentedRadixSortPair(
|
||||
void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT
|
||||
KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out,
|
||||
size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets,
|
||||
EndOffsetIteratorT d_end_offsets, int begin_bit = 0,
|
||||
size_t num_items, size_t num_segments, OffsetIteratorT d_begin_offsets,
|
||||
OffsetIteratorT d_end_offsets, int begin_bit = 0,
|
||||
int end_bit = sizeof(KeyT) * 8) {
|
||||
cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
|
||||
cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in),
|
||||
d_values_out);
|
||||
// In old version of cub, num_items in dispatch is also int32_t, no way to change.
|
||||
using OffsetT =
|
||||
std::conditional_t<BuildWithCUDACub() && HasThrustMinorVer<13>(), size_t,
|
||||
int32_t>;
|
||||
CHECK_LE(num_items, std::numeric_limits<OffsetT>::max());
|
||||
// For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation
|
||||
|
||||
#if (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) || THRUST_MAJOR_VERSION > 1
|
||||
using OffsetT = int32_t; // num items in dispatch is also int32_t, no way to change.
|
||||
CHECK_LE(num_items, std::numeric_limits<int32_t>::max());
|
||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT,
|
||||
descending, KeyT, ValueT, OffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
#else
|
||||
safe_cuda((cub::DispatchSegmentedRadixSort<
|
||||
descending, KeyT, ValueT, BeginOffsetIteratorT,
|
||||
OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys,
|
||||
d_values, num_items, num_segments,
|
||||
d_begin_offsets, d_end_offsets, begin_bit,
|
||||
end_bit, false, nullptr, false)));
|
||||
#endif
|
||||
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@@ -133,7 +133,6 @@ void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan) {
|
||||
info.feature_types.SetDevice(device);
|
||||
auto d_feature_types = info.feature_types.ConstDeviceSpan();
|
||||
CHECK(!d_feature_types.empty());
|
||||
auto &column_sizes_scan = *p_column_sizes_scan;
|
||||
|
||||
@@ -124,11 +124,6 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
|
||||
|
||||
void SortByWeight(dh::device_vector<float>* weights,
|
||||
dh::device_vector<Entry>* sorted_entries);
|
||||
|
||||
void RemoveDuplicatedCategories(
|
||||
int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry> *p_sorted_entries,
|
||||
dh::caching_device_vector<size_t> *p_column_sizes_scan);
|
||||
} // namespace detail
|
||||
|
||||
// Compute sketch on DMatrix.
|
||||
@@ -137,10 +132,9 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
|
||||
size_t sketch_batch_num_elements = 0);
|
||||
|
||||
template <typename AdapterBatch>
|
||||
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
int device, size_t columns, size_t begin, size_t end,
|
||||
float missing, SketchContainer *sketch_container,
|
||||
int num_cuts) {
|
||||
void ProcessSlidingWindow(AdapterBatch const& batch, int device, size_t columns,
|
||||
size_t begin, size_t end, float missing,
|
||||
SketchContainer* sketch_container, int num_cuts) {
|
||||
// Copy current subset of valid elements into temporary storage and sort
|
||||
dh::device_vector<Entry> sorted_entries;
|
||||
dh::caching_device_vector<size_t> column_sizes_scan;
|
||||
@@ -148,7 +142,6 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
thrust::make_counting_iterator(0llu),
|
||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||
HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
|
||||
cuts_ptr.SetDevice(device);
|
||||
detail::MakeEntriesFromAdapter(batch, batch_iter, {begin, end}, missing,
|
||||
columns, num_cuts, device,
|
||||
&cuts_ptr,
|
||||
@@ -158,14 +151,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||
sorted_entries.end(), detail::EntryCompareOp());
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
auto const &h_cuts_ptr = cuts_ptr.HostVector();
|
||||
// Extract the cuts from all columns concurrently
|
||||
sketch_container->Push(dh::ToSpan(sorted_entries),
|
||||
dh::ToSpan(column_sizes_scan), d_cuts_ptr,
|
||||
@@ -235,12 +222,6 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
|
||||
detail::SortByWeight(&temp_weights, &sorted_entries);
|
||||
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
|
||||
&sorted_entries, &column_sizes_scan);
|
||||
}
|
||||
|
||||
auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
|
||||
@@ -293,8 +274,8 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
device, num_cuts_per_feature, false);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end = std::min(batch.Size(), size_t(begin + sketch_batch_num_elements));
|
||||
ProcessSlidingWindow(batch, info, device, num_cols, begin, end, missing,
|
||||
sketch_container, num_cuts_per_feature);
|
||||
ProcessSlidingWindow(batch, device, num_cols,
|
||||
begin, end, missing, sketch_container, num_cuts_per_feature);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@
|
||||
|
||||
#include "array_interface.h"
|
||||
#include "../c_api/c_api_error.h"
|
||||
#include "../common/math.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
@@ -81,24 +80,6 @@ struct COOTuple {
|
||||
float value{0};
|
||||
};
|
||||
|
||||
struct IsValidFunctor {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const data::COOTuple& e) const {
|
||||
return !(common::CheckNAN(e.value) || e.value == missing);
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bool operator()(const Entry& e) const {
|
||||
return !(common::CheckNAN(e.fvalue) || e.fvalue == missing);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/**
|
||||
|
||||
@@ -987,19 +987,18 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
|
||||
|
||||
// Second pass over batch, placing elements in correct position
|
||||
|
||||
auto is_valid = data::IsValidFunctor{missing};
|
||||
#pragma omp parallel num_threads(nthread)
|
||||
{
|
||||
exec.Run([&]() {
|
||||
int tid = omp_get_thread_num();
|
||||
size_t begin = tid * thread_size;
|
||||
size_t end = tid != (nthread - 1) ? (tid + 1) * thread_size : batch_size;
|
||||
size_t begin = tid*thread_size;
|
||||
size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
auto line = batch.GetLine(i);
|
||||
for (auto j = 0ull; j < line.Size(); j++) {
|
||||
auto element = line.GetElement(j);
|
||||
const size_t key = (element.row_idx - base_rowid);
|
||||
if (is_valid(element)) {
|
||||
if (!common::CheckNAN(element.value) && element.value != missing) {
|
||||
builder.Push(key, Entry(element.column_idx, element.value), tid);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,29 @@
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
struct IsValidFunctor : public thrust::unary_function<Entry, bool> {
|
||||
float missing;
|
||||
|
||||
XGBOOST_DEVICE explicit IsValidFunctor(float missing) : missing(missing) {}
|
||||
|
||||
__device__ bool operator()(float value) const {
|
||||
return !(common::CheckNAN(value) || value == missing);
|
||||
}
|
||||
|
||||
__device__ bool operator()(const data::COOTuple& e) const {
|
||||
if (common::CheckNAN(e.value) || e.value == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
__device__ bool operator()(const Entry& e) const {
|
||||
if (common::CheckNAN(e.fvalue) || e.fvalue == missing) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
class CudfAdapterBatch : public detail::NoMetaInfo {
|
||||
friend class CudfAdapter;
|
||||
|
||||
|
||||
@@ -152,7 +152,6 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
|
||||
|
||||
if (batches == 1) {
|
||||
this->info_ = std::move(proxy->Info());
|
||||
this->info_.num_nonzero_ = nnz;
|
||||
CHECK_EQ(proxy->Info().labels_.Size(), 0);
|
||||
}
|
||||
|
||||
|
||||
@@ -273,7 +273,6 @@ class GBTree : public GradientBooster {
|
||||
uint32_t tree_begin, tree_end;
|
||||
std::tie(tree_begin, tree_end) =
|
||||
detail::LayerToTree(model_, tparam_, layer_begin, layer_end);
|
||||
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
|
||||
std::vector<Predictor const *> predictors{
|
||||
cpu_predictor_.get(),
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
|
||||
@@ -585,7 +585,6 @@ struct GPUHistMakerDevice {
|
||||
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
|
||||
<< "Categorical feature value too large.";
|
||||
auto cat = common::AsCat(candidate.split.fvalue);
|
||||
common::CheckCat(cat);
|
||||
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
|
||||
LBitField32 cats_bits(split_cats);
|
||||
cats_bits.Set(cat);
|
||||
|
||||
@@ -392,52 +392,6 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
|
||||
EXPECT_GE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required);
|
||||
}
|
||||
|
||||
void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
|
||||
int32_t num_bins, bool weighted) {
|
||||
auto h_x = GenerateRandomCategoricalSingleColumn(n, num_categories);
|
||||
thrust::device_vector<float> x(h_x);
|
||||
auto adapter = AdapterFromData(x, n, 1);
|
||||
MetaInfo info;
|
||||
info.num_row_ = n;
|
||||
info.num_col_ = 1;
|
||||
info.feature_types.HostVector().push_back(FeatureType::kCategorical);
|
||||
|
||||
if (weighted) {
|
||||
std::vector<float> weights(n, 0);
|
||||
SimpleLCG lcg;
|
||||
SimpleRealUniformDistribution<float> dist(0, 1);
|
||||
for (auto& v : weights) {
|
||||
v = dist(&lcg);
|
||||
}
|
||||
info.weights_.HostVector() = weights;
|
||||
}
|
||||
|
||||
ASSERT_EQ(info.feature_types.Size(), 1);
|
||||
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
|
||||
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
||||
std::numeric_limits<float>::quiet_NaN(), &container);
|
||||
HistogramCuts cuts;
|
||||
container.MakeCuts(&cuts);
|
||||
|
||||
thrust::sort(x.begin(), x.end());
|
||||
auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
|
||||
ASSERT_NE(n_uniques, x.size());
|
||||
ASSERT_EQ(cuts.TotalBins(), n_uniques);
|
||||
ASSERT_EQ(n_uniques, num_categories);
|
||||
|
||||
auto& values = cuts.cut_values_.HostVector();
|
||||
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
|
||||
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
|
||||
ASSERT_TRUE(is_unique);
|
||||
|
||||
x.resize(n_uniques);
|
||||
h_x.resize(n_uniques);
|
||||
thrust::copy(x.begin(), x.end(), h_x.begin());
|
||||
for (decltype(n_uniques) i = 0; i < n_uniques; ++i) {
|
||||
ASSERT_EQ(h_x[i], values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
||||
int categorical_sizes[] = {2, 6, 8, 12};
|
||||
int num_bins = 256;
|
||||
@@ -450,8 +404,6 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
|
||||
auto adapter = AdapterFromData(x_device, n, 1);
|
||||
ValidateBatchedCuts(adapter, num_bins, adapter.NumColumns(),
|
||||
adapter.NumRows(), dmat.get());
|
||||
TestCategoricalSketchAdapter(n, num_categories, num_bins, true);
|
||||
TestCategoricalSketchAdapter(n, num_categories, num_bins, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -452,47 +452,4 @@ TEST(GBTree, FeatureScore) {
|
||||
test_eq("gain");
|
||||
test_eq("cover");
|
||||
}
|
||||
|
||||
TEST(GBTree, PredictRange) {
|
||||
size_t n_samples = 1000, n_features = 10, n_classes = 4;
|
||||
auto m = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
|
||||
|
||||
std::unique_ptr<Learner> learner{Learner::Create({m})};
|
||||
learner->SetParam("num_class", std::to_string(n_classes));
|
||||
|
||||
learner->Configure();
|
||||
for (size_t i = 0; i < 2; ++i) {
|
||||
learner->UpdateOneIter(i, m);
|
||||
}
|
||||
HostDeviceVector<float> out_predt;
|
||||
ASSERT_THROW(learner->Predict(m, false, &out_predt, 0, 3), dmlc::Error);
|
||||
|
||||
auto m_1 =
|
||||
RandomDataGenerator{n_samples, n_features, 0.5}.GenerateDMatrix(true, false, n_classes);
|
||||
HostDeviceVector<float> out_predt_full;
|
||||
learner->Predict(m_1, false, &out_predt_full, 0, 0);
|
||||
ASSERT_TRUE(std::equal(out_predt.HostVector().begin(), out_predt.HostVector().end(),
|
||||
out_predt_full.HostVector().begin()));
|
||||
|
||||
{
|
||||
// inplace predict
|
||||
HostDeviceVector<float> raw_storage;
|
||||
auto raw = RandomDataGenerator{n_samples, n_features, 0.5}.GenerateArrayInterface(&raw_storage);
|
||||
std::shared_ptr<data::ArrayAdapter> x{new data::ArrayAdapter{StringView{raw}}};
|
||||
|
||||
HostDeviceVector<float>* out_predt;
|
||||
learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 2);
|
||||
auto h_out_predt = out_predt->HostVector();
|
||||
learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
|
||||
auto h_out_predt_full = out_predt->HostVector();
|
||||
|
||||
ASSERT_TRUE(std::equal(h_out_predt.begin(), h_out_predt.end(), h_out_predt_full.begin()));
|
||||
|
||||
ASSERT_THROW(learner->InplacePredict(x, nullptr, PredictionType::kValue,
|
||||
std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 3),
|
||||
dmlc::Error);
|
||||
}
|
||||
}
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -186,37 +186,6 @@ Arrow specification.'''
|
||||
assert len(Xy.feature_types) == X.shape[1]
|
||||
assert all(t == "c" for t in Xy.feature_types)
|
||||
|
||||
# test missing value
|
||||
X = cudf.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||
X["f0"] = X["f0"].astype("category")
|
||||
df, cat_codes, _, _ = xgb.data._transform_cudf_df(
|
||||
X, None, None, enable_categorical=True
|
||||
)
|
||||
for col in cat_codes:
|
||||
assert col.has_nulls
|
||||
|
||||
y = [0, 1, 2]
|
||||
with pytest.raises(ValueError):
|
||||
xgb.DMatrix(X, y)
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.num_row() == 3
|
||||
assert Xy.num_col() == 1
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
xgb.DeviceQuantileDMatrix(X, y)
|
||||
|
||||
Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.num_row() == 3
|
||||
assert Xy.num_col() == 1
|
||||
|
||||
X = X["f0"]
|
||||
with pytest.raises(ValueError):
|
||||
xgb.DMatrix(X, y)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.num_row() == 3
|
||||
assert Xy.num_col() == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(**tm.no_cudf())
|
||||
@pytest.mark.skipif(**tm.no_cupy())
|
||||
|
||||
@@ -138,22 +138,9 @@ class TestPandas:
|
||||
X, enable_categorical=True
|
||||
)
|
||||
|
||||
assert np.issubdtype(transformed[:, 0].dtype, np.integer)
|
||||
assert transformed[:, 0].min() == 0
|
||||
|
||||
# test missing value
|
||||
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
|
||||
X["f0"] = X["f0"].astype("category")
|
||||
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
|
||||
assert not np.any(arr == -1.0)
|
||||
|
||||
X = X["f0"]
|
||||
with pytest.raises(ValueError):
|
||||
xgb.DMatrix(X, y)
|
||||
|
||||
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
||||
assert Xy.num_row() == 3
|
||||
assert Xy.num_col() == 1
|
||||
|
||||
def test_pandas_sparse(self):
|
||||
import pandas as pd
|
||||
rows = 100
|
||||
|
||||
Reference in New Issue
Block a user