Compare commits

...

10 Commits

Author SHA1 Message Date
Hyunsu Cho
1220024442 Release 1.4.0 2021-04-10 17:42:00 -07:00
Philip Hyunsu Cho
964ee6b605 [CI] Pack R package tarball with pre-built xgboost.so (with GPU support) (#6827) (#6836)
* Add scripts for packaging R package with GPU-enabled libxgboost.so

* [CI] Automatically build R package tarball

* Add comments

* Don't build tarball for pull requests

* Update the installation doc
2021-04-07 22:47:54 -07:00
Jiaming Yuan
04fedefd4d [back port] Use batched copy if. (#6826) (#6834) 2021-04-07 04:50:52 +08:00
Jiaming Yuan
f814d4027a [back port] Remove unnecessary calls to iota. (#6797) (#6833) 2021-04-07 04:47:29 +08:00
Jiaming Yuan
2cc37370e2 [back port] Fix approximated predict contribution. (#6811) (#6832) 2021-04-07 04:47:07 +08:00
Jiaming Yuan
c6a0bdbb5a [back port] More general predict proba. (#6817) (#6831)
* Use `output_margin` for `softmax`.
* Add test for dask binary cls.

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2021-04-07 04:46:11 +08:00
Jiaming Yuan
357a78b3de [back port] Optimize dart inplace predict perf. (#6804) (#6829) 2021-04-07 00:21:12 +08:00
Jiaming Yuan
d231e7c35f [back port] Don't estimate sketch batch size when rmm is used. (#6807) (#6830) 2021-04-07 00:16:39 +08:00
Jiaming Yuan
604ae01b7a [back port] Use CPU input for test_boost_from_prediction. (#6818) (#6824) 2021-04-05 18:32:04 +08:00
Hyunsu Cho
43f52ed33c Release 1.4.0 RC1 2021-03-28 01:10:20 +00:00
32 changed files with 437 additions and 138 deletions

19
Jenkinsfile vendored
View File

@@ -65,6 +65,7 @@ pipeline {
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
'build-gpu-rpkg': { BuildRPackageWithCUDA(cuda_version: '10.0') },
'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
'build-jvm-doc': { BuildJVMDoc() }
@@ -264,6 +265,24 @@ def BuildCUDA(args) {
}
}
def BuildRPackageWithCUDA(args) {
node('linux && cpu_build') {
unstash name: 'srcs'
def container_type = 'gpu_build_r_centos6'
def docker_binary = "docker"
def docker_args = "--build-arg CUDA_VERSION_ARG=10.0"
if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
sh """
${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_r_pkg_with_cuda.sh ${commit_id}
"""
echo 'Uploading R tarball...'
path = ("${BRANCH_NAME}" == 'master') ? '' : "${BRANCH_NAME}/"
s3Upload bucket: 'xgboost-nightly-builds', path: path, acl: 'PublicRead', includePathPattern:'xgboost_r_gpu_linux_*.tar.gz'
}
deleteDir()
}
}
def BuildJVMPackagesWithCUDA(args) {
node('linux && mgpu') {
unstash name: 'srcs'

View File

@@ -1 +1 @@
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@-SNAPSHOT
@xgboost_VERSION_MAJOR@.@xgboost_VERSION_MINOR@.@xgboost_VERSION_PATCH@

View File

@@ -2,18 +2,15 @@
Installation Guide
##################
.. note:: Pre-built binary wheel for Python
.. note:: Pre-built binary wheel for Python: now with GPU support
If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, available from Python Package Index (PyPI). You may download and install it by running
If you are planning to use Python, consider installing XGBoost from a pre-built binary wheel, to avoid the trouble of building XGBoost from the source. You may download and install it by running
.. code-block:: bash
# Ensure that you are downloading one of the following:
# * xgboost-{version}-py2.py3-none-manylinux1_x86_64.whl
# * xgboost-{version}-py2.py3-none-win_amd64.whl
pip3 install xgboost
* The binary wheel will support GPU algorithms (`gpu_hist`) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
* The binary wheel will support the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
* Currently, we provide binary wheels for 64-bit Linux, macOS and Windows.
* Nightly builds are available. You can go to `this page
<https://s3-us-west-2.amazonaws.com/xgboost-nightly-builds/list.html>`_, find the
@@ -23,6 +20,21 @@ Installation Guide
pip install <url to the wheel>
.. note:: (EXPERIMENTAL) Pre-built binary package for R: now with GPU support
If you are planning to use R, consider installing ``{xgboost}`` from a pre-built binary package, to avoid the trouble of building XGBoost from the source. The binary package will let you use the GPU algorithm (``gpu_hist``) out of the box, as long as your machine has NVIDIA GPUs.
Download the binary package from the Releases page. The file name will be of the form ``xgboost_r_gpu_linux_[version].tar.gz``. Then install XGBoost by running:
.. code-block:: bash
# Install dependencies
R -q -e "install.packages(c('data.table', 'magrittr', 'jsonlite', 'remotes'))"
# Install XGBoost
R CMD INSTALL ./xgboost_r_gpu_linux.tar.gz
Currently, we provide the binary package for 64-bit Linux.
****************************
Building XGBoost from source

View File

@@ -740,15 +740,17 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
*
* \param handle Booster handle
* \param dmat DMatrix handle
* \param c_json_config String encoded predict configuration in JSON format.
* \param c_json_config String encoded predict configuration in JSON format, with
* following available fields in the JSON object:
*
* "type": [0, 5]
* "type": [0, 6]
* 0: normal prediction
* 1: output margin
* 2: predict contribution
* 3: predict approxmated contribution
* 3: predict approximated contribution
* 4: predict feature interaction
* 5: predict leaf
* 5: predict approximated feature interaction
* 6: predict leaf
* "training": bool
* Whether the prediction function is used as part of a training loop. **Not used
* for inplace prediction**.
@@ -764,7 +766,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
* "iteration_begin": int
* Beginning iteration of prediction.
* "iteration_end": int
* End iteration of prediction. Set to 0 this will become the size of tree model.
* End iteration of prediction. Set to 0 this will become the size of tree model (all the trees).
* "strict_shape": bool
* Whether should we reshape the output with stricter rules. If set to true,
* normal/margin/contrib/interaction predict will output consistent shape

View File

@@ -36,7 +36,8 @@ enum class PredictionType : std::uint8_t { // NOLINT
kContribution = 2,
kApproxContribution = 3,
kInteraction = 4,
kLeaf = 5
kApproxInteraction = 5,
kLeaf = 6
};
/*! \brief entry to to easily hold returning information */

View File

@@ -6,7 +6,7 @@
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
<packaging>pom</packaging>
<name>XGBoost JVM Package</name>
<description>JVM Package for XGBoost</description>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j-example_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
<packaging>jar</packaging>
<build>
<plugins>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j-flink_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
<build>
<plugins>
<plugin>
@@ -26,7 +26,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j-gpu_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j-spark-gpu_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j-spark_2.12</artifactId>
<build>
@@ -24,7 +24,7 @@
<dependency>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost4j_${scala.binary.version}</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>

View File

@@ -6,10 +6,10 @@
<parent>
<groupId>ml.dmlc</groupId>
<artifactId>xgboost-jvm_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
</parent>
<artifactId>xgboost4j_2.12</artifactId>
<version>1.4.0-SNAPSHOT</version>
<version>1.4.0</version>
<packaging>jar</packaging>
<dependencies>

View File

@@ -1 +1 @@
1.4.0-SNAPSHOT
1.4.0

View File

@@ -1648,7 +1648,9 @@ class Booster(object):
prediction. Note the final column is the bias term.
approx_contribs :
Approximate the contributions of each feature
Approximate the contributions of each feature. Used when ``pred_contribs`` or
``pred_interactions`` is set to True. Changing the default of this parameter
(False) is not recommended.
pred_interactions :
When this is True the output will be a matrix of size (nsample,
@@ -1720,9 +1722,9 @@ class Booster(object):
if pred_contribs:
assign_type(2 if not approx_contribs else 3)
if pred_interactions:
assign_type(4)
assign_type(4 if not approx_contribs else 5)
if pred_leaf:
assign_type(5)
assign_type(6)
preds = ctypes.POINTER(ctypes.c_float)()
shape = ctypes.POINTER(c_bst_ulong)()
dims = c_bst_ulong()

View File

@@ -21,6 +21,7 @@ from contextlib import contextmanager
from collections import defaultdict
from collections.abc import Sequence
from threading import Thread
from functools import partial, update_wrapper
from typing import TYPE_CHECKING, List, Tuple, Callable, Optional, Any, Union, Dict, Set
from typing import Awaitable, Generator, TypeVar
@@ -967,7 +968,7 @@ def _can_output_df(is_df: bool, output_shape: Tuple) -> bool:
return is_df and len(output_shape) <= 2
async def _direct_predict_impl(
async def _direct_predict_impl( # pylint: disable=too-many-branches
mapped_predict: Callable,
booster: "distributed.Future",
data: _DaskCollection,
@@ -1022,6 +1023,14 @@ async def _direct_predict_impl(
new_axis = list(range(len(output_shape) - 2))
else:
new_axis = [i + 2 for i in range(len(output_shape) - 2)]
if len(output_shape) == 2:
# Somehow dask fail to infer output shape change for 2-dim prediction, and
# `chunks = (None, output_shape[1])` doesn't work due to None is not
# supported in map_blocks.
chunks = list(data.chunks)
chunks[1] = (output_shape[1], )
else:
chunks = None
predictions = da.map_blocks(
mapped_predict,
booster,
@@ -1029,6 +1038,8 @@ async def _direct_predict_impl(
False,
columns,
base_margin_array,
chunks=chunks,
drop_axis=drop_axis,
new_axis=new_axis,
dtype=numpy.float32,
@@ -1776,20 +1787,20 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
self,
X: _DaskCollection,
validate_features: bool,
output_margin: bool,
base_margin: Optional[_DaskCollection],
iteration_range: Optional[Tuple[int, int]],
) -> _DaskCollection:
if iteration_range is None:
iteration_range = (0, 0)
predts = await super()._predict_async(
data=X,
output_margin=output_margin,
output_margin=self.objective == "multi:softmax",
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range,
)
return _cls_predict_proba(self.objective, predts, da.vstack)
vstack = update_wrapper(
partial(da.vstack, allow_unknown_chunksizes=True), da.vstack
)
return _cls_predict_proba(getattr(self, "n_classes_", None), predts, vstack)
# pylint: disable=missing-function-docstring
def predict_proba(
@@ -1797,7 +1808,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
X: _DaskCollection,
ntree_limit: Optional[int] = None,
validate_features: bool = True,
output_margin: bool = False,
base_margin: Optional[_DaskCollection] = None,
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
@@ -1808,7 +1818,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
self._predict_proba_async,
X=X,
validate_features=validate_features,
output_margin=output_margin,
base_margin=base_margin,
iteration_range=iteration_range,
)

View File

@@ -4,7 +4,7 @@
import copy
import warnings
import json
from typing import Union, Optional, List, Dict, Callable, Tuple, Any
from typing import Union, Optional, List, Dict, Callable, Tuple, Any, TypeVar
import numpy as np
from .core import Booster, DMatrix, XGBoostError
from .core import _deprecate_positional_args, _convert_ntree_limit
@@ -561,6 +561,8 @@ class XGBModel(XGBModelBase):
self._Booster.load_model(fname)
meta = self._Booster.attr('scikit_learn')
if meta is None:
# FIXME(jiaming): This doesn't have to be a problem as most of the needed
# information like num_class and objective is in Learner class.
warnings.warn(
'Loading a native XGBoost model with Scikit-Learn interface.')
return
@@ -571,6 +573,8 @@ class XGBModel(XGBModelBase):
self._le = XGBoostLabelEncoder()
self._le.from_json(v)
continue
# FIXME(jiaming): This can be removed once label encoder is gone since we can
# generate it from `np.arange(self.n_classes_)`
if k == 'classes_':
self.classes_ = np.array(v)
continue
@@ -1024,17 +1028,14 @@ class XGBModel(XGBModelBase):
return np.array(json.loads(b.get_dump(dump_format='json')[0])['bias'])
def _cls_predict_proba(
objective: Union[str, Callable], prediction: Any, vstack: Callable
) -> Any:
if objective == 'multi:softmax':
raise ValueError('multi:softmax objective does not support predict_proba,'
' use `multi:softprob` or `binary:logistic` instead.')
if objective == 'multi:softprob' or callable(objective):
# Return prediction directly if if objective is defined by user since we don't
# know how to perform the transformation
PredtT = TypeVar("PredtT")
def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) -> PredtT:
assert len(prediction.shape) <= 2
if len(prediction.shape) == 2 and prediction.shape[1] == n_classes:
return prediction
# Lastly the binary logistic function
# binary logistic function
classone_probs = prediction
classzero_probs = 1.0 - classone_probs
return vstack((classzero_probs, classone_probs)).transpose()
@@ -1218,8 +1219,10 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
return class_probs
if len(class_probs.shape) > 1:
# turns softprob into softmax
column_indexes = np.argmax(class_probs, axis=1)
else:
# turns soft logit into class label
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
@@ -1262,15 +1265,23 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
a numpy array of shape array-like of shape (n_samples, n_classes) with the
probability of each data example being of a given class.
"""
# custom obj: Do nothing as we don't know what to do.
# softprob: Do nothing, output is proba.
# softmax: Use output margin to remove the argmax in PredTransform.
# binary:logistic: Expand the prob vector into 2-class matrix after predict.
# binary:logitraw: Unsupported by predict_proba()
class_probs = super().predict(
X=X,
output_margin=False,
output_margin=self.objective == "multi:softmax",
ntree_limit=ntree_limit,
validate_features=validate_features,
base_margin=base_margin,
iteration_range=iteration_range
)
return _cls_predict_proba(self.objective, class_probs, np.vstack)
# If model is loaded from a raw booster there's no `n_classes_`
return _cls_predict_proba(
getattr(self, "n_classes_", None), class_probs, np.vstack
)
def evals_result(self):
"""Return the evaluation results.

View File

@@ -651,13 +651,17 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle,
auto type = PredictionType(get<Integer const>(config["type"]));
auto iteration_begin = get<Integer const>(config["iteration_begin"]);
auto iteration_end = get<Integer const>(config["iteration_end"]);
learner->Predict(
*static_cast<std::shared_ptr<DMatrix> *>(dmat),
type == PredictionType::kMargin, &entry.predictions, iteration_begin,
iteration_end, get<Boolean const>(config["training"]),
type == PredictionType::kLeaf, type == PredictionType::kContribution,
type == PredictionType::kApproxContribution,
type == PredictionType::kInteraction);
bool approximate = type == PredictionType::kApproxContribution ||
type == PredictionType::kApproxInteraction;
bool contribs = type == PredictionType::kContribution ||
type == PredictionType::kApproxContribution;
bool interactions = type == PredictionType::kInteraction ||
type == PredictionType::kApproxInteraction;
bool training = get<Boolean const>(config["training"]);
learner->Predict(p_m, type == PredictionType::kMargin, &entry.predictions,
iteration_begin, iteration_end, training,
type == PredictionType::kLeaf, contribs, approximate,
interactions);
*out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
auto &shape = learner->GetThreadLocal().prediction_shape;
auto chunksize = p_m->Info().num_row_ == 0 ? 0 : entry.predictions.Size() / p_m->Info().num_row_;

View File

@@ -56,7 +56,6 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
}
case PredictionType::kApproxContribution:
case PredictionType::kContribution: {
auto groups = chunksize / (cols + 1);
if (groups == 1 && !strict_shape) {
*out_dim = 2;
shape.resize(*out_dim);
@@ -71,6 +70,7 @@ inline void CalcPredictShape(bool strict_shape, PredictionType type, size_t rows
}
break;
}
case PredictionType::kApproxInteraction:
case PredictionType::kInteraction: {
if (groups == 1 && !strict_shape) {
*out_dim = 3;

View File

@@ -1290,6 +1290,21 @@ void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
num_items, nullptr, false)));
}
template <typename InIt, typename OutIt, typename Predicate>
void CopyIf(InIt in_first, InIt in_second, OutIt out_first, Predicate pred) {
// We loop over batches because thrust::copy_if cant deal with sizes > 2^31
// See thrust issue #1302, #6822
size_t max_copy_size = std::numeric_limits<int>::max() / 2;
size_t length = std::distance(in_first, in_second);
XGBCachingDeviceAllocator<char> alloc;
for (size_t offset = 0; offset < length; offset += max_copy_size) {
auto begin_input = in_first + offset;
auto end_input = in_first + std::min(offset + max_copy_size, length);
out_first = thrust::copy_if(thrust::cuda::par(alloc), begin_input,
end_input, out_first, pred);
}
}
template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
InclusiveScan(d_in, d_out, cub::Sum(), num_items);
@@ -1311,14 +1326,14 @@ void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_i
if (accending) {
void *d_temp_storage = nullptr;
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false);
sizeof(KeyT) * 8, false, nullptr, false)));
dh::TemporaryArray<char> storage(bytes);
d_temp_storage = storage.data().get();
cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, size_t>::Dispatch(
d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
sizeof(KeyT) * 8, false, nullptr, false);
sizeof(KeyT) * 8, false, nullptr, false)));
} else {
void *d_temp_storage = nullptr;
safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, size_t>::Dispatch(

View File

@@ -93,6 +93,11 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
bst_row_t num_rows, bst_feature_t columns,
size_t nnz, int device,
size_t num_cuts, bool has_weight) {
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
// device available memory is not accurate when rmm is used.
return nnz;
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
if (sketch_batch_num_elements == 0) {
auto required_memory = RequiredMemory(num_rows, columns, nnz, num_cuts, has_weight);
// use up to 80% of available space

View File

@@ -118,9 +118,8 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter,
size_t num_valid = column_sizes_scan->back();
// Copy current subset of valid elements into temporary storage and sort
sorted_entries->resize(num_valid);
dh::XGBCachingDeviceAllocator<char> alloc;
thrust::copy_if(thrust::cuda::par(alloc), entry_iter + range.begin(),
entry_iter + range.end(), sorted_entries->begin(), is_valid);
dh::CopyIf(entry_iter + range.begin(), entry_iter + range.end(),
sorted_entries->begin(), is_valid);
}
void SortByWeight(dh::device_vector<float>* weights,

View File

@@ -55,18 +55,9 @@ void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
COOToEntryOp<decltype(batch)> transform_op{batch};
thrust::transform_iterator<decltype(transform_op), decltype(counting)>
transform_iter(counting, transform_op);
// We loop over batches because thrust::copy_if cant deal with sizes > 2^31
// See thrust issue #1302
size_t max_copy_size = std::numeric_limits<int>::max() / 2;
auto begin_output = thrust::device_pointer_cast(data.data());
for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) {
auto begin_input = transform_iter + offset;
auto end_input =
transform_iter + std::min(offset + max_copy_size, batch.Size());
begin_output =
thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input,
begin_output, IsValidFunctor(missing));
}
dh::CopyIf(transform_iter, transform_iter + batch.Size(), begin_output,
IsValidFunctor(missing));
}
// Does not currently support metainfo as no on-device data source contains this

View File

@@ -575,6 +575,20 @@ void GPUDartPredictInc(common::Span<float> out_predts,
}
#endif
void GPUDartInplacePredictInc(common::Span<float> out_predts,
common::Span<float> predts, float tree_w,
size_t n_rows, float base_score,
bst_group_t n_groups,
bst_group_t group)
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else
{
common::AssertGPUSupport();
}
#endif
class Dart : public GBTree {
public:
explicit Dart(LearnerModelParam const* booster_config) :
@@ -728,13 +742,14 @@ class Dart : public GBTree {
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA)
};
Predictor const * predictor {nullptr};
MetaInfo info;
StringView msg{"Unsupported data type for inplace predict."};
int32_t device = GenericParameter::kCpuId;
PredictionCacheEntry predts;
// Inplace predict is not used for training, so no need to drop tree.
for (size_t i = tree_begin; i < tree_end; ++i) {
PredictionCacheEntry predts;
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
bool success = false;
@@ -742,6 +757,7 @@ class Dart : public GBTree {
if (p && p->InplacePredict(x, nullptr, model_, missing, &predts, i,
i + 1)) {
success = true;
predictor = p;
#if defined(XGBOOST_USE_CUDA)
device = predts.predictions.DeviceIdx();
#endif // defined(XGBOOST_USE_CUDA)
@@ -750,45 +766,52 @@ class Dart : public GBTree {
}
CHECK(success) << msg;
} else {
// No base margin for each tree
bool success = this->GetPredictor()->InplacePredict(
x, nullptr, model_, missing, &predts, i, i + 1);
// No base margin from meta info for each tree
predictor = this->GetPredictor().get();
bool success = predictor->InplacePredict(x, nullptr, model_, missing,
&predts, i, i + 1);
device = predts.predictions.DeviceIdx();
CHECK(success) << msg;
}
auto w = this->weight_drop_.at(i);
auto &h_predts = predts.predictions.HostVector();
auto &h_out_predts = out_preds->predictions.HostVector();
size_t n_groups = model_.learner_model_param->num_output_group;
auto n_rows = predts.predictions.Size() / n_groups;
if (i == tree_begin) {
auto n_rows =
h_predts.size() / model_.learner_model_param->num_output_group;
// base margin is added here.
if (p_m) {
p_m->Info().num_row_ = n_rows;
cpu_predictor_->InitOutPredictions(p_m->Info(),
&out_preds->predictions, model_);
predictor->InitOutPredictions(p_m->Info(), &out_preds->predictions,
model_);
} else {
info.num_row_ = n_rows;
cpu_predictor_->InitOutPredictions(info, &out_preds->predictions,
model_);
predictor->InitOutPredictions(info, &out_preds->predictions, model_);
}
}
// Multiple the tree weight
CHECK_EQ(h_predts.size(), h_out_predts.size());
CHECK_EQ(predts.predictions.Size(), out_preds->predictions.Size());
auto group = model_.tree_info.at(i);
if (device == GenericParameter::kCpuId) {
auto &h_predts = predts.predictions.HostVector();
auto &h_out_predts = out_preds->predictions.HostVector();
#pragma omp parallel for
for (omp_ulong i = 0; i < h_out_predts.size(); ++i) {
for (omp_ulong ridx = 0; ridx < n_rows; ++ridx) {
const size_t offset = ridx * n_groups + group;
// Need to remove the base margin from indiviual tree.
h_out_predts[i] +=
(h_predts[i] - model_.learner_model_param->base_score) * w;
h_out_predts[offset] +=
(h_predts[offset] - model_.learner_model_param->base_score) * w;
}
}
if (device != GenericParameter::kCpuId) {
} else {
out_preds->predictions.SetDevice(device);
out_preds->predictions.DeviceSpan();
predts.predictions.SetDevice(device);
GPUDartInplacePredictInc(out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows,
model_.learner_model_param->base_score,
n_groups, group);
}
}
}

View File

@@ -14,5 +14,15 @@ void GPUDartPredictInc(common::Span<float> out_predts,
out_predts[offset] += (predts[offset] * tree_w);
});
}
void GPUDartInplacePredictInc(common::Span<float> out_predts,
common::Span<float> predts, float tree_w,
size_t n_rows, float base_score,
bst_group_t n_groups, bst_group_t group) {
dh::LaunchN(dh::CurrentDevice(), n_rows, [=] XGBOOST_DEVICE(size_t ridx) {
const size_t offset = ridx * n_groups + group;
out_predts[offset] += (predts[offset] - base_score) * tree_w;
});
}
} // namespace gbm
} // namespace xgboost

View File

@@ -219,8 +219,6 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
/**
* Create sorted index for each class
*/
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
dh::Iota(d_sorted_idx, device);
auto d_predts_t = dh::ToSpan(cache->predts_t);
Transpose(predts, d_predts_t, n_samples, n_classes, device);
@@ -231,6 +229,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
});
// no out-of-place sort for thrust, cub sort doesn't accept general iterator. So can't
// use transform iterator in sorting.
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
dh::SegmentedArgSort<false>(d_predts_t, d_class_ptr, d_sorted_idx);
/**
@@ -447,10 +446,9 @@ GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
/**
* Sort the labels
*/
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
auto d_labels = info.labels_.ConstDeviceSpan();
dh::Iota(d_sorted_idx, device);
auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
dh::SegmentedArgSort<false>(d_labels, d_group_ptr, d_sorted_idx);
auto d_weights = info.weights_.ConstDeviceSpan();

View File

@@ -0,0 +1,112 @@
ARG CUDA_VERSION_ARG
FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos6
ARG CUDA_VERSION_ARG
# Environment
ENV DEBIAN_FRONTEND noninteractive
ENV DEVTOOLSET_URL_ROOT http://vault.centos.org/6.9/sclo/x86_64/rh/devtoolset-4/
COPY CentOS-Base.repo /etc/yum.repos.d/
# Install all basic requirements
RUN \
yum install -y epel-release && \
yum -y update && \
yum install -y tar unzip wget xz git patchelf readline-devel libX11-devel libXt-devel \
xorg-x11-server-devel openssl-devel texlive-* && \
yum install -y $DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-gfortran-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-libquadmath-devel-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-gcc-c++-5.3.1-6.1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-binutils-2.25.1-8.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-runtime-4.1-3.sc1.el6.x86_64.rpm \
$DEVTOOLSET_URL_ROOT/devtoolset-4-libstdc++-devel-5.3.1-6.1.el6.x86_64.rpm
ENV PATH=/opt/python/bin:/usr/local/ninja:/opt/software/packages/bin:/opt/R/3.3.0/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/software/packages/lib:/opt/R/3.3.0/lib64:$LD_LIBRARY_PATH
ENV CC=/opt/rh/devtoolset-4/root/usr/bin/gcc
ENV CXX=/opt/rh/devtoolset-4/root/usr/bin/c++
ENV CPP=/opt/rh/devtoolset-4/root/usr/bin/cpp
ENV F77=/opt/rh/devtoolset-4/root/usr/bin/gfortran
# A few packages have to be built from the source because CentOS 6 is a very old distribution and
# the system packages are not sufficiently up-to-date to build R 3.3.0. We'll want to update to
# CentOS 7 after the 1.4.0 release. Tracking issue: dmlc/xgboost#6791.
#
# Why choose an old Linux distro? This is so that the resulting xgboost.so is compatible with a
# wide range of Linux OSes currently in operation. See https://www.python.org/dev/peps/pep-0571/
RUN \
wget https://zlib.net/fossils/zlib-1.2.5.tar.gz && \
wget https://sourceware.org/pub/bzip2/bzip2-1.0.6.tar.gz && \
wget http://tukaani.org/xz/xz-5.2.2.tar.gz && \
wget https://ftp.pcre.org/pub/pcre/pcre-8.40.tar.gz && \
wget https://www.openssl.org/source/old/1.0.0/openssl-1.0.0k.tar.gz && \
wget --no-check-certificate https://curl.se/download/curl-7.47.1.tar.gz && \
tar xf zlib-1.2.5.tar.gz && \
tar xf bzip2-1.0.6.tar.gz && \
tar xf xz-5.2.2.tar.gz && \
tar xf pcre-8.40.tar.gz && \
tar xf openssl-1.0.0k.tar.gz && \
tar xf curl-7.47.1.tar.gz && \
cd zlib-1.2.5 && \
./configure --prefix=/opt/software/packages && \
make -j$(nproc) && \
make install && \
cd ../bzip2-1.0.6 && \
sed -i 's/CFLAGS=-Wall/CFLAGS=-fPIC -Wall/g' Makefile && \
make -f Makefile-libbz2_so && \
make clean && \
make -j$(nproc) && \
make -n install PREFIX=/opt/software/packages && \
make install PREFIX=/opt/software/packages && \
cd ../xz-5.2.2 && \
./configure --prefix=/opt/software/packages && \
make -j$(nproc) && \
make install && \
cd ../pcre-8.40 && \
./configure --enable-utf8 --prefix=/opt/software/packages && \
make -j$(nproc) && \
make install && \
cd ../curl-7.47.1 && \
./configure --prefix=/opt/software/packages --with-ssl && \
make -j$(nproc) && \
make install && \
export CFLAGS="-I/opt/software/packages/include" && \
export LDFLAGS="-L/opt/software/packages/lib" && \
cd .. && \
# R 3.3.0
wget -nv -nc https://cran.r-project.org/src/base/R-3/R-3.3.0.tar.gz && \
tar xf R-3.3.0.tar.gz && \
cd R-3.3.0 && \
./configure --prefix=/opt/R/3.3.0 --enable-R-shlib && \
make -j$(nproc) && \
make install && \
# Python
wget -nv -nc -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash Miniconda3.sh -b -p /opt/python && \
/opt/python/bin/python -m pip install auditwheel && \
# CMake
wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
# Ninja
mkdir -p /usr/local && \
cd /usr/local/ && \
wget -nv -nc https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz --no-check-certificate && \
tar xf v1.10.0.tar.gz && mv ninja-1.10.0 ninja && rm -v v1.10.0.tar.gz && \
cd ninja && \
/opt/python/bin/python ./configure.py --bootstrap
ENV GOSU_VERSION 1.10
# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true
# Default entry-point to use if running locally
# It will preserve attributes of created files
COPY entrypoint.sh /scripts/
WORKDIR /workspace
ENTRYPOINT ["/scripts/entrypoint.sh"]

View File

@@ -0,0 +1,33 @@
#!/bin/bash
set -e
set -x
if [ "$#" -ne 1 ]
then
echo "Build the R package tarball with CUDA code. Usage: $0 [commit hash]"
exit 1
fi
commit_hash="$1"
make Rpack
mv xgboost/ xgboost_rpack/
mkdir build
cd build
cmake .. -GNinja -DUSE_CUDA=ON -DR_LIB=ON
ninja
cd ..
rm xgboost
# This super wacky hack is found in cmake/RPackageInstall.cmake.in and
# cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
# and have R use xgboost.so that we've already built.
rm -v xgboost_rpack/configure
rm -rfv xgboost_rpack/src
mkdir -p xgboost_rpack/src
cp -v lib/xgboost.so xgboost_rpack/src/
echo 'all:' > xgboost_rpack/src/Makefile
echo 'all:' > xgboost_rpack/src/Makefile.win
mv xgboost_rpack/ xgboost/
tar cvzf xgboost_r_gpu_linux_${commit_hash}.tar.gz xgboost/

View File

@@ -45,6 +45,10 @@ TEST(HistUtil, DeviceSketch) {
}
TEST(HistUtil, SketchBatchNumElements) {
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
LOG(WARNING) << "Test not runnable with RMM enabled.";
return;
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
size_t constexpr kCols = 10000;
int device;
dh::safe_cuda(cudaGetDevice(&device));

View File

@@ -332,27 +332,44 @@ class TestGPUPredict:
rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
def test_predict_dart(self):
@pytest.mark.parametrize("n_classes", [2, 3])
def test_predict_dart(self, n_classes):
from sklearn.datasets import make_classification
import cupy as cp
rng = cp.random.RandomState(1994)
n_samples = 1000
X = rng.randn(n_samples, 10)
y = rng.randn(n_samples)
X_, y_ = make_classification(
n_samples=n_samples, n_informative=5, n_classes=n_classes
)
X, y = cp.array(X_), cp.array(y_)
Xy = xgb.DMatrix(X, y)
booster = xgb.train(
{
if n_classes == 2:
params = {
"tree_method": "gpu_hist",
"booster": "dart",
"rate_drop": 0.5,
},
Xy,
num_boost_round=32
)
"objective": "binary:logistic"
}
else:
params = {
"tree_method": "gpu_hist",
"booster": "dart",
"rate_drop": 0.5,
"objective": "multi:softprob",
"num_class": n_classes
}
booster = xgb.train(params, Xy, num_boost_round=32)
# predictor=auto
inplace = booster.inplace_predict(X)
copied = booster.predict(Xy)
cpu_inplace = booster.inplace_predict(X_)
booster.set_param({"predictor": "cpu_predictor"})
cpu_copied = booster.predict(Xy)
copied = cp.array(copied)
cp.testing.assert_allclose(cpu_inplace, copied, atol=1e-6)
cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
cp.testing.assert_allclose(inplace, copied, atol=1e-6)
booster.set_param({"predictor": "gpu_predictor"})

View File

@@ -173,13 +173,13 @@ def run_gpu_hist(
assert tm.non_increasing(history["train"][dataset.metric])
@pytest.mark.skipif(**tm.no_cudf())
def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None:
import cudf
from sklearn.datasets import load_breast_cancer
with Client(local_cuda_cluster) as client:
X_, y_ = load_breast_cancer(return_X_y=True)
X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
X = dd.from_array(X_, chunksize=100)
y = dd.from_array(y_, chunksize=100)
run_boost_from_prediction(X, y, "gpu_hist", client)
@@ -202,6 +202,7 @@ class TestDistributedGPU:
@settings(deadline=duration(seconds=120), suppress_health_check=suppress)
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.parametrize(
"local_cuda_cluster", [{"n_workers": 2}], indirect=["local_cuda_cluster"]
)
@@ -276,7 +277,7 @@ class TestDistributedGPU:
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
run_dask_classifier(X, y, w, model, client)
run_dask_classifier(X, y, w, model, client, 10)
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
@@ -454,6 +455,7 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainRetur
@pytest.mark.skipif(**tm.no_dask())
@pytest.mark.skipif(**tm.no_dask_cuda())
@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.mgpu
def test_with_asyncio(local_cuda_cluster: LocalCUDACluster) -> None:
with Client(local_cuda_cluster) as client:

View File

@@ -98,6 +98,27 @@ def test_predict_shape():
assert len(contrib.shape) == 3
assert contrib.shape[1] == 1
contrib = reg.get_booster().predict(
xgb.DMatrix(X), pred_contribs=True, approx_contribs=True
)
assert len(contrib.shape) == 2
assert contrib.shape[1] == X.shape[1] + 1
interaction = reg.get_booster().predict(
xgb.DMatrix(X), pred_interactions=True, approx_contribs=True
)
assert len(interaction.shape) == 3
assert interaction.shape[1] == X.shape[1] + 1
assert interaction.shape[2] == X.shape[1] + 1
interaction = reg.get_booster().predict(
xgb.DMatrix(X), pred_interactions=True, approx_contribs=True, strict_shape=True
)
assert len(interaction.shape) == 4
assert interaction.shape[1] == 1
assert interaction.shape[2] == X.shape[1] + 1
assert interaction.shape[3] == X.shape[1] + 1
class TestInplacePredict:
'''Tests for running inplace prediction'''

View File

@@ -318,14 +318,17 @@ def run_dask_classifier(
w: xgb.dask._DaskCollection,
model: str,
client: "Client",
n_classes,
) -> None:
metric = "merror" if n_classes > 2 else "logloss"
if model == "boosting":
classifier = xgb.dask.DaskXGBClassifier(
verbosity=1, n_estimators=2, eval_metric="merror"
verbosity=1, n_estimators=2, eval_metric=metric
)
else:
classifier = xgb.dask.DaskXGBRFClassifier(
verbosity=1, n_estimators=2, eval_metric="merror"
verbosity=1, n_estimators=2, eval_metric=metric
)
assert classifier._estimator_type == "classifier"
@@ -343,7 +346,7 @@ def run_dask_classifier(
assert isinstance(history, dict)
assert list(history.keys())[0] == "validation_0"
assert list(history["validation_0"].keys())[0] == "merror"
assert list(history["validation_0"].keys())[0] == metric
assert len(list(history["validation_0"])) == 1
forest = int(
json.loads(classifier.get_booster().save_config())["learner"][
@@ -351,19 +354,20 @@ def run_dask_classifier(
]["gbtree_train_param"]["num_parallel_tree"]
)
if model == "boosting":
assert len(history["validation_0"]["merror"]) == 2
assert len(history["validation_0"][metric]) == 2
assert forest == 1
else:
assert len(history["validation_0"]["merror"]) == 1
assert len(history["validation_0"][metric]) == 1
assert forest == 2
# Test .predict_proba()
probas = classifier.predict_proba(X).compute()
assert classifier.n_classes_ == 10
assert classifier.n_classes_ == n_classes
assert probas.ndim == 2
assert probas.shape[0] == kRows
assert probas.shape[1] == 10
assert probas.shape[1] == n_classes
if n_classes > 2:
cls_booster = classifier.get_booster()
single_node_proba = cls_booster.inplace_predict(X.compute())
@@ -375,10 +379,10 @@ def run_dask_classifier(
cupy.testing.assert_allclose(single_node_proba, probas)
# Test with dataframe, not shared with GPU as cupy doesn't work well with da.unique.
if isinstance(X, da.Array):
if isinstance(X, da.Array) and n_classes > 2:
X_d: dd.DataFrame = X.to_dask_dataframe()
assert classifier.n_classes_ == 10
assert classifier.n_classes_ == n_classes
prediction_df = classifier.predict(X_d).compute()
assert prediction_df.ndim == 1
@@ -393,7 +397,12 @@ def run_dask_classifier(
def test_dask_classifier(model: str, client: "Client") -> None:
X, y, w = generate_array(with_weights=True)
y = (y * 10).astype(np.int32)
run_dask_classifier(X, y, w, model, client)
run_dask_classifier(X, y, w, model, client, 10)
y_bin = y.copy()
y_bin[y > 5] = 1.0
y_bin[y <= 5] = 0.0
run_dask_classifier(X, y_bin, w, model, client, 2)
@pytest.mark.skipif(**tm.no_sklearn())