[breaking] Remove the predictor param, allow fallback to prediction using DMatrix. (#9129)

- A `DeviceOrd` struct is implemented to indicate the device. It will eventually replace the `gpu_id` parameter.
- The `predictor` parameter is removed.
- Fallback to `DMatrix` when `inplace_predict` is not available.
- The heuristic for choosing a predictor is only used during training.
This commit is contained in:
Jiaming Yuan
2023-07-03 19:23:54 +08:00
committed by GitHub
parent 3a0f787703
commit 39390cc2ee
54 changed files with 1049 additions and 778 deletions

View File

@@ -1023,7 +1023,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
const float **out_result) {
xgboost_CHECK_C_ARG_PTR(c_json_config);
auto config = Json::Load(StringView{c_json_config});
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
HostDeviceVector<float> *p_predt{nullptr};
auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1042,6 +1041,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
xgboost_CHECK_C_ARG_PTR(out_dim);
CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
learner->BoostedRounds(), &shape, out_dim);
CHECK_GE(p_predt->Size(), n_samples);
xgboost_CHECK_C_ARG_PTR(out_result);
xgboost_CHECK_C_ARG_PTR(out_shape);

View File

@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
API_END();
}
int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
char const *c_json_config, std::shared_ptr<DMatrix> p_m,
xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
const float **out_result) {
@@ -107,7 +107,6 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
proxy->SetCUDAArray(c_array_interface);
auto config = Json::Load(StringView{c_json_config});
CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
auto *learner = static_cast<Learner *>(handle);
HostDeviceVector<float> *p_predt{nullptr};
@@ -118,7 +117,13 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
RequiredArg<Integer>(config, "iteration_begin", __func__),
RequiredArg<Integer>(config, "iteration_end", __func__));
CHECK(p_predt);
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
if (learner->Ctx()->IsCPU()) {
// Prediction using DMatrix as fallback.
CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
} else {
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
}
p_predt->SetDevice(proxy->DeviceIdx());
auto &shape = learner->GetThreadLocal().prediction_shape;
size_t n_samples = p_m->Info().num_row_;
@@ -146,7 +151,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
if (m) {
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
}
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
out_result);
}
@@ -159,6 +164,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
}
xgboost_CHECK_C_ARG_PTR(out_result);
return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
out_result);
}

View File

@@ -6,6 +6,11 @@
#ifndef XGBOOST_COMMON_ERROR_MSG_H_
#define XGBOOST_COMMON_ERROR_MSG_H_
#include <cinttypes> // for uint64_t
#include <limits> // for numeric_limits
#include "xgboost/base.h" // for bst_feature_t
#include "xgboost/logging.h"
#include "xgboost/string_view.h" // for StringView
namespace xgboost::error {
@@ -33,5 +38,14 @@ constexpr StringView InconsistentMaxBin() {
return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
"and consistent with the Booster being trained.";
}
constexpr StringView UnknownDevice() { return "Unknown device type."; }
inline void MaxFeatureSize(std::uint64_t n_features) {
auto max_n_features = std::numeric_limits<bst_feature_t>::max();
CHECK_LE(n_features, max_n_features)
<< "Unfortunately, XGBoost does not support data matrices with "
<< std::numeric_limits<bst_feature_t>::max() << " features or greater";
}
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_

View File

@@ -7,7 +7,7 @@
#include <dmlc/data.h>
#include <algorithm>
#include <cstddef> // std::size_t
#include <cstddef> // for size_t
#include <functional>
#include <limits>
#include <map>
@@ -17,6 +17,7 @@
#include <vector>
#include "../c_api/c_api_error.h"
#include "../common/error_msg.h" // for MaxFeatureSize
#include "../common/math.h"
#include "array_interface.h"
#include "arrow-cdi.h"
@@ -300,9 +301,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
array_interface_ = ArrayInterface<2>(get<Object const>(j));
batch_ = ArrayAdapterBatch{array_interface_};
}
ArrayAdapterBatch const& Value() const override { return batch_; }
size_t NumRows() const { return array_interface_.Shape(0); }
size_t NumColumns() const { return array_interface_.Shape(1); }
[[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
private:
ArrayAdapterBatch batch_;

View File

@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
dh::XGBCachingDeviceAllocator<char> alloc;
auto num_rows = [&]() {
return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
};
auto num_cols = [&]() {
return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
};
size_t row_stride = 0;
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
get_device());
auto* p_sketch = &sketch_containers.back();
proxy->Info().weights_.SetDevice(get_device());
Dispatch(proxy, [&](auto const& value) {
cuda_impl::Dispatch(proxy, [&](auto const& value) {
common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
});
}
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
accumulated_rows += batch_rows;
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
}));
nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
Dispatch(proxy, [=](auto const& value) {
cuda_impl::Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
});
auto is_dense = this->IsDense();
proxy->Info().feature_types.SetDevice(get_device());
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
auto new_impl = Dispatch(proxy, [&](auto const& value) {
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
d_feature_types, row_stride, rows, cuts);
});

View File

@@ -1,14 +1,13 @@
/*!
* Copyright 2021 by Contributors
/**
* Copyright 2021-2023, XGBoost Contributors
* \file proxy_dmatrix.cc
*/
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
void DMatrixProxy::SetArrayData(char const *c_interface) {
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
namespace xgboost::data {
void DMatrixProxy::SetArrayData(StringView interface_str) {
std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
@@ -25,5 +24,36 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId;
}
} // namespace data
} // namespace xgboost
namespace cuda_impl {
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
std::shared_ptr<DMatrixProxy> proxy, float missing);
#if !defined(XGBOOST_USE_CUDA)
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
float) {
return nullptr;
}
#endif // XGBOOST_USE_CUDA
} // namespace cuda_impl
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
std::shared_ptr<DMatrixProxy> proxy,
float missing) {
bool type_error{false};
std::shared_ptr<DMatrix> p_fmat{nullptr};
if (proxy->Ctx()->IsCPU()) {
p_fmat = data::HostAdapterDispatch<false>(
proxy.get(),
[&](auto const &adapter) {
auto p_fmat =
std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
return p_fmat;
},
&type_error);
} else {
p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
}
return p_fmat;
}
} // namespace xgboost::data

View File

@@ -1,12 +1,11 @@
/*!
* Copyright 2020-2022, XGBoost contributors
/**
* Copyright 2020-2023, XGBoost contributors
*/
#include "proxy_dmatrix.h"
#include "device_adapter.cuh"
#include "proxy_dmatrix.cuh"
#include "proxy_dmatrix.h"
namespace xgboost {
namespace data {
namespace xgboost::data {
void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
auto const& value = adapter->Value();
@@ -31,5 +30,15 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
ctx_.gpu_id = dh::CurrentDevice();
}
}
} // namespace data
} // namespace xgboost
namespace cuda_impl {
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
std::shared_ptr<DMatrixProxy> proxy,
float missing) {
return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
return p_fmat;
});
}
} // namespace cuda_impl
} // namespace xgboost::data

View File

@@ -6,19 +6,34 @@
#include "device_adapter.cuh"
#include "proxy_dmatrix.h"
namespace xgboost::data {
template <typename Fn>
namespace xgboost::data::cuda_impl {
template <bool get_value = true, typename Fn>
decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
return fn(value);
}
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
return fn(value);
}
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
return fn(value);
}
}
}
} // namespace xgboost::data
} // namespace xgboost::data::cuda_impl

View File

@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
#endif // defined(XGBOOST_USE_CUDA)
}
void SetArrayData(char const* c_interface);
void SetArrayData(StringView interface_str);
void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
bst_feature_t n_features, bool on_host);
@@ -114,28 +114,62 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
return typed;
}
template <typename Fn>
/**
* @brief Dispatch function call based on input type.
*
* @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
* @tparam Fn The type of the function to be dispatched.
*
* @param proxy The proxy object holding the reference to the input.
* @param fn The function to be dispatched.
* @param type_error[out] Set to ture if it's not null and the input data is not recognized by
* the host.
*
* @return The return value of the function being dispatched.
*/
template <bool get_value = true, typename Fn>
decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
return fn(value);
}
if (type_error) {
*type_error = false;
}
return fn(value);
} else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
if constexpr (get_value) {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
return fn(value);
} else {
auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
return fn(value);
}
if (type_error) {
*type_error = false;
}
return fn(value);
} else {
if (type_error) {
*type_error = true;
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
}
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
if constexpr (get_value) {
return std::result_of_t<Fn(
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
} else {
return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
}
}
}
/**
* @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
*/
std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
std::shared_ptr<DMatrixProxy> proxy, float missing);
} // namespace xgboost::data
#endif // XGBOOST_DATA_PROXY_DMATRIX_H_

View File

@@ -1,33 +1,31 @@
/*!
* Copyright 2021 XGBoost contributors
/**
* Copyright 2021-2023, XGBoost contributors
*/
#include "../common/device_helpers.cuh" // for CurrentDevice
#include "proxy_dmatrix.cuh" // for Dispatch, DMatrixProxy
#include "simple_dmatrix.cuh" // for CopyToSparsePage
#include "sparse_page_source.h"
#include "proxy_dmatrix.cuh"
#include "simple_dmatrix.cuh"
namespace xgboost {
namespace data {
#include "xgboost/data.h" // for SparsePage
namespace xgboost::data {
namespace detail {
std::size_t NSamplesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
}
std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
}
} // namespace detail
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
auto device = proxy->DeviceIdx();
if (device < 0) {
device = dh::CurrentDevice();
}
CHECK_GE(device, 0);
Dispatch(proxy, [&](auto const &value) {
CopyToSparsePage(value, device, missing, page);
});
cuda_impl::Dispatch(proxy,
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
}
} // namespace data
} // namespace xgboost
} // namespace xgboost::data

View File

@@ -172,8 +172,7 @@ class GBLinear : public GradientBooster {
}
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
unsigned) override {
bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
model_.LazyInitModel();
LinearCheckLayer(layer_begin);
auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
@@ -210,8 +209,8 @@ class GBLinear : public GradientBooster {
}
}
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
unsigned layer_begin, unsigned /*layer_end*/,
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
bool) override {
LinearCheckLayer(layer_begin);
std::vector<bst_float>& contribs = out_contribs->HostVector();

View File

@@ -18,9 +18,11 @@
#include <vector>
#include "../common/common.h"
#include "../common/error_msg.h" // for UnknownDevice
#include "../common/random.h"
#include "../common/threading_utils.h"
#include "../common/timer.h"
#include "../data/proxy_dmatrix.h" // for DMatrixProxy, HostAdapterDispatch
#include "gbtree_model.h"
#include "xgboost/base.h"
#include "xgboost/data.h"
@@ -58,9 +60,8 @@ void GBTree::Configure(Args const& cfg) {
cpu_predictor_->Configure(cfg);
#if defined(XGBOOST_USE_CUDA)
auto n_gpus = common::AllVisibleGPUs();
if (!gpu_predictor_ && n_gpus != 0) {
gpu_predictor_ = std::unique_ptr<Predictor>(
Predictor::Create("gpu_predictor", this->ctx_));
if (!gpu_predictor_) {
gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
}
if (n_gpus != 0) {
gpu_predictor_->Configure(cfg);
@@ -374,12 +375,7 @@ void GBTree::LoadConfig(Json const& in) {
// This would cause all trees to be pushed to trees_to_update
// e.g. updating a model, then saving and loading it would result in an empty model
tparam_.process_type = TreeProcessType::kDefault;
int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine. "
"Changing predictor to auto.";
tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
}
std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
auto msg = StringView{
R"(
@@ -505,8 +501,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
}
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
bst_layer_t layer_begin, bst_layer_t layer_end) {
void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) const {
CHECK(configured_);
if (layer_end == 0) {
layer_end = this->BoostedRounds();
@@ -526,7 +522,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
CHECK_EQ(out_preds->version, 0);
}
auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
if (out_preds->version == 0) {
// out_preds->Size() can be non-zero as it's initialized here before any
// tree is built at the 0^th iterator.
@@ -546,68 +542,69 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
}
}
std::unique_ptr<Predictor> const &
GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
DMatrix *f_dmat) const {
void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) {
// dispatch to const function.
this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
}
void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
bst_layer_t layer_end) const {
CHECK(configured_);
if (tparam_.predictor != PredictorType::kAuto) {
if (tparam_.predictor == PredictorType::kGPUPredictor) {
#if defined(XGBOOST_USE_CUDA)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
common::AssertGPUSupport();
#endif // defined(XGBOOST_USE_CUDA)
}
if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
#if defined(XGBOOST_USE_ONEAPI)
CHECK(oneapi_predictor_);
return oneapi_predictor_;
#else
common::AssertOneAPISupport();
#endif // defined(XGBOOST_USE_ONEAPI)
}
CHECK(cpu_predictor_);
return cpu_predictor_;
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
if (p_m->Ctx()->Device() != this->ctx_->Device()) {
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
<< "is running on: " << this->ctx_->DeviceName()
<< ", while the input data is on: " << p_m->Ctx()->DeviceName() << ".";
CHECK_EQ(out_preds->version, 0);
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
auto any_adapter = proxy->Adapter();
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
return;
}
if (this->ctx_->IsCPU()) {
this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
} else if (p_m->Ctx()->IsCUDA()) {
CHECK(this->gpu_predictor_);
this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
} else {
LOG(FATAL) << error::UnknownDevice();
}
}
[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
CHECK(configured_);
// Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
// prevent data copy.
if (f_dmat && !f_dmat->SingleColBlock()) {
if (ctx_->IsCPU()) {
return cpu_predictor_;
} else {
#if defined(XGBOOST_USE_CUDA)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
return gpu_predictor_;
#else
common::AssertGPUSupport();
return cpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA)
CHECK(gpu_predictor_);
return gpu_predictor_;
}
}
// Data comes from Device DMatrix.
auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
!f_dmat->PageExists<SparsePage>();
auto is_ellpack =
f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
// Data comes from device memory, like CuDF or CuPy.
auto is_from_device =
f_dmat && f_dmat->PageExists<SparsePage>() &&
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
(*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
auto on_device = is_ellpack || is_from_device;
// Use GPU Predictor if data is already on device and gpu_id is set.
if (on_device && ctx_->gpu_id >= 0) {
#if defined(XGBOOST_USE_CUDA)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
if (on_device && ctx_->IsCUDA()) {
common::AssertGPUSupport();
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
"CUDA support.";
return cpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA)
}
// GPU_Hist by default has prediction cache calculated from quantile values,
@@ -619,23 +616,19 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
// FIXME(trivialfis): Implement a better method for testing whether data
// is on device after DMatrix refactoring is done.
!on_device) {
!on_device && is_training) {
CHECK(cpu_predictor_);
return cpu_predictor_;
}
if (tparam_.tree_method == TreeMethod::kGPUHist) {
#if defined(XGBOOST_USE_CUDA)
CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
if (ctx_->IsCPU()) {
return cpu_predictor_;
} else {
common::AssertGPUSupport();
CHECK(gpu_predictor_);
return gpu_predictor_;
#else
common::AssertGPUSupport();
return cpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA)
}
CHECK(cpu_predictor_);
return cpu_predictor_;
}
@@ -750,7 +743,7 @@ class Dart : public GBTree {
bool training, unsigned layer_begin,
unsigned layer_end) const {
CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
CHECK(predictor);
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
model_);
@@ -814,49 +807,46 @@ class Dart : public GBTree {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
auto n_groups = model_.learner_model_param->num_output_group;
std::vector<Predictor const*> predictors {
cpu_predictor_.get(),
#if defined(XGBOOST_USE_CUDA)
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA)
};
Predictor const* predictor{nullptr};
StringView msg{"Unsupported data type for inplace predict."};
if (ctx_->Device() != p_fmat->Ctx()->Device()) {
LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
<< "is running on: " << this->ctx_->DeviceName()
<< ", while the input data is on: " << p_fmat->Ctx()->DeviceName() << ".";
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
auto any_adapter = proxy->Adapter();
auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
return;
}
StringView msg{"Unsupported data type for inplace predict."};
PredictionCacheEntry predts;
if (ctx_->gpu_id != Context::kCpuId) {
predts.predictions.SetDevice(ctx_->gpu_id);
}
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
auto get_predictor = [&]() -> Predictor const* {
if (ctx_->IsCPU()) {
return cpu_predictor_.get();
} else if (ctx_->IsCUDA()) {
CHECK(this->gpu_predictor_);
return gpu_predictor_.get();
} else {
LOG(FATAL) << error::UnknownDevice();
return nullptr;
}
};
auto predict_impl = [&](size_t i) {
predts.predictions.Fill(0);
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
bool success = false;
for (auto const& p : predictors) {
if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
success = true;
predictor = p;
break;
}
}
CHECK(success) << msg;
} else {
predictor = this->GetPredictor().get();
bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
CHECK(success) << msg << std::endl
<< "Current Predictor: "
<< (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
: "gpu_predictor");
}
bool success{get_predictor()->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)};
CHECK(success) << msg;
};
// Inplace predict is not used for training, so no need to drop tree.
for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
predict_impl(i);
if (i == tree_begin) {
predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
get_predictor()->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
}
// Multiple the tree weight
auto w = this->weight_drop_.at(i);
@@ -886,25 +876,24 @@ class Dart : public GBTree {
std::vector<bst_float> *out_preds,
unsigned layer_begin, unsigned layer_end) override {
DropTrees(false);
auto &predictor = this->GetPredictor();
auto &predictor = this->GetPredictor(false);
uint32_t _, tree_end;
std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
predictor->PredictInstance(inst, out_preds, model_, tree_end);
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
unsigned layer_begin, unsigned layer_end, bool approximate, int,
unsigned) override {
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
approximate);
}
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
unsigned layer_begin, unsigned layer_end, bool approximate) override {
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,

View File

@@ -1,14 +1,11 @@
/*!
* Copyright 2021 by Contributors
/**
* Copyright 2021-2023, XGBoost Contributors
*/
#include "../common/device_helpers.cuh"
#include "xgboost/context.h"
#include "xgboost/linalg.h"
#include "xgboost/span.h"
namespace xgboost {
namespace gbm {
namespace xgboost::gbm {
void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
bst_group_t n_groups, bst_group_t group_id,
HostDeviceVector<GradientPair> *out_gpair) {
@@ -41,5 +38,4 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
});
}
} // namespace gbm
} // namespace xgboost
} // namespace xgboost::gbm

View File

@@ -43,18 +43,10 @@ enum class TreeProcessType : int {
kDefault = 0,
kUpdate = 1
};
enum class PredictorType : int {
kAuto = 0,
kCPUPredictor,
kGPUPredictor,
kOneAPIPredictor
};
} // namespace xgboost
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
namespace xgboost::gbm {
/*! \brief training parameters */
@@ -63,8 +55,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
std::string updater_seq;
/*! \brief type of boosting process to run */
TreeProcessType process_type;
// predictor type
PredictorType predictor;
// tree construction method
TreeMethod tree_method;
// declare parameters
@@ -79,13 +69,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
.describe("Whether to run the normal boosting process that creates new trees,"\
" or to update the trees in an existing model.");
DMLC_DECLARE_ALIAS(updater_seq, updater);
DMLC_DECLARE_FIELD(predictor)
.set_default(PredictorType::kAuto)
.add_enum("auto", PredictorType::kAuto)
.add_enum("cpu_predictor", PredictorType::kCPUPredictor)
.add_enum("gpu_predictor", PredictorType::kGPUPredictor)
.add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
.describe("Predictor algorithm type");
DMLC_DECLARE_FIELD(tree_method)
.set_default(TreeMethod::kAuto)
.add_enum("auto", TreeMethod::kAuto)
@@ -206,15 +189,9 @@ class GBTree : public GradientBooster {
void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry* predt, ObjFunction const* obj) override;
bool UseGPU() const override {
return
tparam_.predictor == PredictorType::kGPUPredictor ||
tparam_.tree_method == TreeMethod::kGPUHist;
}
[[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
GBTreeTrainParam const& GetTrainParam() const {
return tparam_;
}
[[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
void Load(dmlc::Stream* fi) override { model_.Load(fi); }
void Save(dmlc::Stream* fo) const override {
@@ -236,39 +213,14 @@ class GBTree : public GradientBooster {
return !model_.trees.empty() || !model_.trees_to_update.empty();
}
void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
bst_layer_t layer_begin, bst_layer_t layer_end) const;
void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
bst_layer_t layer_begin, bst_layer_t layer_end) override;
void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
bst_layer_t layer_begin, bst_layer_t layer_end) const override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
std::vector<Predictor const *> predictors{
cpu_predictor_.get(),
#if defined(XGBOOST_USE_CUDA)
gpu_predictor_.get()
#endif // defined(XGBOOST_USE_CUDA)
};
StringView msg{"Unsupported data type for inplace predict."};
if (tparam_.predictor == PredictorType::kAuto) {
// Try both predictor implementations
for (auto const &p : predictors) {
if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
return;
}
}
LOG(FATAL) << msg;
} else {
bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
tree_begin, tree_end);
CHECK(success) << msg << std::endl
<< "Current Predictor: "
<< (tparam_.predictor == PredictorType::kCPUPredictor
? "cpu_predictor"
: "gpu_predictor");
}
}
bst_layer_t layer_begin, bst_layer_t layer_end) const override;
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
@@ -349,32 +301,29 @@ class GBTree : public GradientBooster {
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
"n_iteration), use model slicing instead.";
this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
}
void PredictContribution(DMatrix* p_fmat,
HostDeviceVector<bst_float>* out_contribs,
uint32_t layer_begin, uint32_t layer_end, bool approximate,
int, unsigned) override {
void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictContribution(
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
approximate);
}
void PredictInteractionContributions(
DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) override {
CHECK(configured_);
auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
CHECK_EQ(tree_begin, 0)
<< "Predict interaction contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor()->PredictInteractionContributions(
p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
"n_iteration), using model slicing instead.";
this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
tree_end, nullptr, approximate);
}
[[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -390,8 +339,9 @@ class GBTree : public GradientBooster {
std::vector<HostDeviceVector<bst_node_t>>* out_position,
std::vector<std::unique_ptr<RegTree>>* ret);
std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
DMatrix* f_dmat = nullptr) const;
[[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
DMatrix* f_dmat = nullptr) const;
// commit new trees all at once
virtual void CommitModel(TreesOneIter&& new_trees);
@@ -410,9 +360,7 @@ class GBTree : public GradientBooster {
std::vector<std::unique_ptr<TreeUpdater>> updaters_;
// Predictors
std::unique_ptr<Predictor> cpu_predictor_;
#if defined(XGBOOST_USE_CUDA)
std::unique_ptr<Predictor> gpu_predictor_;
#endif // defined(XGBOOST_USE_CUDA)
std::unique_ptr<Predictor> gpu_predictor_{nullptr};
#if defined(XGBOOST_USE_ONEAPI)
std::unique_ptr<Predictor> oneapi_predictor_;
#endif // defined(XGBOOST_USE_ONEAPI)

View File

@@ -40,6 +40,7 @@
#include "common/api_entry.h" // for XGBAPIThreadLocalEntry
#include "common/charconv.h" // for to_chars, to_chars_result, NumericLimits, from_...
#include "common/common.h" // for ToString, Split
#include "common/error_msg.h" // for MaxFeatureSize
#include "common/io.h" // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
#include "common/observer.h" // for TrainingObserver
#include "common/random.h" // for GlobalRandom
@@ -763,9 +764,7 @@ class LearnerConfiguration : public Learner {
CHECK(matrix.first.ptr);
CHECK(!matrix.second.ref.expired());
const uint64_t num_col = matrix.first.ptr->Info().num_col_;
CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
<< "Unfortunately, XGBoost does not support data matrices with "
<< std::numeric_limits<unsigned>::max() << " features or greater";
error::MaxFeatureSize(num_col);
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
}
@@ -1413,6 +1412,8 @@ class LearnerImpl : public LearnerIO {
this->CheckModelInitialized();
auto& out_predictions = this->GetThreadLocal().prediction_entry;
out_predictions.version = 0;
this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
if (type == PredictionType::kValue) {
obj_->PredTransform(&out_predictions.predictions);

View File

@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
if (lj(0) >= Eps64()) {
tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
}
assert(!std::isinf(ti_plus(i)));
assert(!std::isinf(tj_minus(i)));
assert(!isinf(ti_plus(i)));
assert(!isinf(tj_minus(i)));
});
}
} // namespace cuda_impl

View File

@@ -883,9 +883,8 @@ class CPUPredictor : public Predictor {
for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
auto page = batch.GetView();
// parallel over local batch
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
auto row_idx = static_cast<size_t>(batch.base_rowid + i);
common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
auto row_idx = batch.base_rowid + i;
RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
if (feats.Size() == 0) {
feats.Init(num_feature);

View File

@@ -226,9 +226,7 @@ struct GPUHistMakerDevice {
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
}
~GPUHistMakerDevice() { // NOLINT
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
}
~GPUHistMakerDevice() = default;
void InitFeatureGroupsOnce() {
if (!feature_groups) {