Remove internal use of gpu_id. (#9568)

This commit is contained in:
Jiaming Yuan 2023-09-20 23:29:51 +08:00 committed by GitHub
parent 38ac52dd87
commit 8c676c889d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
121 changed files with 1012 additions and 1044 deletions

View File

@ -190,7 +190,7 @@ jobs:
run: | run: |
mkdir build_msvc mkdir build_msvc
cd build_msvc cd build_msvc
cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
cmake --build . --config Release --parallel $(nproc) cmake --build . --config Release --parallel $(nproc)
- name: Install Python package - name: Install Python package

View File

@ -29,31 +29,37 @@ struct DeviceSym {
* viewing types like `linalg::TensorView`. * viewing types like `linalg::TensorView`.
*/ */
struct DeviceOrd { struct DeviceOrd {
// Constant representing the device ID of CPU.
static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU}; enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
// CUDA device ordinal. // CUDA device ordinal.
bst_d_ordinal_t ordinal{-1}; bst_d_ordinal_t ordinal{CPUOrdinal()};
[[nodiscard]] bool IsCUDA() const { return device == kCUDA; } [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
[[nodiscard]] bool IsCPU() const { return device == kCPU; } [[nodiscard]] bool IsCPU() const { return device == kCPU; }
DeviceOrd() = default; constexpr DeviceOrd() = default;
constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {} constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
DeviceOrd(DeviceOrd const& that) = default; constexpr DeviceOrd(DeviceOrd const& that) = default;
DeviceOrd& operator=(DeviceOrd const& that) = default; constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
DeviceOrd(DeviceOrd&& that) = default; constexpr DeviceOrd(DeviceOrd&& that) = default;
DeviceOrd& operator=(DeviceOrd&& that) = default; constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
/** /**
* @brief Constructor for CPU. * @brief Constructor for CPU.
*/ */
[[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; } [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
/** /**
* @brief Constructor for CUDA device. * @brief Constructor for CUDA device.
* *
* @param ordinal CUDA device ordinal. * @param ordinal CUDA device ordinal.
*/ */
[[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; } [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
return DeviceOrd{kCUDA, ordinal};
}
[[nodiscard]] bool operator==(DeviceOrd const& that) const { [[nodiscard]] bool operator==(DeviceOrd const& that) const {
return device == that.device && ordinal == that.ordinal; return device == that.device && ordinal == that.ordinal;
@ -78,25 +84,26 @@ struct DeviceOrd {
static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t)); static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
/** /**
* @brief Runtime context for XGBoost. Contains information like threads and device. * @brief Runtime context for XGBoost. Contains information like threads and device.
*/ */
struct Context : public XGBoostParameter<Context> { struct Context : public XGBoostParameter<Context> {
private: private:
// User interfacing parameter for device ordinal
std::string device{DeviceSym::CPU()}; // NOLINT std::string device{DeviceSym::CPU()}; // NOLINT
// The device object for the current context. We are in the middle of replacing the // The device ordinal set by user
// `gpu_id` with this device field.
DeviceOrd device_{DeviceOrd::CPU()}; DeviceOrd device_{DeviceOrd::CPU()};
public: public:
// Constant representing the device ID of CPU.
static bst_d_ordinal_t constexpr kCpuId = -1;
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
static std::int64_t constexpr kDefaultSeed = 0; static std::int64_t constexpr kDefaultSeed = 0;
public: public:
Context(); Context();
void Init(Args const& kwargs);
template <typename Container> template <typename Container>
Args UpdateAllowUnknown(Container const& kwargs) { Args UpdateAllowUnknown(Container const& kwargs) {
auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs); auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
return args; return args;
} }
std::int32_t gpu_id{kCpuId};
// The number of threads to use if OpenMP is enabled. If equals 0, use the system default. // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
std::int32_t nthread{0}; // NOLINT std::int32_t nthread{0}; // NOLINT
// stored random seed // stored random seed
@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
bool validate_parameters{false}; bool validate_parameters{false};
/** /**
* @brief Configure the parameter `gpu_id'. * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
* removed.
* *
* @param require_gpu Whether GPU is explicitly required by the user through other * @param require_gpu Whether GPU is explicitly required by the user through other
* configurations. * configurations.
@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
private: private:
void SetDeviceOrdinal(Args const& kwargs); void SetDeviceOrdinal(Args const& kwargs);
Context& SetDevice(DeviceOrd d) { Context& SetDevice(DeviceOrd d) {
this->device_ = d; this->device = (this->device_ = d).Name();
this->gpu_id = d.ordinal; // this can be removed once we move away from `gpu_id`.
this->device = d.Name();
return *this; return *this;
} }

View File

@ -106,10 +106,10 @@ class MetaInfo {
MetaInfo& operator=(MetaInfo&& that) = default; MetaInfo& operator=(MetaInfo&& that) = default;
MetaInfo& operator=(MetaInfo const& that) = delete; MetaInfo& operator=(MetaInfo const& that) = delete;
/*! /**
* \brief Validate all metainfo. * @brief Validate all metainfo.
*/ */
void Validate(int32_t device) const; void Validate(DeviceOrd device) const;
MetaInfo Slice(common::Span<int32_t const> ridxs) const; MetaInfo Slice(common::Span<int32_t const> ridxs) const;

View File

@ -88,9 +88,9 @@ class HostDeviceVector {
static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types"); static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
public: public:
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1); explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
HostDeviceVector(std::initializer_list<T> init, int device = -1); HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
explicit HostDeviceVector(const std::vector<T>& init, int device = -1); explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
~HostDeviceVector(); ~HostDeviceVector();
HostDeviceVector(const HostDeviceVector<T>&) = delete; HostDeviceVector(const HostDeviceVector<T>&) = delete;
@ -99,17 +99,9 @@ class HostDeviceVector {
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete; HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
HostDeviceVector<T>& operator=(HostDeviceVector<T>&&); HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
bool Empty() const { return Size() == 0; } [[nodiscard]] bool Empty() const { return Size() == 0; }
size_t Size() const; [[nodiscard]] std::size_t Size() const;
int DeviceIdx() const; [[nodiscard]] DeviceOrd Device() const;
DeviceOrd Device() const {
auto idx = this->DeviceIdx();
if (idx == DeviceOrd::CPU().ordinal) {
return DeviceOrd::CPU();
} else {
return DeviceOrd::CUDA(idx);
}
}
common::Span<T> DeviceSpan(); common::Span<T> DeviceSpan();
common::Span<const T> ConstDeviceSpan() const; common::Span<const T> ConstDeviceSpan() const;
common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); } common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@ -135,13 +127,12 @@ class HostDeviceVector {
const std::vector<T>& ConstHostVector() const; const std::vector<T>& ConstHostVector() const;
const std::vector<T>& HostVector() const {return ConstHostVector(); } const std::vector<T>& HostVector() const {return ConstHostVector(); }
bool HostCanRead() const; [[nodiscard]] bool HostCanRead() const;
bool HostCanWrite() const; [[nodiscard]] bool HostCanWrite() const;
bool DeviceCanRead() const; [[nodiscard]] bool DeviceCanRead() const;
bool DeviceCanWrite() const; [[nodiscard]] bool DeviceCanWrite() const;
GPUAccess DeviceAccess() const; [[nodiscard]] GPUAccess DeviceAccess() const;
void SetDevice(int device) const;
void SetDevice(DeviceOrd device) const; void SetDevice(DeviceOrd device) const;
void Resize(size_t new_size, T v = T()); void Resize(size_t new_size, T v = T());

View File

@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
template <typename T> template <typename T>
auto MakeVec(HostDeviceVector<T> *data) { auto MakeVec(HostDeviceVector<T> *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(), return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
data->Size(), data->Device()); data->Device());
} }
template <typename T> template <typename T>
auto MakeVec(HostDeviceVector<T> const *data) { auto MakeVec(HostDeviceVector<T> const *data) {
return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(), return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
data->Size(), data->Device()); data->Size(), data->Device());
} }
@ -757,13 +757,13 @@ class Tensor {
Order order_{Order::kC}; Order order_{Order::kC};
template <typename I, std::int32_t D> template <typename I, std::int32_t D>
void Initialize(I const (&shape)[D], std::int32_t device) { void Initialize(I const (&shape)[D], DeviceOrd device) {
static_assert(D <= kDim, "Invalid shape."); static_assert(D <= kDim, "Invalid shape.");
std::copy(shape, shape + D, shape_); std::copy(shape, shape + D, shape_);
for (auto i = D; i < kDim; ++i) { for (auto i = D; i < kDim; ++i) {
shape_[i] = 1; shape_[i] = 1;
} }
if (device >= 0) { if (device.IsCUDA()) {
data_.SetDevice(device); data_.SetDevice(device);
data_.ConstDevicePointer(); // Pull to device; data_.ConstDevicePointer(); // Pull to device;
} }
@ -780,14 +780,11 @@ class Tensor {
* See \ref TensorView for parameters of this constructor. * See \ref TensorView for parameters of this constructor.
*/ */
template <typename I, int32_t D> template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device, order} {}
template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC) explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
: Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {} : Tensor{common::Span<I const, D>{shape}, device, order} {}
template <typename I, size_t D> template <typename I, size_t D>
explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC) explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
: order_{order} { : order_{order} {
// No device unroll as this is a host only function. // No device unroll as this is a host only function.
std::copy(shape.data(), shape.data() + D, shape_); std::copy(shape.data(), shape.data() + D, shape_);
@ -795,11 +792,11 @@ class Tensor {
shape_[i] = 1; shape_[i] = 1;
} }
auto size = detail::CalcSize(shape_); auto size = detail::CalcSize(shape_);
if (device >= 0) { if (device.IsCUDA()) {
data_.SetDevice(device); data_.SetDevice(device);
} }
data_.Resize(size); data_.Resize(size);
if (device >= 0) { if (device.IsCUDA()) {
data_.DevicePointer(); // Pull to device data_.DevicePointer(); // Pull to device
} }
} }
@ -807,7 +804,7 @@ class Tensor {
* Initialize from 2 host iterators. * Initialize from 2 host iterators.
*/ */
template <typename It, typename I, int32_t D> template <typename It, typename I, int32_t D>
explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC) explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
: order_{order} { : order_{order} {
auto &h_vec = data_.HostVector(); auto &h_vec = data_.HostVector();
h_vec.insert(h_vec.begin(), begin, end); h_vec.insert(h_vec.begin(), begin, end);
@ -816,7 +813,7 @@ class Tensor {
} }
template <typename I, int32_t D> template <typename I, int32_t D>
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device, explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
Order order = kC) Order order = kC)
: order_{order} { : order_{order} {
auto &h_vec = data_.HostVector(); auto &h_vec = data_.HostVector();
@ -824,10 +821,6 @@ class Tensor {
// shape // shape
this->Initialize(shape, device); this->Initialize(shape, device);
} }
template <typename I, int32_t D>
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
Order order = kC)
: Tensor{data, shape, device.ordinal, order} {}
/** /**
* \brief Index operator. Not thread safe, should not be used in performance critical * \brief Index operator. Not thread safe, should not be used in performance critical
* region. For more efficient indexing, consider getting a view first. * region. For more efficient indexing, consider getting a view first.
@ -944,9 +937,7 @@ class Tensor {
/** /**
* \brief Set device ordinal for this tensor. * \brief Set device ordinal for this tensor.
*/ */
void SetDevice(int32_t device) const { data_.SetDevice(device); }
void SetDevice(DeviceOrd device) const { data_.SetDevice(device); } void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
[[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
[[nodiscard]] DeviceOrd Device() const { return data_.Device(); } [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
}; };
@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
template <typename T, typename... Index> template <typename T, typename... Index>
auto Empty(Context const *ctx, Index &&...index) { auto Empty(Context const *ctx, Index &&...index) {
Tensor<T, sizeof...(Index)> t; Tensor<T, sizeof...(Index)> t;
t.SetDevice(ctx->gpu_id); t.SetDevice(ctx->Device());
t.Reshape(index...); t.Reshape(index...);
return t; return t;
} }
@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
template <typename T, typename... Index> template <typename T, typename... Index>
auto Constant(Context const *ctx, T v, Index &&...index) { auto Constant(Context const *ctx, T v, Index &&...index) {
Tensor<T, sizeof...(Index)> t; Tensor<T, sizeof...(Index)> t;
t.SetDevice(ctx->gpu_id); t.SetDevice(ctx->Device());
t.Reshape(index...); t.Reshape(index...);
t.Data()->Fill(std::move(v)); t.Data()->Fill(std::move(v));
return t; return t;
@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
// Only first axis is supported for now. // Only first axis is supported for now.
template <typename T, int32_t D> template <typename T, int32_t D>
void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) { void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
if (r.DeviceIdx() >= 0) { if (r.Device().IsCUDA()) {
l->SetDevice(r.DeviceIdx()); l->SetDevice(r.Device());
} }
l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) { l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
for (size_t i = 1; i < D; ++i) { for (size_t i = 1; i < D; ++i) {

View File

@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
public: public:
PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {} PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) { PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
auto p_cache = this->CacheItem(m); auto p_cache = this->CacheItem(m);
if (device != Context::kCpuId) { if (device.IsCUDA()) {
p_cache->predictions.SetDevice(device); p_cache->predictions.SetDevice(device);
} }
return *p_cache; return *p_cache;

View File

@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
auto hess_dev = dh::CudaGetPointerDevice(hess.data); auto hess_dev = dh::CudaGetPointerDevice(hess.data);
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device."; CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
auto &gpair = *out_gpair; auto &gpair = *out_gpair;
gpair.SetDevice(grad_dev); gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
gpair.Reshape(grad.Shape(0), grad.Shape(1)); gpair.Reshape(grad.Shape(0), grad.Shape(1));
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev)); auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
auto cuctx = ctx->CUDACtx(); auto cuctx = ctx->CUDACtx();
@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
if (learner->Ctx()->IsCUDA()) { if (learner->Ctx()->IsCUDA()) {
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead()); CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
} }
p_predt->SetDevice(proxy->DeviceIdx()); p_predt->SetDevice(proxy->Device());
auto &shape = learner->GetThreadLocal().prediction_shape; auto &shape = learner->GetThreadLocal().prediction_shape;
size_t n_samples = p_m->Info().num_row_; size_t n_samples = p_m->Info().num_row_;

View File

@ -15,8 +15,7 @@
#include "communicator-inl.cuh" #include "communicator-inl.cuh"
namespace xgboost { namespace xgboost::collective {
namespace collective {
/** /**
* @brief Find the global sum of the given values across all workers. * @brief Find the global sum of the given values across all workers.
@ -31,10 +30,9 @@ namespace collective {
* @param size Number of values to sum. * @param size Number of values to sum.
*/ */
template <typename T> template <typename T>
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) { void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
if (info.IsRowSplit()) { if (info.IsRowSplit()) {
collective::AllReduce<collective::Operation::kSum>(device, values, size); collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
} }
} }
} // namespace collective } // namespace xgboost::collective
} // namespace xgboost

View File

@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; }); [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
} }
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr, void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries, dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights, dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan) { dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple { sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size. return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
}); });
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature, detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr, IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
&column_sizes_scan); &column_sizes_scan);
auto d_cuts_ptr = cuts_ptr.DeviceSpan(); auto d_cuts_ptr = cuts_ptr.DeviceSpan();
if (sketch_container->HasCategorical()) { if (sketch_container->HasCategorical()) {
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight; auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight, detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
&column_sizes_scan); &column_sizes_scan);
} }
@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
HistogramCuts cuts; HistogramCuts cuts;
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_, SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
ctx->Ordinal()); ctx->Device());
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty()); CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
for (const auto& page : p_fmat->GetBatches<SparsePage>()) { for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
std::size_t page_nnz = page.data.Size(); std::size_t page_nnz = page.data.Size();

View File

@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
} }
template <std::uint32_t kBlockThreads, typename Kernel> template <std::uint32_t kBlockThreads, typename Kernel>
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) { std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
int n_mps = 0; int n_mps = 0;
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device)); dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
int n_blocks_per_mp = 0; int n_blocks_per_mp = 0;
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel, dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
kBlockThreads, shared_mem)); kBlockThreads, shared_mem));
@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
* \param out_column_size Output buffer for the size of each column. * \param out_column_size Output buffer for the size of each column.
*/ */
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false> template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter, void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) { data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0); thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
std::size_t max_shared_memory = dh::MaxSharedMemory(device); std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
// Not strictly correct as we should use number of samples to determine the type of // Not strictly correct as we should use number of samples to determine the type of
// counter. However, the sample size is not known due to sliding window on number of // counter. However, the sample size is not known due to sliding window on number of
// elements. // elements.
@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
} }
template <typename BatchIt> template <typename BatchIt>
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature, void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid, IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr, HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
dh::caching_device_vector<size_t>* column_sizes_scan) { dh::caching_device_vector<size_t>* column_sizes_scan) {
@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
// Count the valid entries in each column and copy them out. // Count the valid entries in each column and copy them out.
template <typename AdapterBatch, typename BatchIter> template <typename AdapterBatch, typename BatchIter>
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range, void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
float missing, size_t columns, size_t cuts_per_feature, int device, float missing, size_t columns, size_t cuts_per_feature,
DeviceOrd device,
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan, HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
dh::caching_device_vector<size_t>* column_sizes_scan, dh::caching_device_vector<size_t>* column_sizes_scan,
dh::device_vector<Entry>* sorted_entries) { dh::device_vector<Entry>* sorted_entries) {
@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
void SortByWeight(dh::device_vector<float>* weights, void SortByWeight(dh::device_vector<float>* weights,
dh::device_vector<Entry>* sorted_entries); dh::device_vector<Entry>* sorted_entries);
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr, void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
dh::device_vector<Entry>* p_sorted_entries, dh::device_vector<Entry>* p_sorted_entries,
dh::device_vector<float>* p_sorted_weights, dh::device_vector<float>* p_sorted_weights,
dh::caching_device_vector<size_t>* p_column_sizes_scan); dh::caching_device_vector<size_t>* p_column_sizes_scan);
@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
template <typename AdapterBatch> template <typename AdapterBatch>
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info, void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
int device, size_t columns, size_t begin, size_t end, DeviceOrd device, size_t columns, size_t begin, size_t end,
float missing, SketchContainer *sketch_container, float missing, SketchContainer *sketch_container,
int num_cuts) { int num_cuts) {
// Copy current subset of valid elements into temporary storage and sort // Copy current subset of valid elements into temporary storage and sort
@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
template <typename Batch> template <typename Batch>
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info, void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
int num_cuts_per_feature, int num_cuts_per_feature,
bool is_ranking, float missing, int device, bool is_ranking, float missing, DeviceOrd device,
size_t columns, size_t begin, size_t end, size_t columns, size_t begin, size_t end,
SketchContainer *sketch_container) { SketchContainer *sketch_container) {
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
info.weights_.SetDevice(device); info.weights_.SetDevice(device);
auto weights = info.weights_.ConstDeviceSpan(); auto weights = info.weights_.ConstDeviceSpan();
@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
size_t num_rows = batch.NumRows(); size_t num_rows = batch.NumRows();
size_t num_cols = batch.NumCols(); size_t num_cols = batch.NumCols();
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows); size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
int32_t device = sketch_container->DeviceIdx(); auto device = sketch_container->DeviceIdx();
bool weighted = !info.weights_.Empty(); bool weighted = !info.weights_.Empty();
if (weighted) { if (weighted) {
sketch_batch_num_elements = detail::SketchBatchNumElements( sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements, sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(), num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, true); device.ordinal, num_cuts_per_feature, true);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements)); std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
sketch_batch_num_elements = detail::SketchBatchNumElements( sketch_batch_num_elements = detail::SketchBatchNumElements(
sketch_batch_num_elements, sketch_batch_num_elements,
num_rows, num_cols, std::numeric_limits<size_t>::max(), num_rows, num_cols, std::numeric_limits<size_t>::max(),
device, num_cuts_per_feature, false); device.ordinal, num_cuts_per_feature, false);
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) { for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
size_t end = size_t end =
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements)); std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));

View File

@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
}; };
template <typename T> template <typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int) HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
: impl_(nullptr) { : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(size, v); impl_ = new HostDeviceVectorImpl<T>(size, v);
} }
template <typename T> template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int) HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
: impl_(nullptr) { : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init); impl_ = new HostDeviceVectorImpl<T>(init);
} }
template <typename T> template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int) HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
: impl_(nullptr) { : impl_(nullptr) {
impl_ = new HostDeviceVectorImpl<T>(init); impl_ = new HostDeviceVectorImpl<T>(init);
} }
@ -81,7 +81,7 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); } size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
template <typename T> template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return -1; } DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
template <typename T> template <typename T>
T* HostDeviceVector<T>::DevicePointer() { return nullptr; } T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
return false; return false;
} }
template <typename T>
void HostDeviceVector<T>::SetDevice(int) const {}
template <typename T> template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {} void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}

View File

@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
template <typename T> template <typename T>
class HostDeviceVectorImpl { class HostDeviceVectorImpl {
public: public:
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) { HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
if (device >= 0) { if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
data_d_->resize(size, v); data_d_->resize(size, v);
@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
// Initializer can be std::vector<T> or std::initializer_list<T> // Initializer can be std::vector<T> or std::initializer_list<T>
template <class Initializer> template <class Initializer>
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) { HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
if (device >= 0) { if (device.IsCUDA()) {
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
LazyResizeDevice(init.size()); LazyResizeDevice(init.size());
Copy(init); Copy(init);
@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
gpu_access_{that.gpu_access_} {} gpu_access_{that.gpu_access_} {}
~HostDeviceVectorImpl() { ~HostDeviceVectorImpl() {
if (device_ >= 0) { if (device_.IsCUDA()) {
SetDevice(); SetDevice();
} }
} }
size_t Size() const { [[nodiscard]] size_t Size() const {
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0; return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
} }
int DeviceIdx() const { return device_; } [[nodiscard]] DeviceOrd Device() const { return device_; }
T* DevicePointer() { T* DevicePointer() {
LazySyncDevice(GPUAccess::kWrite); LazySyncDevice(GPUAccess::kWrite);
@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
} else { } else {
auto ptr = other->ConstDevicePointer(); auto ptr = other->ConstDevicePointer();
SetDevice(); SetDevice();
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx()); CHECK_EQ(this->Device(), other->Device());
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
ptr, ptr,
other->Size() * sizeof(T), other->Size() * sizeof(T),
@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
return data_h_; return data_h_;
} }
void SetDevice(int device) { void SetDevice(DeviceOrd device) {
if (device_ == device) { return; } if (device_ == device) { return; }
if (device_ >= 0) { if (device_.IsCUDA()) {
LazySyncHost(GPUAccess::kNone); LazySyncHost(GPUAccess::kNone);
} }
if (device_ >= 0 && device >= 0) { if (device_.IsCUDA() && device.IsCUDA()) {
CHECK_EQ(device_, device) << "New device ordinal is different from previous one."; CHECK_EQ(device_.ordinal, device.ordinal)
<< "New device ordinal is different from previous one.";
} }
device_ = device; device_ = device;
if (device_ >= 0) { if (device_.IsCUDA()) {
LazyResizeDevice(data_h_.size()); LazyResizeDevice(data_h_.size());
} }
} }
void Resize(size_t new_size, T v) { void Resize(size_t new_size, T v) {
if (new_size == Size()) { return; } if (new_size == Size()) { return; }
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) { if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
// fast on-device resize // fast on-device resize
gpu_access_ = GPUAccess::kWrite; gpu_access_ = GPUAccess::kWrite;
SetDevice(); SetDevice();
@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
gpu_access_ = access; gpu_access_ = access;
} }
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; } [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); } [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); } [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; } [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); } [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); } [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
GPUAccess Access() const { return gpu_access_; } [[nodiscard]] GPUAccess Access() const { return gpu_access_; }
private: private:
int device_{-1}; DeviceOrd device_{DeviceOrd::CPU()};
std::vector<T> data_h_{}; std::vector<T> data_h_{};
std::unique_ptr<dh::device_vector<T>> data_d_{}; std::unique_ptr<dh::device_vector<T>> data_d_{};
GPUAccess gpu_access_{GPUAccess::kNone}; GPUAccess gpu_access_{GPUAccess::kNone};
@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
} }
void SetDevice() { void SetDevice() {
CHECK_GE(device_, 0); CHECK_GE(device_.ordinal, 0);
if (cudaSetDeviceHandler == nullptr) { if (cudaSetDeviceHandler == nullptr) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
} else { } else {
(*cudaSetDeviceHandler)(device_); (*cudaSetDeviceHandler)(device_.ordinal);
} }
if (!data_d_) { if (!data_d_) {
@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
}; };
template<typename T> template<typename T>
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device) HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {} : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
template <typename T> template <typename T>
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device) HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {} : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T> template <typename T>
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device) HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
: impl_(new HostDeviceVectorImpl<T>(init, device)) {} : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
template <typename T> template <typename T>
@ -309,7 +310,9 @@ template <typename T>
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); } size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
template <typename T> template <typename T>
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); } DeviceOrd HostDeviceVector<T>::Device() const {
return impl_->Device();
}
template <typename T> template <typename T>
T* HostDeviceVector<T>::DevicePointer() { T* HostDeviceVector<T>::DevicePointer() {
@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
return impl_->Access(); return impl_->Access();
} }
template <typename T>
void HostDeviceVector<T>::SetDevice(int device) const {
impl_->SetDevice(device);
}
template <typename T> template <typename T>
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const { void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
impl_->SetDevice(device.ordinal); impl_->SetDevice(device);
} }
template <typename T> template <typename T>

View File

@ -8,16 +8,12 @@
#include "xgboost/context.h" // Context #include "xgboost/context.h" // Context
#include "xgboost/host_device_vector.h" // HostDeviceVector #include "xgboost/host_device_vector.h" // HostDeviceVector
namespace xgboost { namespace xgboost::common::cuda_impl {
namespace common {
namespace cuda_impl {
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) { double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
values.SetDevice(ctx->gpu_id); values.SetDevice(ctx->Device());
auto const d_values = values.ConstDeviceSpan(); auto const d_values = values.ConstDeviceSpan();
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0, return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
thrust::plus<float>{}); thrust::plus<float>{});
} }
} // namespace cuda_impl } // namespace xgboost::common::cuda_impl
} // namespace common
} // namespace xgboost

View File

@ -24,7 +24,7 @@ struct OptionalWeights {
inline OptionalWeights MakeOptionalWeights(Context const* ctx, inline OptionalWeights MakeOptionalWeights(Context const* ctx,
HostDeviceVector<float> const& weights) { HostDeviceVector<float> const& weights) {
if (ctx->IsCUDA()) { if (ctx->IsCUDA()) {
weights.SetDevice(ctx->gpu_id); weights.SetDevice(ctx->Device());
} }
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()}; return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
} }

View File

@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
// summary does the output element come from) result by definition of merged rank. So we // summary does the output element come from) result by definition of merged rank. So we
// run it in 2 passes to obtain the merge path and then customize the standard merge // run it in 2 passes to obtain the merge path and then customize the standard merge
// algorithm. // algorithm.
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x, void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y, Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) { Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
CHECK_EQ(d_x.size() + d_y.size(), out.size()); CHECK_EQ(d_x.size() + d_y.size(), out.size());
CHECK_EQ(x_ptr.size(), out_ptr.size()); CHECK_EQ(x_ptr.size(), out_ptr.size());
CHECK_EQ(y_ptr.size(), out_ptr.size()); CHECK_EQ(y_ptr.size(), out_ptr.size());
@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr, void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, common::Span<OffsetT> cuts_ptr,
size_t total_cuts, Span<float> weights) { size_t total_cuts, Span<float> weights) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
Span<SketchEntry> out; Span<SketchEntry> out;
dh::device_vector<SketchEntry> cuts; dh::device_vector<SketchEntry> cuts;
bool first_window = this->Current().empty(); bool first_window = this->Current().empty();
@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
* pruning or merging. We preserve the first type and remove the second type. * pruning or merging. We preserve the first type and remove the second type.
*/ */
timer_.Start(__func__); timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1); CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
void SketchContainer::Prune(size_t to) { void SketchContainer::Prune(size_t to) {
timer_.Start(__func__); timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
OffsetT to_total = 0; OffsetT to_total = 0;
auto& h_columns_ptr = columns_ptr_b_.HostVector(); auto& h_columns_ptr = columns_ptr_b_.HostVector();
@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr, void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
Span<SketchEntry const> that) { Span<SketchEntry const> that) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
timer_.Start(__func__); timer_.Start(__func__);
if (this->Current().size() == 0) { if (this->Current().size() == 0) {
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0); CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
} }
void SketchContainer::FixError() { void SketchContainer::FixError() {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan(); auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
auto in = dh::ToSpan(this->Current()); auto in = dh::ToSpan(this->Current());
dh::LaunchN(in.size(), [=] __device__(size_t idx) { dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@ -502,7 +502,7 @@ void SketchContainer::FixError() {
} }
void SketchContainer::AllReduce(bool is_column_split) { void SketchContainer::AllReduce(bool is_column_split) {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
auto world = collective::GetWorldSize(); auto world = collective::GetWorldSize();
if (world == 1 || is_column_split) { if (world == 1 || is_column_split) {
return; return;
@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
auto offset = rank * d_columns_ptr.size(); auto offset = rank * d_columns_ptr.size();
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(), thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
gathered_ptrs.begin() + offset); gathered_ptrs.begin() + offset);
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(), collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
gathered_ptrs.size()); gathered_ptrs.size());
// Get the data from all workers. // Get the data from all workers.
std::vector<size_t> recv_lengths; std::vector<size_t> recv_lengths;
dh::caching_device_vector<char> recvbuf; dh::caching_device_vector<char> recvbuf;
collective::AllGatherV(device_, this->Current().data().get(), collective::AllGatherV(device_.ordinal, this->Current().data().get(),
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf); dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
collective::Synchronize(device_); collective::Synchronize(device_.ordinal);
// Segment the received data. // Segment the received data.
auto s_recvbuf = dh::ToSpan(recvbuf); auto s_recvbuf = dh::ToSpan(recvbuf);
@ -584,7 +584,7 @@ struct InvalidCatOp {
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) { void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
timer_.Start(__func__); timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
p_cuts->min_vals_.Resize(num_columns_); p_cuts->min_vals_.Resize(num_columns_);
// Sync between workers. // Sync between workers.

View File

@ -41,7 +41,7 @@ class SketchContainer {
bst_row_t num_rows_; bst_row_t num_rows_;
bst_feature_t num_columns_; bst_feature_t num_columns_;
int32_t num_bins_; int32_t num_bins_;
int32_t device_; DeviceOrd device_;
// Double buffer as neither prune nor merge can be performed inplace. // Double buffer as neither prune nor merge can be performed inplace.
dh::device_vector<SketchEntry> entries_a_; dh::device_vector<SketchEntry> entries_a_;
@ -93,35 +93,32 @@ class SketchContainer {
* \param num_rows Total number of rows in known dataset (typically the rows in current worker). * \param num_rows Total number of rows in known dataset (typically the rows in current worker).
* \param device GPU ID. * \param device GPU ID.
*/ */
SketchContainer(HostDeviceVector<FeatureType> const &feature_types, SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
int32_t max_bin, bst_feature_t num_columns, bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
bst_row_t num_rows, int32_t device) : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
: num_rows_{num_rows}, CHECK(device.IsCUDA());
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} { // Initialize Sketches for this dmatrix
CHECK_GE(device, 0); this->columns_ptr_.SetDevice(device_);
// Initialize Sketches for this dmatrix this->columns_ptr_.Resize(num_columns + 1);
this->columns_ptr_.SetDevice(device_); this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_.Resize(num_columns + 1); this->columns_ptr_b_.Resize(num_columns + 1);
this->columns_ptr_b_.SetDevice(device_);
this->columns_ptr_b_.Resize(num_columns + 1);
this->feature_types_.Resize(feature_types.Size()); this->feature_types_.Resize(feature_types.Size());
this->feature_types_.Copy(feature_types); this->feature_types_.Copy(feature_types);
// Pull to device. // Pull to device.
this->feature_types_.SetDevice(device); this->feature_types_.SetDevice(device);
this->feature_types_.ConstDeviceSpan(); this->feature_types_.ConstDeviceSpan();
this->feature_types_.ConstHostSpan(); this->feature_types_.ConstHostSpan();
auto d_feature_types = feature_types_.ConstDeviceSpan(); auto d_feature_types = feature_types_.ConstDeviceSpan();
has_categorical_ = has_categorical_ =
!d_feature_types.empty() && !d_feature_types.empty() &&
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
common::IsCatOp{});
timer_.Init(__func__); timer_.Init(__func__);
} }
/* \brief Return GPU ID for this container. */ /* \brief Return GPU ID for this container. */
int32_t DeviceIdx() const { return device_; } [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
/* \brief Whether the predictor matrix contains categorical features. */ /* \brief Whether the predictor matrix contains categorical features. */
bool HasCategorical() const { return has_categorical_; } bool HasCategorical() const { return has_categorical_; }
/* \brief Accumulate weights of duplicated entries in input. */ /* \brief Accumulate weights of duplicated entries in input. */
@ -175,7 +172,7 @@ class SketchContainer {
template <typename KeyComp = thrust::equal_to<size_t>> template <typename KeyComp = thrust::equal_to<size_t>>
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) { size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
timer_.Start(__func__); timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
this->columns_ptr_.SetDevice(device_); this->columns_ptr_.SetDevice(device_);
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan(); Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
CHECK_EQ(d_column_scan.size(), num_columns_ + 1); CHECK_EQ(d_column_scan.size(), num_columns_ + 1);

View File

@ -15,8 +15,7 @@
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply #include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
#include "xgboost/logging.h" // CHECK_EQ #include "xgboost/logging.h" // CHECK_EQ
namespace xgboost { namespace xgboost::common {
namespace common {
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t, void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) { HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
if (!ctx->IsCPU()) { if (!ctx->IsCPU()) {
@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
} }
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) { void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
v.SetDevice(ctx->gpu_id); v.SetDevice(ctx->Device());
out->SetDevice(ctx->gpu_id); out->SetDevice(ctx->Device());
out->Reshape(1); out->Reshape(1);
if (ctx->IsCPU()) { if (ctx->IsCPU()) {
@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device())); cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
} }
} }
} // namespace common } // namespace xgboost::common
} // namespace xgboost

View File

@ -15,14 +15,12 @@
#include "xgboost/host_device_vector.h" // HostDeviceVector #include "xgboost/host_device_vector.h" // HostDeviceVector
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply #include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
namespace xgboost { namespace xgboost::common::cuda_impl {
namespace common {
namespace cuda_impl {
void Median(Context const* ctx, linalg::TensorView<float const, 2> t, void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) { common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
CHECK_GE(t.Shape(1), 1); CHECK_GE(t.Shape(1), 1);
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0); HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
segments.SetDevice(ctx->gpu_id); segments.SetDevice(ctx->Device());
auto d_segments = segments.DeviceSpan(); auto d_segments = segments.DeviceSpan();
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(), dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; }); [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape())); return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
}); });
out->SetDevice(ctx->gpu_id); out->SetDevice(ctx->Device());
out->Reshape(t.Shape(1)); out->Reshape(t.Shape(1));
if (weights.Empty()) { if (weights.Empty()) {
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it, common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
dh::TemporaryArray<char> temp{bytes}; dh::TemporaryArray<char> temp{bytes};
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s); cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
} }
} // namespace cuda_impl } // namespace xgboost::common::cuda_impl
} // namespace common
} // namespace xgboost

View File

@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
auto d_sorted_idx = dh::ToSpan(sorted_idx); auto d_sorted_idx = dh::ToSpan(sorted_idx);
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx)); auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
quantiles->SetDevice(ctx->gpu_id); quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments); quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan(); auto d_results = quantiles->DeviceSpan();
@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
scan_val, weights_cdf.begin()); scan_val, weights_cdf.begin());
auto n_segments = std::distance(seg_beg, seg_end) - 1; auto n_segments = std::distance(seg_beg, seg_end) - 1;
quantiles->SetDevice(ctx->gpu_id); quantiles->SetDevice(ctx->Device());
quantiles->Resize(n_segments); quantiles->Resize(n_segments);
auto d_results = quantiles->DeviceSpan(); auto d_results = quantiles->DeviceSpan();
auto d_weight_cdf = dh::ToSpan(weights_cdf); auto d_weight_cdf = dh::ToSpan(weights_cdf);

View File

@ -60,8 +60,8 @@ class Transform {
template <typename Functor> template <typename Functor>
struct Evaluator { struct Evaluator {
public: public:
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx) Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {} : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
/*! /*!
* \brief Evaluate the functor with input pointers to HostDeviceVector. * \brief Evaluate the functor with input pointers to HostDeviceVector.
@ -71,7 +71,7 @@ class Transform {
*/ */
template <typename... HDV> template <typename... HDV>
void Eval(HDV... vectors) const { void Eval(HDV... vectors) const {
bool on_device = device_ >= 0; bool on_device = device_.IsCUDA();
if (on_device) { if (on_device) {
LaunchCUDA(func_, vectors...); LaunchCUDA(func_, vectors...);
@ -116,11 +116,11 @@ class Transform {
} }
// Recursive unpack for Shard. // Recursive unpack for Shard.
template <typename T> template <typename T>
void UnpackShard(int device, const HostDeviceVector<T> *vector) const { void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
vector->SetDevice(device); vector->SetDevice(device);
} }
template <typename Head, typename... Rest> template <typename Head, typename... Rest>
void UnpackShard(int device, void UnpackShard(DeviceOrd device,
const HostDeviceVector<Head> *_vector, const HostDeviceVector<Head> *_vector,
const HostDeviceVector<Rest> *... _vectors) const { const HostDeviceVector<Rest> *... _vectors) const {
_vector->SetDevice(device); _vector->SetDevice(device);
@ -140,7 +140,7 @@ class Transform {
// granularity is used in data vector. // granularity is used in data vector.
size_t shard_size = range_size; size_t shard_size = range_size;
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)}; Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
const int kGrids = const int kGrids =
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads)); static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
if (kGrids == 0) { if (kGrids == 0) {
@ -174,7 +174,7 @@ class Transform {
/*! \brief Range object specifying parallel threads index range. */ /*! \brief Range object specifying parallel threads index range. */
Range range_; Range range_;
int32_t n_threads_; int32_t n_threads_;
int32_t device_; DeviceOrd device_;
}; };
public: public:
@ -192,8 +192,8 @@ class Transform {
*/ */
template <typename Functor> template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads, static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
int32_t device_idx) { DeviceOrd device) {
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx}; return Evaluator<Functor>{func, std::move(range), n_threads, device};
} }
}; };

View File

@ -20,7 +20,6 @@ namespace xgboost {
DMLC_REGISTER_PARAMETER(Context); DMLC_REGISTER_PARAMETER(Context);
bst_d_ordinal_t constexpr Context::kCpuId;
std::int64_t constexpr Context::kDefaultSeed; std::int64_t constexpr Context::kDefaultSeed;
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {} Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
return std::nullopt; return std::nullopt;
} }
std::int32_t parsed_id{Context::kCpuId}; std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id); auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
if (res.ec != std::errc()) { if (res.ec != std::errc()) {
return std::nullopt; return std::nullopt;
@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':'); auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
DeviceOrd device; DeviceOrd device;
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check. device.ordinal = DeviceOrd::InvalidOrdinal(); // mark it invalid for check.
if (split_it == s_device.cend()) { if (split_it == s_device.cend()) {
// no ordinal. // no ordinal.
if (s_device == DeviceSym::CPU()) { if (s_device == DeviceSym::CPU()) {
@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
device = DeviceOrd::CUDA(opt_id.value()); device = DeviceOrd::CUDA(opt_id.value());
} }
if (device.ordinal < Context::kCpuId) { if (device.ordinal < DeviceOrd::CPUOrdinal()) {
fatal(); fatal();
} }
device = CUDAOrdinal(device, fail_on_invalid_gpu_id); device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
} }
} // namespace } // namespace
std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
os << ord.Name();
return os;
}
void Context::Init(Args const& kwargs) {
auto unknown = this->UpdateAllowUnknown(kwargs);
if (!unknown.empty()) {
std::stringstream ss;
std::size_t i = 0;
ss << "[Internal Error] Unknown parameters passed to the Context {";
for (auto const& [k, _] : unknown) {
ss << '"' << k << '"';
if (++i != unknown.size()) {
ss << ", ";
}
}
ss << "}\n";
LOG(FATAL) << ss.str();
}
}
void Context::ConfigureGpuId(bool require_gpu) { void Context::ConfigureGpuId(bool require_gpu) {
if (this->IsCPU() && require_gpu) { if (this->IsCPU() && require_gpu) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}}); this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
error::WarnDeprecatedGPUId(); error::WarnDeprecatedGPUId();
auto opt_id = ParseInt(StringView{gpu_id_it->second}); auto opt_id = ParseInt(StringView{gpu_id_it->second});
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second; CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
if (opt_id.value() > Context::kCpuId) { if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}}); this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
} else { } else {
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}}); this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
this->SetDevice(new_d); this->SetDevice(new_d);
if (this->IsCPU()) { if (this->IsCPU()) {
CHECK_EQ(this->device_.ordinal, kCpuId); CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
} else { } else {
CHECK_GT(this->device_.ordinal, kCpuId); CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
} }
} }

View File

@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
linalg::Stack(&this->labels, that.labels); linalg::Stack(&this->labels, that.labels);
this->weights_.SetDevice(that.weights_.DeviceIdx()); this->weights_.SetDevice(that.weights_.Device());
this->weights_.Extend(that.weights_); this->weights_.Extend(that.weights_);
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx()); this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
this->labels_lower_bound_.Extend(that.labels_lower_bound_); this->labels_lower_bound_.Extend(that.labels_lower_bound_);
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx()); this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
this->labels_upper_bound_.Extend(that.labels_upper_bound_); this->labels_upper_bound_.Extend(that.labels_upper_bound_);
linalg::Stack(&this->base_margin_, that.base_margin_); linalg::Stack(&this->base_margin_, that.base_margin_);
@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
} }
if (!that.feature_weights.Empty()) { if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size()); this->feature_weights.Resize(that.feature_weights.Size());
this->feature_weights.SetDevice(that.feature_weights.DeviceIdx()); this->feature_weights.SetDevice(that.feature_weights.Device());
this->feature_weights.Copy(that.feature_weights); this->feature_weights.Copy(that.feature_weights);
} }
} }
@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
namespace { namespace {
template <typename T> template <typename T>
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) { void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device; bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
if (!valid) { if (!valid) {
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than " LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
"the booster. The device ordinal of the data is: " "the booster. The device ordinal of the data is: "
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device; << v.Device() << "; the device ordinal of the Booster is: " << device;
} }
} }
template <typename T, std::int32_t D> template <typename T, std::int32_t D>
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) { void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
CheckDevice(device, *v.Data()); CheckDevice(device, *v.Data());
} }
} // anonymous namespace } // anonymous namespace
void MetaInfo::Validate(std::int32_t device) const { void MetaInfo::Validate(DeviceOrd device) const {
if (group_ptr_.size() != 0 && weights_.Size() != 0) { if (group_ptr_.size() != 0 && weights_.Size() != 0) {
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight(); CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
return; return;

View File

@ -29,13 +29,13 @@ template <typename T, int32_t D>
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) { void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
ArrayInterface<D> array(arr_interface); ArrayInterface<D> array(arr_interface);
if (array.n == 0) { if (array.n == 0) {
p_out->SetDevice(0); p_out->SetDevice(DeviceOrd::CUDA(0));
p_out->Reshape(array.shape); p_out->Reshape(array.shape);
return; return;
} }
CHECK_EQ(array.valid.Capacity(), 0) CHECK_EQ(array.valid.Capacity(), 0)
<< "Meta info like label or weight can not have missing value."; << "Meta info like label or weight can not have missing value.";
auto ptr_device = SetDeviceToPtr(array.data); auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
p_out->SetDevice(ptr_device); p_out->SetDevice(ptr_device);
if (array.is_contiguous && array.type == ToDType<T>::kType) { if (array.is_contiguous && array.type == ToDType<T>::kType) {
@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
return; return;
} }
p_out->Reshape(array.shape); p_out->Reshape(array.shape);
auto t = p_out->View(DeviceOrd::CUDA(ptr_device)); auto t = p_out->View(ptr_device);
linalg::ElementWiseTransformDevice( linalg::ElementWiseTransformDevice(
t, t,
[=] __device__(size_t i, T) { [=] __device__(size_t i, T) {
@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
}); });
dh::caching_device_vector<bool> flag(1); dh::caching_device_vector<bool> flag(1);
auto d_flag = dh::ToSpan(flag); auto d_flag = dh::ToSpan(flag);
auto d = SetDeviceToPtr(array_interface.data); auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; }); dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) { dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
auto typed = TypedIndex<uint32_t, 1>{array_interface}; auto typed = TypedIndex<uint32_t, 1>{array_interface};

View File

@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows) CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
: columns_(columns), : columns_(columns),
num_rows_(num_rows) {} num_rows_(num_rows) {}
size_t Size() const { return num_rows_ * columns_.size(); } [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
__device__ __forceinline__ COOTuple GetElement(size_t idx) const { [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % columns_.size(); size_t column_idx = idx % columns_.size();
size_t row_idx = idx / columns_.size(); size_t row_idx = idx / columns_.size();
auto const& column = columns_[column_idx]; auto const& column = columns_[column_idx];
@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return {row_idx, column_idx, value}; return {row_idx, column_idx, value};
} }
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
auto const& column = columns_[fidx]; auto const& column = columns_[fidx];
float value = column.valid.Data() == nullptr || column.valid.Check(ridx) float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
? column(ridx) ? column(ridx)
@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return value; return value;
} }
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; } [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); } [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
private: private:
common::Span<ArrayInterface<1>> columns_; common::Span<ArrayInterface<1>> columns_;
@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
return; return;
} }
device_idx_ = dh::CudaGetPointerDevice(first_column.data); device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
CHECK_NE(device_idx_, Context::kCpuId); CHECK(device_.IsCUDA());
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
for (auto& json_col : json_columns) { for (auto& json_col : json_columns) {
auto column = ArrayInterface<1>(get<Object const>(json_col)); auto column = ArrayInterface<1>(get<Object const>(json_col));
columns.push_back(column); columns.push_back(column);
num_rows_ = std::max(num_rows_, column.Shape(0)); num_rows_ = std::max(num_rows_, column.Shape(0));
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data)) CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
<< "All columns should use the same device."; << "All columns should use the same device.";
CHECK_EQ(num_rows_, column.Shape(0)) CHECK_EQ(num_rows_, column.Shape(0))
<< "All columns should have same number of rows."; << "All columns should have same number of rows.";
@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
return batch_; return batch_;
} }
size_t NumRows() const { return num_rows_; } [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return columns_.size(); } [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
int32_t DeviceIdx() const { return device_idx_; } [[nodiscard]] DeviceOrd Device() const { return device_; }
private: private:
CudfAdapterBatch batch_; CudfAdapterBatch batch_;
dh::device_vector<ArrayInterface<1>> columns_; dh::device_vector<ArrayInterface<1>> columns_;
size_t num_rows_{0}; size_t num_rows_{0};
int32_t device_idx_{Context::kCpuId}; DeviceOrd device_{DeviceOrd::CPU()};
}; };
class CupyAdapterBatch : public detail::NoMetaInfo { class CupyAdapterBatch : public detail::NoMetaInfo {
@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
CupyAdapterBatch() = default; CupyAdapterBatch() = default;
explicit CupyAdapterBatch(ArrayInterface<2> array_interface) explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
: array_interface_(std::move(array_interface)) {} : array_interface_(std::move(array_interface)) {}
size_t Size() const { [[nodiscard]] std::size_t Size() const {
return array_interface_.Shape(0) * array_interface_.Shape(1); return array_interface_.Shape(0) * array_interface_.Shape(1);
} }
__device__ COOTuple GetElement(size_t idx) const { [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % array_interface_.Shape(1); size_t column_idx = idx % array_interface_.Shape(1);
size_t row_idx = idx / array_interface_.Shape(1); size_t row_idx = idx / array_interface_.Shape(1);
float value = array_interface_(row_idx, column_idx); float value = array_interface_(row_idx, column_idx);
return {row_idx, column_idx, value}; return {row_idx, column_idx, value};
} }
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const { [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
float value = array_interface_(ridx, fidx); float value = array_interface_(ridx, fidx);
return value; return value;
} }
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); } [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); } [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
private: private:
ArrayInterface<2> array_interface_; ArrayInterface<2> array_interface_;
@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
if (array_interface_.Shape(0) == 0) { if (array_interface_.Shape(0) == 0) {
return; return;
} }
device_idx_ = dh::CudaGetPointerDevice(array_interface_.data); device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
CHECK_NE(device_idx_, Context::kCpuId); CHECK(device_.IsCUDA());
} }
explicit CupyAdapter(std::string cuda_interface_str) explicit CupyAdapter(std::string cuda_interface_str)
: CupyAdapter{StringView{cuda_interface_str}} {} : CupyAdapter{StringView{cuda_interface_str}} {}
const CupyAdapterBatch& Value() const override { return batch_; } [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
size_t NumRows() const { return array_interface_.Shape(0); } [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
size_t NumColumns() const { return array_interface_.Shape(1); } [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
int32_t DeviceIdx() const { return device_idx_; } [[nodiscard]] DeviceOrd Device() const { return device_; }
private: private:
ArrayInterface<2> array_interface_; ArrayInterface<2> array_interface_;
CupyAdapterBatch batch_; CupyAdapterBatch batch_;
int32_t device_idx_ {Context::kCpuId}; DeviceOrd device_{DeviceOrd::CPU()};
}; };
// Returns maximum row length // Returns maximum row length
template <typename AdapterBatchT> template <typename AdapterBatchT>
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx, std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
float missing) { float missing) {
dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing); IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes())); dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));

View File

@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
} }
// Construct an ELLPACK matrix with the given number of empty rows. // Construct an ELLPACK matrix with the given number of empty rows.
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
bool is_dense, size_t row_stride, size_t row_stride, size_t n_rows)
size_t n_rows) : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
: is_dense(is_dense),
cuts_(std::move(cuts)),
row_stride(row_stride),
n_rows(n_rows) {
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
monitor_.Start("InitCompressedData"); monitor_.Start("InitCompressedData");
InitCompressedData(device); InitCompressedData(device);
monitor_.Stop("InitCompressedData"); monitor_.Stop("InitCompressedData");
} }
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
const SparsePage &page, bool is_dense, const SparsePage &page, bool is_dense,
size_t row_stride, size_t row_stride,
common::Span<FeatureType const> feature_types) common::Span<FeatureType const> feature_types)
@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param) EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
: is_dense(dmat->IsDense()) { : is_dense(dmat->IsDense()) {
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
n_rows = dmat->Info().num_row_; n_rows = dmat->Info().num_row_;
@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
monitor_.Stop("Quantiles"); monitor_.Stop("Quantiles");
monitor_.Start("InitCompressedData"); monitor_.Start("InitCompressedData");
this->InitCompressedData(ctx->gpu_id); this->InitCompressedData(ctx->Device());
monitor_.Stop("InitCompressedData"); monitor_.Stop("InitCompressedData");
dmat->Info().feature_types.SetDevice(ctx->gpu_id); dmat->Info().feature_types.SetDevice(ctx->Device());
auto ft = dmat->Info().feature_types.ConstDeviceSpan(); auto ft = dmat->Info().feature_types.ConstDeviceSpan();
monitor_.Start("BinningCompression"); monitor_.Start("BinningCompression");
CHECK(dmat->SingleColBlock()); CHECK(dmat->SingleColBlock());
for (const auto& batch : dmat->GetBatches<SparsePage>()) { for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(ctx->gpu_id, batch, ft); CreateHistIndices(ctx->Device(), batch, ft);
} }
monitor_.Stop("BinningCompression"); monitor_.Stop("BinningCompression");
} }
@ -209,7 +205,7 @@ struct TupleScanOp {
// to remove missing data // to remove missing data
template <typename AdapterBatchT> template <typename AdapterBatchT>
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types, void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
EllpackPageImpl* dst, int device_idx, float missing) { EllpackPageImpl* dst, DeviceOrd device, float missing) {
// Some witchcraft happens here // Some witchcraft happens here
// The goal is to copy valid elements out of the input to an ELLPACK matrix // The goal is to copy valid elements out of the input to an ELLPACK matrix
// with a given row stride, using no extra working memory Standard stream // with a given row stride, using no extra working memory Standard stream
@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
// Tuple[2] = The index in the input data // Tuple[2] = The index in the input data
using Tuple = thrust::tuple<size_t, size_t, size_t>; using Tuple = thrust::tuple<size_t, size_t, size_t>;
auto device_accessor = dst->GetDeviceAccessor(device_idx); auto device_accessor = dst->GetDeviceAccessor(device);
common::CompressedBufferWriter writer(device_accessor.NumSymbols()); common::CompressedBufferWriter writer(device_accessor.NumSymbols());
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
#endif #endif
} }
void WriteNullValues(EllpackPageImpl* dst, int device_idx, void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
common::Span<size_t> row_counts) {
// Write the null values // Write the null values
auto device_accessor = dst->GetDeviceAccessor(device_idx); auto device_accessor = dst->GetDeviceAccessor(device);
common::CompressedBufferWriter writer(device_accessor.NumSymbols()); common::CompressedBufferWriter writer(device_accessor.NumSymbols());
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer(); auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
auto row_stride = dst->row_stride; auto row_stride = dst->row_stride;
@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
} }
template <typename AdapterBatch> template <typename AdapterBatch>
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
common::Span<size_t> row_counts_span, common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride, common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts) { size_t n_rows, common::HistogramCuts const& cuts) {
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows); *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
CopyDataToEllpack(batch, feature_types, this, device, missing); CopyDataToEllpack(batch, feature_types, this, device, missing);
@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
template EllpackPageImpl::EllpackPageImpl( \ template EllpackPageImpl::EllpackPageImpl( \
__BATCH_T batch, float missing, int device, bool is_dense, \ __BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \ common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts); size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
[&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
row_stride = *std::max_element(it, it + page.Size()); row_stride = *std::max_element(it, it + page.Size());
CHECK_GE(ctx->gpu_id, 0); CHECK(ctx->IsCUDA());
monitor_.Start("InitCompressedData"); monitor_.Start("InitCompressedData");
InitCompressedData(ctx->gpu_id); InitCompressedData(ctx->Device());
monitor_.Stop("InitCompressedData"); monitor_.Stop("InitCompressedData");
// copy gidx // copy gidx
@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(), dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream())); cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft); auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
auto null = accessor.NullValue(); auto null = accessor.NullValue();
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null); CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
} }
@ -407,8 +402,7 @@ struct CopyPage {
}; };
// Copy the data from the given EllpackPage to the current page. // Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page, size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
size_t offset) {
monitor_.Start("Copy"); monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride; size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
@ -468,7 +462,7 @@ struct CompactPage {
}; };
// Compacts the data from the given EllpackPage into the current page. // Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page, void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
common::Span<size_t> row_indexes) { common::Span<size_t> row_indexes) {
monitor_.Start("Compact"); monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
} }
// Initialize the buffer to stored compressed features. // Initialize the buffer to stored compressed features.
void EllpackPageImpl::InitCompressedData(int device) { void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
size_t num_symbols = NumSymbols(); size_t num_symbols = NumSymbols();
// Required buffer size for storing data matrix in ELLPack format. // Required buffer size for storing data matrix in ELLPack format.
size_t compressed_size_bytes = size_t compressed_size_bytes =
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
num_symbols);
gidx_buffer.SetDevice(device); gidx_buffer.SetDevice(device);
// Don't call fill unnecessarily // Don't call fill unnecessarily
if (gidx_buffer.Size() == 0) { if (gidx_buffer.Size() == 0) {
@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
} }
// Compress a CSR page into ELLPACK. // Compress a CSR page into ELLPACK.
void EllpackPageImpl::CreateHistIndices(int device, void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
const SparsePage& row_batch, const SparsePage& row_batch,
common::Span<FeatureType const> feature_types) { common::Span<FeatureType const> feature_types) {
if (row_batch.Size() == 0) return; if (row_batch.Size() == 0) return;
@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
// bin and compress entries in batches of rows // bin and compress entries in batches of rows
size_t gpu_batch_nrows = size_t gpu_batch_nrows =
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)), std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
static_cast<size_t>(row_batch.Size())); static_cast<size_t>(row_batch.Size()));
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows); size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
} }
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
int device, common::Span<FeatureType const> feature_types) const { DeviceOrd device, common::Span<FeatureType const> feature_types) const {
gidx_buffer.SetDevice(device); gidx_buffer.SetDevice(device);
return {device, return {device,
cuts_, cuts_,
@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
} }
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
common::Span<FeatureType const> feature_types) const { common::Span<FeatureType const> feature_types) const {
return {Context::kCpuId, return {DeviceOrd::CPU(),
cuts_, cuts_,
is_dense, is_dense,
row_stride, row_stride,

View File

@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
common::Span<const FeatureType> feature_types; common::Span<const FeatureType> feature_types;
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts, EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
bool is_dense, size_t row_stride, size_t base_rowid, size_t row_stride, size_t base_rowid, size_t n_rows,
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter, common::CompressedIterator<uint32_t> gidx_iter,
common::Span<FeatureType const> feature_types) common::Span<FeatureType const> feature_types)
: is_dense(is_dense), : is_dense(is_dense),
row_stride(row_stride), row_stride(row_stride),
base_rowid(base_rowid), base_rowid(base_rowid),
n_rows(n_rows) ,gidx_iter(gidx_iter), n_rows(n_rows),
gidx_iter(gidx_iter),
feature_types{feature_types} { feature_types{feature_types} {
if (device == Context::kCpuId) { if (device.IsCPU()) {
gidx_fvalue_map = cuts.cut_values_.ConstHostSpan(); gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
feature_segments = cuts.cut_ptrs_.ConstHostSpan(); feature_segments = cuts.cut_ptrs_.ConstHostSpan();
min_fvalue = cuts.min_vals_.ConstHostSpan(); min_fvalue = cuts.min_vals_.ConstHostSpan();
@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
} }
// Get a matrix element, uses binary search for look up Return NaN if missing // Get a matrix element, uses binary search for look up Return NaN if missing
// Given a row index and a feature index, returns the corresponding cut value // Given a row index and a feature index, returns the corresponding cut value
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const { [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
ridx -= base_rowid; ridx -= base_rowid;
auto row_begin = row_stride * ridx; auto row_begin = row_stride * ridx;
auto row_end = row_begin + row_stride; auto row_end = row_begin + row_stride;
@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
} }
template <bool is_cat> template <bool is_cat>
__device__ uint32_t SearchBin(float value, size_t column_id) const { [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
auto beg = feature_segments[column_id]; auto beg = feature_segments[column_id];
auto end = feature_segments[column_id + 1]; auto end = feature_segments[column_id + 1];
uint32_t idx = 0; uint32_t idx = 0;
@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
return idx; return idx;
} }
__device__ bst_float GetFvalue(size_t ridx, size_t fidx) const { [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
auto gidx = GetBinIndex(ridx, fidx); auto gidx = GetBinIndex(ridx, fidx);
if (gidx == -1) { if (gidx == -1) {
return nan(""); return nan("");
@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
} }
// Check if the row id is withing range of the current batch. // Check if the row id is withing range of the current batch.
__device__ bool IsInRange(size_t row_id) const { [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
return row_id >= base_rowid && row_id < base_rowid + n_rows; return row_id >= base_rowid && row_id < base_rowid + n_rows;
} }
/*! \brief Return the total number of symbols (total number of bins plus 1 for /*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */ * not found). */
XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; } [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); } [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
}; };
@ -141,14 +142,13 @@ class EllpackPageImpl {
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
* and the given number of rows. * and the given number of rows.
*/ */
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense, EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
size_t row_stride, size_t n_rows); size_t n_rows);
/*! /*!
* \brief Constructor used for external memory. * \brief Constructor used for external memory.
*/ */
EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
const SparsePage &page, bool is_dense, size_t row_stride, bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
common::Span<FeatureType const> feature_types);
/*! /*!
* \brief Constructor from an existing DMatrix. * \brief Constructor from an existing DMatrix.
@ -159,7 +159,7 @@ class EllpackPageImpl {
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm); explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
template <typename AdapterBatch> template <typename AdapterBatch>
explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense, explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
common::Span<size_t> row_counts_span, common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride, common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, common::HistogramCuts const& cuts); size_t n_rows, common::HistogramCuts const& cuts);
@ -176,7 +176,7 @@ class EllpackPageImpl {
* @param offset The number of elements to skip before copying. * @param offset The number of elements to skip before copying.
* @returns The number of elements copied. * @returns The number of elements copied.
*/ */
size_t Copy(int device, EllpackPageImpl const *page, size_t offset); size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
/*! \brief Compact the given ELLPACK page into the current page. /*! \brief Compact the given ELLPACK page into the current page.
* *
@ -184,11 +184,10 @@ class EllpackPageImpl {
* @param page The ELLPACK page to compact from. * @param page The ELLPACK page to compact from.
* @param row_indexes Row indexes for the compacted page. * @param row_indexes Row indexes for the compacted page.
*/ */
void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes); void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
/*! \return Number of instances in the page. */ /*! \return Number of instances in the page. */
size_t Size() const; [[nodiscard]] size_t Size() const;
/*! \brief Set the base row id for this page. */ /*! \brief Set the base row id for this page. */
void SetBaseRowId(std::size_t row_id) { void SetBaseRowId(std::size_t row_id) {
@ -204,12 +203,12 @@ class EllpackPageImpl {
/*! \brief Return the total number of symbols (total number of bins plus 1 for /*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */ * not found). */
size_t NumSymbols() const { return cuts_.TotalBins() + 1; } [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
EllpackDeviceAccessor [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
GetDeviceAccessor(int device, DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
common::Span<FeatureType const> feature_types = {}) const; [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const; common::Span<FeatureType const> feature_types = {}) const;
private: private:
/*! /*!
@ -218,13 +217,13 @@ class EllpackPageImpl {
* @param device The GPU device to use. * @param device The GPU device to use.
* @param row_batch The CSR page. * @param row_batch The CSR page.
*/ */
void CreateHistIndices(int device, void CreateHistIndices(DeviceOrd device,
const SparsePage& row_batch, const SparsePage& row_batch,
common::Span<FeatureType const> feature_types); common::Span<FeatureType const> feature_types);
/*! /*!
* \brief Initialize the buffer to store compressed features. * \brief Initialize the buffer to store compressed features.
*/ */
void InitCompressedData(int device); void InitCompressedData(DeviceOrd device);
public: public:

View File

@ -10,7 +10,7 @@
namespace xgboost::data { namespace xgboost::data {
void EllpackPageSource::Fetch() { void EllpackPageSource::Fetch() {
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
if (!this->ReadCache()) { if (!this->ReadCache()) {
if (count_ != 0 && !sync_) { if (count_ != 0 && !sync_) {
// source is initialized to be the 0th page during construction, so when count_ is 0 // source is initialized to be the 0th page during construction, so when count_ is 0

View File

@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
BatchParam param_; BatchParam param_;
common::Span<FeatureType const> feature_types_; common::Span<FeatureType const> feature_types_;
std::unique_ptr<common::HistogramCuts> cuts_; std::unique_ptr<common::HistogramCuts> cuts_;
std::int32_t device_; DeviceOrd device_;
public: public:
EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches, EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
std::shared_ptr<Cache> cache, BatchParam param, std::shared_ptr<Cache> cache, BatchParam param,
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride, std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
common::Span<FeatureType const> feature_types, common::Span<FeatureType const> feature_types,
std::shared_ptr<SparsePageSource> source, std::int32_t device) std::shared_ptr<SparsePageSource> source, DeviceOrd device)
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false), : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
is_dense_{is_dense}, is_dense_{is_dense},
row_stride_{row_stride}, row_stride_{row_stride},

View File

@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
auto pctx = MakeProxy(proxy_)->Ctx(); auto pctx = MakeProxy(proxy_)->Ctx();
Context ctx; Context ctx;
ctx.UpdateAllowUnknown( ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
// hardcoded parameter. // hardcoded parameter.
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()}; BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
return HostAdapterDispatch(proxy, [&](auto const& value) { return HostAdapterDispatch(proxy, [&](auto const& value) {
size_t n_threads = ctx->Threads(); size_t n_threads = ctx->Threads();
size_t n_features = column_sizes.size(); size_t n_features = column_sizes.size();
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId); linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
column_sizes_tloc.Data()->Fill(0ul); column_sizes_tloc.Data()->Fill(0ul);
auto view = column_sizes_tloc.HostView(); auto view = column_sizes_tloc.HostView();
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) { common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {

View File

@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
int32_t current_device; int32_t current_device;
dh::safe_cuda(cudaGetDevice(&current_device)); dh::safe_cuda(cudaGetDevice(&current_device));
auto get_device = [&]() -> int32_t { auto get_device = [&]() {
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id; auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
CHECK_NE(d, Context::kCpuId); CHECK(!d.IsCPU());
return d; return d;
}; };
@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
common::HistogramCuts cuts; common::HistogramCuts cuts;
do { do {
// We use do while here as the first batch is fetched in ctor // We use do while here as the first batch is fetched in ctor
// ctx_.gpu_id = proxy->DeviceIdx(); CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs()); dh::safe_cuda(cudaSetDevice(get_device().ordinal));
dh::safe_cuda(cudaSetDevice(get_device()));
if (cols == 0) { if (cols == 0) {
cols = num_cols(); cols = num_cols();
collective::Allreduce<collective::Operation::kMax>(&cols, 1); collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
auto n_features = cols; auto n_features = cols;
CHECK_GE(n_features, 1) << "Data must has at least 1 column."; CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
dh::safe_cuda(cudaSetDevice(get_device())); dh::safe_cuda(cudaSetDevice(get_device().ordinal));
if (!ref) { if (!ref) {
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
common::SketchContainer final_sketch( common::SketchContainer final_sketch(
@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
size_t n_batches_for_verification = 0; size_t n_batches_for_verification = 0;
while (iter.Next()) { while (iter.Next()) {
init_page(); init_page();
dh::safe_cuda(cudaSetDevice(get_device())); dh::safe_cuda(cudaSetDevice(get_device().ordinal));
auto rows = num_rows(); auto rows = num_rows();
dh::device_vector<size_t> row_counts(rows + 1, 0); dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size()); common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
if (!ellpack_) { if (!ellpack_) {
ellpack_.reset(new EllpackPage()); ellpack_.reset(new EllpackPage());
if (ctx->IsCUDA()) { if (ctx->IsCUDA()) {
this->Info().feature_types.SetDevice(ctx->gpu_id); this->Info().feature_types.SetDevice(ctx->Device());
*ellpack_->Impl() = *ellpack_->Impl() =
EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
} else if (fmat_ctx_.IsCUDA()) { } else if (fmat_ctx_.IsCUDA()) {
this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id); this->Info().feature_types.SetDevice(fmat_ctx_.Device());
*ellpack_->Impl() = *ellpack_->Impl() =
EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
} else { } else {
// Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
// for cut reference. // for cut reference.
auto cuda_ctx = ctx->MakeCUDA(); auto cuda_ctx = ctx->MakeCUDA();
this->Info().feature_types.SetDevice(cuda_ctx.gpu_id); this->Info().feature_types.SetDevice(cuda_ctx.Device());
*ellpack_->Impl() = *ellpack_->Impl() =
EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
} }

View File

@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
this->batch_ = adapter; this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns(); this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows(); this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId; this->ctx_.Init(Args{{"device", "cpu"}});
} }
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
char const *c_values, bst_feature_t n_features, bool on_host) { bst_feature_t n_features, bool on_host) {
CHECK(on_host) << "Not implemented on device."; CHECK(on_host) << "Not implemented on device.";
std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter( std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)}; StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
this->batch_ = adapter; this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns(); this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows(); this->Info().num_row_ = adapter->NumRows();
this->ctx_.gpu_id = Context::kCpuId; this->ctx_.Init(Args{{"device", "cpu"}});
} }
namespace cuda_impl { namespace cuda_impl {

View File

@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
this->batch_ = adapter; this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns(); this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows(); this->Info().num_row_ = adapter->NumRows();
if (adapter->DeviceIdx() < 0) { if (adapter->Device().IsCPU()) {
// empty data // empty data
CHECK_EQ(this->Info().num_row_, 0); CHECK_EQ(this->Info().num_row_, 0);
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice()); ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return; return;
} }
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx()); ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
} }
void DMatrixProxy::FromCudaArray(StringView interface_str) { void DMatrixProxy::FromCudaArray(StringView interface_str) {
@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
this->batch_ = adapter; this->batch_ = adapter;
this->Info().num_col_ = adapter->NumColumns(); this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows(); this->Info().num_row_ = adapter->NumRows();
if (adapter->DeviceIdx() < 0) { if (adapter->Device().IsCPU()) {
// empty data // empty data
CHECK_EQ(this->Info().num_row_, 0); CHECK_EQ(this->Info().num_row_, 0);
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice()); ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
return; return;
} }
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx()); ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
} }
namespace cuda_impl { namespace cuda_impl {

View File

@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA)
public: public:
int DeviceIdx() const { return ctx_.gpu_id; } DeviceOrd Device() const { return ctx_.Device(); }
void SetCUDAArray(char const* c_interface) { void SetCUDAArray(char const* c_interface) {
common::AssertGPUSupport(); common::AssertGPUSupport();

View File

@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
} }
if (batch.BaseMargin() != nullptr) { if (batch.BaseMargin() != nullptr) {
info_.base_margin_ = decltype(info_.base_margin_){ info_.base_margin_ = decltype(info_.base_margin_){
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId}; batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
} }
if (batch.Qid() != nullptr) { if (batch.Qid() != nullptr) {
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size()); qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());

View File

@ -10,9 +10,7 @@
#include "xgboost/context.h" // for Context #include "xgboost/context.h" // for Context
#include "xgboost/data.h" #include "xgboost/data.h"
namespace xgboost { namespace xgboost::data {
namespace data {
// Does not currently support metainfo as no on-device data source contains this // Does not currently support metainfo as no on-device data source contains this
// Current implementation assumes a single batch. More batches can // Current implementation assumes a single batch. More batches can
// be supported in future. Does not currently support inferring row/column size // be supported in future. Does not currently support inferring row/column size
@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
DataSplitMode data_split_mode) { DataSplitMode data_split_mode) {
CHECK(data_split_mode != DataSplitMode::kCol) CHECK(data_split_mode != DataSplitMode::kCol)
<< "Column-wise data split is currently not supported on the GPU."; << "Column-wise data split is currently not supported on the GPU.";
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice() auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
: adapter->DeviceIdx(); ? DeviceOrd::CUDA(dh::CurrentDevice())
CHECK_GE(device, 0); : adapter->Device();
dh::safe_cuda(cudaSetDevice(device)); CHECK(device.IsCUDA());
dh::safe_cuda(cudaSetDevice(device.ordinal));
Context ctx; Context ctx;
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}}); ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
CHECK(adapter->NumRows() != kAdapterUnknownSize); CHECK(adapter->NumRows() != kAdapterUnknownSize);
CHECK(adapter->NumColumns() != kAdapterUnknownSize); CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
int nthread, DataSplitMode data_split_mode); int nthread, DataSplitMode data_split_mode);
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing, template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
int nthread, DataSplitMode data_split_mode); int nthread, DataSplitMode data_split_mode);
} // namespace data } // namespace xgboost::data
} // namespace xgboost

View File

@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
} }
template <typename AdapterBatchT> template <typename AdapterBatchT>
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
int device_idx, float missing) { float missing) {
dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing); IsValidFunctor is_valid(missing);
// Count elements per row // Count elements per row
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) { dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
}); });
dh::XGBCachingDeviceAllocator<char> alloc; dh::XGBCachingDeviceAllocator<char> alloc;
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()), thrust::device_pointer_cast(offset.data() + offset.size()),
thrust::device_pointer_cast(offset.data() + offset.size()), thrust::device_pointer_cast(offset.data()));
thrust::device_pointer_cast(offset.data()));
} }
template <typename AdapterBatchT> template <typename AdapterBatchT>
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing, size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
SparsePage* page) { SparsePage* page) {
bool valid = NoInfInData(batch, IsValidFunctor{missing}); bool valid = NoInfInData(batch, IsValidFunctor{missing});
CHECK(valid) << error::InfInData(); CHECK(valid) << error::InfInData();

View File

@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
ellpack_page_source_.reset(); // make sure resource is released before making new ones. ellpack_page_source_.reset(); // make sure resource is released before making new ones.
ellpack_page_source_ = std::make_shared<EllpackPageSource>( ellpack_page_source_ = std::make_shared<EllpackPageSource>(
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id), this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id); param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
ctx->Device());
} else { } else {
CHECK(sparse_page_source_); CHECK(sparse_page_source_);
ellpack_page_source_->Reset(); ellpack_page_source_->Reset();

View File

@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
} // namespace detail } // namespace detail
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) { void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
auto device = proxy->DeviceIdx(); auto device = proxy->Device();
if (device < 0) { if (device.IsCPU()) {
device = dh::CurrentDevice(); device = DeviceOrd::CUDA(dh::CurrentDevice());
} }
CHECK_GE(device, 0); CHECK(device.IsCUDA());
cuda_impl::Dispatch(proxy, cuda_impl::Dispatch(proxy,
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); }); [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });

View File

@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
bst_target_t const n_groups = model_.learner_model_param->OutputLength(); bst_target_t const n_groups = model_.learner_model_param->OutputLength();
monitor_.Start("BoostNewTrees"); monitor_.Start("BoostNewTrees");
predt->predictions.SetDevice(ctx_->Ordinal()); predt->predictions.SetDevice(ctx_->Device());
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_, auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
model_.learner_model_param->OutputLength()); model_.learner_model_param->OutputLength());
CHECK_NE(n_groups, 0); CHECK_NE(n_groups, 0);
@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
} else { } else {
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs"; CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)}, linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
ctx_->Ordinal()}; ctx_->Device()};
bool update_predict = true; bool update_predict = true;
for (bst_target_t gid = 0; gid < n_groups; ++gid) { for (bst_target_t gid = 0; gid < n_groups; ++gid) {
node_position.clear(); node_position.clear();
@ -736,7 +736,7 @@ class Dart : public GBTree {
PredictionCacheEntry predts; // temporary storage for prediction PredictionCacheEntry predts; // temporary storage for prediction
if (ctx_->IsCUDA()) { if (ctx_->IsCUDA()) {
predts.predictions.SetDevice(ctx_->gpu_id); predts.predictions.SetDevice(ctx_->Device());
} }
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0); predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
// multi-target is not yet supported. // multi-target is not yet supported.
@ -761,8 +761,8 @@ class Dart : public GBTree {
CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size()); CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
size_t n_rows = p_fmat->Info().num_row_; size_t n_rows = p_fmat->Info().num_row_;
if (predts.predictions.DeviceIdx() != Context::kCpuId) { if (predts.predictions.Device().IsCUDA()) {
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx()); p_out_preds->predictions.SetDevice(predts.predictions.Device());
GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(), GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows, n_groups, predts.predictions.DeviceSpan(), w, n_rows, n_groups,
group); group);
@ -801,8 +801,8 @@ class Dart : public GBTree {
StringView msg{"Unsupported data type for inplace predict."}; StringView msg{"Unsupported data type for inplace predict."};
PredictionCacheEntry predts; PredictionCacheEntry predts;
if (ctx_->gpu_id != Context::kCpuId) { if (ctx_->IsCUDA()) {
predts.predictions.SetDevice(ctx_->gpu_id); predts.predictions.SetDevice(ctx_->Device());
} }
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0); predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@ -838,8 +838,8 @@ class Dart : public GBTree {
CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size()); CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
size_t n_rows = p_fmat->Info().num_row_; size_t n_rows = p_fmat->Info().num_row_;
if (predts.predictions.DeviceIdx() != Context::kCpuId) { if (predts.predictions.Device().IsCUDA()) {
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx()); p_out_preds->predictions.SetDevice(predts.predictions.Device());
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device()); auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(), GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups, predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,

View File

@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
void LearnerModelParam::Copy(LearnerModelParam const& that) { void LearnerModelParam::Copy(LearnerModelParam const& that) {
base_score_.Reshape(that.base_score_.Shape()); base_score_.Reshape(that.base_score_.Shape());
base_score_.Data()->SetDevice(that.base_score_.DeviceIdx()); base_score_.Data()->SetDevice(that.base_score_.Device());
base_score_.Data()->Copy(*that.base_score_.Data()); base_score_.Data()->Copy(*that.base_score_.Data());
std::as_const(base_score_).HostView(); std::as_const(base_score_).HostView();
if (that.base_score_.DeviceIdx() != Context::kCpuId) { if (!that.base_score_.Device().IsCPU()) {
std::as_const(base_score_).View(that.base_score_.Device()); std::as_const(base_score_).View(that.base_score_.Device());
} }
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead()); CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) { if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
if (p_fmat) { if (p_fmat) {
auto const& info = p_fmat->Info(); auto const& info = p_fmat->Info();
info.Validate(Ctx()->Ordinal()); info.Validate(Ctx()->Device());
// We estimate it from input data. // We estimate it from input data.
linalg::Tensor<float, 1> base_score; linalg::Tensor<float, 1> base_score;
InitEstimation(info, &base_score); InitEstimation(info, &base_score);
@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
monitor_.Init("Learner"); monitor_.Init("Learner");
for (std::shared_ptr<DMatrix> const& d : cache) { for (std::shared_ptr<DMatrix> const& d : cache) {
if (d) { if (d) {
prediction_container_.Cache(d, Context::kCpuId); prediction_container_.Cache(d, DeviceOrd::CPU());
} }
} }
} }
@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration {
? std::numeric_limits<float>::quiet_NaN() ? std::numeric_limits<float>::quiet_NaN()
: obj_->ProbToMargin(mparam_.base_score)}, : obj_->ProbToMargin(mparam_.base_score)},
{1}, {1},
Context::kCpuId}, DeviceOrd::CPU()},
obj_->Task(), tparam_.multi_strategy); obj_->Task(), tparam_.multi_strategy);
if (attributes_.find("objective") != attributes_.cend()) { if (attributes_.find("objective") != attributes_.cend()) {
@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO {
this->ValidateDMatrix(train.get(), true); this->ValidateDMatrix(train.get(), true);
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id); auto& predt = prediction_container_.Cache(train, ctx_.Device());
monitor_.Start("PredictRaw"); monitor_.Start("PredictRaw");
this->PredictRaw(train.get(), &predt, true, 0, 0); this->PredictRaw(train.get(), &predt, true, 0, 0);
@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO {
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1)) CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
<< "The number of columns in gradient should be equal to the number of targets/classes in " << "The number of columns in gradient should be equal to the number of targets/classes in "
"the model."; "the model.";
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id); auto& predt = prediction_container_.Cache(train, ctx_.Device());
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get()); gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
monitor_.Stop("BoostOneIter"); monitor_.Stop("BoostOneIter");
} }
@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO {
for (size_t i = 0; i < data_sets.size(); ++i) { for (size_t i = 0; i < data_sets.size(); ++i) {
std::shared_ptr<DMatrix> m = data_sets[i]; std::shared_ptr<DMatrix> m = data_sets[i];
auto &predt = prediction_container_.Cache(m, ctx_.gpu_id); auto &predt = prediction_container_.Cache(m, ctx_.Device());
this->ValidateDMatrix(m.get(), false); this->ValidateDMatrix(m.get(), false);
this->PredictRaw(m.get(), &predt, false, 0, 0); this->PredictRaw(m.get(), &predt, false, 0, 0);
auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions; auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
out.Resize(predt.predictions.Size()); out.Resize(predt.predictions.Size());
out.Copy(predt.predictions); out.Copy(predt.predictions);
@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO {
} else if (pred_leaf) { } else if (pred_leaf) {
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end); gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
} else { } else {
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id); auto& prediction = prediction_container_.Cache(data, ctx_.Device());
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end); this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
// Copy the prediction cache to output prediction. out_preds comes from C API // Copy the prediction cache to output prediction. out_preds comes from C API
out_preds->SetDevice(ctx_.Device()); out_preds->SetDevice(ctx_.Device());
@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO {
void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const { void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
MetaInfo const& info = p_fmat->Info(); MetaInfo const& info = p_fmat->Info();
info.Validate(ctx_.gpu_id); info.Validate(ctx_.Device());
if (is_training) { if (is_training) {
CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_) CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)

View File

@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
} }
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) { void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
if (ctx_->gpu_id < 0) return; if (ctx_->IsCPU()) return;
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_); num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
return; return;
} }
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
// The begin and end indices for the section of each column associated with // The begin and end indices for the section of each column associated with
// this device // this device
std::vector<std::pair<bst_uint, bst_uint>> column_segments; std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
++group_idx) { ++group_idx) {
// Get gradient // Get gradient
auto grad = GradientPair(0, 0); auto grad = GradientPair(0, 0);
if (ctx_->gpu_id >= 0) { if (ctx_->IsCUDA()) {
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group); grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
} }
auto dbias = static_cast<float>( auto dbias = static_cast<float>(
@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
model->Bias()[group_idx] += dbias; model->Bias()[group_idx] += dbias;
// Update residual // Update residual
if (ctx_->gpu_id >= 0) { if (ctx_->IsCUDA()) {
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group); UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
} }
} }
@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
bst_float &w = (*model)[fidx][group_idx]; bst_float &w = (*model)[fidx][group_idx];
// Get gradient // Get gradient
auto grad = GradientPair(0, 0); auto grad = GradientPair(0, 0);
if (ctx_->gpu_id >= 0) { if (ctx_->IsCUDA()) {
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx); grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
} }
auto dw = static_cast<float>(tparam_.learning_rate * auto dw = static_cast<float>(tparam_.learning_rate *
@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
tparam_.reg_lambda_denorm)); tparam_.reg_lambda_denorm));
w += dw; w += dw;
if (ctx_->gpu_id >= 0) { if (ctx_->IsCUDA()) {
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx); UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
} }
} }
// This needs to be public because of the __device__ lambda. // This needs to be public because of the __device__ lambda.
GradientPair GetBiasGradient(int group_idx, int num_group) { GradientPair GetBiasGradient(int group_idx, int num_group) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
auto counting = thrust::make_counting_iterator(0ull); auto counting = thrust::make_counting_iterator(0ull);
auto f = [=] __device__(size_t idx) { auto f = [=] __device__(size_t idx) {
return idx * num_group + group_idx; return idx * num_group + group_idx;
@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
// This needs to be public because of the __device__ lambda. // This needs to be public because of the __device__ lambda.
GradientPair GetGradient(int group_idx, int num_group, int fidx) { GradientPair GetGradient(int group_idx, int num_group, int fidx) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]); common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx]; size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_); common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);

View File

@ -23,8 +23,7 @@
#include "xgboost/linalg.h" #include "xgboost/linalg.h"
#include "xgboost/metric.h" #include "xgboost/metric.h"
namespace xgboost { namespace xgboost::metric {
namespace metric {
// tag the this file, used by force static link later. // tag the this file, used by force static link later.
DMLC_REGISTRY_FILE_TAG(auc); DMLC_REGISTRY_FILE_TAG(auc);
/** /**
@ -257,10 +256,10 @@ template <typename Curve>
class EvalAUC : public MetricNoCache { class EvalAUC : public MetricNoCache {
double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override { double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
double auc {0}; double auc {0};
if (ctx_->gpu_id != Context::kCpuId) { if (ctx_->Device().IsCUDA()) {
preds.SetDevice(ctx_->gpu_id); preds.SetDevice(ctx_->Device());
info.labels.SetDevice(ctx_->gpu_id); info.labels.SetDevice(ctx_->Device());
info.weights_.SetDevice(ctx_->gpu_id); info.weights_.SetDevice(ctx_->Device());
} }
// We use the global size to handle empty dataset. // We use the global size to handle empty dataset.
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()}; std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
double auc{0}; double auc{0};
uint32_t valid_groups = 0; uint32_t valid_groups = 0;
auto n_threads = ctx_->Threads(); auto n_threads = ctx_->Threads();
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
std::tie(auc, valid_groups) = std::tie(auc, valid_groups) =
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads); RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
} else { } else {
@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
double auc{0}; double auc{0};
auto n_threads = ctx_->Threads(); auto n_threads = ctx_->Threads();
CHECK_NE(n_classes, 0); CHECK_NE(n_classes, 0);
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC); auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
} else { } else {
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes); auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
std::tuple<double, double, double> std::tuple<double, double, double>
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) { EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
double fp, tp, auc; double fp, tp, auc;
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(), std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
info.labels.HostView().Slice(linalg::All(), 0), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()}); common::OptionalWeights{info.weights_.ConstHostSpan()});
@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
} }
public: public:
char const* Name() const override { [[nodiscard]] char const* Name() const override {
return "auc"; return "auc";
} }
}; };
@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
std::tuple<double, double, double> std::tuple<double, double, double>
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) { EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
double pr, re, auc; double pr, re, auc;
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
std::tie(pr, re, auc) = std::tie(pr, re, auc) =
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0), BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
common::OptionalWeights{info.weights_.ConstHostSpan()}); common::OptionalWeights{info.weights_.ConstHostSpan()});
@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info, double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
size_t n_classes) { size_t n_classes) {
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
auto n_threads = this->ctx_->Threads(); auto n_threads = this->ctx_->Threads();
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC); return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
} else { } else {
@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
double auc{0}; double auc{0};
uint32_t valid_groups = 0; uint32_t valid_groups = 0;
auto n_threads = ctx_->Threads(); auto n_threads = ctx_->Threads();
if (ctx_->gpu_id == Context::kCpuId) { if (ctx_->IsCPU()) {
auto labels = info.labels.Data()->ConstHostSpan(); auto labels = info.labels.Data()->ConstHostSpan();
if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) { if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
InvalidLabels(); InvalidLabels();
@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
} }
public: public:
const char *Name() const override { return "aucpr"; } [[nodiscard]] const char *Name() const override { return "aucpr"; }
}; };
XGBOOST_REGISTER_METRIC(AUCPR, "aucpr") XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
return {}; return {};
} }
#endif #endif
} // namespace metric } // namespace xgboost::metric
} // namespace xgboost

View File

@ -824,7 +824,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
common::Span<float const> predts, common::Span<float const> predts,
MetaInfo const &info, MetaInfo const &info,
std::shared_ptr<DeviceAUCCache> *p_cache) { std::shared_ptr<DeviceAUCCache> *p_cache) {
dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
if (predts.empty()) { if (predts.empty()) {
return std::make_pair(0.0, static_cast<uint32_t>(0)); return std::make_pair(0.0, static_cast<uint32_t>(0));
} }

View File

@ -127,24 +127,24 @@ class MultiClassMetricsReduction {
#endif // XGBOOST_USE_CUDA #endif // XGBOOST_USE_CUDA
PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class, PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
const HostDeviceVector<bst_float>& weights, const HostDeviceVector<bst_float>& weights,
const HostDeviceVector<bst_float>& labels, const HostDeviceVector<bst_float>& labels,
const HostDeviceVector<bst_float>& preds) { const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result; PackedReduceResult result;
if (device < 0) { if (device.IsCPU()) {
result = result =
CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads()); CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
} }
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
else { // NOLINT else { // NOLINT
device_ = tparam.gpu_id; device_ = ctx.Device();
preds.SetDevice(device_); preds.SetDevice(device_);
labels.SetDevice(device_); labels.SetDevice(device_);
weights.SetDevice(device_); weights.SetDevice(device_);
dh::safe_cuda(cudaSetDevice(device_)); dh::safe_cuda(cudaSetDevice(device_.ordinal));
result = DeviceReduceMetrics(weights, labels, preds, n_class); result = DeviceReduceMetrics(weights, labels, preds, n_class);
} }
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA)
@ -154,7 +154,7 @@ class MultiClassMetricsReduction {
private: private:
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
dh::PinnedMemory label_error_; dh::PinnedMemory label_error_;
int device_{-1}; DeviceOrd device_{DeviceOrd::CPU()};
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA)
}; };
@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache {
CHECK_GE(nclass, 1U) CHECK_GE(nclass, 1U)
<< "mlogloss and merror are only used for multi-class classification," << "mlogloss and merror are only used for multi-class classification,"
<< " use logloss for binary classification"; << " use logloss for binary classification";
int device = ctx_->gpu_id; auto device = ctx_->Device();
auto result = auto result =
reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds); reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
dat[0] = result.Residue(); dat[0] = result.Residue();

View File

@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
auto d_gptr = p_cache->DataGroupPtr(ctx); auto d_gptr = p_cache->DataGroupPtr(ctx);
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0); auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id); predt.SetDevice(ctx->Device());
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan()); auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
auto topk = p_cache->Param().TopK(); auto topk = p_cache->Param().TopK();
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_); auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
CHECK_EQ(d_weight.weights.size(), p_cache->Groups()); CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
} }
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0); auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
predt.SetDevice(ctx->gpu_id); predt.SetDevice(ctx->Device());
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size()); auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
auto d_group_ptr = p_cache->DataGroupPtr(ctx); auto d_group_ptr = p_cache->DataGroupPtr(ctx);

View File

@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction {
const HostDeviceVector<bst_float>& preds) { const HostDeviceVector<bst_float>& preds) {
PackedReduceResult result; PackedReduceResult result;
if (ctx.gpu_id < 0) { if (ctx.IsCPU()) {
result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound, result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
preds, ctx.Threads()); preds, ctx.Threads());
} }
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
else { // NOLINT else { // NOLINT
preds.SetDevice(ctx.gpu_id); preds.SetDevice(ctx.Device());
labels_lower_bound.SetDevice(ctx.gpu_id); labels_lower_bound.SetDevice(ctx.Device());
labels_upper_bound.SetDevice(ctx.gpu_id); labels_upper_bound.SetDevice(ctx.Device());
weights.SetDevice(ctx.gpu_id); weights.SetDevice(ctx.Device());
dh::safe_cuda(cudaSetDevice(ctx.gpu_id)); dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds); result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
} }
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA)

View File

@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate, detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
predt, alpha, p_tree); predt, alpha, p_tree);
} else { } else {
position.SetDevice(ctx->gpu_id); position.SetDevice(ctx->Device());
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate, detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
predt, alpha, p_tree); predt, alpha, p_tree);
} }

View File

@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
template <typename Distribution> template <typename Distribution>
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device, linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
bool is_null_weight, float aft_loss_distribution_scale) { bool is_null_weight, float aft_loss_distribution_scale) {
common::Transform<>::Init( common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx, [=] XGBOOST_DEVICE(size_t _idx,
@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
CHECK_EQ(info.labels_upper_bound_.Size(), ndata); CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(ndata, 1); out_gpair->Reshape(ndata, 1);
const int device = ctx_->gpu_id; const auto device = ctx_->Device();
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale; const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
const bool is_null_weight = info.weights_.Size() == 0; const bool is_null_weight = info.weights_.Size() == 0;
if (!is_null_weight) { if (!is_null_weight) {
@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
_preds[_idx] = exp(_preds[_idx]); _preds[_idx] = exp(_preds[_idx]);
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2018-2022 by XGBoost Contributors * Copyright 2018-2023, XGBoost Contributors
* \file hinge.cc * \file hinge.cc
* \brief Provides an implementation of the hinge loss function * \brief Provides an implementation of the hinge loss function
* \author Henry Gouk * \author Henry Gouk
@ -13,8 +13,7 @@
#include "../common/transform.h" #include "../common/transform.h"
#include "../common/common.h" #include "../common/common.h"
namespace xgboost { namespace xgboost::obj {
namespace obj {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu); DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
_out_gpair[_idx] = GradientPair(g, h); _out_gpair[_idx] = GradientPair(g, h);
}, },
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
ctx_->gpu_id).Eval( ctx_->Device()).Eval(
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_); out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
} }
@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
_preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0; _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }
const char* DefaultEvalMetric() const override { [[nodiscard]] const char* DefaultEvalMetric() const override {
return "error"; return "error";
} }
@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
.describe("Hinge loss. Expects labels to be in [0,1f]") .describe("Hinge loss. Expects labels to be in [0,1f]")
.set_body([]() { return new HingeObj(); }); .set_body([]() { return new HingeObj(); });
} // namespace obj } // namespace xgboost::obj
} // namespace xgboost

View File

@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
CheckInitInputs(info); CheckInitInputs(info);
} }
// Avoid altering any state in child objective. // Avoid altering any state in child objective.
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id); HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id); linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
Json config{Object{}}; Json config{Object{}};
this->SaveConfig(&config); this->SaveConfig(&config);

View File

@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept {
// Update position biased for unbiased click data // Update position biased for unbiased click data
void UpdatePositionBias() { void UpdatePositionBias() {
li_full_.SetDevice(ctx_->gpu_id); li_full_.SetDevice(ctx_->Device());
lj_full_.SetDevice(ctx_->gpu_id); lj_full_.SetDevice(ctx_->Device());
li_.SetDevice(ctx_->gpu_id); li_.SetDevice(ctx_->Device());
lj_.SetDevice(ctx_->gpu_id); lj_.SetDevice(ctx_->Device());
if (ctx_->IsCPU()) { if (ctx_->IsCPU()) {
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()), cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),

View File

@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
linalg::VectorView<double> li, linalg::VectorView<double> lj, linalg::VectorView<double> li, linalg::VectorView<double> lj,
linalg::Matrix<GradientPair>* out_gpair) { linalg::Matrix<GradientPair>* out_gpair) {
// boilerplate // boilerplate
std::int32_t device_id = ctx->gpu_id; auto device = ctx->Device();
dh::safe_cuda(cudaSetDevice(device_id)); dh::safe_cuda(cudaSetDevice(device.ordinal));
auto n_groups = p_cache->Groups(); auto n_groups = p_cache->Groups();
info.labels.SetDevice(device_id); info.labels.SetDevice(device);
preds.SetDevice(device_id); preds.SetDevice(device);
out_gpair->SetDevice(ctx->Device()); out_gpair->SetDevice(ctx->Device());
out_gpair->Reshape(preds.Size(), 1); out_gpair->Reshape(preds.Size(), 1);

View File

@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
const int nclass = param_.num_class; const int nclass = param_.num_class;
const auto ndata = static_cast<int64_t>(preds.Size() / nclass); const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
auto device = ctx_->gpu_id; auto device = ctx_->Device();
out_gpair->SetDevice(device); out_gpair->SetDevice(device);
info.labels.SetDevice(device); info.labels.SetDevice(device);
info.weights_.SetDevice(device); info.weights_.SetDevice(device);
@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
const int nclass = param_.num_class; const int nclass = param_.num_class;
const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass); const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
auto device = io_preds->DeviceIdx(); auto device = io_preds->Device();
if (prob) { if (prob) {
common::Transform<>::Init( common::Transform<>::Init(
[=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) { [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {

View File

@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction {
out_gpair->Reshape(info.num_row_, n_targets); out_gpair->Reshape(info.num_row_, n_targets);
auto gpair = out_gpair->View(ctx_->Device()); auto gpair = out_gpair->View(ctx_->Device());
info.weights_.SetDevice(ctx_->gpu_id); info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()}; : info.weights_.ConstDeviceSpan()};
preds.SetDevice(ctx_->gpu_id); preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds); auto predt = linalg::MakeVec(&preds);
auto n_samples = info.num_row_; auto n_samples = info.num_row_;
alpha_.SetDevice(ctx_->gpu_id); alpha_.SetDevice(ctx_->Device());
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan(); auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
linalg::ElementWiseKernel( linalg::ElementWiseKernel(
@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction {
CHECK(!alpha_.Empty()); CHECK(!alpha_.Empty());
auto n_targets = this->Targets(info); auto n_targets = this->Targets(info);
base_score->SetDevice(ctx_->gpu_id); base_score->SetDevice(ctx_->Device());
base_score->Reshape(n_targets); base_score->Reshape(n_targets);
double sw{0}; double sw{0};
@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction {
} }
} else { } else {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
alpha_.SetDevice(ctx_->gpu_id); alpha_.SetDevice(ctx_->Device());
auto d_alpha = alpha_.ConstDeviceSpan(); auto d_alpha = alpha_.ConstDeviceSpan();
auto d_labels = info.labels.View(ctx_->Device()); auto d_labels = info.labels.View(ctx_->Device());
auto seg_it = dh::MakeTransformIterator<std::size_t>( auto seg_it = dh::MakeTransformIterator<std::size_t>(
@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction {
val_it + n, base_score->Data()); val_it + n, base_score->Data());
sw = info.num_row_; sw = info.num_row_;
} else { } else {
info.weights_.SetDevice(ctx_->gpu_id); info.weights_.SetDevice(ctx_->Device());
auto d_weights = info.weights_.ConstDeviceSpan(); auto d_weights = info.weights_.ConstDeviceSpan();
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul), auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { [=] XGBOOST_DEVICE(std::size_t i) {

View File

@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
size_t const ndata = preds.Size(); size_t const ndata = preds.Size();
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
auto device = ctx_->gpu_id; auto device = ctx_->Device();
bool is_null_weight = info.weights_.Size() == 0; bool is_null_weight = info.weights_.Size() == 0;
auto scale_pos_weight = param_.scale_pos_weight; auto scale_pos_weight = param_.scale_pos_weight;
@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
additional_input_.HostVector().begin()[1] = is_null_weight; additional_input_.HostVector().begin()[1] = is_null_weight;
const size_t nthreads = ctx_->Threads(); const size_t nthreads = ctx_->Threads();
bool on_device = device >= 0; bool on_device = device.IsCUDA();
// On CPU we run the transformation each thread processing a contigious block of data // On CPU we run the transformation each thread processing a contigious block of data
// for better performance. // for better performance.
const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads)); const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
_preds[_idx] = Loss::PredTransform(_preds[_idx]); _preds[_idx] = Loss::PredTransform(_preds[_idx]);
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }
@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept {
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0."; CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
auto labels = info.labels.View(ctx_->Device()); auto labels = info.labels.View(ctx_->Device());
out_gpair->SetDevice(ctx_->gpu_id); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
auto gpair = out_gpair->View(ctx_->Device()); auto gpair = out_gpair->View(ctx_->Device());
preds.SetDevice(ctx_->gpu_id); preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds); auto predt = linalg::MakeVec(&preds);
info.weights_.SetDevice(ctx_->gpu_id); info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()}; : info.weights_.ConstDeviceSpan()};
@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept {
size_t const ndata = preds.Size(); size_t const ndata = preds.Size();
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
auto device = ctx_->gpu_id; auto device = ctx_->Device();
label_correct_.Resize(1); label_correct_.Resize(1);
label_correct_.Fill(1); label_correct_.Fill(1);
@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]); _preds[_idx] = expf(_preds[_idx]);
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override { void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept {
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty"; CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided"; CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
const size_t ndata = preds.Size(); const size_t ndata = preds.Size();
auto device = ctx_->gpu_id; auto device = ctx_->Device();
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
label_correct_.Resize(1); label_correct_.Resize(1);
@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]); _preds[_idx] = expf(_preds[_idx]);
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override { void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept {
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
auto device = ctx_->gpu_id; auto device = ctx_->Device();
label_correct_.Resize(1); label_correct_.Resize(1);
label_correct_.Fill(1); label_correct_.Fill(1);
@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept {
_preds[_idx] = expf(_preds[_idx]); _preds[_idx] = expf(_preds[_idx]);
}, },
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(), common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
io_preds->DeviceIdx()) io_preds->Device())
.Eval(io_preds); .Eval(io_preds);
} }
@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction {
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
auto gpair = out_gpair->View(ctx_->Device()); auto gpair = out_gpair->View(ctx_->Device());
preds.SetDevice(ctx_->gpu_id); preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeVec(&preds); auto predt = linalg::MakeVec(&preds);
info.weights_.SetDevice(ctx_->gpu_id); info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan() common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()}; : info.weights_.ConstDeviceSpan()};

View File

@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared, XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
bst_feature_t num_features, bst_row_t num_rows, bst_feature_t num_features, bst_row_t num_rows,
size_t entry_start, float missing) : size_t entry_start, float missing)
batch{batch}, : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
columns{num_features}, extern __shared__ float _smem[];
use_shared{use_shared}, smem = _smem;
is_valid{missing} { if (use_shared) {
extern __shared__ float _smem[]; uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
smem = _smem; size_t shared_elements = blockDim.x * num_features;
if (use_shared) { dh::BlockFill(smem, shared_elements, nanf(""));
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; __syncthreads();
size_t shared_elements = blockDim.x * num_features; if (global_idx < num_rows) {
dh::BlockFill(smem, shared_elements, nanf("")); auto beg = global_idx * columns;
__syncthreads(); auto end = (global_idx + 1) * columns;
if (global_idx < num_rows) { for (size_t i = beg; i < end; ++i) {
auto beg = global_idx * columns; auto value = batch.GetElement(i).value;
auto end = (global_idx + 1) * columns; if (is_valid(value)) {
for (size_t i = beg; i < end; ++i) { smem[threadIdx.x * num_features + (i - beg)] = value;
auto value = batch.GetElement(i).value;
if (is_valid(value)) {
smem[threadIdx.x * num_features + (i - beg)] = value;
}
} }
} }
} }
__syncthreads();
} }
__syncthreads();
}
XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const { [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
if (use_shared) { if (use_shared) {
return smem[threadIdx.x * columns + fidx]; return smem[threadIdx.x * columns + fidx];
} }
@ -340,11 +337,11 @@ class DeviceModel {
size_t tree_end_; // NOLINT size_t tree_end_; // NOLINT
int num_group; int num_group;
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) { void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(gpu_id)); dh::safe_cuda(cudaSetDevice(device.ordinal));
// Copy decision trees to device // Copy decision trees to device
tree_segments = HostDeviceVector<size_t>({}, gpu_id); tree_segments = HostDeviceVector<size_t>({}, device);
auto& h_tree_segments = tree_segments.HostVector(); auto& h_tree_segments = tree_segments.HostVector();
h_tree_segments.reserve((tree_end - tree_begin) + 1); h_tree_segments.reserve((tree_end - tree_begin) + 1);
size_t sum = 0; size_t sum = 0;
@ -354,8 +351,8 @@ class DeviceModel {
h_tree_segments.push_back(sum); h_tree_segments.push_back(sum);
} }
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id); nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id); stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
auto d_nodes = nodes.DevicePointer(); auto d_nodes = nodes.DevicePointer();
auto d_stats = stats.DevicePointer(); auto d_stats = stats.DevicePointer();
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) { for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@ -369,12 +366,12 @@ class DeviceModel {
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault)); sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
} }
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id); tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
auto& h_tree_group = tree_group.HostVector(); auto& h_tree_group = tree_group.HostVector();
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size()); std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
// Initialize categorical splits. // Initialize categorical splits.
split_types.SetDevice(gpu_id); split_types.SetDevice(device);
std::vector<FeatureType>& h_split_types = split_types.HostVector(); std::vector<FeatureType>& h_split_types = split_types.HostVector();
h_split_types.resize(h_tree_segments.back()); h_split_types.resize(h_tree_segments.back());
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@ -383,8 +380,8 @@ class DeviceModel {
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]); h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
} }
categories = HostDeviceVector<uint32_t>({}, gpu_id); categories = HostDeviceVector<uint32_t>({}, device);
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id); categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
std::vector<uint32_t> &h_categories = categories.HostVector(); std::vector<uint32_t> &h_categories = categories.HostVector();
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector(); std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@ -397,7 +394,7 @@ class DeviceModel {
} }
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>( categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
h_tree_segments.back(), {}, gpu_id); h_tree_segments.back(), {}, device);
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments = std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
categories_node_segments.HostVector(); categories_node_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) { for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@ -485,8 +482,8 @@ struct PathInfo {
void ExtractPaths( void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths, dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories, DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
int gpu_id) { DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(gpu_id)); dh::safe_cuda(cudaSetDevice(device.ordinal));
auto& device_model = *model; auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size()); dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@ -773,12 +770,12 @@ class ColumnSplitHelper {
template <bool predict_leaf> template <bool predict_leaf>
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model, void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
bst_feature_t num_features, std::uint32_t num_group) const { bst_feature_t num_features, std::uint32_t num_group) const {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
dh::caching_device_vector<BitType> decision_storage{}; dh::caching_device_vector<BitType> decision_storage{};
dh::caching_device_vector<BitType> missing_storage{}; dh::caching_device_vector<BitType> missing_storage{};
auto constexpr kBlockThreads = 128; auto constexpr kBlockThreads = 128;
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id); auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
auto const shared_memory_bytes = auto const shared_memory_bytes =
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes); SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
auto const use_shared = shared_memory_bytes != 0; auto const use_shared = shared_memory_bytes != 0;
@ -791,8 +788,8 @@ class ColumnSplitHelper {
BitVector decision_bits{dh::ToSpan(decision_storage)}; BitVector decision_bits{dh::ToSpan(decision_storage)};
BitVector missing_bits{dh::ToSpan(missing_storage)}; BitVector missing_bits{dh::ToSpan(missing_storage)};
batch.offset.SetDevice(ctx_->gpu_id); batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->gpu_id); batch.data.SetDevice(ctx_->Device());
std::size_t entry_start = 0; std::size_t entry_start = 0;
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features); SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
@ -823,9 +820,9 @@ class ColumnSplitHelper {
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage, void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
dh::caching_device_vector<BitType>* missing_storage) const { dh::caching_device_vector<BitType>* missing_storage) const {
collective::AllReduce<collective::Operation::kBitwiseOR>( collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size()); ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
collective::AllReduce<collective::Operation::kBitwiseAND>( collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size()); ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
} }
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage, void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
size_t num_features, size_t num_features,
HostDeviceVector<bst_float>* predictions, HostDeviceVector<bst_float>* predictions,
size_t batch_offset, bool is_dense) const { size_t batch_offset, bool is_dense) const {
batch.offset.SetDevice(ctx_->gpu_id); batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->gpu_id); batch.data.SetDevice(ctx_->Device());
const uint32_t BLOCK_THREADS = 128; const uint32_t BLOCK_THREADS = 128;
size_t num_rows = batch.Size(); size_t num_rows = batch.Size();
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS)); auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id); auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
size_t shared_memory_bytes = size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes); SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
bool use_shared = shared_memory_bytes != 0; bool use_shared = shared_memory_bytes != 0;
@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_end - tree_begin == 0) { if (tree_end - tree_begin == 0) {
return; return;
} }
out_preds->SetDevice(ctx_->gpu_id); out_preds->SetDevice(ctx_->Device());
auto const& info = dmat->Info(); auto const& info = dmat->Info();
DeviceModel d_model; DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id); d_model.Init(model, tree_begin, tree_end, ctx_->Device());
if (info.IsColumnSplit()) { if (info.IsColumnSplit()) {
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model); column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
} else { } else {
size_t batch_offset = 0; size_t batch_offset = 0;
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) { for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
dmat->Info().feature_types.SetDevice(ctx_->gpu_id); dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan(); auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal( this->PredictInternal(
page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types), page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
d_model, d_model,
out_preds, out_preds,
batch_offset); batch_offset);
@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {} : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
~GPUPredictor() override { ~GPUPredictor() override {
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) { if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
} }
} }
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts, void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
const gbm::GBTreeModel& model, uint32_t tree_begin, const gbm::GBTreeModel& model, uint32_t tree_begin,
uint32_t tree_end = 0) const override { uint32_t tree_end = 0) const override {
int device = ctx_->gpu_id; CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
auto* out_preds = &predts->predictions; auto* out_preds = &predts->predictions;
if (tree_end == 0) { if (tree_end == 0) {
tree_end = model.trees.size(); tree_end = model.trees.size();
@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
auto m = std::any_cast<std::shared_ptr<Adapter>>(x); auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature) CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model."; << "Number of columns in data must equal to trained model.";
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx()) CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
<< "XGBoost is running on device: " << this->ctx_->gpu_id << ", " << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
<< "but data is on: " << m->DeviceIdx(); << "but data is on: " << m->Device().Name();
if (p_m) { if (p_m) {
p_m->Info().num_row_ = m->NumRows(); p_m->Info().num_row_ = m->NumRows();
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model); this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
info.num_row_ = m->NumRows(); info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model); this->InitOutPredictions(info, &(out_preds->predictions), model);
} }
out_preds->predictions.SetDevice(m->DeviceIdx()); out_preds->predictions.SetDevice(m->Device());
const uint32_t BLOCK_THREADS = 128; const uint32_t BLOCK_THREADS = 128;
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS)); auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx()); auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
size_t shared_memory_bytes = size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes); SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
DeviceModel d_model; DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, m->DeviceIdx()); d_model.Init(model, tree_begin, tree_end, m->Device());
bool use_shared = shared_memory_bytes != 0; bool use_shared = shared_memory_bytes != 0;
size_t entry_start = 0; size_t entry_start = 0;
@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
} }
CHECK(!p_fmat->Info().IsColumnSplit()) CHECK(!p_fmat->Info().IsColumnSplit())
<< "Predict contribution support for column-wise data split is not yet implemented."; << "Predict contribution support for column-wise data split is not yet implemented.";
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); out_contribs->SetDevice(ctx_->Device());
out_contribs->SetDevice(ctx_->gpu_id);
if (tree_end == 0 || tree_end > model.trees.size()) { if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size()); tree_end = static_cast<uint32_t>(model.trees.size());
} }
@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths; device_paths;
DeviceModel d_model; DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id); d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories; dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id); ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) { for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id); batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->gpu_id); batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature); model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns; auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis)); dh::tend(phis));
} }
// Add the base margin term to last column // Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id); p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan(); const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_); auto base_score = model.learner_model_param->BaseScore(ctx_);
@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) { if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented; LOG(FATAL) << "Dart booster feature " << not_implemented;
} }
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->gpu_id); out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) { if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size()); tree_end = static_cast<uint32_t>(model.trees.size());
} }
@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths; device_paths;
DeviceModel d_model; DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id); d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories; dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id); ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) { for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id); batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->gpu_id); batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature); model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns; auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis)); dh::tend(phis));
} }
// Add the base margin term to last column // Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id); p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan(); const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_); auto base_score = model.learner_model_param->BaseScore(ctx_);
@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions, void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
const gbm::GBTreeModel &model, const gbm::GBTreeModel &model,
unsigned tree_end) const override { unsigned tree_end) const override {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
const MetaInfo& info = p_fmat->Info(); const MetaInfo& info = p_fmat->Info();
bst_row_t num_rows = info.num_row_; bst_row_t num_rows = info.num_row_;
if (tree_end == 0 || tree_end > model.trees.size()) { if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size()); tree_end = static_cast<uint32_t>(model.trees.size());
} }
predictions->SetDevice(ctx_->gpu_id); predictions->SetDevice(ctx_->Device());
predictions->Resize(num_rows * tree_end); predictions->Resize(num_rows * tree_end);
DeviceModel d_model; DeviceModel d_model;
d_model.Init(model, 0, tree_end, this->ctx_->gpu_id); d_model.Init(model, 0, tree_end, this->ctx_->Device());
if (info.IsColumnSplit()) { if (info.IsColumnSplit()) {
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model); column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
return; return;
} }
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
constexpr uint32_t kBlockThreads = 128; constexpr uint32_t kBlockThreads = 128;
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>( size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
info.num_col_, max_shared_memory_bytes); info.num_col_, max_shared_memory_bytes);
@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
if (p_fmat->PageExists<SparsePage>()) { if (p_fmat->PageExists<SparsePage>()) {
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) { for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id); batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->gpu_id); batch.offset.SetDevice(ctx_->Device());
bst_row_t batch_offset = 0; bst_row_t batch_offset = 0;
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(), SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature}; model.learner_model_param->num_feature};
@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
} else { } else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) { for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
bst_row_t batch_offset = 0; bst_row_t batch_offset = 0;
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)}; EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
size_t num_rows = batch.Size(); size_t num_rows = batch.Size();
auto grid = auto grid =
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads)); static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
private: private:
/*! \brief Reconfigure the device when GPU is changed. */ /*! \brief Reconfigure the device when GPU is changed. */
static size_t ConfigureDevice(int device) { static size_t ConfigureDevice(DeviceOrd device) {
if (device >= 0) { if (device.IsCUDA()) {
return dh::MaxSharedMemory(device); return dh::MaxSharedMemory(device.ordinal);
} }
return 0; return 0;
} }

View File

@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_}; std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data(); const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
if (ctx_->gpu_id >= 0) { if (ctx_->Device().IsCUDA()) {
out_preds->SetDevice(ctx_->gpu_id); out_preds->SetDevice(ctx_->Device());
} }
if (!base_margin->Empty()) { if (!base_margin->Empty()) {
out_preds->Resize(n); out_preds->Resize(n);

View File

@ -19,8 +19,7 @@
#include "xgboost/linalg.h" // TensorView, Tensor, Constant #include "xgboost/linalg.h" // TensorView, Tensor, Constant
#include "xgboost/logging.h" // CHECK_EQ #include "xgboost/logging.h" // CHECK_EQ
namespace xgboost { namespace xgboost::tree {
namespace tree {
namespace cpu_impl { namespace cpu_impl {
void FitStump(Context const* ctx, MetaInfo const& info, void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair, linalg::TensorView<GradientPair const, 2> gpair,
@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair, void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
bst_target_t n_targets, linalg::Vector<float>* out) { bst_target_t n_targets, linalg::Vector<float>* out) {
out->SetDevice(ctx->gpu_id); out->SetDevice(ctx->Device());
out->Reshape(n_targets); out->Reshape(n_targets);
gpair.SetDevice(ctx->Device()); gpair.SetDevice(ctx->Device());
@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView()) ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device())); : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
} }
} // namespace tree } // namespace xgboost::tree
} // namespace xgboost

View File

@ -21,9 +21,7 @@
#include "xgboost/logging.h" // CHECK_EQ #include "xgboost/logging.h" // CHECK_EQ
#include "xgboost/span.h" // span #include "xgboost/span.h" // span
namespace xgboost { namespace xgboost::tree::cuda_impl {
namespace tree {
namespace cuda_impl {
void FitStump(Context const* ctx, MetaInfo const& info, void FitStump(Context const* ctx, MetaInfo const& info,
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) { linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
auto n_targets = out.Size(); auto n_targets = out.Size();
@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it, thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values())); thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()), collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
d_sum.Size() * 2); d_sum.Size() * 2);
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets, thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess())); CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
}); });
} }
} // namespace cuda_impl } // namespace xgboost::tree::cuda_impl
} // namespace tree
} // namespace xgboost

View File

@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
auto const world_size = collective::GetWorldSize(); auto const world_size = collective::GetWorldSize();
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size); dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
auto all_candidates = dh::ToSpan(all_candidate_storage); auto all_candidates = dh::ToSpan(all_candidate_storage);
collective::AllGather(device_, out_splits.data(), all_candidates.data(), collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
out_splits.size() * sizeof(DeviceSplitCandidate)); out_splits.size() * sizeof(DeviceSplitCandidate));
// Reduce to get the best candidate from all workers. // Reduce to get the best candidate from all workers.

View File

@ -85,7 +85,7 @@ class GPUHistEvaluator {
std::size_t node_categorical_storage_size_ = 0; std::size_t node_categorical_storage_size_ = 0;
// Is the data split column-wise? // Is the data split column-wise?
bool is_column_split_ = false; bool is_column_split_ = false;
int32_t device_; DeviceOrd device_;
// Copy the categories from device to host asynchronously. // Copy the categories from device to host asynchronously.
void CopyToHost( const std::vector<bst_node_t>& nidx); void CopyToHost( const std::vector<bst_node_t>& nidx);
@ -133,14 +133,14 @@ class GPUHistEvaluator {
} }
public: public:
GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device) GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
: tree_evaluator_{param, n_features, device}, param_{param} {} : tree_evaluator_{param, n_features, device}, param_{param} {}
/** /**
* \brief Reset the evaluator, should be called before any use. * \brief Reset the evaluator, should be called before any use.
*/ */
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param, bool is_column_split, bst_feature_t n_features, TrainParam const &param, bool is_column_split,
int32_t device); DeviceOrd device);
/** /**
* \brief Get host category storage for nidx. Different from the internal version, this * \brief Get host category storage for nidx. Different from the internal version, this

View File

@ -1,5 +1,5 @@
/*! /*!
* Copyright 2022 by XGBoost Contributors * Copyright 2022-2023 by XGBoost Contributors
* *
* \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
* compilation time. * compilation time.
@ -12,11 +12,10 @@
#include "evaluate_splits.cuh" #include "evaluate_splits.cuh"
#include "xgboost/data.h" #include "xgboost/data.h"
namespace xgboost { namespace xgboost::tree {
namespace tree {
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param, bst_feature_t n_features, TrainParam const &param,
bool is_column_split, int32_t device) { bool is_column_split, DeviceOrd device) {
param_ = param; param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device}; tree_evaluator_ = TreeEvaluator{param, n_features, device};
has_categoricals_ = cuts.HasCategorical(); has_categoricals_ = cuts.HasCategorical();
@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
}); });
return dh::ToSpan(cat_sorted_idx_); return dh::ToSpan(cat_sorted_idx_);
} }
} // namespace xgboost::tree
} // namespace tree
} // namespace xgboost

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2020 by XGBoost Contributors * Copyright 2020-2023 by XGBoost Contributors
*/ */
#ifndef FEATURE_GROUPS_CUH_ #ifndef FEATURE_GROUPS_CUH_
#define FEATURE_GROUPS_CUH_ #define FEATURE_GROUPS_CUH_
@ -102,11 +102,10 @@ struct FeatureGroups {
InitSingle(cuts); InitSingle(cuts);
} }
FeatureGroupsAccessor DeviceAccessor(int device) const { [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
feature_segments.SetDevice(device); feature_segments.SetDevice(device);
bin_segments.SetDevice(device); bin_segments.SetDevice(device);
return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
max_group_bins};
} }
private: private:

View File

@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) { for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
auto page = batch.Impl(); auto page = batch.Impl();
if (!page_) { if (!page_) {
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense, page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
page->row_stride, dmat->Info().num_row_); page->row_stride, dmat->Info().num_row_);
} }
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset); size_t num_elements = page_->Copy(ctx->Device(), page, offset);
offset += num_elements; offset += num_elements;
} }
page_concatenated_ = true; page_concatenated_ = true;
@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
auto first_page = (*batch_iterator.begin()).Impl(); auto first_page = (*batch_iterator.begin()).Impl();
// Create a new ELLPACK page with empty rows. // Create a new ELLPACK page with empty rows.
page_.reset(); // Release the device memory first before reallocating page_.reset(); // Release the device memory first before reallocating
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense, page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
first_page->row_stride, sample_rows)); first_page->row_stride, sample_rows));
// Compact the ELLPACK pages into the single sample page. // Compact the ELLPACK pages into the single sample page.
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
for (auto& batch : batch_iterator) { for (auto& batch : batch_iterator) {
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
} }
return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
auto first_page = (*batch_iterator.begin()).Impl(); auto first_page = (*batch_iterator.begin()).Impl();
// Create a new ELLPACK page with empty rows. // Create a new ELLPACK page with empty rows.
page_.reset(); // Release the device memory first before reallocating page_.reset(); // Release the device memory first before reallocating
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense, page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
first_page->row_stride, sample_rows)); first_page->row_stride, sample_rows));
// Compact the ELLPACK pages into the single sample page. // Compact the ELLPACK pages into the single sample page.
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0); thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
for (auto& batch : batch_iterator) { for (auto& batch : batch_iterator) {
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_)); page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
} }
return {sample_rows, page_.get(), dh::ToSpan(gpair_)}; return {sample_rows, page_.get(), dh::ToSpan(gpair_)};

View File

@ -13,15 +13,15 @@
namespace xgboost { namespace xgboost {
namespace tree { namespace tree {
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
} }
RowPartitioner::~RowPartitioner() { RowPartitioner::~RowPartitioner() {
dh::safe_cuda(cudaSetDevice(device_idx_)); dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
} }
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) { common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {

View File

@ -199,7 +199,7 @@ class RowPartitioner {
static constexpr bst_node_t kIgnoredTreePosition = -1; static constexpr bst_node_t kIgnoredTreePosition = -1;
private: private:
int device_idx_; DeviceOrd device_idx_;
/*! \brief In here if you want to find the rows belong to a node nid, first you need to /*! \brief In here if you want to find the rows belong to a node nid, first you need to
* get the indices segment from ridx_segments[nid], then get the row index that * get the indices segment from ridx_segments[nid], then get the row index that
* represents position of row in input data X. `RowPartitioner::GetRows` would be a * represents position of row in input data X. `RowPartitioner::GetRows` would be a
@ -223,7 +223,7 @@ class RowPartitioner {
dh::PinnedMemory pinned2_; dh::PinnedMemory pinned2_;
public: public:
RowPartitioner(int device_idx, size_t num_rows); RowPartitioner(DeviceOrd device_idx, size_t num_rows);
~RowPartitioner(); ~RowPartitioner();
RowPartitioner(const RowPartitioner&) = delete; RowPartitioner(const RowPartitioner&) = delete;
RowPartitioner& operator=(const RowPartitioner&) = delete; RowPartitioner& operator=(const RowPartitioner&) = delete;

View File

@ -477,7 +477,7 @@ class HistEvaluator {
: ctx_{ctx}, : ctx_{ctx},
param_{param}, param_{param},
column_sampler_{std::move(sampler)}, column_sampler_{std::move(sampler)},
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId}, tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
is_col_split_{info.IsColumnSplit()} { is_col_split_{info.IsColumnSplit()} {
interaction_constraints_.Configure(*param, info.num_col_); interaction_constraints_.Configure(*param, info.num_col_);
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(), column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@ -696,7 +696,7 @@ class HistMultiEvaluator {
stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets); stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
gain_.resize(1); gain_.resize(1);
linalg::Vector<float> weight({n_targets}, ctx_->gpu_id); linalg::Vector<float> weight({n_targets}, ctx_->Device());
CalcWeight(*param_, root_sum, weight.HostView()); CalcWeight(*param_, root_sum, weight.HostView());
auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView()); auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
gain_.front() = root_gain; gain_.front() = root_gain;

View File

@ -1,5 +1,5 @@
/*! /**
* Copyright 2018-2020 by Contributors * Copyright 2018-2023 by Contributors
* \file split_evaluator.h * \file split_evaluator.h
* \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation. * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
* \author Henry Gouk * \author Henry Gouk
@ -23,8 +23,7 @@
#include "xgboost/host_device_vector.h" #include "xgboost/host_device_vector.h"
#include "xgboost/tree_model.h" #include "xgboost/tree_model.h"
namespace xgboost { namespace xgboost::tree {
namespace tree {
class TreeEvaluator { class TreeEvaluator {
// hist and exact use parent id to calculate constraints. // hist and exact use parent id to calculate constraints.
static constexpr bst_node_t kRootParentId = static constexpr bst_node_t kRootParentId =
@ -33,13 +32,13 @@ class TreeEvaluator {
HostDeviceVector<float> lower_bounds_; HostDeviceVector<float> lower_bounds_;
HostDeviceVector<float> upper_bounds_; HostDeviceVector<float> upper_bounds_;
HostDeviceVector<int32_t> monotone_; HostDeviceVector<int32_t> monotone_;
int32_t device_; DeviceOrd device_;
bool has_constraint_; bool has_constraint_;
public: public:
TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) { TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
device_ = device; device_ = device;
if (device != Context::kCpuId) { if (device.IsCUDA()) {
lower_bounds_.SetDevice(device); lower_bounds_.SetDevice(device);
upper_bounds_.SetDevice(device); upper_bounds_.SetDevice(device);
monotone_.SetDevice(device); monotone_.SetDevice(device);
@ -59,7 +58,7 @@ class TreeEvaluator {
has_constraint_ = true; has_constraint_ = true;
} }
if (device_ != Context::kCpuId) { if (device_.IsCUDA()) {
// Pull to device early. // Pull to device early.
lower_bounds_.ConstDeviceSpan(); lower_bounds_.ConstDeviceSpan();
upper_bounds_.ConstDeviceSpan(); upper_bounds_.ConstDeviceSpan();
@ -122,7 +121,7 @@ class TreeEvaluator {
} }
// Fast floating point division instruction on device // Fast floating point division instruction on device
XGBOOST_DEVICE float Divide(float a, float b) const { [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
return __fdividef(a, b); return __fdividef(a, b);
#else #else
@ -154,7 +153,7 @@ class TreeEvaluator {
public: public:
/* Get a view to the evaluator that can be passed down to device. */ /* Get a view to the evaluator that can be passed down to device. */
template <typename ParamT = TrainParam> auto GetEvaluator() const { template <typename ParamT = TrainParam> auto GetEvaluator() const {
if (device_ != Context::kCpuId) { if (device_.IsCUDA()) {
auto constraints = monotone_.ConstDevicePointer(); auto constraints = monotone_.ConstDevicePointer();
return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(), return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
upper_bounds_.ConstDevicePointer(), has_constraint_}; upper_bounds_.ConstDevicePointer(), has_constraint_};
@ -215,7 +214,6 @@ enum SplitType {
// partition-based categorical split // partition-based categorical split
kPart = 2 kPart = 2
}; };
} // namespace tree } // namespace xgboost::tree
} // namespace xgboost
#endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_ #endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_

View File

@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
: param_(param), : param_(param),
colmaker_train_param_{colmaker_train_param}, colmaker_train_param_{colmaker_train_param},
ctx_{ctx}, ctx_{ctx},
tree_evaluator_(param_, column_densities.size(), Context::kCpuId), tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
interaction_constraints_{std::move(_interaction_constraints)}, interaction_constraints_{std::move(_interaction_constraints)},
column_densities_(column_densities) {} column_densities_(column_densities) {}
// update one tree, growing // update one tree, growing

View File

@ -74,7 +74,7 @@ class DeviceHistogramStorage {
dh::device_vector<typename GradientSumT::ValueT> overflow_; dh::device_vector<typename GradientSumT::ValueT> overflow_;
std::map<int, size_t> overflow_nidx_map_; std::map<int, size_t> overflow_nidx_map_;
int n_bins_; int n_bins_;
int device_id_; DeviceOrd device_id_;
static constexpr size_t kNumItemsInGradientSum = static constexpr size_t kNumItemsInGradientSum =
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@ -82,7 +82,7 @@ class DeviceHistogramStorage {
public: public:
// Start with about 16mb // Start with about 16mb
DeviceHistogramStorage() { data_.reserve(1 << 22); } DeviceHistogramStorage() { data_.reserve(1 << 22); }
void Init(int device_id, int n_bins) { void Init(DeviceOrd device_id, int n_bins) {
this->n_bins_ = n_bins; this->n_bins_ = n_bins;
this->device_id_ = device_id; this->device_id_ = device_id;
} }
@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows, common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler, TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
uint32_t n_features, BatchParam batch_param, MetaInfo const& info) uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
: evaluator_{_param, n_features, ctx->gpu_id}, : evaluator_{_param, n_features, ctx->Device()},
ctx_(ctx), ctx_(ctx),
feature_types{_feature_types}, feature_types{_feature_types},
param(std::move(_param)), param(std::move(_param)),
@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
} }
CHECK(column_sampler_); CHECK(column_sampler_);
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id)); monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
} }
~GPUHistMakerDevice() = default; ~GPUHistMakerDevice() = default;
@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
if (!feature_groups) { if (!feature_groups) {
CHECK(page); CHECK(page);
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense, feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
dh::MaxSharedMemoryOptin(ctx_->gpu_id), dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
sizeof(GradientPairPrecise)); sizeof(GradientPairPrecise));
} }
} }
@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(), this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
param.colsample_bynode, param.colsample_bylevel, param.colsample_bynode, param.colsample_bylevel,
param.colsample_bytree); param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
this->interaction_constraints.Reset(); this->interaction_constraints.Reset();
@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
gpair = sample.gpair; gpair = sample.gpair;
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
dmat->Info().IsColumnSplit(), ctx_->gpu_id); dmat->Info().IsColumnSplit(), ctx_->Device());
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info()); quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
row_partitioner.reset(); // Release the device memory first before reallocating row_partitioner.reset(); // Release the device memory first before reallocating
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows); row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
// Init histogram // Init histogram
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); hist.Init(ctx_->Device(), page->Cuts().TotalBins());
hist.Reset(); hist.Reset();
this->InitFeatureGroupsOnce(); this->InitFeatureGroupsOnce();
@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
sampled_features->SetDevice(ctx_->Device()); sampled_features->SetDevice(ctx_->Device());
common::Span<bst_feature_t> feature_set = common::Span<bst_feature_t> feature_set =
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto matrix = page->GetDeviceAccessor(ctx_->Device());
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
EvaluateSplitSharedInputs shared_inputs{ EvaluateSplitSharedInputs shared_inputs{
gpu_param, gpu_param,
@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size()); dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
std::vector<bst_node_t> nidx(2 * candidates.size()); std::vector<bst_node_t> nidx(2 * candidates.size());
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size()); auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto matrix = page->GetDeviceAccessor(ctx_->Device());
EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types, EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
matrix.feature_segments, matrix.gidx_fvalue_map, matrix.feature_segments, matrix.gidx_fvalue_map,
matrix.min_fvalue, matrix.min_fvalue,
@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
void BuildHist(int nidx) { void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx);
auto d_ridx = row_partitioner->GetRows(nidx); auto d_ridx = row_partitioner->GetRows(nidx);
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id), BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist, feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
*quantiser); d_node_hist, *quantiser);
} }
// Attempt to do subtraction trick // Attempt to do subtraction trick
@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
}); });
collective::AllReduce<collective::Operation::kBitwiseOR>( collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size()); ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
collective::AllReduce<collective::Operation::kBitwiseAND>( collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size()); ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
collective::Synchronize(ctx_->gpu_id); collective::Synchronize(ctx_->Ordinal());
row_partitioner->UpdatePositionBatch( row_partitioner->UpdatePositionBatch(
nidx, left_nidx, right_nidx, split_data, nidx, left_nidx, right_nidx, split_data,
@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat); CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
} }
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
if (info_.IsColumnSplit()) { if (info_.IsColumnSplit()) {
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx); UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories, common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments, common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
HostDeviceVector<bst_node_t>* p_out_position) { HostDeviceVector<bst_node_t>* p_out_position) {
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
auto d_gpair = this->gpair; auto d_gpair = this->gpair;
p_out_position->SetDevice(ctx_->gpu_id); p_out_position->SetDevice(ctx_->Device());
p_out_position->Resize(row_partitioner->GetRows().size()); p_out_position->Resize(row_partitioner->GetRows().size());
auto new_position_op = [=] __device__(size_t row_id, int position) { auto new_position_op = [=] __device__(size_t row_id, int position) {
@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
monitor.Start("AllReduce"); monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data(); auto d_node_hist = hist.GetNodeHistogram(nidx).data();
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT; using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist), collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
page->Cuts().TotalBins() * 2 * num_histograms); page->Cuts().TotalBins() * 2 * num_histograms);
monitor.Stop("AllReduce"); monitor.Stop("AllReduce");
@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
} }
void InitDataOnce(TrainParam const* param, DMatrix* dmat) { void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device"; CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
info_ = &dmat->Info(); info_ = &dmat->Info();
// Synchronise the column sampling seed // Synchronise the column sampling seed
@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed); this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()}; auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
info_->feature_types.SetDevice(ctx_->gpu_id); info_->feature_types.SetDevice(ctx_->Device());
maker = std::make_unique<GPUHistMakerDevice>( maker = std::make_unique<GPUHistMakerDevice>(
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_, ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info()); *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
this->InitData(param, p_fmat, p_tree); this->InitData(param, p_fmat, p_tree);
monitor_.Stop("InitData"); monitor_.Stop("InitData");
gpair->SetDevice(ctx_->gpu_id); gpair->SetDevice(ctx_->Device());
maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
} }
@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
this->InitData(p_fmat, p_tree); this->InitData(p_fmat, p_tree);
monitor_.Stop("InitData"); monitor_.Stop("InitData");
gpair->SetDevice(ctx_->gpu_id); gpair->SetDevice(ctx_->Device());
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position); maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
} }

View File

@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; }; auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
if (need_copy()) { if (need_copy()) {
// allocate buffer // allocate buffer
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF}; sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
h_sample_out = sample_out.HostView(); h_sample_out = sample_out.HostView();
} }

View File

@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() {
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
std::bitset<64> original{}; std::bitset<64> original{};
original[rank] = true; original[rank] = true;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank); HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1); collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank); collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], 0ULL); EXPECT_EQ(buffer.HostVector()[0], 0ULL);
@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() {
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
std::bitset<64> original{}; std::bitset<64> original{};
original[rank] = true; original[rank] = true;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank); HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1); collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank); collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1); EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() {
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
std::bitset<64> original{~0ULL}; std::bitset<64> original{~0ULL};
original[rank] = false; original[rank] = false;
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank); HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1); collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
collective::Synchronize(rank); collective::Synchronize(rank);
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1); EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);

View File

@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17)); EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
p_mat->Info().Validate(-1); p_mat->Info().Validate(DeviceOrd::CPU());
EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17), EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
dmlc::Error); dmlc::Error);
@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
void TestSketchFromWeights(bool with_group) { void TestSketchFromWeights(bool with_group) {
size_t constexpr kRows = 300, kCols = 20, kBins = 256; size_t constexpr kRows = 300, kCols = 20, kBins = 256;
size_t constexpr kGroups = 10; size_t constexpr kGroups = 10;
auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix(); auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
Context ctx; Context ctx;
common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins); common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);

View File

@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
ASSERT_EQ(info.feature_types.Size(), n_features); ASSERT_EQ(info.feature_types.Size(), n_features);
HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3}; HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
cuts_ptr.SetDevice(0); cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
dh::device_vector<float> weight(n_samples * n_features, 0); dh::device_vector<float> weight(n_samples * n_features, 0);
dh::Iota(dh::ToSpan(weight)); dh::Iota(dh::ToSpan(weight));
@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(), thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
detail::EntryCompareOp()); detail::EntryCompareOp());
detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries, detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
&weight, &columns_ptr); &weight, &columns_ptr);
auto const& h_cptr = cuts_ptr.ConstHostVector(); auto const& h_cptr = cuts_ptr.ConstHostVector();
@ -363,7 +363,8 @@ template <typename Adapter>
auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) { auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
common::HistogramCuts batched_cuts; common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0); SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
DeviceOrd::CUDA(0));
MetaInfo info; MetaInfo info;
AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size); AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit()); sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
ConsoleLogger::Configure({{"verbosity", "3"}}); ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts; common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0); SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(), AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container); &sketch_container);
HistogramCuts cuts; HistogramCuts cuts;
@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
ConsoleLogger::Configure({{"verbosity", "3"}}); ConsoleLogger::Configure({{"verbosity", "3"}});
common::HistogramCuts batched_cuts; common::HistogramCuts batched_cuts;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0); SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info, AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
&sketch_container); &sketch_container);
@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
} }
ASSERT_EQ(info.feature_types.Size(), 1); ASSERT_EQ(info.feature_types.Size(), 1);
SketchContainer container(info.feature_types, num_bins, 1, n, 0); SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), num_bins, info, AdapterDeviceSketch(adapter.Value(), num_bins, info,
std::numeric_limits<float>::quiet_NaN(), &container); std::numeric_limits<float>::quiet_NaN(), &container);
HistogramCuts cuts; HistogramCuts cuts;
@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
namespace { namespace {
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) { auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
auto n = n_samples * n_features; auto n = n_samples * n_features;
std::vector<float> x; std::vector<float> x;
x.resize(n); x.resize(n);
@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) {
std::vector<std::size_t> h_column_size_1(column_sizes_scan.size()); std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>( detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin()); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>( detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1); ASSERT_EQ(h_column_size, h_column_size_1);
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>( detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1); ASSERT_EQ(h_column_size, h_column_size_1);
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>( detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan)); ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin()); thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
ASSERT_EQ(h_column_size, h_column_size_1); ASSERT_EQ(h_column_size, h_column_size_1);
} }
@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
size_t constexpr kRows = 300, kCols = 20, kBins = 256; size_t constexpr kRows = 300, kCols = 20, kBins = 256;
size_t constexpr kGroups = 10; size_t constexpr kGroups = 10;
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
std::string m = std::string m = RandomDataGenerator{kRows, kCols, 0}
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface( .Device(DeviceOrd::CUDA(0))
&storage); .GenerateArrayInterface(&storage);
MetaInfo info; MetaInfo info;
Context ctx; Context ctx;
auto& h_weights = info.weights_.HostVector(); auto& h_weights = info.weights_.HostVector();
@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups); info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
} }
info.weights_.SetDevice(0); info.weights_.SetDevice(DeviceOrd::CUDA(0));
info.num_row_ = kRows; info.num_row_ = kRows;
info.num_col_ = kCols; info.num_col_ = kCols;
data::CupyAdapter adapter(m); data::CupyAdapter adapter(m);
auto const& batch = adapter.Value(); auto const& batch = adapter.Value();
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0); SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(), AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container); &sketch_container);
@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
// https://github.com/dmlc/xgboost/issues/7946 // https://github.com/dmlc/xgboost/issues/7946
h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups); h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
} }
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0); SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(), AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
&sketch_container); &sketch_container);
sketch_container.MakeCuts(&weighted, info.IsColumnSplit()); sketch_container.MakeCuts(&weighted, info.IsColumnSplit());

View File

@ -1,7 +1,6 @@
/*! /**
* Copyright 2018 XGBoost contributors * Copyright 2018-2023 XGBoost contributors
*/ */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thrust/equal.h> #include <thrust/equal.h>
#include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/counting_iterator.h>
@ -9,14 +8,13 @@
#include "../../../src/common/device_helpers.cuh" #include "../../../src/common/device_helpers.cuh"
#include <xgboost/host_device_vector.h> #include <xgboost/host_device_vector.h>
namespace xgboost { namespace xgboost::common {
namespace common {
namespace { namespace {
void SetDeviceForTest(int device) { void SetDeviceForTest(DeviceOrd device) {
int n_devices; int n_devices;
dh::safe_cuda(cudaGetDeviceCount(&n_devices)); dh::safe_cuda(cudaGetDeviceCount(&n_devices));
device %= n_devices; device.ordinal %= n_devices;
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
} }
} // namespace } // namespace
@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler {
} }
}; };
void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) { void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
// create the vector // create the vector
v->SetDevice(device); v->SetDevice(device);
v->Resize(n); v->Resize(n);
ASSERT_EQ(v->Size(), n); ASSERT_EQ(v->Size(), n);
ASSERT_EQ(v->DeviceIdx(), device); ASSERT_EQ(v->Device(), device);
// ensure that the device have read-write access // ensure that the device have read-write access
ASSERT_TRUE(v->DeviceCanRead()); ASSERT_TRUE(v->DeviceCanRead());
ASSERT_TRUE(v->DeviceCanWrite()); ASSERT_TRUE(v->DeviceCanWrite());
@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
} }
void PlusOne(HostDeviceVector<int> *v) { void PlusOne(HostDeviceVector<int> *v) {
int device = v->DeviceIdx(); auto device = v->Device();
SetDeviceForTest(device); SetDeviceForTest(device);
thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v), thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
[=]__device__(unsigned int a){ return a + 1; }); [=]__device__(unsigned int a){ return a + 1; });
@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector<int>* v,
unsigned int first, unsigned int first,
GPUAccess access) { GPUAccess access) {
ASSERT_EQ(v->Size(), size); ASSERT_EQ(v->Size(), size);
SetDeviceForTest(v->DeviceIdx()); SetDeviceForTest(v->Device());
ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v), ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
thrust::make_counting_iterator(first))); thrust::make_counting_iterator(first)));
@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
ASSERT_FALSE(v->DeviceCanWrite()); ASSERT_FALSE(v->DeviceCanWrite());
} }
void TestHostDeviceVector(size_t n, int device) { void TestHostDeviceVector(size_t n, DeviceOrd device) {
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v; HostDeviceVector<int> v;
InitHostDeviceVector(n, device, &v); InitHostDeviceVector(n, device, &v);
@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) {
TEST(HostDeviceVector, Basic) { TEST(HostDeviceVector, Basic) {
size_t n = 1001; size_t n = 1001;
int device = 0; DeviceOrd device = DeviceOrd::CUDA(0);
TestHostDeviceVector(n, device); TestHostDeviceVector(n, device);
} }
TEST(HostDeviceVector, Copy) { TEST(HostDeviceVector, Copy) {
size_t n = 1001; size_t n = 1001;
int device = 0; auto device = DeviceOrd::CUDA(0);
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice); HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
HostDeviceVector<int> v; HostDeviceVector<int> v;
@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) {
h_vec[i] = i; h_vec[i] = i;
} }
HostDeviceVector<int> vec (h_vec); HostDeviceVector<int> vec (h_vec);
auto device = 0; auto device = DeviceOrd::CUDA(0);
vec.SetDevice(device); vec.SetDevice(device);
ASSERT_EQ(vec.Size(), h_vec.size()); ASSERT_EQ(vec.Size(), h_vec.size());
auto span = vec.DeviceSpan(); // sync to device auto span = vec.DeviceSpan(); // sync to device
vec.SetDevice(-1); // pull back to cpu. vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu.
ASSERT_EQ(vec.Size(), h_vec.size()); ASSERT_EQ(vec.Size(), h_vec.size());
ASSERT_EQ(vec.DeviceIdx(), -1); ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
auto h_vec_1 = vec.HostVector(); auto h_vec_1 = vec.HostVector();
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin())); ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) {
TEST(HostDeviceVector, Span) { TEST(HostDeviceVector, Span) {
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f}; HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
vec.SetDevice(0); vec.SetDevice(DeviceOrd::CUDA(0));
auto span = vec.DeviceSpan(); auto span = vec.DeviceSpan();
ASSERT_EQ(vec.Size(), span.size()); ASSERT_EQ(vec.Size(), span.size());
ASSERT_EQ(vec.DevicePointer(), span.data()); ASSERT_EQ(vec.DevicePointer(), span.data());
@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) {
ASSERT_FALSE(another.Empty()); ASSERT_FALSE(another.Empty());
ASSERT_TRUE(vec.Empty()); ASSERT_TRUE(vec.Empty());
} }
} // namespace common } // namespace xgboost::common
} // namespace xgboost

View File

@ -12,7 +12,7 @@ namespace xgboost::linalg {
namespace { namespace {
void TestElementWiseKernel() { void TestElementWiseKernel() {
auto device = DeviceOrd::CUDA(0); auto device = DeviceOrd::CUDA(0);
Tensor<float, 3> l{{2, 3, 4}, 0}; Tensor<float, 3> l{{2, 3, 4}, device};
{ {
/** /**
* Non-contiguous * Non-contiguous

View File

@ -9,9 +9,7 @@
#include "../../../src/data/adapter.h" #include "../../../src/data/adapter.h"
#include "xgboost/context.h" #include "xgboost/context.h"
namespace xgboost { namespace xgboost::common {
namespace common {
TEST(Quantile, LoadBalance) { TEST(Quantile, LoadBalance) {
size_t constexpr kRows = 1000, kCols = 100; size_t constexpr kRows = 1000, kCols = 100;
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
} }
auto m = RandomDataGenerator{kRows, kCols, 0} auto m = RandomDataGenerator{kRows, kCols, 0}
.Device(Context::kCpuId) .Device(DeviceOrd::CPU())
.Type(ft) .Type(ft)
.MaxCategory(17) .MaxCategory(17)
.Seed(rank + seed) .Seed(rank + seed)
@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
auto constexpr kWorkers = 4; auto constexpr kWorkers = 4;
RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers); RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
} }
} // namespace xgboost::common
} // namespace common
} // namespace xgboost

View File

@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
TEST(GPUQuantile, Basic) { TEST(GPUQuantile, Basic) {
constexpr size_t kRows = 1000, kCols = 100, kBins = 256; constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, kBins, kCols, kRows, 0); SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
dh::caching_device_vector<Entry> entries; dh::caching_device_vector<Entry> entries;
dh::device_vector<bst_row_t> cuts_ptr(kCols+1); dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0); thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) {
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) { RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0); SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity} std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
.Seed(seed) .Seed(seed)
.Device(0) .Device(FstCU())
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
AdapterDeviceSketch(adapter.Value(), n_bins, info, AdapterDeviceSketch(adapter.Value(), n_bins, info,
@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) {
thrust::make_counting_iterator(0llu), thrust::make_counting_iterator(0llu),
[=] __device__(size_t idx) { return batch.GetElement(idx); }); [=] __device__(size_t idx) { return batch.GetElement(idx); });
auto end = kCols * kRows; auto end = kCols * kRows;
detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid, detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
&cut_sizes_scan, &column_sizes_scan); &cut_sizes_scan, &column_sizes_scan);
auto const& cut_sizes = cut_sizes_scan.HostVector(); auto const& cut_sizes = cut_sizes_scan.HostVector();
ASSERT_LE(sketch.Data().size(), cut_sizes.back()); ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) {
} }
// if with_error is true, the test tolerates floating point error // if with_error is true, the test tolerates floating point error
void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in, void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
Span<bst_row_t const> d_columns_ptr, bool with_error = false) { Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device.ordinal));
std::vector<SketchEntry> h_in(in.size()); std::vector<SketchEntry> h_in(in.size());
dh::CopyDeviceSpanToVector(&h_in, in); dh::CopyDeviceSpanToVector(&h_in, in);
std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size()); std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) {
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0); SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0} std::string interface_str =
.Device(0) RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
.Seed(seed) &storage);
.GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
AdapterDeviceSketch(adapter.Value(), n_bins, info, AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch); std::numeric_limits<float>::quiet_NaN(), &sketch);
@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) {
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(), ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
sketch.Data().data() + sketch.Data().size(), sketch.Data().data() + sketch.Data().size(),
detail::SketchUnique{})); detail::SketchUnique{}));
TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr()); TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
}); });
} }
@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) {
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
size_t n_bins = 10; size_t n_bins = 10;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0); SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage_0; HostDeviceVector<float> storage_0;
std::string interface_str_0 = std::string interface_str_0 =
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface( RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
&storage_0); &storage_0);
data::CupyAdapter adapter_0(interface_str_0); data::CupyAdapter adapter_0(interface_str_0);
MetaInfo info; MetaInfo info;
@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) {
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0); SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
HostDeviceVector<float> storage_0; HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0} std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(FstCU())
.Seed(seed) .Seed(seed)
.GenerateArrayInterface(&storage_0); .GenerateArrayInterface(&storage_0);
data::CupyAdapter adapter_0(interface_str_0); data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketch(adapter_0.Value(), n_bins, info, AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), &sketch_0); std::numeric_limits<float>::quiet_NaN(), &sketch_0);
SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0); SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
HostDeviceVector<float> storage_1; HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0} std::string interface_str_1 =
.Device(0) RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
.Seed(seed) &storage_1);
.GenerateArrayInterface(&storage_1);
data::CupyAdapter adapter_1(interface_str_1); data::CupyAdapter adapter_1(interface_str_1);
AdapterDeviceSketch(adapter_1.Value(), n_bins, info, AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
std::numeric_limits<float>::quiet_NaN(), &sketch_1); &sketch_1);
size_t size_before_merge = sketch_0.Data().size(); size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data()); sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
if (info.weights_.Size() != 0) { if (info.weights_.Size() != 0) {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true); TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
sketch_0.FixError(); sketch_0.FixError();
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false); TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
} else { } else {
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr()); TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
} }
auto columns_ptr = sketch_0.ColumnsPtr(); auto columns_ptr = sketch_0.ColumnsPtr();
@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
MetaInfo info; MetaInfo info;
int32_t seed = 0; int32_t seed = 0;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_0(ft, n_bins, cols, rows, 0); SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
HostDeviceVector<float> storage_0; HostDeviceVector<float> storage_0;
std::string interface_str_0 = RandomDataGenerator{rows, cols, 0} std::string interface_str_0 =
.Device(0) RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
.Seed(seed) &storage_0);
.GenerateArrayInterface(&storage_0);
data::CupyAdapter adapter_0(interface_str_0); data::CupyAdapter adapter_0(interface_str_0);
AdapterDeviceSketch(adapter_0.Value(), n_bins, info, AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
&sketch_0); &sketch_0);
size_t f_rows = rows * frac; size_t f_rows = rows * frac;
SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0); SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
HostDeviceVector<float> storage_1; HostDeviceVector<float> storage_1;
std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0} std::string interface_str_1 =
.Device(0) RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
.Seed(seed) &storage_1);
.GenerateArrayInterface(&storage_1);
auto data_1 = storage_1.DeviceSpan(); auto data_1 = storage_1.DeviceSpan();
auto tuple_it = thrust::make_tuple( auto tuple_it = thrust::make_tuple(
thrust::make_counting_iterator<size_t>(0ul), data_1.data()); thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
size_t size_before_merge = sketch_0.Data().size(); size_t size_before_merge = sketch_0.Data().size();
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data()); sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr()); TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
auto columns_ptr = sketch_0.ColumnsPtr(); auto columns_ptr = sketch_0.ColumnsPtr();
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size()); std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) {
TEST(GPUQuantile, MultiMerge) { TEST(GPUQuantile, MultiMerge) {
constexpr size_t kRows = 20, kCols = 1; constexpr size_t kRows = 20, kCols = 1;
int32_t world = 2; int32_t world = 2;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
MetaInfo const &info) {
// Set up single node version // Set up single node version
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0); SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
size_t intermediate_num_cuts = std::min( size_t intermediate_num_cuts = std::min(
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor)); kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) {
for (auto rank = 0; rank < world; ++rank) { for (auto rank = 0; rank < world; ++rank) {
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
std::string interface_str = RandomDataGenerator{kRows, kCols, 0} std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
.Device(0) .Device(FstCU())
.Seed(rank + seed) .Seed(rank + seed)
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
data::CupyAdapter adapter(interface_str); data::CupyAdapter adapter(interface_str);
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
containers.emplace_back(ft, n_bins, kCols, kRows, 0); containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
AdapterDeviceSketch(adapter.Value(), n_bins, info, AdapterDeviceSketch(adapter.Value(), n_bins, info,
std::numeric_limits<float>::quiet_NaN(), std::numeric_limits<float>::quiet_NaN(),
&containers.back()); &containers.back());
@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) {
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data()); sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
sketch_on_single_node.FixError(); sketch_on_single_node.FixError();
} }
TestQuantileElemRank(0, sketch_on_single_node.Data(), TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
sketch_on_single_node.ColumnsPtr());
sketch_on_single_node.Unique(); sketch_on_single_node.Unique();
TestQuantileElemRank(0, sketch_on_single_node.Data(), TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
sketch_on_single_node.ColumnsPtr());
}); });
} }
@ -351,7 +344,7 @@ void TestAllReduceBasic() {
auto const world = collective::GetWorldSize(); auto const world = collective::GetWorldSize();
constexpr size_t kRows = 1000, kCols = 100; constexpr size_t kRows = 1000, kCols = 100;
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
auto const device = GPUIDX; auto const device = DeviceOrd::CUDA(GPUIDX);
// Set up single node version; // Set up single node version;
HostDeviceVector<FeatureType> ft({}, device); HostDeviceVector<FeatureType> ft({}, device);
@ -483,7 +476,7 @@ void TestSameOnAllWorkers() {
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
MetaInfo const &info) { MetaInfo const &info) {
auto const rank = collective::GetRank(); auto const rank = collective::GetRank();
auto const device = GPUIDX; auto const device = DeviceOrd::CUDA(GPUIDX);
HostDeviceVector<FeatureType> ft({}, device); HostDeviceVector<FeatureType> ft({}, device);
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
HostDeviceVector<float> storage({}, device); HostDeviceVector<float> storage({}, device);
@ -514,9 +507,9 @@ void TestSameOnAllWorkers() {
thrust::copy(thrust::device, local_data.data(), thrust::copy(thrust::device, local_data.data(),
local_data.data() + local_data.size(), local_data.data() + local_data.size(),
all_workers.begin() + local_data.size() * rank); all_workers.begin() + local_data.size() * rank);
collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(), collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
all_workers.size()); all_workers.size());
collective::Synchronize(device); collective::Synchronize(device.ordinal);
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float); auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
std::vector<float> h_base_line(base_line.size()); std::vector<float> h_base_line(base_line.size());
@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) {
columns_ptr[1] = kRows; columns_ptr[1] = kRows;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0); SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {}); sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
auto sketch_data = sketch.Data(); auto sketch_data = sketch.Data();
@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) {
int32_t n_bins = 16; int32_t n_bins = 16;
HostDeviceVector<FeatureType> ft; HostDeviceVector<FeatureType> ft;
SketchContainer sketch(ft, n_bins, kCols, kRows, 0); SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
dh::device_vector<Entry> d_entries {entries}; dh::device_vector<Entry> d_entries {entries};
dh::device_vector<size_t> columns_ptr(kCols + 1, 0); dh::device_vector<size_t> columns_ptr(kCols + 1, 0);

View File

@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
HostDeviceVector<float> predt(info.num_row_, 0); HostDeviceVector<float> predt(info.num_row_, 0);
auto& h_predt = predt.HostVector(); auto& h_predt = predt.HostVector();
std::iota(h_predt.begin(), h_predt.end(), 0.0f); std::iota(h_predt.begin(), h_predt.end(), 0.0f);
predt.SetDevice(ctx->gpu_id); predt.SetDevice(ctx->Device());
auto rank_idx = auto rank_idx =
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
auto fail = [&]() { NDCGCache cache{ctx, info, param}; }; auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
// empty label // empty label
ASSERT_THROW(fail(), dmlc::Error); ASSERT_THROW(fail(), dmlc::Error);
info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId}; info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
// invalid label // invalid label
ASSERT_THROW(fail(), dmlc::Error); ASSERT_THROW(fail(), dmlc::Error);
auto h_labels = info.labels.HostView(); auto h_labels = info.labels.HostView();

View File

@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() {
auto d_scores = dh::ToSpan(scores); auto d_scores = dh::ToSpan(scores);
common::SegmentedSequence(&ctx, d_group_ptr, d_scores); common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id); linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
ltr::LambdaRankParam p; ltr::LambdaRankParam p;
p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}}); p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) {
HostDeviceVector<float> predt(info.num_row_, 0); HostDeviceVector<float> predt(info.num_row_, 0);
auto& h_predt = predt.HostVector(); auto& h_predt = predt.HostVector();
std::iota(h_predt.begin(), h_predt.end(), 0.0f); std::iota(h_predt.begin(), h_predt.end(), 0.0f);
predt.SetDevice(ctx->gpu_id); predt.SetDevice(ctx->Device());
auto rank_idx = auto rank_idx =
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan()); cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());

View File

@ -9,12 +9,11 @@
#include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter #include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter
#include "../helpers.h" #include "../helpers.h"
namespace xgboost { namespace xgboost::common {
namespace common {
TEST(Stats, Quantile) { TEST(Stats, Quantile) {
Context ctx; Context ctx;
{ {
linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId); linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
std::vector<size_t> index{0, 2, 3, 4, 6}; std::vector<size_t> index{0, 2, 3, 4, 6};
auto h_arr = arr.HostView(); auto h_arr = arr.HostView();
auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); }); auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
TEST(Stats, WeightedQuantile) { TEST(Stats, WeightedQuantile) {
Context ctx; Context ctx;
linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId); linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId); linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
auto h_arr = arr.HostView(); auto h_arr = arr.HostView();
auto h_weight = weight.HostView(); auto h_weight = weight.HostView();
@ -64,7 +63,7 @@ TEST(Stats, Median) {
Context ctx; Context ctx;
{ {
linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId}; linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
HostDeviceVector<float> weights; HostDeviceVector<float> weights;
linalg::Tensor<float, 1> out; linalg::Tensor<float, 1> out;
Median(&ctx, values, weights, &out); Median(&ctx, values, weights, &out);
@ -83,7 +82,7 @@ TEST(Stats, Median) {
{ {
ctx = ctx.MakeCPU(); ctx = ctx.MakeCPU();
// 4x2 matrix // 4x2 matrix
linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id}; linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
HostDeviceVector<float> weights; HostDeviceVector<float> weights;
linalg::Tensor<float, 1> out; linalg::Tensor<float, 1> out;
Median(&ctx, values, weights, &out); Median(&ctx, values, weights, &out);
@ -102,14 +101,14 @@ TEST(Stats, Median) {
namespace { namespace {
void TestMean(Context const* ctx) { void TestMean(Context const* ctx) {
std::size_t n{128}; std::size_t n{128};
linalg::Vector<float> data({n}, ctx->gpu_id); linalg::Vector<float> data({n}, ctx->Device());
auto h_v = data.HostView().Values(); auto h_v = data.HostView().Values();
std::iota(h_v.begin(), h_v.end(), .0f); std::iota(h_v.begin(), h_v.end(), .0f);
auto nf = static_cast<float>(n); auto nf = static_cast<float>(n);
float mean = nf * (nf - 1) / 2 / n; float mean = nf * (nf - 1) / 2 / n;
linalg::Vector<float> res{{1}, ctx->gpu_id}; linalg::Vector<float> res{{1}, ctx->Device()};
Mean(ctx, data, &res); Mean(ctx, data, &res);
auto h_res = res.HostView(); auto h_res = res.HostView();
ASSERT_EQ(h_res.Size(), 1); ASSERT_EQ(h_res.Size(), 1);
@ -128,5 +127,4 @@ TEST(Stats, GPUMean) {
TestMean(&ctx); TestMean(&ctx);
} }
#endif // defined(XGBOOST_USE_CUDA) #endif // defined(XGBOOST_USE_CUDA)
} // namespace common } // namespace xgboost::common
} // namespace xgboost

View File

@ -20,8 +20,8 @@ namespace common {
namespace { namespace {
class StatsGPU : public ::testing::Test { class StatsGPU : public ::testing::Test {
private: private:
linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0}; linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0}; linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
HostDeviceVector<float> results_; HostDeviceVector<float> results_;
using TestSet = std::vector<std::pair<float, float>>; using TestSet = std::vector<std::pair<float, float>>;
Context ctx_; Context ctx_;
@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test {
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0}; linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
auto d_arr = arr.View(DeviceOrd::CUDA(0)); auto d_arr = arr.View(DeviceOrd::CUDA(0));
auto key_it = dh::MakeTransformIterator<std::size_t>( auto key_it = dh::MakeTransformIterator<std::size_t>(
@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test {
// one alpha for each segment // one alpha for each segment
HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f}; HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
alphas.SetDevice(0); alphas.SetDevice(FstCU());
auto d_alphas = alphas.ConstDeviceSpan(); auto d_alphas = alphas.ConstDeviceSpan();
auto w_it = thrust::make_constant_iterator(0.1f); auto w_it = thrust::make_constant_iterator(0.1f);
SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test {
auto val_it = auto val_it =
dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul), dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
[=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); }); [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
linalg::Tensor<float, 1> weights{{10}, 0}; linalg::Tensor<float, 1> weights{{10}, FstCU()};
linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)), linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
[=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; }); [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
auto w_it = weights.Data()->ConstDevicePointer(); auto w_it = weights.Data()->ConstDevicePointer();
@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test {
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
data.insert(data.cend(), seg.begin(), seg.end()); data.insert(data.cend(), seg.begin(), seg.end());
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0}; linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
auto d_arr = arr.View(DeviceOrd::CUDA(0)); auto d_arr = arr.View(DeviceOrd::CUDA(0));
auto key_it = dh::MakeTransformIterator<std::size_t>( auto key_it = dh::MakeTransformIterator<std::size_t>(
@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test {
// one alpha for each segment // one alpha for each segment
HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f}; HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
alphas.SetDevice(0); alphas.SetDevice(FstCU());
auto d_alphas = alphas.ConstDeviceSpan(); auto d_alphas = alphas.ConstDeviceSpan();
SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it, SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
val_it + d_arr.Size(), &results_); val_it + d_arr.Size(), &results_);

View File

@ -11,63 +11,59 @@
#include "../../../src/common/transform.h" #include "../../../src/common/transform.h"
#include "../helpers.h" #include "../helpers.h"
namespace xgboost::common {
namespace {
constexpr DeviceOrd TransformDevice() {
#if defined(__CUDACC__) #if defined(__CUDACC__)
return DeviceOrd::CUDA(0);
#define TRANSFORM_GPU 0
#else #else
return DeviceOrd::CPU();
#define TRANSFORM_GPU -1
#endif #endif
}
namespace xgboost { } // namespace
namespace common {
template <typename T> template <typename T>
struct TestTransformRange { struct TestTransformRange {
void XGBOOST_DEVICE operator()(size_t _idx, void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
Span<bst_float> _out, Span<const bst_float> _in) {
_out[_idx] = _in[_idx]; _out[_idx] = _in[_idx];
} }
}; };
TEST(Transform, DeclareUnifiedTest(Basic)) { TEST(Transform, DeclareUnifiedTest(Basic)) {
const size_t size {256}; const size_t size{256};
std::vector<bst_float> h_in(size); std::vector<float> h_in(size);
std::vector<bst_float> h_out(size); std::vector<float> h_out(size);
std::iota(h_in.begin(), h_in.end(), 0); std::iota(h_in.begin(), h_in.end(), 0);
std::vector<bst_float> h_sol(size); std::vector<float> h_sol(size);
std::iota(h_sol.begin(), h_sol.end(), 0); std::iota(h_sol.begin(), h_sol.end(), 0);
const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU}; auto device = TransformDevice();
HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU}; HostDeviceVector<float> const in_vec{h_in, device};
HostDeviceVector<float> out_vec{h_out, device};
out_vec.Fill(0); out_vec.Fill(0);
Transform<>::Init(TestTransformRange<bst_float>{}, Transform<>::Init(TestTransformRange<float>{},
Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(), Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
TRANSFORM_GPU) TransformDevice())
.Eval(&out_vec, &in_vec); .Eval(&out_vec, &in_vec);
std::vector<bst_float> res = out_vec.HostVector(); std::vector<float> res = out_vec.HostVector();
ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin())); ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
} }
#if !defined(__CUDACC__) #if !defined(__CUDACC__)
TEST(TransformDeathTest, Exception) { TEST(TransformDeathTest, Exception) {
size_t const kSize {16}; size_t const kSize{16};
std::vector<bst_float> h_in(kSize); std::vector<float> h_in(kSize);
const HostDeviceVector<bst_float> in_vec{h_in, -1}; const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
EXPECT_DEATH( EXPECT_DEATH(
{ {
Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; }, Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(), Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-1) DeviceOrd::CPU())
.Eval(&in_vec); .Eval(&in_vec);
}, },
""); "");
} }
#endif #endif
} // namespace xgboost::common
} // namespace common
} // namespace xgboost

View File

@ -0,0 +1,5 @@
/**
* Copyright 2023 XGBoost contributors
*/
// Dummy file to keep the CUDA tests.
#include "test_transform_range.cc"

View File

@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) {
for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) { for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
auto str_arr = RandomDataGenerator{8192, n_features, 0.0} auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
.Device(ctx.gpu_id) .Device(ctx.Device())
.GenerateArrayInterface(&storage); .GenerateArrayInterface(&storage);
auto adapter = CupyAdapter{str_arr}; auto adapter = CupyAdapter{str_arr};
HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0); HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
offset.SetDevice(ctx.gpu_id); offset.SetDevice(ctx.Device());
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id, auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
std::numeric_limits<float>::quiet_NaN()); std::numeric_limits<float>::quiet_NaN());
ASSERT_EQ(rstride, n_features); ASSERT_EQ(rstride, n_features);
} }

View File

@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
Context ctx{MakeCUDACtx(0)}; Context ctx{MakeCUDACtx(0)};
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()}; auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
auto ellpack = EllpackPage(&ctx, m.get(), p); auto ellpack = EllpackPage(&ctx, m.get(), p);
auto accessor = ellpack.Impl()->GetDeviceAccessor(0); auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
ASSERT_EQ(kCats, accessor.NumBins()); ASSERT_EQ(kCats, accessor.NumBins());
auto x_copy = x; auto x_copy = x;
@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) {
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
// Create an empty result page. // Create an empty result page.
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride, EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
kRows);
// Copy batch pages into the result page. // Copy batch pages into the result page.
size_t offset = 0; size_t offset = 0;
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) { for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
size_t num_elements = result.Copy(0, batch.Impl(), offset); size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
offset += num_elements; offset += num_elements;
} }
@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) {
EXPECT_EQ(impl->base_rowid, current_row); EXPECT_EQ(impl->base_rowid, current_row);
for (size_t i = 0; i < impl->Size(); i++) { for (size_t i = 0; i < impl->Size(); i++) {
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin()); thrust::copy(row_d.begin(), row_d.end(), row.begin());
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get())); dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
EXPECT_EQ(row, row_result); EXPECT_EQ(row, row_result);
@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) {
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl(); auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
// Create an empty result page. // Create an empty result page.
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride, EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
kCompactedRows);
// Compact batch pages into the result page. // Compact batch pages into the result page.
std::vector<size_t> row_indexes_h { std::vector<size_t> row_indexes_h {
@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) {
thrust::device_vector<size_t> row_indexes_d = row_indexes_h; thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows); common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) { for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
result.Compact(0, batch.Impl(), row_indexes_span); result.Compact(FstCU(), batch.Impl(), row_indexes_span);
} }
size_t current_row = 0; size_t current_row = 0;
@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) {
continue; continue;
} }
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
current_row, row_d.data().get())); current_row, row_d.data().get()));
dh::safe_cuda(cudaDeviceSynchronize()); dh::safe_cuda(cudaDeviceSynchronize());
thrust::copy(row_d.begin(), row_d.end(), row.begin()); thrust::copy(row_d.begin(), row_d.end(), row.begin());
dh::LaunchN(kCols, dh::LaunchN(kCols,
ReadRowFunction(result.GetDeviceAccessor(0), compacted_row, ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
row_result_d.data().get())); row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin()); thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

View File

@ -30,7 +30,7 @@ namespace xgboost::data {
TEST(GradientIndex, ExternalMemoryBaseRowID) { TEST(GradientIndex, ExternalMemoryBaseRowID) {
Context ctx; Context ctx;
auto p_fmat = RandomDataGenerator{4096, 256, 0.5} auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
.Device(ctx.gpu_id) .Device(ctx.Device())
.Batches(8) .Batches(8)
.GenerateSparsePageDMatrix("cache", true); .GenerateSparsePageDMatrix("cache", true);

View File

@ -11,9 +11,7 @@
#include "../helpers.h" #include "../helpers.h"
#include "test_iterative_dmatrix.h" #include "test_iterative_dmatrix.h"
namespace xgboost { namespace xgboost::data {
namespace data {
void TestEquivalent(float sparsity) { void TestEquivalent(float sparsity) {
Context ctx{MakeCUDACtx(0)}; Context ctx{MakeCUDACtx(0)};
@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) {
std::size_t offset = 0; std::size_t offset = 0;
auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
std::unique_ptr<EllpackPageImpl> page_concatenated { std::unique_ptr<EllpackPageImpl> page_concatenated {
new EllpackPageImpl(0, first->Cuts(), first->is_dense, new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
first->row_stride, 1000 * 100)}; first->row_stride, 1000 * 100)};
for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) { for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
auto page = batch.Impl(); auto page = batch.Impl();
size_t num_elements = page_concatenated->Copy(0, page, offset); size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
offset += num_elements; offset += num_elements;
} }
auto from_iter = page_concatenated->GetDeviceAccessor(0); auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols()); ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows()); ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) {
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)}; DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) { for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
auto from_data = ellpack.Impl()->GetDeviceAccessor(0); auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size()); std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size()); std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
auto impl = ellpack.Impl(); auto impl = ellpack.Impl();
common::CompressedIterator<uint32_t> iterator( common::CompressedIterator<uint32_t> iterator(
impl->gidx_buffer.HostVector().data(), impl->NumSymbols()); impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue()); EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue()); EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
// null values get placed after valid values in a row // null values get placed after valid values in a row
EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue()); EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_col_, cols);
EXPECT_EQ(m.Info().num_row_, rows); EXPECT_EQ(m.Info().num_row_, rows);
EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
TestRefDMatrix<EllpackPage, CudaArrayIterForTest>( TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
&ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); }); &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
} }
} // namespace data } // namespace xgboost::data
} // namespace xgboost

View File

@ -12,6 +12,7 @@
#include "../helpers.h" #include "../helpers.h"
#include "xgboost/base.h" #include "xgboost/base.h"
namespace xgboost {
TEST(MetaInfo, GetSet) { TEST(MetaInfo, GetSet) {
xgboost::Context ctx; xgboost::Context ctx;
xgboost::MetaInfo info; xgboost::MetaInfo info;
@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) {
info.num_nonzero_ = 12; info.num_nonzero_ = 12;
info.num_col_ = 3; info.num_col_ = 3;
std::vector<xgboost::bst_group_t> groups (11); std::vector<xgboost::bst_group_t> groups (11);
xgboost::Context ctx; Context ctx;
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11); info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
EXPECT_THROW(info.Validate(0), dmlc::Error); EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
std::vector<float> labels(info.num_row_ + 1); std::vector<float> labels(info.num_row_ + 1);
EXPECT_THROW( EXPECT_THROW(
@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) {
info.group_ptr_.clear(); info.group_ptr_.clear();
labels.resize(info.num_row_); labels.resize(info.num_row_);
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_); info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
info.labels.SetDevice(0); info.labels.SetDevice(FstCU());
EXPECT_THROW(info.Validate(1), dmlc::Error); EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups}; xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
d_groups.SetDevice(0); d_groups.SetDevice(FstCU());
d_groups.DevicePointer(); // pull to device d_groups.DevicePointer(); // pull to device
std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec( std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))}; d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) {
} }
} }
namespace xgboost {
TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); } TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
} // namespace xgboost } // namespace xgboost

View File

@ -1,31 +1,27 @@
/*! /**
* Copyright 2021 XGBoost contributors * Copyright 2021-2023, XGBoost contributors
*/ */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "../helpers.h"
#include "../../../src/data/proxy_dmatrix.h"
#include "../../../src/data/adapter.h"
namespace xgboost { #include "../../../src/data/adapter.h"
namespace data { #include "../../../src/data/proxy_dmatrix.h"
#include "../helpers.h"
namespace xgboost::data {
TEST(ProxyDMatrix, HostData) { TEST(ProxyDMatrix, HostData) {
DMatrixProxy proxy; DMatrixProxy proxy;
size_t constexpr kRows = 100, kCols = 10; size_t constexpr kRows = 100, kCols = 10;
std::vector<HostDeviceVector<float>> label_storage(1); std::vector<HostDeviceVector<float>> label_storage(1);
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
auto data = RandomDataGenerator(kRows, kCols, 0.5) auto data =
.Device(0) RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
.GenerateArrayInterface(&storage);
proxy.SetArrayData(data.c_str()); proxy.SetArrayData(data.c_str());
auto n_samples = HostAdapterDispatch( auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
&proxy, [](auto const &value) { return value.Size(); });
ASSERT_EQ(n_samples, kRows); ASSERT_EQ(n_samples, kRows);
auto n_features = HostAdapterDispatch( auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
&proxy, [](auto const &value) { return value.NumCols(); });
ASSERT_EQ(n_features, kCols); ASSERT_EQ(n_features, kCols);
} }
} // namespace data } // namespace xgboost::data
} // namespace xgboost

View File

@ -15,10 +15,12 @@ namespace xgboost::data {
TEST(ProxyDMatrix, DeviceData) { TEST(ProxyDMatrix, DeviceData) {
constexpr size_t kRows{100}, kCols{100}; constexpr size_t kRows{100}, kCols{100};
HostDeviceVector<float> storage; HostDeviceVector<float> storage;
auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage); auto data =
RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
std::vector<HostDeviceVector<float>> label_storage(1); std::vector<HostDeviceVector<float>> label_storage(1);
auto labels = auto labels = RandomDataGenerator(kRows, 1, 0)
RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage); .Device(FstCU())
.GenerateColumnarArrayInterface(&label_storage);
DMatrixProxy proxy; DMatrixProxy proxy;
proxy.SetCUDAArray(data.c_str()); proxy.SetCUDAArray(data.c_str());
@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) {
std::vector<HostDeviceVector<float>> columnar_storage(kCols); std::vector<HostDeviceVector<float>> columnar_storage(kCols);
data = RandomDataGenerator(kRows, kCols, 0) data = RandomDataGenerator(kRows, kCols, 0)
.Device(0) .Device(FstCU())
.GenerateColumnarArrayInterface(&columnar_storage); .GenerateColumnarArrayInterface(&columnar_storage);
proxy.SetCUDAArray(data.c_str()); proxy.SetCUDAArray(data.c_str());
ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>)); ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));

View File

@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
std::iota(upper.begin(), upper.end(), 1.0f); std::iota(upper.begin(), upper.end(), 1.0f);
auto& margin = p_m->Info().base_margin_; auto& margin = p_m->Info().base_margin_;
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId}; margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
std::array<int32_t, 3> ridxs {1, 3, 5}; std::array<int32_t, 3> ridxs {1, 3, 5};
std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) }; std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
std::iota(upper.begin(), upper.end(), 1.0f); std::iota(upper.begin(), upper.end(), 1.0f);
auto& margin = p_m->Info().base_margin_; auto& margin = p_m->Info().base_margin_;
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId}; margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
auto constexpr kSlices {2}; auto constexpr kSlices {2};
auto constexpr kSliceSize {4}; auto constexpr kSliceSize {4};

View File

@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
size_t offset = 0; size_t offset = 0;
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) { for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
if (!impl_ext) { if (!impl_ext) {
impl_ext.reset(new EllpackPageImpl( impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(), batch.Impl()->Cuts(), batch.Impl()->is_dense,
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows)); batch.Impl()->row_stride, kRows);
} }
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset); auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
offset += n_elems; offset += n_elems;
} }
EXPECT_EQ(impl_ext->base_rowid, 0); EXPECT_EQ(impl_ext->base_rowid, 0);
@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
EXPECT_EQ(impl_ext->base_rowid, current_row); EXPECT_EQ(impl_ext->base_rowid, current_row);
for (size_t i = 0; i < impl_ext->Size(); i++) { for (size_t i = 0; i < impl_ext->Size(); i++) {
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get())); dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin()); thrust::copy(row_d.begin(), row_d.end(), row.begin());
dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get())); dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
row_ext_d.data().get()));
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin()); thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
EXPECT_EQ(row, row_ext); EXPECT_EQ(row, row_ext);

View File

@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
gbtree.Configure({{"tree_method", "hist"}}); gbtree.Configure({{"tree_method", "hist"}});
auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal()); linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
gpair.Data()->Copy(GenerateRandomGradients(kRows)); gpair.Data()->Copy(GenerateRandomGradients(kRows));
PredictionCacheEntry out_predictions; PredictionCacheEntry out_predictions;
@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
// pull data into device. // pull data into device.
data.HostVector(); data.HostVector();
data.SetDevice(0); data.SetDevice(DeviceOrd::CUDA(0));
data.DeviceSpan(); data.DeviceSpan();
ASSERT_FALSE(data.HostCanWrite()); ASSERT_FALSE(data.HostCanWrite());
@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
} }
learner->Configure(); learner->Configure();
for (std::int32_t i = 0; i < 3; ++i) { for (std::int32_t i = 0; i < 3; ++i) {
linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId}; linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_)); gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
learner->BoostOneIter(0, Xy, &gpair); learner->BoostOneIter(0, Xy, &gpair);
} }
@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
if (device == "GPU") { if (device == "GPU") {
ctx = MakeCUDACtx(0); ctx = MakeCUDACtx(0);
} }
auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id); auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
auto array_str = rng.GenerateArrayInterface(&data); auto array_str = rng.GenerateArrayInterface(&data);
auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols); auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
auto test_qdm_err = [&](std::string booster, Context const* ctx) { auto test_qdm_err = [&](std::string booster, Context const* ctx) {
std::shared_ptr<DMatrix> p_fmat; std::shared_ptr<DMatrix> p_fmat;
bst_bin_t max_bins = 16; bst_bin_t max_bins = 16;
auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins); auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
if (ctx->IsCPU()) { if (ctx->IsCPU()) {
p_fmat = rng.GenerateQuantileDMatrix(true); p_fmat = rng.GenerateQuantileDMatrix(true);
} else { } else {

View File

@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
bst_feature_t n_features{32}; bst_feature_t n_features{32};
HostDeviceVector<float> X_storage; HostDeviceVector<float> X_storage;
// use a different device than the learner // use a different device than the learner
std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1; auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
auto X = RandomDataGenerator{n_samples, n_features, 0.0} auto X = RandomDataGenerator{n_samples, n_features, 0.0}
.Device(data_ordinal) .Device(data_ordinal)
.GenerateArrayInterface(&X_storage); .GenerateArrayInterface(&X_storage);
@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage); auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
std::shared_ptr<DMatrix> Xy; std::shared_ptr<DMatrix> Xy;
if (data_ordinal == Context::kCpuId) { if (data_ordinal.IsCPU()) {
auto X_adapter = data::ArrayAdapter{StringView{X}}; auto X_adapter = data::ArrayAdapter{StringView{X}};
Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads())); Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
} else { } else {
@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy}; std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m); auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
if (data_ordinal == Context::kCpuId) { if (data_ordinal.IsCPU()) {
proxy->SetArrayData(StringView{X}); proxy->SetArrayData(StringView{X});
} else { } else {
proxy->SetCUDAArray(X.c_str()); proxy->SetCUDAArray(X.c_str());
@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
// test when the contexts match // test when the contexts match
Context new_ctx = *proxy->Ctx(); Context new_ctx = *proxy->Ctx();
ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id); ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
learner->SetParam("device", new_ctx.DeviceName()); learner->SetParam("device", new_ctx.DeviceName());
HostDeviceVector<float>* out_predt_1{nullptr}; HostDeviceVector<float>* out_predt_1{nullptr};

View File

@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
std::vector<xgboost::bst_float> out_hess) { std::vector<xgboost::bst_float> out_hess) {
xgboost::MetaInfo info; xgboost::MetaInfo info;
info.num_row_ = labels.size(); info.num_row_ = labels.size();
info.labels = xgboost::linalg::Tensor<float, 2>{ info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1}; labels.cend(),
{labels.size(), static_cast<std::size_t>(1)},
xgboost::DeviceOrd::CPU()};
info.weights_.HostVector() = weights; info.weights_.HostVector() = weights;
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
std::vector<xgboost::bst_float> out_hess) { std::vector<xgboost::bst_float> out_hess) {
xgboost::MetaInfo info; xgboost::MetaInfo info;
info.num_row_ = labels.size(); info.num_row_ = labels.size();
info.labels = xgboost::linalg::Matrix<float>{ info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1}; labels.cend(),
{labels.size(), static_cast<std::size_t>(1)},
xgboost::DeviceOrd::CPU()};
info.weights_.HostVector() = weights; info.weights_.HostVector() = weights;
info.group_ptr_ = groups; info.group_ptr_ = groups;
@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
xgboost::DataSplitMode data_split_mode) { xgboost::DataSplitMode data_split_mode) {
return GetMultiMetricEval( return GetMultiMetricEval(
metric, preds, metric, preds,
xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights, xgboost::linalg::Tensor<float, 2>{
groups, data_split_mode); labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
weights, groups, data_split_mode);
} }
double GetMultiMetricEval(xgboost::Metric* metric, double GetMultiMetricEval(xgboost::Metric* metric,
@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
p_fmat->Info().labels.Data()); p_fmat->Info().labels.Data());
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_); CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_); p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
if (device_ != Context::kCpuId) { if (device_.IsCUDA()) {
p_fmat->Info().labels.SetDevice(device_); p_fmat->Info().labels.SetDevice(device_);
} }
} }
@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
v = dist(&lcg); v = dist(&lcg);
} }
} }
if (device_ >= 0) { if (device_.IsCUDA()) {
out->SetDevice(device_); out->SetDevice(device_);
out->DeviceSpan(); out->DeviceSpan();
} }
@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch( std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features, HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
std::size_t batches, std::int32_t device) { std::size_t batches, DeviceOrd device) {
std::vector<std::string> result(batches); std::vector<std::string> result(batches);
std::vector<Json> objects; std::vector<Json> objects;
@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) { auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
Json array_interface{Object()}; Json array_interface{Object()};
array_interface["data"] = std::vector<Json>(2); array_interface["data"] = std::vector<Json>(2);
if (device >= 0) { if (device.IsCUDA()) {
array_interface["data"][0] = array_interface["data"][0] =
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset)); Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
array_interface["stream"] = Null{}; array_interface["stream"] = Null{};
@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
h_rptr.emplace_back(rptr); h_rptr.emplace_back(rptr);
} }
if (device_ >= 0) { if (device_.IsCUDA()) {
value->SetDevice(device_); value->SetDevice(device_);
value->DeviceSpan(); value->DeviceSpan();
row_ptr->SetDevice(device_); row_ptr->SetDevice(device_);
@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR(
out->Info().labels.Reshape(this->rows_, this->n_targets_); out->Info().labels.Reshape(this->rows_, this->n_targets_);
} }
} }
if (device_ >= 0) { if (device_.IsCUDA()) {
out->Info().labels.SetDevice(device_); out->Info().labels.SetDevice(device_);
out->Info().feature_types.SetDevice(device_); out->Info().feature_types.SetDevice(device_);
for (auto const& page : out->GetBatches<SparsePage>()) { for (auto const& page : out->GetBatches<SparsePage>()) {
@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR(
CHECK_GE(this->n_batches_, 1) CHECK_GE(this->n_batches_, 1)
<< "Must set the n_batches before generating an external memory DMatrix."; << "Must set the n_batches before generating an external memory DMatrix.";
std::unique_ptr<ArrayIterForTest> iter; std::unique_ptr<ArrayIterForTest> iter;
if (device_ == Context::kCpuId) { if (device_.IsCPU()) {
iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_); iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
} else { } else {
#if defined(XGBOOST_USE_CUDA) #if defined(XGBOOST_USE_CUDA)
@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() {
NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols, NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
size_t batches) size_t batches)
: ArrayIterForTest{sparsity, rows, cols, batches} { : ArrayIterForTest{sparsity, rows, cols, batches} {
rng_->Device(Context::kCpuId); rng_->Device(DeviceOrd::CPU());
std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
this->Reset(); this->Reset();
} }
@ -644,8 +649,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
labels[i] = i; labels[i] = i;
} }
p_dmat->Info().labels = p_dmat->Info().labels =
linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1}; linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal()); linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
auto h_gpair = gpair.HostView(); auto h_gpair = gpair.HostView();
for (size_t i = 0; i < kRows; ++i) { for (size_t i = 0; i < kRows; ++i) {
h_gpair(i) = GradientPair{static_cast<float>(i), 1}; h_gpair(i) = GradientPair{static_cast<float>(i), 1};
@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches); CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
this->data_.Copy(data); this->data_.Copy(data);
std::tie(batches_, interface_) = std::tie(batches_, interface_) =
MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id); MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
} }
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); } ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }

View File

@ -9,7 +9,7 @@ namespace xgboost {
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
size_t cols, size_t batches) size_t cols, size_t batches)
: ArrayIterForTest{sparsity, rows, cols, batches} { : ArrayIterForTest{sparsity, rows, cols, batches} {
rng_->Device(0); rng_->Device(FstCU());
std::tie(batches_, interface_) = std::tie(batches_, interface_) =
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_); rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
this->Reset(); this->Reset();

View File

@ -231,7 +231,7 @@ class RandomDataGenerator {
bst_target_t n_targets_{1}; bst_target_t n_targets_{1};
std::int32_t device_{Context::kCpuId}; DeviceOrd device_{DeviceOrd::CPU()};
std::size_t n_batches_{0}; std::size_t n_batches_{0};
std::uint64_t seed_{0}; std::uint64_t seed_{0};
SimpleLCG lcg_; SimpleLCG lcg_;
@ -256,7 +256,7 @@ class RandomDataGenerator {
upper_ = v; upper_ = v;
return *this; return *this;
} }
RandomDataGenerator& Device(int32_t d) { RandomDataGenerator& Device(DeviceOrd d) {
device_ = d; device_ = d;
return *this; return *this;
} }
@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
* \brief Make a context that uses CUDA if device >= 0. * \brief Make a context that uses CUDA if device >= 0.
*/ */
inline Context MakeCUDACtx(std::int32_t device) { inline Context MakeCUDACtx(std::int32_t device) {
if (device == Context::kCpuId) { if (device == DeviceOrd::CPUOrdinal()) {
return Context{}; return Context{};
} }
return Context{}.MakeCUDA(device); return Context{}.MakeCUDA(device);
@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
* \brief Make learner model param * \brief Make learner model param
*/ */
inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups, inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
int32_t device = Context::kCpuId) { DeviceOrd device = DeviceOrd::CPU()) {
size_t shape[1]{1}; size_t shape[1]{1};
LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device}, LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
n_groups, 1, MultiStrategy::kOneOutputPerTree); n_groups, 1, MultiStrategy::kOneOutputPerTree);
@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{}; class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
} // namespace xgboost } // namespace xgboost

View File

@ -1,3 +1,8 @@
/**
* Copyright 2020-2023, XGBoost contributors
*/
#pragma once
#if defined(__CUDACC__) #if defined(__CUDACC__)
#include "../../src/data/ellpack_page.cuh" #include "../../src/data/ellpack_page.cuh"
#endif #endif
@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
}; };
} // anonymous namespace } // anonymous namespace
inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage( inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
int n_rows, int n_cols, bst_float sparsity= 0) { bst_float sparsity = 0) {
auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix(); auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin(); const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
} }
auto page = std::unique_ptr<EllpackPageImpl>( auto page = std::unique_ptr<EllpackPageImpl>(
new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {})); new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
return page; return page;
} }

View File

@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
// Invalid dataset // Invalid dataset
auto p_fmat = EmptyDMatrix(); auto p_fmat = EmptyDMatrix();
MetaInfo& info = p_fmat->Info(); MetaInfo& info = p_fmat->Info();
info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1}; info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
float auc = metric->Evaluate({1, 1}, p_fmat); float auc = metric->Evaluate({1, 1}, p_fmat);
ASSERT_TRUE(std::isnan(auc)); ASSERT_TRUE(std::isnan(auc));
*info.labels.Data() = HostDeviceVector<float>{}; *info.labels.Data() = HostDeviceVector<float>{};

Some files were not shown because too many files have changed in this diff Show More