Remove internal use of gpu_id. (#9568)
This commit is contained in:
parent
38ac52dd87
commit
8c676c889d
2
.github/workflows/python_tests.yml
vendored
2
.github/workflows/python_tests.yml
vendored
@ -190,7 +190,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build_msvc
|
mkdir build_msvc
|
||||||
cd build_msvc
|
cd build_msvc
|
||||||
cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
|
cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
|
||||||
cmake --build . --config Release --parallel $(nproc)
|
cmake --build . --config Release --parallel $(nproc)
|
||||||
|
|
||||||
- name: Install Python package
|
- name: Install Python package
|
||||||
|
|||||||
@ -29,31 +29,37 @@ struct DeviceSym {
|
|||||||
* viewing types like `linalg::TensorView`.
|
* viewing types like `linalg::TensorView`.
|
||||||
*/
|
*/
|
||||||
struct DeviceOrd {
|
struct DeviceOrd {
|
||||||
|
// Constant representing the device ID of CPU.
|
||||||
|
static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
|
||||||
|
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
|
||||||
|
|
||||||
enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
|
enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
|
||||||
// CUDA device ordinal.
|
// CUDA device ordinal.
|
||||||
bst_d_ordinal_t ordinal{-1};
|
bst_d_ordinal_t ordinal{CPUOrdinal()};
|
||||||
|
|
||||||
[[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
|
[[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
|
||||||
[[nodiscard]] bool IsCPU() const { return device == kCPU; }
|
[[nodiscard]] bool IsCPU() const { return device == kCPU; }
|
||||||
|
|
||||||
DeviceOrd() = default;
|
constexpr DeviceOrd() = default;
|
||||||
constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
|
constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
|
||||||
|
|
||||||
DeviceOrd(DeviceOrd const& that) = default;
|
constexpr DeviceOrd(DeviceOrd const& that) = default;
|
||||||
DeviceOrd& operator=(DeviceOrd const& that) = default;
|
constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
|
||||||
DeviceOrd(DeviceOrd&& that) = default;
|
constexpr DeviceOrd(DeviceOrd&& that) = default;
|
||||||
DeviceOrd& operator=(DeviceOrd&& that) = default;
|
constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Constructor for CPU.
|
* @brief Constructor for CPU.
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
|
[[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
|
||||||
/**
|
/**
|
||||||
* @brief Constructor for CUDA device.
|
* @brief Constructor for CUDA device.
|
||||||
*
|
*
|
||||||
* @param ordinal CUDA device ordinal.
|
* @param ordinal CUDA device ordinal.
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
|
[[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
|
||||||
|
return DeviceOrd{kCUDA, ordinal};
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]] bool operator==(DeviceOrd const& that) const {
|
[[nodiscard]] bool operator==(DeviceOrd const& that) const {
|
||||||
return device == that.device && ordinal == that.ordinal;
|
return device == that.device && ordinal == that.ordinal;
|
||||||
@ -78,25 +84,26 @@ struct DeviceOrd {
|
|||||||
|
|
||||||
static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
|
static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Runtime context for XGBoost. Contains information like threads and device.
|
* @brief Runtime context for XGBoost. Contains information like threads and device.
|
||||||
*/
|
*/
|
||||||
struct Context : public XGBoostParameter<Context> {
|
struct Context : public XGBoostParameter<Context> {
|
||||||
private:
|
private:
|
||||||
|
// User interfacing parameter for device ordinal
|
||||||
std::string device{DeviceSym::CPU()}; // NOLINT
|
std::string device{DeviceSym::CPU()}; // NOLINT
|
||||||
// The device object for the current context. We are in the middle of replacing the
|
// The device ordinal set by user
|
||||||
// `gpu_id` with this device field.
|
|
||||||
DeviceOrd device_{DeviceOrd::CPU()};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Constant representing the device ID of CPU.
|
|
||||||
static bst_d_ordinal_t constexpr kCpuId = -1;
|
|
||||||
static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
|
|
||||||
static std::int64_t constexpr kDefaultSeed = 0;
|
static std::int64_t constexpr kDefaultSeed = 0;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Context();
|
Context();
|
||||||
|
|
||||||
|
void Init(Args const& kwargs);
|
||||||
|
|
||||||
template <typename Container>
|
template <typename Container>
|
||||||
Args UpdateAllowUnknown(Container const& kwargs) {
|
Args UpdateAllowUnknown(Container const& kwargs) {
|
||||||
auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
|
auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
|
||||||
@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
|
|||||||
return args;
|
return args;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::int32_t gpu_id{kCpuId};
|
|
||||||
// The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
|
// The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
|
||||||
std::int32_t nthread{0}; // NOLINT
|
std::int32_t nthread{0}; // NOLINT
|
||||||
// stored random seed
|
// stored random seed
|
||||||
@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
|
|||||||
bool validate_parameters{false};
|
bool validate_parameters{false};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Configure the parameter `gpu_id'.
|
* @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
|
||||||
|
* removed.
|
||||||
*
|
*
|
||||||
* @param require_gpu Whether GPU is explicitly required by the user through other
|
* @param require_gpu Whether GPU is explicitly required by the user through other
|
||||||
* configurations.
|
* configurations.
|
||||||
@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
|
|||||||
private:
|
private:
|
||||||
void SetDeviceOrdinal(Args const& kwargs);
|
void SetDeviceOrdinal(Args const& kwargs);
|
||||||
Context& SetDevice(DeviceOrd d) {
|
Context& SetDevice(DeviceOrd d) {
|
||||||
this->device_ = d;
|
this->device = (this->device_ = d).Name();
|
||||||
this->gpu_id = d.ordinal; // this can be removed once we move away from `gpu_id`.
|
|
||||||
this->device = d.Name();
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -106,10 +106,10 @@ class MetaInfo {
|
|||||||
MetaInfo& operator=(MetaInfo&& that) = default;
|
MetaInfo& operator=(MetaInfo&& that) = default;
|
||||||
MetaInfo& operator=(MetaInfo const& that) = delete;
|
MetaInfo& operator=(MetaInfo const& that) = delete;
|
||||||
|
|
||||||
/*!
|
/**
|
||||||
* \brief Validate all metainfo.
|
* @brief Validate all metainfo.
|
||||||
*/
|
*/
|
||||||
void Validate(int32_t device) const;
|
void Validate(DeviceOrd device) const;
|
||||||
|
|
||||||
MetaInfo Slice(common::Span<int32_t const> ridxs) const;
|
MetaInfo Slice(common::Span<int32_t const> ridxs) const;
|
||||||
|
|
||||||
|
|||||||
@ -88,9 +88,9 @@ class HostDeviceVector {
|
|||||||
static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
|
static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
|
explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
|
||||||
HostDeviceVector(std::initializer_list<T> init, int device = -1);
|
HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
|
||||||
explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
|
explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
|
||||||
~HostDeviceVector();
|
~HostDeviceVector();
|
||||||
|
|
||||||
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
HostDeviceVector(const HostDeviceVector<T>&) = delete;
|
||||||
@ -99,17 +99,9 @@ class HostDeviceVector {
|
|||||||
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
|
HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
|
||||||
HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
|
HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
|
||||||
|
|
||||||
bool Empty() const { return Size() == 0; }
|
[[nodiscard]] bool Empty() const { return Size() == 0; }
|
||||||
size_t Size() const;
|
[[nodiscard]] std::size_t Size() const;
|
||||||
int DeviceIdx() const;
|
[[nodiscard]] DeviceOrd Device() const;
|
||||||
DeviceOrd Device() const {
|
|
||||||
auto idx = this->DeviceIdx();
|
|
||||||
if (idx == DeviceOrd::CPU().ordinal) {
|
|
||||||
return DeviceOrd::CPU();
|
|
||||||
} else {
|
|
||||||
return DeviceOrd::CUDA(idx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
common::Span<T> DeviceSpan();
|
common::Span<T> DeviceSpan();
|
||||||
common::Span<const T> ConstDeviceSpan() const;
|
common::Span<const T> ConstDeviceSpan() const;
|
||||||
common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
|
common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
|
||||||
@ -135,13 +127,12 @@ class HostDeviceVector {
|
|||||||
const std::vector<T>& ConstHostVector() const;
|
const std::vector<T>& ConstHostVector() const;
|
||||||
const std::vector<T>& HostVector() const {return ConstHostVector(); }
|
const std::vector<T>& HostVector() const {return ConstHostVector(); }
|
||||||
|
|
||||||
bool HostCanRead() const;
|
[[nodiscard]] bool HostCanRead() const;
|
||||||
bool HostCanWrite() const;
|
[[nodiscard]] bool HostCanWrite() const;
|
||||||
bool DeviceCanRead() const;
|
[[nodiscard]] bool DeviceCanRead() const;
|
||||||
bool DeviceCanWrite() const;
|
[[nodiscard]] bool DeviceCanWrite() const;
|
||||||
GPUAccess DeviceAccess() const;
|
[[nodiscard]] GPUAccess DeviceAccess() const;
|
||||||
|
|
||||||
void SetDevice(int device) const;
|
|
||||||
void SetDevice(DeviceOrd device) const;
|
void SetDevice(DeviceOrd device) const;
|
||||||
|
|
||||||
void Resize(size_t new_size, T v = T());
|
void Resize(size_t new_size, T v = T());
|
||||||
|
|||||||
@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
auto MakeVec(HostDeviceVector<T> *data) {
|
auto MakeVec(HostDeviceVector<T> *data) {
|
||||||
return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
|
return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
|
||||||
data->Size(), data->Device());
|
data->Device());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
auto MakeVec(HostDeviceVector<T> const *data) {
|
auto MakeVec(HostDeviceVector<T> const *data) {
|
||||||
return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
|
return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
|
||||||
data->Size(), data->Device());
|
data->Size(), data->Device());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -757,13 +757,13 @@ class Tensor {
|
|||||||
Order order_{Order::kC};
|
Order order_{Order::kC};
|
||||||
|
|
||||||
template <typename I, std::int32_t D>
|
template <typename I, std::int32_t D>
|
||||||
void Initialize(I const (&shape)[D], std::int32_t device) {
|
void Initialize(I const (&shape)[D], DeviceOrd device) {
|
||||||
static_assert(D <= kDim, "Invalid shape.");
|
static_assert(D <= kDim, "Invalid shape.");
|
||||||
std::copy(shape, shape + D, shape_);
|
std::copy(shape, shape + D, shape_);
|
||||||
for (auto i = D; i < kDim; ++i) {
|
for (auto i = D; i < kDim; ++i) {
|
||||||
shape_[i] = 1;
|
shape_[i] = 1;
|
||||||
}
|
}
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
data_.SetDevice(device);
|
data_.SetDevice(device);
|
||||||
data_.ConstDevicePointer(); // Pull to device;
|
data_.ConstDevicePointer(); // Pull to device;
|
||||||
}
|
}
|
||||||
@ -780,14 +780,11 @@ class Tensor {
|
|||||||
* See \ref TensorView for parameters of this constructor.
|
* See \ref TensorView for parameters of this constructor.
|
||||||
*/
|
*/
|
||||||
template <typename I, int32_t D>
|
template <typename I, int32_t D>
|
||||||
explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
|
|
||||||
: Tensor{common::Span<I const, D>{shape}, device, order} {}
|
|
||||||
template <typename I, int32_t D>
|
|
||||||
explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
|
explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
|
||||||
: Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
|
: Tensor{common::Span<I const, D>{shape}, device, order} {}
|
||||||
|
|
||||||
template <typename I, size_t D>
|
template <typename I, size_t D>
|
||||||
explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
|
explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
|
||||||
: order_{order} {
|
: order_{order} {
|
||||||
// No device unroll as this is a host only function.
|
// No device unroll as this is a host only function.
|
||||||
std::copy(shape.data(), shape.data() + D, shape_);
|
std::copy(shape.data(), shape.data() + D, shape_);
|
||||||
@ -795,11 +792,11 @@ class Tensor {
|
|||||||
shape_[i] = 1;
|
shape_[i] = 1;
|
||||||
}
|
}
|
||||||
auto size = detail::CalcSize(shape_);
|
auto size = detail::CalcSize(shape_);
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
data_.SetDevice(device);
|
data_.SetDevice(device);
|
||||||
}
|
}
|
||||||
data_.Resize(size);
|
data_.Resize(size);
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
data_.DevicePointer(); // Pull to device
|
data_.DevicePointer(); // Pull to device
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -807,7 +804,7 @@ class Tensor {
|
|||||||
* Initialize from 2 host iterators.
|
* Initialize from 2 host iterators.
|
||||||
*/
|
*/
|
||||||
template <typename It, typename I, int32_t D>
|
template <typename It, typename I, int32_t D>
|
||||||
explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
|
explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
|
||||||
: order_{order} {
|
: order_{order} {
|
||||||
auto &h_vec = data_.HostVector();
|
auto &h_vec = data_.HostVector();
|
||||||
h_vec.insert(h_vec.begin(), begin, end);
|
h_vec.insert(h_vec.begin(), begin, end);
|
||||||
@ -816,7 +813,7 @@ class Tensor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename I, int32_t D>
|
template <typename I, int32_t D>
|
||||||
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
|
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
|
||||||
Order order = kC)
|
Order order = kC)
|
||||||
: order_{order} {
|
: order_{order} {
|
||||||
auto &h_vec = data_.HostVector();
|
auto &h_vec = data_.HostVector();
|
||||||
@ -824,10 +821,6 @@ class Tensor {
|
|||||||
// shape
|
// shape
|
||||||
this->Initialize(shape, device);
|
this->Initialize(shape, device);
|
||||||
}
|
}
|
||||||
template <typename I, int32_t D>
|
|
||||||
explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
|
|
||||||
Order order = kC)
|
|
||||||
: Tensor{data, shape, device.ordinal, order} {}
|
|
||||||
/**
|
/**
|
||||||
* \brief Index operator. Not thread safe, should not be used in performance critical
|
* \brief Index operator. Not thread safe, should not be used in performance critical
|
||||||
* region. For more efficient indexing, consider getting a view first.
|
* region. For more efficient indexing, consider getting a view first.
|
||||||
@ -944,9 +937,7 @@ class Tensor {
|
|||||||
/**
|
/**
|
||||||
* \brief Set device ordinal for this tensor.
|
* \brief Set device ordinal for this tensor.
|
||||||
*/
|
*/
|
||||||
void SetDevice(int32_t device) const { data_.SetDevice(device); }
|
|
||||||
void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
|
void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
|
||||||
[[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
|
|
||||||
[[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
|
[[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
|
|||||||
template <typename T, typename... Index>
|
template <typename T, typename... Index>
|
||||||
auto Empty(Context const *ctx, Index &&...index) {
|
auto Empty(Context const *ctx, Index &&...index) {
|
||||||
Tensor<T, sizeof...(Index)> t;
|
Tensor<T, sizeof...(Index)> t;
|
||||||
t.SetDevice(ctx->gpu_id);
|
t.SetDevice(ctx->Device());
|
||||||
t.Reshape(index...);
|
t.Reshape(index...);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
|
|||||||
template <typename T, typename... Index>
|
template <typename T, typename... Index>
|
||||||
auto Constant(Context const *ctx, T v, Index &&...index) {
|
auto Constant(Context const *ctx, T v, Index &&...index) {
|
||||||
Tensor<T, sizeof...(Index)> t;
|
Tensor<T, sizeof...(Index)> t;
|
||||||
t.SetDevice(ctx->gpu_id);
|
t.SetDevice(ctx->Device());
|
||||||
t.Reshape(index...);
|
t.Reshape(index...);
|
||||||
t.Data()->Fill(std::move(v));
|
t.Data()->Fill(std::move(v));
|
||||||
return t;
|
return t;
|
||||||
@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
|
|||||||
// Only first axis is supported for now.
|
// Only first axis is supported for now.
|
||||||
template <typename T, int32_t D>
|
template <typename T, int32_t D>
|
||||||
void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
|
void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
|
||||||
if (r.DeviceIdx() >= 0) {
|
if (r.Device().IsCUDA()) {
|
||||||
l->SetDevice(r.DeviceIdx());
|
l->SetDevice(r.Device());
|
||||||
}
|
}
|
||||||
l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
|
l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
|
||||||
for (size_t i = 1; i < D; ++i) {
|
for (size_t i = 1; i < D; ++i) {
|
||||||
|
|||||||
@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
|
PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
|
||||||
PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
|
PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
|
||||||
auto p_cache = this->CacheItem(m);
|
auto p_cache = this->CacheItem(m);
|
||||||
if (device != Context::kCpuId) {
|
if (device.IsCUDA()) {
|
||||||
p_cache->predictions.SetDevice(device);
|
p_cache->predictions.SetDevice(device);
|
||||||
}
|
}
|
||||||
return *p_cache;
|
return *p_cache;
|
||||||
|
|||||||
@ -66,7 +66,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
|
|||||||
auto hess_dev = dh::CudaGetPointerDevice(hess.data);
|
auto hess_dev = dh::CudaGetPointerDevice(hess.data);
|
||||||
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
|
CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
|
||||||
auto &gpair = *out_gpair;
|
auto &gpair = *out_gpair;
|
||||||
gpair.SetDevice(grad_dev);
|
gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
|
||||||
gpair.Reshape(grad.Shape(0), grad.Shape(1));
|
gpair.Reshape(grad.Shape(0), grad.Shape(1));
|
||||||
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
|
auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
|
||||||
auto cuctx = ctx->CUDACtx();
|
auto cuctx = ctx->CUDACtx();
|
||||||
@ -144,7 +144,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
|
|||||||
if (learner->Ctx()->IsCUDA()) {
|
if (learner->Ctx()->IsCUDA()) {
|
||||||
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
|
CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
|
||||||
}
|
}
|
||||||
p_predt->SetDevice(proxy->DeviceIdx());
|
p_predt->SetDevice(proxy->Device());
|
||||||
|
|
||||||
auto &shape = learner->GetThreadLocal().prediction_shape;
|
auto &shape = learner->GetThreadLocal().prediction_shape;
|
||||||
size_t n_samples = p_m->Info().num_row_;
|
size_t n_samples = p_m->Info().num_row_;
|
||||||
|
|||||||
@ -15,8 +15,7 @@
|
|||||||
|
|
||||||
#include "communicator-inl.cuh"
|
#include "communicator-inl.cuh"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::collective {
|
||||||
namespace collective {
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Find the global sum of the given values across all workers.
|
* @brief Find the global sum of the given values across all workers.
|
||||||
@ -31,10 +30,9 @@ namespace collective {
|
|||||||
* @param size Number of values to sum.
|
* @param size Number of values to sum.
|
||||||
*/
|
*/
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
|
void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
|
||||||
if (info.IsRowSplit()) {
|
if (info.IsRowSplit()) {
|
||||||
collective::AllReduce<collective::Operation::kSum>(device, values, size);
|
collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace collective
|
} // namespace xgboost::collective
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
|
|||||||
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
||||||
}
|
}
|
||||||
|
|
||||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||||
dh::device_vector<Entry>* p_sorted_entries,
|
dh::device_vector<Entry>* p_sorted_entries,
|
||||||
dh::device_vector<float>* p_sorted_weights,
|
dh::device_vector<float>* p_sorted_weights,
|
||||||
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
|
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
|
||||||
@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
|
|||||||
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
|
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
|
||||||
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
|
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
|
||||||
});
|
});
|
||||||
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
|
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
|
||||||
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
|
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
|
||||||
&column_sizes_scan);
|
&column_sizes_scan);
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
if (sketch_container->HasCategorical()) {
|
if (sketch_container->HasCategorical()) {
|
||||||
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
|
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
|
||||||
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
||||||
&column_sizes_scan);
|
&column_sizes_scan);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
|
|||||||
|
|
||||||
HistogramCuts cuts;
|
HistogramCuts cuts;
|
||||||
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
|
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
|
||||||
ctx->Ordinal());
|
ctx->Device());
|
||||||
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
|
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
|
||||||
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
|
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
|
||||||
std::size_t page_nnz = page.data.Size();
|
std::size_t page_nnz = page.data.Size();
|
||||||
|
|||||||
@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <std::uint32_t kBlockThreads, typename Kernel>
|
template <std::uint32_t kBlockThreads, typename Kernel>
|
||||||
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
|
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
|
||||||
int n_mps = 0;
|
int n_mps = 0;
|
||||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
|
||||||
int n_blocks_per_mp = 0;
|
int n_blocks_per_mp = 0;
|
||||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||||
kBlockThreads, shared_mem));
|
kBlockThreads, shared_mem));
|
||||||
@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
|
|||||||
* \param out_column_size Output buffer for the size of each column.
|
* \param out_column_size Output buffer for the size of each column.
|
||||||
*/
|
*/
|
||||||
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
|
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
|
||||||
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
|
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
|
||||||
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
|
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
|
||||||
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
|
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
|
||||||
|
|
||||||
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
|
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
|
||||||
// Not strictly correct as we should use number of samples to determine the type of
|
// Not strictly correct as we should use number of samples to determine the type of
|
||||||
// counter. However, the sample size is not known due to sliding window on number of
|
// counter. However, the sample size is not known due to sliding window on number of
|
||||||
// elements.
|
// elements.
|
||||||
@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename BatchIt>
|
template <typename BatchIt>
|
||||||
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
|
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
|
||||||
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
|
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
|
||||||
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
|
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
|
||||||
dh::caching_device_vector<size_t>* column_sizes_scan) {
|
dh::caching_device_vector<size_t>* column_sizes_scan) {
|
||||||
@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
|
|||||||
// Count the valid entries in each column and copy them out.
|
// Count the valid entries in each column and copy them out.
|
||||||
template <typename AdapterBatch, typename BatchIter>
|
template <typename AdapterBatch, typename BatchIter>
|
||||||
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
|
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
|
||||||
float missing, size_t columns, size_t cuts_per_feature, int device,
|
float missing, size_t columns, size_t cuts_per_feature,
|
||||||
|
DeviceOrd device,
|
||||||
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
|
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
|
||||||
dh::caching_device_vector<size_t>* column_sizes_scan,
|
dh::caching_device_vector<size_t>* column_sizes_scan,
|
||||||
dh::device_vector<Entry>* sorted_entries) {
|
dh::device_vector<Entry>* sorted_entries) {
|
||||||
@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
|
|||||||
void SortByWeight(dh::device_vector<float>* weights,
|
void SortByWeight(dh::device_vector<float>* weights,
|
||||||
dh::device_vector<Entry>* sorted_entries);
|
dh::device_vector<Entry>* sorted_entries);
|
||||||
|
|
||||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||||
dh::device_vector<Entry>* p_sorted_entries,
|
dh::device_vector<Entry>* p_sorted_entries,
|
||||||
dh::device_vector<float>* p_sorted_weights,
|
dh::device_vector<float>* p_sorted_weights,
|
||||||
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
||||||
@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
|
|||||||
|
|
||||||
template <typename AdapterBatch>
|
template <typename AdapterBatch>
|
||||||
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||||
int device, size_t columns, size_t begin, size_t end,
|
DeviceOrd device, size_t columns, size_t begin, size_t end,
|
||||||
float missing, SketchContainer *sketch_container,
|
float missing, SketchContainer *sketch_container,
|
||||||
int num_cuts) {
|
int num_cuts) {
|
||||||
// Copy current subset of valid elements into temporary storage and sort
|
// Copy current subset of valid elements into temporary storage and sort
|
||||||
@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
|||||||
template <typename Batch>
|
template <typename Batch>
|
||||||
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||||
int num_cuts_per_feature,
|
int num_cuts_per_feature,
|
||||||
bool is_ranking, float missing, int device,
|
bool is_ranking, float missing, DeviceOrd device,
|
||||||
size_t columns, size_t begin, size_t end,
|
size_t columns, size_t begin, size_t end,
|
||||||
SketchContainer *sketch_container) {
|
SketchContainer *sketch_container) {
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
info.weights_.SetDevice(device);
|
info.weights_.SetDevice(device);
|
||||||
auto weights = info.weights_.ConstDeviceSpan();
|
auto weights = info.weights_.ConstDeviceSpan();
|
||||||
|
|
||||||
@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
|||||||
size_t num_rows = batch.NumRows();
|
size_t num_rows = batch.NumRows();
|
||||||
size_t num_cols = batch.NumCols();
|
size_t num_cols = batch.NumCols();
|
||||||
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
|
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
|
||||||
int32_t device = sketch_container->DeviceIdx();
|
auto device = sketch_container->DeviceIdx();
|
||||||
bool weighted = !info.weights_.Empty();
|
bool weighted = !info.weights_.Empty();
|
||||||
|
|
||||||
if (weighted) {
|
if (weighted) {
|
||||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||||
sketch_batch_num_elements,
|
sketch_batch_num_elements,
|
||||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||||
device, num_cuts_per_feature, true);
|
device.ordinal, num_cuts_per_feature, true);
|
||||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||||
size_t end =
|
size_t end =
|
||||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||||
@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
|||||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||||
sketch_batch_num_elements,
|
sketch_batch_num_elements,
|
||||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||||
device, num_cuts_per_feature, false);
|
device.ordinal, num_cuts_per_feature, false);
|
||||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||||
size_t end =
|
size_t end =
|
||||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||||
|
|||||||
@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
|
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
|
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
|
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
|
||||||
: impl_(nullptr) {
|
: impl_(nullptr) {
|
||||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||||
}
|
}
|
||||||
@ -81,7 +81,7 @@ template <typename T>
|
|||||||
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
|
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
|
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
|
||||||
@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void HostDeviceVector<T>::SetDevice(int) const {}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
|
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
|
||||||
|
|
||||||
|
|||||||
@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
class HostDeviceVectorImpl {
|
class HostDeviceVectorImpl {
|
||||||
public:
|
public:
|
||||||
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
|
HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
data_d_->resize(size, v);
|
data_d_->resize(size, v);
|
||||||
@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
|
|||||||
|
|
||||||
// Initializer can be std::vector<T> or std::initializer_list<T>
|
// Initializer can be std::vector<T> or std::initializer_list<T>
|
||||||
template <class Initializer>
|
template <class Initializer>
|
||||||
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
|
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
LazyResizeDevice(init.size());
|
LazyResizeDevice(init.size());
|
||||||
Copy(init);
|
Copy(init);
|
||||||
@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
|
|||||||
gpu_access_{that.gpu_access_} {}
|
gpu_access_{that.gpu_access_} {}
|
||||||
|
|
||||||
~HostDeviceVectorImpl() {
|
~HostDeviceVectorImpl() {
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
SetDevice();
|
SetDevice();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Size() const {
|
[[nodiscard]] size_t Size() const {
|
||||||
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
|
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int DeviceIdx() const { return device_; }
|
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||||
|
|
||||||
T* DevicePointer() {
|
T* DevicePointer() {
|
||||||
LazySyncDevice(GPUAccess::kWrite);
|
LazySyncDevice(GPUAccess::kWrite);
|
||||||
@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
|
|||||||
} else {
|
} else {
|
||||||
auto ptr = other->ConstDevicePointer();
|
auto ptr = other->ConstDevicePointer();
|
||||||
SetDevice();
|
SetDevice();
|
||||||
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
|
CHECK_EQ(this->Device(), other->Device());
|
||||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||||
ptr,
|
ptr,
|
||||||
other->Size() * sizeof(T),
|
other->Size() * sizeof(T),
|
||||||
@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
|
|||||||
return data_h_;
|
return data_h_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetDevice(int device) {
|
void SetDevice(DeviceOrd device) {
|
||||||
if (device_ == device) { return; }
|
if (device_ == device) { return; }
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
LazySyncHost(GPUAccess::kNone);
|
LazySyncHost(GPUAccess::kNone);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device_ >= 0 && device >= 0) {
|
if (device_.IsCUDA() && device.IsCUDA()) {
|
||||||
CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
|
CHECK_EQ(device_.ordinal, device.ordinal)
|
||||||
|
<< "New device ordinal is different from previous one.";
|
||||||
}
|
}
|
||||||
device_ = device;
|
device_ = device;
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
LazyResizeDevice(data_h_.size());
|
LazyResizeDevice(data_h_.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Resize(size_t new_size, T v) {
|
void Resize(size_t new_size, T v) {
|
||||||
if (new_size == Size()) { return; }
|
if (new_size == Size()) { return; }
|
||||||
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
|
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
||||||
// fast on-device resize
|
// fast on-device resize
|
||||||
gpu_access_ = GPUAccess::kWrite;
|
gpu_access_ = GPUAccess::kWrite;
|
||||||
SetDevice();
|
SetDevice();
|
||||||
@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
|
|||||||
gpu_access_ = access;
|
gpu_access_ = access;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
[[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||||
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
[[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||||
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
[[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||||
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
[[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||||
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
[[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||||
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
[[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||||
GPUAccess Access() const { return gpu_access_; }
|
[[nodiscard]] GPUAccess Access() const { return gpu_access_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int device_{-1};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
std::vector<T> data_h_{};
|
std::vector<T> data_h_{};
|
||||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
||||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||||
@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SetDevice() {
|
void SetDevice() {
|
||||||
CHECK_GE(device_, 0);
|
CHECK_GE(device_.ordinal, 0);
|
||||||
if (cudaSetDeviceHandler == nullptr) {
|
if (cudaSetDeviceHandler == nullptr) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
} else {
|
} else {
|
||||||
(*cudaSetDeviceHandler)(device_);
|
(*cudaSetDeviceHandler)(device_.ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!data_d_) {
|
if (!data_d_) {
|
||||||
@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
|
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
|
||||||
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
|
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
|
||||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
|
||||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -309,7 +310,9 @@ template <typename T>
|
|||||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
DeviceOrd HostDeviceVector<T>::Device() const {
|
||||||
|
return impl_->Device();
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
T* HostDeviceVector<T>::DevicePointer() {
|
T* HostDeviceVector<T>::DevicePointer() {
|
||||||
@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
|||||||
return impl_->Access();
|
return impl_->Access();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void HostDeviceVector<T>::SetDevice(int device) const {
|
|
||||||
impl_->SetDevice(device);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
||||||
impl_->SetDevice(device.ordinal);
|
impl_->SetDevice(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|||||||
@ -8,16 +8,12 @@
|
|||||||
#include "xgboost/context.h" // Context
|
#include "xgboost/context.h" // Context
|
||||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common::cuda_impl {
|
||||||
namespace common {
|
|
||||||
namespace cuda_impl {
|
|
||||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||||
values.SetDevice(ctx->gpu_id);
|
values.SetDevice(ctx->Device());
|
||||||
auto const d_values = values.ConstDeviceSpan();
|
auto const d_values = values.ConstDeviceSpan();
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
||||||
thrust::plus<float>{});
|
thrust::plus<float>{});
|
||||||
}
|
}
|
||||||
} // namespace cuda_impl
|
} // namespace xgboost::common::cuda_impl
|
||||||
} // namespace common
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -24,7 +24,7 @@ struct OptionalWeights {
|
|||||||
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
||||||
HostDeviceVector<float> const& weights) {
|
HostDeviceVector<float> const& weights) {
|
||||||
if (ctx->IsCUDA()) {
|
if (ctx->IsCUDA()) {
|
||||||
weights.SetDevice(ctx->gpu_id);
|
weights.SetDevice(ctx->Device());
|
||||||
}
|
}
|
||||||
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
|
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
|
||||||
}
|
}
|
||||||
|
|||||||
@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
|||||||
// summary does the output element come from) result by definition of merged rank. So we
|
// summary does the output element come from) result by definition of merged rank. So we
|
||||||
// run it in 2 passes to obtain the merge path and then customize the standard merge
|
// run it in 2 passes to obtain the merge path and then customize the standard merge
|
||||||
// algorithm.
|
// algorithm.
|
||||||
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
|
||||||
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
||||||
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
||||||
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
||||||
CHECK_EQ(y_ptr.size(), out_ptr.size());
|
CHECK_EQ(y_ptr.size(), out_ptr.size());
|
||||||
@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
|||||||
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||||
common::Span<OffsetT> cuts_ptr,
|
common::Span<OffsetT> cuts_ptr,
|
||||||
size_t total_cuts, Span<float> weights) {
|
size_t total_cuts, Span<float> weights) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
Span<SketchEntry> out;
|
Span<SketchEntry> out;
|
||||||
dh::device_vector<SketchEntry> cuts;
|
dh::device_vector<SketchEntry> cuts;
|
||||||
bool first_window = this->Current().empty();
|
bool first_window = this->Current().empty();
|
||||||
@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
|||||||
* pruning or merging. We preserve the first type and remove the second type.
|
* pruning or merging. We preserve the first type and remove the second type.
|
||||||
*/
|
*/
|
||||||
timer_.Start(__func__);
|
timer_.Start(__func__);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
|||||||
|
|
||||||
void SketchContainer::Prune(size_t to) {
|
void SketchContainer::Prune(size_t to) {
|
||||||
timer_.Start(__func__);
|
timer_.Start(__func__);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
|
|
||||||
OffsetT to_total = 0;
|
OffsetT to_total = 0;
|
||||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||||
@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
|
|||||||
|
|
||||||
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||||
Span<SketchEntry const> that) {
|
Span<SketchEntry const> that) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
timer_.Start(__func__);
|
timer_.Start(__func__);
|
||||||
if (this->Current().size() == 0) {
|
if (this->Current().size() == 0) {
|
||||||
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
|
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
|
||||||
@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SketchContainer::FixError() {
|
void SketchContainer::FixError() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||||
auto in = dh::ToSpan(this->Current());
|
auto in = dh::ToSpan(this->Current());
|
||||||
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
|
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
|
||||||
@ -502,7 +502,7 @@ void SketchContainer::FixError() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SketchContainer::AllReduce(bool is_column_split) {
|
void SketchContainer::AllReduce(bool is_column_split) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
auto world = collective::GetWorldSize();
|
auto world = collective::GetWorldSize();
|
||||||
if (world == 1 || is_column_split) {
|
if (world == 1 || is_column_split) {
|
||||||
return;
|
return;
|
||||||
@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
|
|||||||
auto offset = rank * d_columns_ptr.size();
|
auto offset = rank * d_columns_ptr.size();
|
||||||
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
|
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
|
||||||
gathered_ptrs.begin() + offset);
|
gathered_ptrs.begin() + offset);
|
||||||
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
|
collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
|
||||||
gathered_ptrs.size());
|
gathered_ptrs.size());
|
||||||
|
|
||||||
// Get the data from all workers.
|
// Get the data from all workers.
|
||||||
std::vector<size_t> recv_lengths;
|
std::vector<size_t> recv_lengths;
|
||||||
dh::caching_device_vector<char> recvbuf;
|
dh::caching_device_vector<char> recvbuf;
|
||||||
collective::AllGatherV(device_, this->Current().data().get(),
|
collective::AllGatherV(device_.ordinal, this->Current().data().get(),
|
||||||
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
|
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
|
||||||
collective::Synchronize(device_);
|
collective::Synchronize(device_.ordinal);
|
||||||
|
|
||||||
// Segment the received data.
|
// Segment the received data.
|
||||||
auto s_recvbuf = dh::ToSpan(recvbuf);
|
auto s_recvbuf = dh::ToSpan(recvbuf);
|
||||||
@ -584,7 +584,7 @@ struct InvalidCatOp {
|
|||||||
|
|
||||||
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||||
timer_.Start(__func__);
|
timer_.Start(__func__);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
p_cuts->min_vals_.Resize(num_columns_);
|
p_cuts->min_vals_.Resize(num_columns_);
|
||||||
|
|
||||||
// Sync between workers.
|
// Sync between workers.
|
||||||
|
|||||||
@ -41,7 +41,7 @@ class SketchContainer {
|
|||||||
bst_row_t num_rows_;
|
bst_row_t num_rows_;
|
||||||
bst_feature_t num_columns_;
|
bst_feature_t num_columns_;
|
||||||
int32_t num_bins_;
|
int32_t num_bins_;
|
||||||
int32_t device_;
|
DeviceOrd device_;
|
||||||
|
|
||||||
// Double buffer as neither prune nor merge can be performed inplace.
|
// Double buffer as neither prune nor merge can be performed inplace.
|
||||||
dh::device_vector<SketchEntry> entries_a_;
|
dh::device_vector<SketchEntry> entries_a_;
|
||||||
@ -93,35 +93,32 @@ class SketchContainer {
|
|||||||
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
||||||
* \param device GPU ID.
|
* \param device GPU ID.
|
||||||
*/
|
*/
|
||||||
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
|
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
|
||||||
int32_t max_bin, bst_feature_t num_columns,
|
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
|
||||||
bst_row_t num_rows, int32_t device)
|
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||||
: num_rows_{num_rows},
|
CHECK(device.IsCUDA());
|
||||||
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
// Initialize Sketches for this dmatrix
|
||||||
CHECK_GE(device, 0);
|
this->columns_ptr_.SetDevice(device_);
|
||||||
// Initialize Sketches for this dmatrix
|
this->columns_ptr_.Resize(num_columns + 1);
|
||||||
this->columns_ptr_.SetDevice(device_);
|
this->columns_ptr_b_.SetDevice(device_);
|
||||||
this->columns_ptr_.Resize(num_columns + 1);
|
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||||
this->columns_ptr_b_.SetDevice(device_);
|
|
||||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
|
||||||
|
|
||||||
this->feature_types_.Resize(feature_types.Size());
|
this->feature_types_.Resize(feature_types.Size());
|
||||||
this->feature_types_.Copy(feature_types);
|
this->feature_types_.Copy(feature_types);
|
||||||
// Pull to device.
|
// Pull to device.
|
||||||
this->feature_types_.SetDevice(device);
|
this->feature_types_.SetDevice(device);
|
||||||
this->feature_types_.ConstDeviceSpan();
|
this->feature_types_.ConstDeviceSpan();
|
||||||
this->feature_types_.ConstHostSpan();
|
this->feature_types_.ConstHostSpan();
|
||||||
|
|
||||||
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
||||||
has_categorical_ =
|
has_categorical_ =
|
||||||
!d_feature_types.empty() &&
|
!d_feature_types.empty() &&
|
||||||
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
|
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
|
||||||
common::IsCatOp{});
|
|
||||||
|
|
||||||
timer_.Init(__func__);
|
timer_.Init(__func__);
|
||||||
}
|
}
|
||||||
/* \brief Return GPU ID for this container. */
|
/* \brief Return GPU ID for this container. */
|
||||||
int32_t DeviceIdx() const { return device_; }
|
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
|
||||||
/* \brief Whether the predictor matrix contains categorical features. */
|
/* \brief Whether the predictor matrix contains categorical features. */
|
||||||
bool HasCategorical() const { return has_categorical_; }
|
bool HasCategorical() const { return has_categorical_; }
|
||||||
/* \brief Accumulate weights of duplicated entries in input. */
|
/* \brief Accumulate weights of duplicated entries in input. */
|
||||||
@ -175,7 +172,7 @@ class SketchContainer {
|
|||||||
template <typename KeyComp = thrust::equal_to<size_t>>
|
template <typename KeyComp = thrust::equal_to<size_t>>
|
||||||
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||||
timer_.Start(__func__);
|
timer_.Start(__func__);
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
this->columns_ptr_.SetDevice(device_);
|
this->columns_ptr_.SetDevice(device_);
|
||||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||||
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
||||||
|
|||||||
@ -15,8 +15,7 @@
|
|||||||
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
|
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
|
||||||
#include "xgboost/logging.h" // CHECK_EQ
|
#include "xgboost/logging.h" // CHECK_EQ
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common {
|
||||||
namespace common {
|
|
||||||
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||||
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
|
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
|
||||||
if (!ctx->IsCPU()) {
|
if (!ctx->IsCPU()) {
|
||||||
@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
|
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
|
||||||
v.SetDevice(ctx->gpu_id);
|
v.SetDevice(ctx->Device());
|
||||||
out->SetDevice(ctx->gpu_id);
|
out->SetDevice(ctx->Device());
|
||||||
out->Reshape(1);
|
out->Reshape(1);
|
||||||
|
|
||||||
if (ctx->IsCPU()) {
|
if (ctx->IsCPU()) {
|
||||||
@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
|
|||||||
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
|
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace xgboost::common
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -15,14 +15,12 @@
|
|||||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||||
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
|
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common::cuda_impl {
|
||||||
namespace common {
|
|
||||||
namespace cuda_impl {
|
|
||||||
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||||
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
|
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
|
||||||
CHECK_GE(t.Shape(1), 1);
|
CHECK_GE(t.Shape(1), 1);
|
||||||
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
|
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
|
||||||
segments.SetDevice(ctx->gpu_id);
|
segments.SetDevice(ctx->Device());
|
||||||
auto d_segments = segments.DeviceSpan();
|
auto d_segments = segments.DeviceSpan();
|
||||||
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
|
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
|
||||||
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
|
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
|
||||||
@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
|||||||
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
|
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
|
||||||
});
|
});
|
||||||
|
|
||||||
out->SetDevice(ctx->gpu_id);
|
out->SetDevice(ctx->Device());
|
||||||
out->Reshape(t.Shape(1));
|
out->Reshape(t.Shape(1));
|
||||||
if (weights.Empty()) {
|
if (weights.Empty()) {
|
||||||
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
|
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
|
||||||
@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
|
|||||||
dh::TemporaryArray<char> temp{bytes};
|
dh::TemporaryArray<char> temp{bytes};
|
||||||
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
|
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
|
||||||
}
|
}
|
||||||
} // namespace cuda_impl
|
} // namespace xgboost::common::cuda_impl
|
||||||
} // namespace common
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
|
|||||||
auto d_sorted_idx = dh::ToSpan(sorted_idx);
|
auto d_sorted_idx = dh::ToSpan(sorted_idx);
|
||||||
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
|
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
|
||||||
|
|
||||||
quantiles->SetDevice(ctx->gpu_id);
|
quantiles->SetDevice(ctx->Device());
|
||||||
quantiles->Resize(n_segments);
|
quantiles->Resize(n_segments);
|
||||||
auto d_results = quantiles->DeviceSpan();
|
auto d_results = quantiles->DeviceSpan();
|
||||||
|
|
||||||
@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
|
|||||||
scan_val, weights_cdf.begin());
|
scan_val, weights_cdf.begin());
|
||||||
|
|
||||||
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
||||||
quantiles->SetDevice(ctx->gpu_id);
|
quantiles->SetDevice(ctx->Device());
|
||||||
quantiles->Resize(n_segments);
|
quantiles->Resize(n_segments);
|
||||||
auto d_results = quantiles->DeviceSpan();
|
auto d_results = quantiles->DeviceSpan();
|
||||||
auto d_weight_cdf = dh::ToSpan(weights_cdf);
|
auto d_weight_cdf = dh::ToSpan(weights_cdf);
|
||||||
|
|||||||
@ -60,8 +60,8 @@ class Transform {
|
|||||||
template <typename Functor>
|
template <typename Functor>
|
||||||
struct Evaluator {
|
struct Evaluator {
|
||||||
public:
|
public:
|
||||||
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
|
Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
|
||||||
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
|
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Evaluate the functor with input pointers to HostDeviceVector.
|
* \brief Evaluate the functor with input pointers to HostDeviceVector.
|
||||||
@ -71,7 +71,7 @@ class Transform {
|
|||||||
*/
|
*/
|
||||||
template <typename... HDV>
|
template <typename... HDV>
|
||||||
void Eval(HDV... vectors) const {
|
void Eval(HDV... vectors) const {
|
||||||
bool on_device = device_ >= 0;
|
bool on_device = device_.IsCUDA();
|
||||||
|
|
||||||
if (on_device) {
|
if (on_device) {
|
||||||
LaunchCUDA(func_, vectors...);
|
LaunchCUDA(func_, vectors...);
|
||||||
@ -116,11 +116,11 @@ class Transform {
|
|||||||
}
|
}
|
||||||
// Recursive unpack for Shard.
|
// Recursive unpack for Shard.
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
|
void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
|
||||||
vector->SetDevice(device);
|
vector->SetDevice(device);
|
||||||
}
|
}
|
||||||
template <typename Head, typename... Rest>
|
template <typename Head, typename... Rest>
|
||||||
void UnpackShard(int device,
|
void UnpackShard(DeviceOrd device,
|
||||||
const HostDeviceVector<Head> *_vector,
|
const HostDeviceVector<Head> *_vector,
|
||||||
const HostDeviceVector<Rest> *... _vectors) const {
|
const HostDeviceVector<Rest> *... _vectors) const {
|
||||||
_vector->SetDevice(device);
|
_vector->SetDevice(device);
|
||||||
@ -140,7 +140,7 @@ class Transform {
|
|||||||
// granularity is used in data vector.
|
// granularity is used in data vector.
|
||||||
size_t shard_size = range_size;
|
size_t shard_size = range_size;
|
||||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
const int kGrids =
|
const int kGrids =
|
||||||
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
||||||
if (kGrids == 0) {
|
if (kGrids == 0) {
|
||||||
@ -174,7 +174,7 @@ class Transform {
|
|||||||
/*! \brief Range object specifying parallel threads index range. */
|
/*! \brief Range object specifying parallel threads index range. */
|
||||||
Range range_;
|
Range range_;
|
||||||
int32_t n_threads_;
|
int32_t n_threads_;
|
||||||
int32_t device_;
|
DeviceOrd device_;
|
||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -192,8 +192,8 @@ class Transform {
|
|||||||
*/
|
*/
|
||||||
template <typename Functor>
|
template <typename Functor>
|
||||||
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
|
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
|
||||||
int32_t device_idx) {
|
DeviceOrd device) {
|
||||||
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
|
return Evaluator<Functor>{func, std::move(range), n_threads, device};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,6 @@ namespace xgboost {
|
|||||||
|
|
||||||
DMLC_REGISTER_PARAMETER(Context);
|
DMLC_REGISTER_PARAMETER(Context);
|
||||||
|
|
||||||
bst_d_ordinal_t constexpr Context::kCpuId;
|
|
||||||
std::int64_t constexpr Context::kDefaultSeed;
|
std::int64_t constexpr Context::kDefaultSeed;
|
||||||
|
|
||||||
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
|
Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
|
||||||
@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
|||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::int32_t parsed_id{Context::kCpuId};
|
std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
|
||||||
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
|
auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
|
||||||
if (res.ec != std::errc()) {
|
if (res.ec != std::errc()) {
|
||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
|||||||
|
|
||||||
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
|
auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
|
||||||
DeviceOrd device;
|
DeviceOrd device;
|
||||||
device.ordinal = Context::InvalidOrdinal(); // mark it invalid for check.
|
device.ordinal = DeviceOrd::InvalidOrdinal(); // mark it invalid for check.
|
||||||
if (split_it == s_device.cend()) {
|
if (split_it == s_device.cend()) {
|
||||||
// no ordinal.
|
// no ordinal.
|
||||||
if (s_device == DeviceSym::CPU()) {
|
if (s_device == DeviceSym::CPU()) {
|
||||||
@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
|||||||
device = DeviceOrd::CUDA(opt_id.value());
|
device = DeviceOrd::CUDA(opt_id.value());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device.ordinal < Context::kCpuId) {
|
if (device.ordinal < DeviceOrd::CPUOrdinal()) {
|
||||||
fatal();
|
fatal();
|
||||||
}
|
}
|
||||||
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
|
device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
|
||||||
@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
|
||||||
|
os << ord.Name();
|
||||||
|
return os;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Context::Init(Args const& kwargs) {
|
||||||
|
auto unknown = this->UpdateAllowUnknown(kwargs);
|
||||||
|
if (!unknown.empty()) {
|
||||||
|
std::stringstream ss;
|
||||||
|
std::size_t i = 0;
|
||||||
|
ss << "[Internal Error] Unknown parameters passed to the Context {";
|
||||||
|
for (auto const& [k, _] : unknown) {
|
||||||
|
ss << '"' << k << '"';
|
||||||
|
if (++i != unknown.size()) {
|
||||||
|
ss << ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ss << "}\n";
|
||||||
|
LOG(FATAL) << ss.str();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void Context::ConfigureGpuId(bool require_gpu) {
|
void Context::ConfigureGpuId(bool require_gpu) {
|
||||||
if (this->IsCPU() && require_gpu) {
|
if (this->IsCPU() && require_gpu) {
|
||||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
|
this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
|
||||||
@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
|
|||||||
error::WarnDeprecatedGPUId();
|
error::WarnDeprecatedGPUId();
|
||||||
auto opt_id = ParseInt(StringView{gpu_id_it->second});
|
auto opt_id = ParseInt(StringView{gpu_id_it->second});
|
||||||
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
|
CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
|
||||||
if (opt_id.value() > Context::kCpuId) {
|
if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
|
||||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
|
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
|
||||||
} else {
|
} else {
|
||||||
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
|
this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
|
||||||
@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
|
|||||||
this->SetDevice(new_d);
|
this->SetDevice(new_d);
|
||||||
|
|
||||||
if (this->IsCPU()) {
|
if (this->IsCPU()) {
|
||||||
CHECK_EQ(this->device_.ordinal, kCpuId);
|
CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
|
||||||
} else {
|
} else {
|
||||||
CHECK_GT(this->device_.ordinal, kCpuId);
|
CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -687,13 +687,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
|
|||||||
|
|
||||||
linalg::Stack(&this->labels, that.labels);
|
linalg::Stack(&this->labels, that.labels);
|
||||||
|
|
||||||
this->weights_.SetDevice(that.weights_.DeviceIdx());
|
this->weights_.SetDevice(that.weights_.Device());
|
||||||
this->weights_.Extend(that.weights_);
|
this->weights_.Extend(that.weights_);
|
||||||
|
|
||||||
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
|
this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
|
||||||
this->labels_lower_bound_.Extend(that.labels_lower_bound_);
|
this->labels_lower_bound_.Extend(that.labels_lower_bound_);
|
||||||
|
|
||||||
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
|
this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
|
||||||
this->labels_upper_bound_.Extend(that.labels_upper_bound_);
|
this->labels_upper_bound_.Extend(that.labels_upper_bound_);
|
||||||
|
|
||||||
linalg::Stack(&this->base_margin_, that.base_margin_);
|
linalg::Stack(&this->base_margin_, that.base_margin_);
|
||||||
@ -723,7 +723,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
|
|||||||
}
|
}
|
||||||
if (!that.feature_weights.Empty()) {
|
if (!that.feature_weights.Empty()) {
|
||||||
this->feature_weights.Resize(that.feature_weights.Size());
|
this->feature_weights.Resize(that.feature_weights.Size());
|
||||||
this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
|
this->feature_weights.SetDevice(that.feature_weights.Device());
|
||||||
this->feature_weights.Copy(that.feature_weights);
|
this->feature_weights.Copy(that.feature_weights);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -738,22 +738,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
|
void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
|
||||||
bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
|
bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
|
||||||
if (!valid) {
|
if (!valid) {
|
||||||
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
|
LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
|
||||||
"the booster. The device ordinal of the data is: "
|
"the booster. The device ordinal of the data is: "
|
||||||
<< v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
|
<< v.Device() << "; the device ordinal of the Booster is: " << device;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, std::int32_t D>
|
template <typename T, std::int32_t D>
|
||||||
void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
|
void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
|
||||||
CheckDevice(device, *v.Data());
|
CheckDevice(device, *v.Data());
|
||||||
}
|
}
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
void MetaInfo::Validate(std::int32_t device) const {
|
void MetaInfo::Validate(DeviceOrd device) const {
|
||||||
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
|
if (group_ptr_.size() != 0 && weights_.Size() != 0) {
|
||||||
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
|
CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
|
||||||
return;
|
return;
|
||||||
|
|||||||
@ -29,13 +29,13 @@ template <typename T, int32_t D>
|
|||||||
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
|
void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
|
||||||
ArrayInterface<D> array(arr_interface);
|
ArrayInterface<D> array(arr_interface);
|
||||||
if (array.n == 0) {
|
if (array.n == 0) {
|
||||||
p_out->SetDevice(0);
|
p_out->SetDevice(DeviceOrd::CUDA(0));
|
||||||
p_out->Reshape(array.shape);
|
p_out->Reshape(array.shape);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
CHECK_EQ(array.valid.Capacity(), 0)
|
CHECK_EQ(array.valid.Capacity(), 0)
|
||||||
<< "Meta info like label or weight can not have missing value.";
|
<< "Meta info like label or weight can not have missing value.";
|
||||||
auto ptr_device = SetDeviceToPtr(array.data);
|
auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
|
||||||
p_out->SetDevice(ptr_device);
|
p_out->SetDevice(ptr_device);
|
||||||
|
|
||||||
if (array.is_contiguous && array.type == ToDType<T>::kType) {
|
if (array.is_contiguous && array.type == ToDType<T>::kType) {
|
||||||
@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
p_out->Reshape(array.shape);
|
p_out->Reshape(array.shape);
|
||||||
auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
|
auto t = p_out->View(ptr_device);
|
||||||
linalg::ElementWiseTransformDevice(
|
linalg::ElementWiseTransformDevice(
|
||||||
t,
|
t,
|
||||||
[=] __device__(size_t i, T) {
|
[=] __device__(size_t i, T) {
|
||||||
@ -86,7 +86,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
|||||||
});
|
});
|
||||||
dh::caching_device_vector<bool> flag(1);
|
dh::caching_device_vector<bool> flag(1);
|
||||||
auto d_flag = dh::ToSpan(flag);
|
auto d_flag = dh::ToSpan(flag);
|
||||||
auto d = SetDeviceToPtr(array_interface.data);
|
auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
|
||||||
dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
|
dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
|
||||||
dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
|
dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
|
||||||
auto typed = TypedIndex<uint32_t, 1>{array_interface};
|
auto typed = TypedIndex<uint32_t, 1>{array_interface};
|
||||||
|
|||||||
@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
|||||||
CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
|
CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
|
||||||
: columns_(columns),
|
: columns_(columns),
|
||||||
num_rows_(num_rows) {}
|
num_rows_(num_rows) {}
|
||||||
size_t Size() const { return num_rows_ * columns_.size(); }
|
[[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
|
||||||
__device__ __forceinline__ COOTuple GetElement(size_t idx) const {
|
[[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
|
||||||
size_t column_idx = idx % columns_.size();
|
size_t column_idx = idx % columns_.size();
|
||||||
size_t row_idx = idx / columns_.size();
|
size_t row_idx = idx / columns_.size();
|
||||||
auto const& column = columns_[column_idx];
|
auto const& column = columns_[column_idx];
|
||||||
@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
|||||||
return {row_idx, column_idx, value};
|
return {row_idx, column_idx, value};
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||||
auto const& column = columns_[fidx];
|
auto const& column = columns_[fidx];
|
||||||
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
|
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
|
||||||
? column(ridx)
|
? column(ridx)
|
||||||
@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
|
|||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
|
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
|
||||||
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
|
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
common::Span<ArrayInterface<1>> columns_;
|
common::Span<ArrayInterface<1>> columns_;
|
||||||
@ -120,14 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
device_idx_ = dh::CudaGetPointerDevice(first_column.data);
|
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
|
||||||
CHECK_NE(device_idx_, Context::kCpuId);
|
CHECK(device_.IsCUDA());
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
for (auto& json_col : json_columns) {
|
for (auto& json_col : json_columns) {
|
||||||
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
auto column = ArrayInterface<1>(get<Object const>(json_col));
|
||||||
columns.push_back(column);
|
columns.push_back(column);
|
||||||
num_rows_ = std::max(num_rows_, column.Shape(0));
|
num_rows_ = std::max(num_rows_, column.Shape(0));
|
||||||
CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
|
CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
|
||||||
<< "All columns should use the same device.";
|
<< "All columns should use the same device.";
|
||||||
CHECK_EQ(num_rows_, column.Shape(0))
|
CHECK_EQ(num_rows_, column.Shape(0))
|
||||||
<< "All columns should have same number of rows.";
|
<< "All columns should have same number of rows.";
|
||||||
@ -143,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
|||||||
return batch_;
|
return batch_;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t NumRows() const { return num_rows_; }
|
[[nodiscard]] std::size_t NumRows() const { return num_rows_; }
|
||||||
size_t NumColumns() const { return columns_.size(); }
|
[[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
|
||||||
int32_t DeviceIdx() const { return device_idx_; }
|
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
CudfAdapterBatch batch_;
|
CudfAdapterBatch batch_;
|
||||||
dh::device_vector<ArrayInterface<1>> columns_;
|
dh::device_vector<ArrayInterface<1>> columns_;
|
||||||
size_t num_rows_{0};
|
size_t num_rows_{0};
|
||||||
int32_t device_idx_{Context::kCpuId};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
};
|
};
|
||||||
|
|
||||||
class CupyAdapterBatch : public detail::NoMetaInfo {
|
class CupyAdapterBatch : public detail::NoMetaInfo {
|
||||||
@ -159,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
|
|||||||
CupyAdapterBatch() = default;
|
CupyAdapterBatch() = default;
|
||||||
explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
|
explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
|
||||||
: array_interface_(std::move(array_interface)) {}
|
: array_interface_(std::move(array_interface)) {}
|
||||||
size_t Size() const {
|
[[nodiscard]] std::size_t Size() const {
|
||||||
return array_interface_.Shape(0) * array_interface_.Shape(1);
|
return array_interface_.Shape(0) * array_interface_.Shape(1);
|
||||||
}
|
}
|
||||||
__device__ COOTuple GetElement(size_t idx) const {
|
[[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
|
||||||
size_t column_idx = idx % array_interface_.Shape(1);
|
size_t column_idx = idx % array_interface_.Shape(1);
|
||||||
size_t row_idx = idx / array_interface_.Shape(1);
|
size_t row_idx = idx / array_interface_.Shape(1);
|
||||||
float value = array_interface_(row_idx, column_idx);
|
float value = array_interface_(row_idx, column_idx);
|
||||||
return {row_idx, column_idx, value};
|
return {row_idx, column_idx, value};
|
||||||
}
|
}
|
||||||
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
[[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
|
||||||
float value = array_interface_(ridx, fidx);
|
float value = array_interface_(ridx, fidx);
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
|
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
|
||||||
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
|
[[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ArrayInterface<2> array_interface_;
|
ArrayInterface<2> array_interface_;
|
||||||
@ -189,28 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
|
|||||||
if (array_interface_.Shape(0) == 0) {
|
if (array_interface_.Shape(0) == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
|
device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
|
||||||
CHECK_NE(device_idx_, Context::kCpuId);
|
CHECK(device_.IsCUDA());
|
||||||
}
|
}
|
||||||
explicit CupyAdapter(std::string cuda_interface_str)
|
explicit CupyAdapter(std::string cuda_interface_str)
|
||||||
: CupyAdapter{StringView{cuda_interface_str}} {}
|
: CupyAdapter{StringView{cuda_interface_str}} {}
|
||||||
const CupyAdapterBatch& Value() const override { return batch_; }
|
[[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
|
||||||
|
|
||||||
size_t NumRows() const { return array_interface_.Shape(0); }
|
[[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
|
||||||
size_t NumColumns() const { return array_interface_.Shape(1); }
|
[[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
|
||||||
int32_t DeviceIdx() const { return device_idx_; }
|
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
ArrayInterface<2> array_interface_;
|
ArrayInterface<2> array_interface_;
|
||||||
CupyAdapterBatch batch_;
|
CupyAdapterBatch batch_;
|
||||||
int32_t device_idx_ {Context::kCpuId};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
};
|
};
|
||||||
|
|
||||||
// Returns maximum row length
|
// Returns maximum row length
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
|
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
|
||||||
float missing) {
|
float missing) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
IsValidFunctor is_valid(missing);
|
IsValidFunctor is_valid(missing);
|
||||||
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
|
||||||
|
|
||||||
|
|||||||
@ -94,22 +94,18 @@ __global__ void CompressBinEllpackKernel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Construct an ELLPACK matrix with the given number of empty rows.
|
// Construct an ELLPACK matrix with the given number of empty rows.
|
||||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
|
||||||
bool is_dense, size_t row_stride,
|
size_t row_stride, size_t n_rows)
|
||||||
size_t n_rows)
|
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
|
||||||
: is_dense(is_dense),
|
|
||||||
cuts_(std::move(cuts)),
|
|
||||||
row_stride(row_stride),
|
|
||||||
n_rows(n_rows) {
|
|
||||||
monitor_.Init("ellpack_page");
|
monitor_.Init("ellpack_page");
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
|
|
||||||
monitor_.Start("InitCompressedData");
|
monitor_.Start("InitCompressedData");
|
||||||
InitCompressedData(device);
|
InitCompressedData(device);
|
||||||
monitor_.Stop("InitCompressedData");
|
monitor_.Stop("InitCompressedData");
|
||||||
}
|
}
|
||||||
|
|
||||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
|
||||||
const SparsePage &page, bool is_dense,
|
const SparsePage &page, bool is_dense,
|
||||||
size_t row_stride,
|
size_t row_stride,
|
||||||
common::Span<FeatureType const> feature_types)
|
common::Span<FeatureType const> feature_types)
|
||||||
@ -123,7 +119,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
|||||||
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
|
EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
|
||||||
: is_dense(dmat->IsDense()) {
|
: is_dense(dmat->IsDense()) {
|
||||||
monitor_.Init("ellpack_page");
|
monitor_.Init("ellpack_page");
|
||||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||||
|
|
||||||
n_rows = dmat->Info().num_row_;
|
n_rows = dmat->Info().num_row_;
|
||||||
|
|
||||||
@ -138,15 +134,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
|
|||||||
monitor_.Stop("Quantiles");
|
monitor_.Stop("Quantiles");
|
||||||
|
|
||||||
monitor_.Start("InitCompressedData");
|
monitor_.Start("InitCompressedData");
|
||||||
this->InitCompressedData(ctx->gpu_id);
|
this->InitCompressedData(ctx->Device());
|
||||||
monitor_.Stop("InitCompressedData");
|
monitor_.Stop("InitCompressedData");
|
||||||
|
|
||||||
dmat->Info().feature_types.SetDevice(ctx->gpu_id);
|
dmat->Info().feature_types.SetDevice(ctx->Device());
|
||||||
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
auto ft = dmat->Info().feature_types.ConstDeviceSpan();
|
||||||
monitor_.Start("BinningCompression");
|
monitor_.Start("BinningCompression");
|
||||||
CHECK(dmat->SingleColBlock());
|
CHECK(dmat->SingleColBlock());
|
||||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||||
CreateHistIndices(ctx->gpu_id, batch, ft);
|
CreateHistIndices(ctx->Device(), batch, ft);
|
||||||
}
|
}
|
||||||
monitor_.Stop("BinningCompression");
|
monitor_.Stop("BinningCompression");
|
||||||
}
|
}
|
||||||
@ -209,7 +205,7 @@ struct TupleScanOp {
|
|||||||
// to remove missing data
|
// to remove missing data
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
|
void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
|
||||||
EllpackPageImpl* dst, int device_idx, float missing) {
|
EllpackPageImpl* dst, DeviceOrd device, float missing) {
|
||||||
// Some witchcraft happens here
|
// Some witchcraft happens here
|
||||||
// The goal is to copy valid elements out of the input to an ELLPACK matrix
|
// The goal is to copy valid elements out of the input to an ELLPACK matrix
|
||||||
// with a given row stride, using no extra working memory Standard stream
|
// with a given row stride, using no extra working memory Standard stream
|
||||||
@ -241,7 +237,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
|||||||
// Tuple[2] = The index in the input data
|
// Tuple[2] = The index in the input data
|
||||||
using Tuple = thrust::tuple<size_t, size_t, size_t>;
|
using Tuple = thrust::tuple<size_t, size_t, size_t>;
|
||||||
|
|
||||||
auto device_accessor = dst->GetDeviceAccessor(device_idx);
|
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||||
|
|
||||||
@ -280,10 +276,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void WriteNullValues(EllpackPageImpl* dst, int device_idx,
|
void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
|
||||||
common::Span<size_t> row_counts) {
|
|
||||||
// Write the null values
|
// Write the null values
|
||||||
auto device_accessor = dst->GetDeviceAccessor(device_idx);
|
auto device_accessor = dst->GetDeviceAccessor(device);
|
||||||
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
common::CompressedBufferWriter writer(device_accessor.NumSymbols());
|
||||||
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
|
||||||
auto row_stride = dst->row_stride;
|
auto row_stride = dst->row_stride;
|
||||||
@ -300,11 +295,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename AdapterBatch>
|
template <typename AdapterBatch>
|
||||||
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
|
EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||||
common::Span<size_t> row_counts_span,
|
common::Span<size_t> row_counts_span,
|
||||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||||
size_t n_rows, common::HistogramCuts const& cuts) {
|
size_t n_rows, common::HistogramCuts const& cuts) {
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
|
|
||||||
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
*this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
|
||||||
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
CopyDataToEllpack(batch, feature_types, this, device, missing);
|
||||||
@ -313,7 +308,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
|
|||||||
|
|
||||||
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
|
#define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \
|
||||||
template EllpackPageImpl::EllpackPageImpl( \
|
template EllpackPageImpl::EllpackPageImpl( \
|
||||||
__BATCH_T batch, float missing, int device, bool is_dense, \
|
__BATCH_T batch, float missing, DeviceOrd device, bool is_dense, \
|
||||||
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
|
common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
|
||||||
size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
|
size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
|
||||||
|
|
||||||
@ -370,9 +365,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
|||||||
[&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
|
[&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
|
||||||
row_stride = *std::max_element(it, it + page.Size());
|
row_stride = *std::max_element(it, it + page.Size());
|
||||||
|
|
||||||
CHECK_GE(ctx->gpu_id, 0);
|
CHECK(ctx->IsCUDA());
|
||||||
monitor_.Start("InitCompressedData");
|
monitor_.Start("InitCompressedData");
|
||||||
InitCompressedData(ctx->gpu_id);
|
InitCompressedData(ctx->Device());
|
||||||
monitor_.Stop("InitCompressedData");
|
monitor_.Stop("InitCompressedData");
|
||||||
|
|
||||||
// copy gidx
|
// copy gidx
|
||||||
@ -382,7 +377,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
|
|||||||
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
|
||||||
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
|
||||||
|
|
||||||
auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
|
auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
|
||||||
auto null = accessor.NullValue();
|
auto null = accessor.NullValue();
|
||||||
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
|
||||||
}
|
}
|
||||||
@ -407,8 +402,7 @@ struct CopyPage {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Copy the data from the given EllpackPage to the current page.
|
// Copy the data from the given EllpackPage to the current page.
|
||||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
|
size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
|
||||||
size_t offset) {
|
|
||||||
monitor_.Start("Copy");
|
monitor_.Start("Copy");
|
||||||
size_t num_elements = page->n_rows * page->row_stride;
|
size_t num_elements = page->n_rows * page->row_stride;
|
||||||
CHECK_EQ(row_stride, page->row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
@ -468,7 +462,7 @@ struct CompactPage {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Compacts the data from the given EllpackPage into the current page.
|
// Compacts the data from the given EllpackPage into the current page.
|
||||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
|
void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
|
||||||
common::Span<size_t> row_indexes) {
|
common::Span<size_t> row_indexes) {
|
||||||
monitor_.Start("Compact");
|
monitor_.Start("Compact");
|
||||||
CHECK_EQ(row_stride, page->row_stride);
|
CHECK_EQ(row_stride, page->row_stride);
|
||||||
@ -481,13 +475,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize the buffer to stored compressed features.
|
// Initialize the buffer to stored compressed features.
|
||||||
void EllpackPageImpl::InitCompressedData(int device) {
|
void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
|
||||||
size_t num_symbols = NumSymbols();
|
size_t num_symbols = NumSymbols();
|
||||||
|
|
||||||
// Required buffer size for storing data matrix in ELLPack format.
|
// Required buffer size for storing data matrix in ELLPack format.
|
||||||
size_t compressed_size_bytes =
|
size_t compressed_size_bytes =
|
||||||
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
|
common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
|
||||||
num_symbols);
|
|
||||||
gidx_buffer.SetDevice(device);
|
gidx_buffer.SetDevice(device);
|
||||||
// Don't call fill unnecessarily
|
// Don't call fill unnecessarily
|
||||||
if (gidx_buffer.Size() == 0) {
|
if (gidx_buffer.Size() == 0) {
|
||||||
@ -499,7 +492,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compress a CSR page into ELLPACK.
|
// Compress a CSR page into ELLPACK.
|
||||||
void EllpackPageImpl::CreateHistIndices(int device,
|
void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
|
||||||
const SparsePage& row_batch,
|
const SparsePage& row_batch,
|
||||||
common::Span<FeatureType const> feature_types) {
|
common::Span<FeatureType const> feature_types) {
|
||||||
if (row_batch.Size() == 0) return;
|
if (row_batch.Size() == 0) return;
|
||||||
@ -509,7 +502,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
|||||||
|
|
||||||
// bin and compress entries in batches of rows
|
// bin and compress entries in batches of rows
|
||||||
size_t gpu_batch_nrows =
|
size_t gpu_batch_nrows =
|
||||||
std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
|
std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
|
||||||
static_cast<size_t>(row_batch.Size()));
|
static_cast<size_t>(row_batch.Size()));
|
||||||
|
|
||||||
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
|
size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
|
||||||
@ -572,7 +565,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
|
|||||||
}
|
}
|
||||||
|
|
||||||
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
||||||
int device, common::Span<FeatureType const> feature_types) const {
|
DeviceOrd device, common::Span<FeatureType const> feature_types) const {
|
||||||
gidx_buffer.SetDevice(device);
|
gidx_buffer.SetDevice(device);
|
||||||
return {device,
|
return {device,
|
||||||
cuts_,
|
cuts_,
|
||||||
@ -586,7 +579,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
|
|||||||
}
|
}
|
||||||
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
|
EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
|
||||||
common::Span<FeatureType const> feature_types) const {
|
common::Span<FeatureType const> feature_types) const {
|
||||||
return {Context::kCpuId,
|
return {DeviceOrd::CPU(),
|
||||||
cuts_,
|
cuts_,
|
||||||
is_dense,
|
is_dense,
|
||||||
row_stride,
|
row_stride,
|
||||||
|
|||||||
@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
|
|||||||
|
|
||||||
common::Span<const FeatureType> feature_types;
|
common::Span<const FeatureType> feature_types;
|
||||||
|
|
||||||
EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
|
EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
|
||||||
bool is_dense, size_t row_stride, size_t base_rowid,
|
size_t row_stride, size_t base_rowid, size_t n_rows,
|
||||||
size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
|
common::CompressedIterator<uint32_t> gidx_iter,
|
||||||
common::Span<FeatureType const> feature_types)
|
common::Span<FeatureType const> feature_types)
|
||||||
: is_dense(is_dense),
|
: is_dense(is_dense),
|
||||||
row_stride(row_stride),
|
row_stride(row_stride),
|
||||||
base_rowid(base_rowid),
|
base_rowid(base_rowid),
|
||||||
n_rows(n_rows) ,gidx_iter(gidx_iter),
|
n_rows(n_rows),
|
||||||
|
gidx_iter(gidx_iter),
|
||||||
feature_types{feature_types} {
|
feature_types{feature_types} {
|
||||||
if (device == Context::kCpuId) {
|
if (device.IsCPU()) {
|
||||||
gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
|
gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
|
||||||
feature_segments = cuts.cut_ptrs_.ConstHostSpan();
|
feature_segments = cuts.cut_ptrs_.ConstHostSpan();
|
||||||
min_fvalue = cuts.min_vals_.ConstHostSpan();
|
min_fvalue = cuts.min_vals_.ConstHostSpan();
|
||||||
@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
|
|||||||
}
|
}
|
||||||
// Get a matrix element, uses binary search for look up Return NaN if missing
|
// Get a matrix element, uses binary search for look up Return NaN if missing
|
||||||
// Given a row index and a feature index, returns the corresponding cut value
|
// Given a row index and a feature index, returns the corresponding cut value
|
||||||
__device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
[[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
|
||||||
ridx -= base_rowid;
|
ridx -= base_rowid;
|
||||||
auto row_begin = row_stride * ridx;
|
auto row_begin = row_stride * ridx;
|
||||||
auto row_end = row_begin + row_stride;
|
auto row_end = row_begin + row_stride;
|
||||||
@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <bool is_cat>
|
template <bool is_cat>
|
||||||
__device__ uint32_t SearchBin(float value, size_t column_id) const {
|
[[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
|
||||||
auto beg = feature_segments[column_id];
|
auto beg = feature_segments[column_id];
|
||||||
auto end = feature_segments[column_id + 1];
|
auto end = feature_segments[column_id + 1];
|
||||||
uint32_t idx = 0;
|
uint32_t idx = 0;
|
||||||
@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
|
|||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
|
[[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
|
||||||
auto gidx = GetBinIndex(ridx, fidx);
|
auto gidx = GetBinIndex(ridx, fidx);
|
||||||
if (gidx == -1) {
|
if (gidx == -1) {
|
||||||
return nan("");
|
return nan("");
|
||||||
@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if the row id is withing range of the current batch.
|
// Check if the row id is withing range of the current batch.
|
||||||
__device__ bool IsInRange(size_t row_id) const {
|
[[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
|
||||||
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
return row_id >= base_rowid && row_id < base_rowid + n_rows;
|
||||||
}
|
}
|
||||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||||
* not found). */
|
* not found). */
|
||||||
XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
[[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
|
||||||
|
|
||||||
XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
|
[[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
|
||||||
|
|
||||||
XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
[[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
|
||||||
|
|
||||||
XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
[[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -141,14 +142,13 @@ class EllpackPageImpl {
|
|||||||
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
* This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
|
||||||
* and the given number of rows.
|
* and the given number of rows.
|
||||||
*/
|
*/
|
||||||
EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
|
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
|
||||||
size_t row_stride, size_t n_rows);
|
size_t n_rows);
|
||||||
/*!
|
/*!
|
||||||
* \brief Constructor used for external memory.
|
* \brief Constructor used for external memory.
|
||||||
*/
|
*/
|
||||||
EllpackPageImpl(int device, common::HistogramCuts cuts,
|
EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
|
||||||
const SparsePage &page, bool is_dense, size_t row_stride,
|
bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
|
||||||
common::Span<FeatureType const> feature_types);
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Constructor from an existing DMatrix.
|
* \brief Constructor from an existing DMatrix.
|
||||||
@ -159,7 +159,7 @@ class EllpackPageImpl {
|
|||||||
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
|
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
|
||||||
|
|
||||||
template <typename AdapterBatch>
|
template <typename AdapterBatch>
|
||||||
explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
|
explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
|
||||||
common::Span<size_t> row_counts_span,
|
common::Span<size_t> row_counts_span,
|
||||||
common::Span<FeatureType const> feature_types, size_t row_stride,
|
common::Span<FeatureType const> feature_types, size_t row_stride,
|
||||||
size_t n_rows, common::HistogramCuts const& cuts);
|
size_t n_rows, common::HistogramCuts const& cuts);
|
||||||
@ -176,7 +176,7 @@ class EllpackPageImpl {
|
|||||||
* @param offset The number of elements to skip before copying.
|
* @param offset The number of elements to skip before copying.
|
||||||
* @returns The number of elements copied.
|
* @returns The number of elements copied.
|
||||||
*/
|
*/
|
||||||
size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
|
size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
|
||||||
|
|
||||||
/*! \brief Compact the given ELLPACK page into the current page.
|
/*! \brief Compact the given ELLPACK page into the current page.
|
||||||
*
|
*
|
||||||
@ -184,11 +184,10 @@ class EllpackPageImpl {
|
|||||||
* @param page The ELLPACK page to compact from.
|
* @param page The ELLPACK page to compact from.
|
||||||
* @param row_indexes Row indexes for the compacted page.
|
* @param row_indexes Row indexes for the compacted page.
|
||||||
*/
|
*/
|
||||||
void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
|
||||||
|
|
||||||
|
|
||||||
/*! \return Number of instances in the page. */
|
/*! \return Number of instances in the page. */
|
||||||
size_t Size() const;
|
[[nodiscard]] size_t Size() const;
|
||||||
|
|
||||||
/*! \brief Set the base row id for this page. */
|
/*! \brief Set the base row id for this page. */
|
||||||
void SetBaseRowId(std::size_t row_id) {
|
void SetBaseRowId(std::size_t row_id) {
|
||||||
@ -204,12 +203,12 @@ class EllpackPageImpl {
|
|||||||
|
|
||||||
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
/*! \brief Return the total number of symbols (total number of bins plus 1 for
|
||||||
* not found). */
|
* not found). */
|
||||||
size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
[[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
|
||||||
|
|
||||||
EllpackDeviceAccessor
|
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
|
||||||
GetDeviceAccessor(int device,
|
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
|
||||||
common::Span<FeatureType const> feature_types = {}) const;
|
[[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
|
||||||
EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
|
common::Span<FeatureType const> feature_types = {}) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/*!
|
/*!
|
||||||
@ -218,13 +217,13 @@ class EllpackPageImpl {
|
|||||||
* @param device The GPU device to use.
|
* @param device The GPU device to use.
|
||||||
* @param row_batch The CSR page.
|
* @param row_batch The CSR page.
|
||||||
*/
|
*/
|
||||||
void CreateHistIndices(int device,
|
void CreateHistIndices(DeviceOrd device,
|
||||||
const SparsePage& row_batch,
|
const SparsePage& row_batch,
|
||||||
common::Span<FeatureType const> feature_types);
|
common::Span<FeatureType const> feature_types);
|
||||||
/*!
|
/*!
|
||||||
* \brief Initialize the buffer to store compressed features.
|
* \brief Initialize the buffer to store compressed features.
|
||||||
*/
|
*/
|
||||||
void InitCompressedData(int device);
|
void InitCompressedData(DeviceOrd device);
|
||||||
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|||||||
@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
namespace xgboost::data {
|
namespace xgboost::data {
|
||||||
void EllpackPageSource::Fetch() {
|
void EllpackPageSource::Fetch() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
if (!this->ReadCache()) {
|
if (!this->ReadCache()) {
|
||||||
if (count_ != 0 && !sync_) {
|
if (count_ != 0 && !sync_) {
|
||||||
// source is initialized to be the 0th page during construction, so when count_ is 0
|
// source is initialized to be the 0th page during construction, so when count_ is 0
|
||||||
|
|||||||
@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
|
|||||||
BatchParam param_;
|
BatchParam param_;
|
||||||
common::Span<FeatureType const> feature_types_;
|
common::Span<FeatureType const> feature_types_;
|
||||||
std::unique_ptr<common::HistogramCuts> cuts_;
|
std::unique_ptr<common::HistogramCuts> cuts_;
|
||||||
std::int32_t device_;
|
DeviceOrd device_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
|
EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
|
||||||
std::shared_ptr<Cache> cache, BatchParam param,
|
std::shared_ptr<Cache> cache, BatchParam param,
|
||||||
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
|
std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
|
||||||
common::Span<FeatureType const> feature_types,
|
common::Span<FeatureType const> feature_types,
|
||||||
std::shared_ptr<SparsePageSource> source, std::int32_t device)
|
std::shared_ptr<SparsePageSource> source, DeviceOrd device)
|
||||||
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
|
: PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
|
||||||
is_dense_{is_dense},
|
is_dense_{is_dense},
|
||||||
row_stride_{row_stride},
|
row_stride_{row_stride},
|
||||||
|
|||||||
@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
|
|||||||
auto pctx = MakeProxy(proxy_)->Ctx();
|
auto pctx = MakeProxy(proxy_)->Ctx();
|
||||||
|
|
||||||
Context ctx;
|
Context ctx;
|
||||||
ctx.UpdateAllowUnknown(
|
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
|
||||||
Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
|
|
||||||
// hardcoded parameter.
|
// hardcoded parameter.
|
||||||
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
|
BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
|
||||||
|
|
||||||
@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
|
|||||||
return HostAdapterDispatch(proxy, [&](auto const& value) {
|
return HostAdapterDispatch(proxy, [&](auto const& value) {
|
||||||
size_t n_threads = ctx->Threads();
|
size_t n_threads = ctx->Threads();
|
||||||
size_t n_features = column_sizes.size();
|
size_t n_features = column_sizes.size();
|
||||||
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
|
linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
|
||||||
column_sizes_tloc.Data()->Fill(0ul);
|
column_sizes_tloc.Data()->Fill(0ul);
|
||||||
auto view = column_sizes_tloc.HostView();
|
auto view = column_sizes_tloc.HostView();
|
||||||
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
|
common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
|
||||||
|
|||||||
@ -47,9 +47,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
|||||||
|
|
||||||
int32_t current_device;
|
int32_t current_device;
|
||||||
dh::safe_cuda(cudaGetDevice(¤t_device));
|
dh::safe_cuda(cudaGetDevice(¤t_device));
|
||||||
auto get_device = [&]() -> int32_t {
|
auto get_device = [&]() {
|
||||||
std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
|
auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
|
||||||
CHECK_NE(d, Context::kCpuId);
|
CHECK(!d.IsCPU());
|
||||||
return d;
|
return d;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -59,9 +59,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
|||||||
common::HistogramCuts cuts;
|
common::HistogramCuts cuts;
|
||||||
do {
|
do {
|
||||||
// We use do while here as the first batch is fetched in ctor
|
// We use do while here as the first batch is fetched in ctor
|
||||||
// ctx_.gpu_id = proxy->DeviceIdx();
|
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
|
||||||
CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
|
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
|
||||||
if (cols == 0) {
|
if (cols == 0) {
|
||||||
cols = num_cols();
|
cols = num_cols();
|
||||||
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
|
collective::Allreduce<collective::Operation::kMax>(&cols, 1);
|
||||||
@ -93,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
|||||||
auto n_features = cols;
|
auto n_features = cols;
|
||||||
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||||
if (!ref) {
|
if (!ref) {
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
common::SketchContainer final_sketch(
|
common::SketchContainer final_sketch(
|
||||||
@ -132,7 +131,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
|||||||
size_t n_batches_for_verification = 0;
|
size_t n_batches_for_verification = 0;
|
||||||
while (iter.Next()) {
|
while (iter.Next()) {
|
||||||
init_page();
|
init_page();
|
||||||
dh::safe_cuda(cudaSetDevice(get_device()));
|
dh::safe_cuda(cudaSetDevice(get_device().ordinal));
|
||||||
auto rows = num_rows();
|
auto rows = num_rows();
|
||||||
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
dh::device_vector<size_t> row_counts(rows + 1, 0);
|
||||||
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
|
||||||
@ -184,18 +183,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
|
|||||||
if (!ellpack_) {
|
if (!ellpack_) {
|
||||||
ellpack_.reset(new EllpackPage());
|
ellpack_.reset(new EllpackPage());
|
||||||
if (ctx->IsCUDA()) {
|
if (ctx->IsCUDA()) {
|
||||||
this->Info().feature_types.SetDevice(ctx->gpu_id);
|
this->Info().feature_types.SetDevice(ctx->Device());
|
||||||
*ellpack_->Impl() =
|
*ellpack_->Impl() =
|
||||||
EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||||
} else if (fmat_ctx_.IsCUDA()) {
|
} else if (fmat_ctx_.IsCUDA()) {
|
||||||
this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
|
this->Info().feature_types.SetDevice(fmat_ctx_.Device());
|
||||||
*ellpack_->Impl() =
|
*ellpack_->Impl() =
|
||||||
EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||||
} else {
|
} else {
|
||||||
// Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
|
// Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
|
||||||
// for cut reference.
|
// for cut reference.
|
||||||
auto cuda_ctx = ctx->MakeCUDA();
|
auto cuda_ctx = ctx->MakeCUDA();
|
||||||
this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
|
this->Info().feature_types.SetDevice(cuda_ctx.Device());
|
||||||
*ellpack_->Impl() =
|
*ellpack_->Impl() =
|
||||||
EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
|
|||||||
this->batch_ = adapter;
|
this->batch_ = adapter;
|
||||||
this->Info().num_col_ = adapter->NumColumns();
|
this->Info().num_col_ = adapter->NumColumns();
|
||||||
this->Info().num_row_ = adapter->NumRows();
|
this->Info().num_row_ = adapter->NumRows();
|
||||||
this->ctx_.gpu_id = Context::kCpuId;
|
this->ctx_.Init(Args{{"device", "cpu"}});
|
||||||
}
|
}
|
||||||
|
|
||||||
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
|
void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
|
||||||
char const *c_values, bst_feature_t n_features, bool on_host) {
|
bst_feature_t n_features, bool on_host) {
|
||||||
CHECK(on_host) << "Not implemented on device.";
|
CHECK(on_host) << "Not implemented on device.";
|
||||||
std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
|
std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
|
||||||
StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
|
StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
|
||||||
this->batch_ = adapter;
|
this->batch_ = adapter;
|
||||||
this->Info().num_col_ = adapter->NumColumns();
|
this->Info().num_col_ = adapter->NumColumns();
|
||||||
this->Info().num_row_ = adapter->NumRows();
|
this->Info().num_row_ = adapter->NumRows();
|
||||||
this->ctx_.gpu_id = Context::kCpuId;
|
this->ctx_.Init(Args{{"device", "cpu"}});
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace cuda_impl {
|
namespace cuda_impl {
|
||||||
|
|||||||
@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
|
|||||||
this->batch_ = adapter;
|
this->batch_ = adapter;
|
||||||
this->Info().num_col_ = adapter->NumColumns();
|
this->Info().num_col_ = adapter->NumColumns();
|
||||||
this->Info().num_row_ = adapter->NumRows();
|
this->Info().num_row_ = adapter->NumRows();
|
||||||
if (adapter->DeviceIdx() < 0) {
|
if (adapter->Device().IsCPU()) {
|
||||||
// empty data
|
// empty data
|
||||||
CHECK_EQ(this->Info().num_row_, 0);
|
CHECK_EQ(this->Info().num_row_, 0);
|
||||||
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
|
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
||||||
@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
|
|||||||
this->batch_ = adapter;
|
this->batch_ = adapter;
|
||||||
this->Info().num_col_ = adapter->NumColumns();
|
this->Info().num_col_ = adapter->NumColumns();
|
||||||
this->Info().num_row_ = adapter->NumRows();
|
this->Info().num_row_ = adapter->NumRows();
|
||||||
if (adapter->DeviceIdx() < 0) {
|
if (adapter->Device().IsCPU()) {
|
||||||
// empty data
|
// empty data
|
||||||
CHECK_EQ(this->Info().num_row_, 0);
|
CHECK_EQ(this->Info().num_row_, 0);
|
||||||
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
|
ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace cuda_impl {
|
namespace cuda_impl {
|
||||||
|
|||||||
@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
|
|||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
|
|
||||||
public:
|
public:
|
||||||
int DeviceIdx() const { return ctx_.gpu_id; }
|
DeviceOrd Device() const { return ctx_.Device(); }
|
||||||
|
|
||||||
void SetCUDAArray(char const* c_interface) {
|
void SetCUDAArray(char const* c_interface) {
|
||||||
common::AssertGPUSupport();
|
common::AssertGPUSupport();
|
||||||
|
|||||||
@ -253,7 +253,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
|
|||||||
}
|
}
|
||||||
if (batch.BaseMargin() != nullptr) {
|
if (batch.BaseMargin() != nullptr) {
|
||||||
info_.base_margin_ = decltype(info_.base_margin_){
|
info_.base_margin_ = decltype(info_.base_margin_){
|
||||||
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
|
batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
|
||||||
}
|
}
|
||||||
if (batch.Qid() != nullptr) {
|
if (batch.Qid() != nullptr) {
|
||||||
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
|
qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
|
||||||
|
|||||||
@ -10,9 +10,7 @@
|
|||||||
#include "xgboost/context.h" // for Context
|
#include "xgboost/context.h" // for Context
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::data {
|
||||||
namespace data {
|
|
||||||
|
|
||||||
// Does not currently support metainfo as no on-device data source contains this
|
// Does not currently support metainfo as no on-device data source contains this
|
||||||
// Current implementation assumes a single batch. More batches can
|
// Current implementation assumes a single batch. More batches can
|
||||||
// be supported in future. Does not currently support inferring row/column size
|
// be supported in future. Does not currently support inferring row/column size
|
||||||
@ -21,13 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
|
|||||||
DataSplitMode data_split_mode) {
|
DataSplitMode data_split_mode) {
|
||||||
CHECK(data_split_mode != DataSplitMode::kCol)
|
CHECK(data_split_mode != DataSplitMode::kCol)
|
||||||
<< "Column-wise data split is currently not supported on the GPU.";
|
<< "Column-wise data split is currently not supported on the GPU.";
|
||||||
auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
|
auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
|
||||||
: adapter->DeviceIdx();
|
? DeviceOrd::CUDA(dh::CurrentDevice())
|
||||||
CHECK_GE(device, 0);
|
: adapter->Device();
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
CHECK(device.IsCUDA());
|
||||||
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
|
|
||||||
Context ctx;
|
Context ctx;
|
||||||
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
|
ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
|
||||||
|
|
||||||
CHECK(adapter->NumRows() != kAdapterUnknownSize);
|
CHECK(adapter->NumRows() != kAdapterUnknownSize);
|
||||||
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
|
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
|
||||||
@ -52,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
|
|||||||
int nthread, DataSplitMode data_split_mode);
|
int nthread, DataSplitMode data_split_mode);
|
||||||
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
|
template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
|
||||||
int nthread, DataSplitMode data_split_mode);
|
int nthread, DataSplitMode data_split_mode);
|
||||||
} // namespace data
|
} // namespace xgboost::data
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -40,9 +40,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
|
||||||
int device_idx, float missing) {
|
float missing) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
IsValidFunctor is_valid(missing);
|
IsValidFunctor is_valid(missing);
|
||||||
// Count elements per row
|
// Count elements per row
|
||||||
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
|
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
|
||||||
@ -55,14 +55,13 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
|||||||
});
|
});
|
||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
thrust::exclusive_scan(thrust::cuda::par(alloc),
|
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data()),
|
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
thrust::device_pointer_cast(offset.data()));
|
||||||
thrust::device_pointer_cast(offset.data()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
|
size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
|
||||||
SparsePage* page) {
|
SparsePage* page) {
|
||||||
bool valid = NoInfInData(batch, IsValidFunctor{missing});
|
bool valid = NoInfInData(batch, IsValidFunctor{missing});
|
||||||
CHECK(valid) << error::InfInData();
|
CHECK(valid) << error::InfInData();
|
||||||
|
|||||||
@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
|
|||||||
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
|
ellpack_page_source_.reset(); // make sure resource is released before making new ones.
|
||||||
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
|
ellpack_page_source_ = std::make_shared<EllpackPageSource>(
|
||||||
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
|
this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
|
||||||
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
|
param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
|
||||||
|
ctx->Device());
|
||||||
} else {
|
} else {
|
||||||
CHECK(sparse_page_source_);
|
CHECK(sparse_page_source_);
|
||||||
ellpack_page_source_->Reset();
|
ellpack_page_source_->Reset();
|
||||||
|
|||||||
@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
|
|||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
|
void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
|
||||||
auto device = proxy->DeviceIdx();
|
auto device = proxy->Device();
|
||||||
if (device < 0) {
|
if (device.IsCPU()) {
|
||||||
device = dh::CurrentDevice();
|
device = DeviceOrd::CUDA(dh::CurrentDevice());
|
||||||
}
|
}
|
||||||
CHECK_GE(device, 0);
|
CHECK(device.IsCUDA());
|
||||||
|
|
||||||
cuda_impl::Dispatch(proxy,
|
cuda_impl::Dispatch(proxy,
|
||||||
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
|
[&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
|
||||||
|
|||||||
@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
|
|||||||
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
|
bst_target_t const n_groups = model_.learner_model_param->OutputLength();
|
||||||
monitor_.Start("BoostNewTrees");
|
monitor_.Start("BoostNewTrees");
|
||||||
|
|
||||||
predt->predictions.SetDevice(ctx_->Ordinal());
|
predt->predictions.SetDevice(ctx_->Device());
|
||||||
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
|
auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
|
||||||
model_.learner_model_param->OutputLength());
|
model_.learner_model_param->OutputLength());
|
||||||
CHECK_NE(n_groups, 0);
|
CHECK_NE(n_groups, 0);
|
||||||
@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
|
|||||||
} else {
|
} else {
|
||||||
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
|
CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
|
||||||
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
|
linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
|
||||||
ctx_->Ordinal()};
|
ctx_->Device()};
|
||||||
bool update_predict = true;
|
bool update_predict = true;
|
||||||
for (bst_target_t gid = 0; gid < n_groups; ++gid) {
|
for (bst_target_t gid = 0; gid < n_groups; ++gid) {
|
||||||
node_position.clear();
|
node_position.clear();
|
||||||
@ -736,7 +736,7 @@ class Dart : public GBTree {
|
|||||||
|
|
||||||
PredictionCacheEntry predts; // temporary storage for prediction
|
PredictionCacheEntry predts; // temporary storage for prediction
|
||||||
if (ctx_->IsCUDA()) {
|
if (ctx_->IsCUDA()) {
|
||||||
predts.predictions.SetDevice(ctx_->gpu_id);
|
predts.predictions.SetDevice(ctx_->Device());
|
||||||
}
|
}
|
||||||
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
||||||
// multi-target is not yet supported.
|
// multi-target is not yet supported.
|
||||||
@ -761,8 +761,8 @@ class Dart : public GBTree {
|
|||||||
CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
|
CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
|
||||||
|
|
||||||
size_t n_rows = p_fmat->Info().num_row_;
|
size_t n_rows = p_fmat->Info().num_row_;
|
||||||
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
|
if (predts.predictions.Device().IsCUDA()) {
|
||||||
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
|
p_out_preds->predictions.SetDevice(predts.predictions.Device());
|
||||||
GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
|
GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
|
||||||
predts.predictions.DeviceSpan(), w, n_rows, n_groups,
|
predts.predictions.DeviceSpan(), w, n_rows, n_groups,
|
||||||
group);
|
group);
|
||||||
@ -801,8 +801,8 @@ class Dart : public GBTree {
|
|||||||
|
|
||||||
StringView msg{"Unsupported data type for inplace predict."};
|
StringView msg{"Unsupported data type for inplace predict."};
|
||||||
PredictionCacheEntry predts;
|
PredictionCacheEntry predts;
|
||||||
if (ctx_->gpu_id != Context::kCpuId) {
|
if (ctx_->IsCUDA()) {
|
||||||
predts.predictions.SetDevice(ctx_->gpu_id);
|
predts.predictions.SetDevice(ctx_->Device());
|
||||||
}
|
}
|
||||||
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
|
||||||
|
|
||||||
@ -838,8 +838,8 @@ class Dart : public GBTree {
|
|||||||
CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
|
CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
|
||||||
|
|
||||||
size_t n_rows = p_fmat->Info().num_row_;
|
size_t n_rows = p_fmat->Info().num_row_;
|
||||||
if (predts.predictions.DeviceIdx() != Context::kCpuId) {
|
if (predts.predictions.Device().IsCUDA()) {
|
||||||
p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
|
p_out_preds->predictions.SetDevice(predts.predictions.Device());
|
||||||
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
|
auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
|
||||||
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
|
GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
|
||||||
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
|
predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
|
||||||
|
|||||||
@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
|
|||||||
|
|
||||||
void LearnerModelParam::Copy(LearnerModelParam const& that) {
|
void LearnerModelParam::Copy(LearnerModelParam const& that) {
|
||||||
base_score_.Reshape(that.base_score_.Shape());
|
base_score_.Reshape(that.base_score_.Shape());
|
||||||
base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
|
base_score_.Data()->SetDevice(that.base_score_.Device());
|
||||||
base_score_.Data()->Copy(*that.base_score_.Data());
|
base_score_.Data()->Copy(*that.base_score_.Data());
|
||||||
std::as_const(base_score_).HostView();
|
std::as_const(base_score_).HostView();
|
||||||
if (that.base_score_.DeviceIdx() != Context::kCpuId) {
|
if (!that.base_score_.Device().IsCPU()) {
|
||||||
std::as_const(base_score_).View(that.base_score_.Device());
|
std::as_const(base_score_).View(that.base_score_.Device());
|
||||||
}
|
}
|
||||||
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
|
CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
|
||||||
@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
|
|||||||
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
|
if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
|
||||||
if (p_fmat) {
|
if (p_fmat) {
|
||||||
auto const& info = p_fmat->Info();
|
auto const& info = p_fmat->Info();
|
||||||
info.Validate(Ctx()->Ordinal());
|
info.Validate(Ctx()->Device());
|
||||||
// We estimate it from input data.
|
// We estimate it from input data.
|
||||||
linalg::Tensor<float, 1> base_score;
|
linalg::Tensor<float, 1> base_score;
|
||||||
InitEstimation(info, &base_score);
|
InitEstimation(info, &base_score);
|
||||||
@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
|
|||||||
monitor_.Init("Learner");
|
monitor_.Init("Learner");
|
||||||
for (std::shared_ptr<DMatrix> const& d : cache) {
|
for (std::shared_ptr<DMatrix> const& d : cache) {
|
||||||
if (d) {
|
if (d) {
|
||||||
prediction_container_.Cache(d, Context::kCpuId);
|
prediction_container_.Cache(d, DeviceOrd::CPU());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1046,7 +1046,7 @@ class LearnerIO : public LearnerConfiguration {
|
|||||||
? std::numeric_limits<float>::quiet_NaN()
|
? std::numeric_limits<float>::quiet_NaN()
|
||||||
: obj_->ProbToMargin(mparam_.base_score)},
|
: obj_->ProbToMargin(mparam_.base_score)},
|
||||||
{1},
|
{1},
|
||||||
Context::kCpuId},
|
DeviceOrd::CPU()},
|
||||||
obj_->Task(), tparam_.multi_strategy);
|
obj_->Task(), tparam_.multi_strategy);
|
||||||
|
|
||||||
if (attributes_.find("objective") != attributes_.cend()) {
|
if (attributes_.find("objective") != attributes_.cend()) {
|
||||||
@ -1271,7 +1271,7 @@ class LearnerImpl : public LearnerIO {
|
|||||||
|
|
||||||
this->ValidateDMatrix(train.get(), true);
|
this->ValidateDMatrix(train.get(), true);
|
||||||
|
|
||||||
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
|
auto& predt = prediction_container_.Cache(train, ctx_.Device());
|
||||||
|
|
||||||
monitor_.Start("PredictRaw");
|
monitor_.Start("PredictRaw");
|
||||||
this->PredictRaw(train.get(), &predt, true, 0, 0);
|
this->PredictRaw(train.get(), &predt, true, 0, 0);
|
||||||
@ -1301,7 +1301,7 @@ class LearnerImpl : public LearnerIO {
|
|||||||
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
|
CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
|
||||||
<< "The number of columns in gradient should be equal to the number of targets/classes in "
|
<< "The number of columns in gradient should be equal to the number of targets/classes in "
|
||||||
"the model.";
|
"the model.";
|
||||||
auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
|
auto& predt = prediction_container_.Cache(train, ctx_.Device());
|
||||||
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
|
gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
|
||||||
monitor_.Stop("BoostOneIter");
|
monitor_.Stop("BoostOneIter");
|
||||||
}
|
}
|
||||||
@ -1327,11 +1327,11 @@ class LearnerImpl : public LearnerIO {
|
|||||||
|
|
||||||
for (size_t i = 0; i < data_sets.size(); ++i) {
|
for (size_t i = 0; i < data_sets.size(); ++i) {
|
||||||
std::shared_ptr<DMatrix> m = data_sets[i];
|
std::shared_ptr<DMatrix> m = data_sets[i];
|
||||||
auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
|
auto &predt = prediction_container_.Cache(m, ctx_.Device());
|
||||||
this->ValidateDMatrix(m.get(), false);
|
this->ValidateDMatrix(m.get(), false);
|
||||||
this->PredictRaw(m.get(), &predt, false, 0, 0);
|
this->PredictRaw(m.get(), &predt, false, 0, 0);
|
||||||
|
|
||||||
auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
|
auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
|
||||||
out.Resize(predt.predictions.Size());
|
out.Resize(predt.predictions.Size());
|
||||||
out.Copy(predt.predictions);
|
out.Copy(predt.predictions);
|
||||||
|
|
||||||
@ -1367,7 +1367,7 @@ class LearnerImpl : public LearnerIO {
|
|||||||
} else if (pred_leaf) {
|
} else if (pred_leaf) {
|
||||||
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
|
gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
|
||||||
} else {
|
} else {
|
||||||
auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
|
auto& prediction = prediction_container_.Cache(data, ctx_.Device());
|
||||||
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
|
this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
|
||||||
// Copy the prediction cache to output prediction. out_preds comes from C API
|
// Copy the prediction cache to output prediction. out_preds comes from C API
|
||||||
out_preds->SetDevice(ctx_.Device());
|
out_preds->SetDevice(ctx_.Device());
|
||||||
@ -1447,7 +1447,7 @@ class LearnerImpl : public LearnerIO {
|
|||||||
|
|
||||||
void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
|
void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
|
||||||
MetaInfo const& info = p_fmat->Info();
|
MetaInfo const& info = p_fmat->Info();
|
||||||
info.Validate(ctx_.gpu_id);
|
info.Validate(ctx_.Device());
|
||||||
|
|
||||||
if (is_training) {
|
if (is_training) {
|
||||||
CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
|
CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
|
||||||
|
|||||||
@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
}
|
}
|
||||||
|
|
||||||
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
|
void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
|
||||||
if (ctx_->gpu_id < 0) return;
|
if (ctx_->IsCPU()) return;
|
||||||
|
|
||||||
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
|
num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
|
||||||
|
|
||||||
@ -60,7 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
// The begin and end indices for the section of each column associated with
|
// The begin and end indices for the section of each column associated with
|
||||||
// this device
|
// this device
|
||||||
std::vector<std::pair<bst_uint, bst_uint>> column_segments;
|
std::vector<std::pair<bst_uint, bst_uint>> column_segments;
|
||||||
@ -133,7 +133,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
++group_idx) {
|
++group_idx) {
|
||||||
// Get gradient
|
// Get gradient
|
||||||
auto grad = GradientPair(0, 0);
|
auto grad = GradientPair(0, 0);
|
||||||
if (ctx_->gpu_id >= 0) {
|
if (ctx_->IsCUDA()) {
|
||||||
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
|
grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
|
||||||
}
|
}
|
||||||
auto dbias = static_cast<float>(
|
auto dbias = static_cast<float>(
|
||||||
@ -142,7 +142,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
model->Bias()[group_idx] += dbias;
|
model->Bias()[group_idx] += dbias;
|
||||||
|
|
||||||
// Update residual
|
// Update residual
|
||||||
if (ctx_->gpu_id >= 0) {
|
if (ctx_->IsCUDA()) {
|
||||||
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
|
UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -153,7 +153,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
bst_float &w = (*model)[fidx][group_idx];
|
bst_float &w = (*model)[fidx][group_idx];
|
||||||
// Get gradient
|
// Get gradient
|
||||||
auto grad = GradientPair(0, 0);
|
auto grad = GradientPair(0, 0);
|
||||||
if (ctx_->gpu_id >= 0) {
|
if (ctx_->IsCUDA()) {
|
||||||
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
|
grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
|
||||||
}
|
}
|
||||||
auto dw = static_cast<float>(tparam_.learning_rate *
|
auto dw = static_cast<float>(tparam_.learning_rate *
|
||||||
@ -162,14 +162,14 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
tparam_.reg_lambda_denorm));
|
tparam_.reg_lambda_denorm));
|
||||||
w += dw;
|
w += dw;
|
||||||
|
|
||||||
if (ctx_->gpu_id >= 0) {
|
if (ctx_->IsCUDA()) {
|
||||||
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
|
UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This needs to be public because of the __device__ lambda.
|
// This needs to be public because of the __device__ lambda.
|
||||||
GradientPair GetBiasGradient(int group_idx, int num_group) {
|
GradientPair GetBiasGradient(int group_idx, int num_group) {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
auto counting = thrust::make_counting_iterator(0ull);
|
auto counting = thrust::make_counting_iterator(0ull);
|
||||||
auto f = [=] __device__(size_t idx) {
|
auto f = [=] __device__(size_t idx) {
|
||||||
return idx * num_group + group_idx;
|
return idx * num_group + group_idx;
|
||||||
@ -193,7 +193,7 @@ class GPUCoordinateUpdater : public LinearUpdater { // NOLINT
|
|||||||
|
|
||||||
// This needs to be public because of the __device__ lambda.
|
// This needs to be public because of the __device__ lambda.
|
||||||
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
|
GradientPair GetGradient(int group_idx, int num_group, int fidx) {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
|
common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
|
||||||
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
|
size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
|
||||||
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
|
common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
|
||||||
|
|||||||
@ -23,8 +23,7 @@
|
|||||||
#include "xgboost/linalg.h"
|
#include "xgboost/linalg.h"
|
||||||
#include "xgboost/metric.h"
|
#include "xgboost/metric.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::metric {
|
||||||
namespace metric {
|
|
||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
DMLC_REGISTRY_FILE_TAG(auc);
|
DMLC_REGISTRY_FILE_TAG(auc);
|
||||||
/**
|
/**
|
||||||
@ -257,10 +256,10 @@ template <typename Curve>
|
|||||||
class EvalAUC : public MetricNoCache {
|
class EvalAUC : public MetricNoCache {
|
||||||
double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
|
double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
|
||||||
double auc {0};
|
double auc {0};
|
||||||
if (ctx_->gpu_id != Context::kCpuId) {
|
if (ctx_->Device().IsCUDA()) {
|
||||||
preds.SetDevice(ctx_->gpu_id);
|
preds.SetDevice(ctx_->Device());
|
||||||
info.labels.SetDevice(ctx_->gpu_id);
|
info.labels.SetDevice(ctx_->Device());
|
||||||
info.weights_.SetDevice(ctx_->gpu_id);
|
info.weights_.SetDevice(ctx_->Device());
|
||||||
}
|
}
|
||||||
// We use the global size to handle empty dataset.
|
// We use the global size to handle empty dataset.
|
||||||
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
|
std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
|
||||||
@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
|||||||
double auc{0};
|
double auc{0};
|
||||||
uint32_t valid_groups = 0;
|
uint32_t valid_groups = 0;
|
||||||
auto n_threads = ctx_->Threads();
|
auto n_threads = ctx_->Threads();
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
std::tie(auc, valid_groups) =
|
std::tie(auc, valid_groups) =
|
||||||
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
|
RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
|
||||||
} else {
|
} else {
|
||||||
@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
|||||||
double auc{0};
|
double auc{0};
|
||||||
auto n_threads = ctx_->Threads();
|
auto n_threads = ctx_->Threads();
|
||||||
CHECK_NE(n_classes, 0);
|
CHECK_NE(n_classes, 0);
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
|
auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
|
||||||
} else {
|
} else {
|
||||||
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
|
auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
|
||||||
@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
|||||||
std::tuple<double, double, double>
|
std::tuple<double, double, double>
|
||||||
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
||||||
double fp, tp, auc;
|
double fp, tp, auc;
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
|
std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
|
||||||
info.labels.HostView().Slice(linalg::All(), 0),
|
info.labels.HostView().Slice(linalg::All(), 0),
|
||||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||||
@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
char const* Name() const override {
|
[[nodiscard]] char const* Name() const override {
|
||||||
return "auc";
|
return "auc";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
|||||||
std::tuple<double, double, double>
|
std::tuple<double, double, double>
|
||||||
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
|
||||||
double pr, re, auc;
|
double pr, re, auc;
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
std::tie(pr, re, auc) =
|
std::tie(pr, re, auc) =
|
||||||
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
|
BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
|
||||||
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
common::OptionalWeights{info.weights_.ConstHostSpan()});
|
||||||
@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
|||||||
|
|
||||||
double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
|
double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
|
||||||
size_t n_classes) {
|
size_t n_classes) {
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
auto n_threads = this->ctx_->Threads();
|
auto n_threads = this->ctx_->Threads();
|
||||||
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
|
return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
|
||||||
} else {
|
} else {
|
||||||
@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
|||||||
double auc{0};
|
double auc{0};
|
||||||
uint32_t valid_groups = 0;
|
uint32_t valid_groups = 0;
|
||||||
auto n_threads = ctx_->Threads();
|
auto n_threads = ctx_->Threads();
|
||||||
if (ctx_->gpu_id == Context::kCpuId) {
|
if (ctx_->IsCPU()) {
|
||||||
auto labels = info.labels.Data()->ConstHostSpan();
|
auto labels = info.labels.Data()->ConstHostSpan();
|
||||||
if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
|
if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
|
||||||
InvalidLabels();
|
InvalidLabels();
|
||||||
@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
const char *Name() const override { return "aucpr"; }
|
[[nodiscard]] const char *Name() const override { return "aucpr"; }
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
|
XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
|
||||||
@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
} // namespace metric
|
} // namespace xgboost::metric
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -824,7 +824,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
common::Span<float const> predts,
|
common::Span<float const> predts,
|
||||||
MetaInfo const &info,
|
MetaInfo const &info,
|
||||||
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
std::shared_ptr<DeviceAUCCache> *p_cache) {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||||
if (predts.empty()) {
|
if (predts.empty()) {
|
||||||
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
return std::make_pair(0.0, static_cast<uint32_t>(0));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -127,24 +127,24 @@ class MultiClassMetricsReduction {
|
|||||||
|
|
||||||
#endif // XGBOOST_USE_CUDA
|
#endif // XGBOOST_USE_CUDA
|
||||||
|
|
||||||
PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
|
PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
|
||||||
const HostDeviceVector<bst_float>& weights,
|
const HostDeviceVector<bst_float>& weights,
|
||||||
const HostDeviceVector<bst_float>& labels,
|
const HostDeviceVector<bst_float>& labels,
|
||||||
const HostDeviceVector<bst_float>& preds) {
|
const HostDeviceVector<bst_float>& preds) {
|
||||||
PackedReduceResult result;
|
PackedReduceResult result;
|
||||||
|
|
||||||
if (device < 0) {
|
if (device.IsCPU()) {
|
||||||
result =
|
result =
|
||||||
CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
|
CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
|
||||||
}
|
}
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
else { // NOLINT
|
else { // NOLINT
|
||||||
device_ = tparam.gpu_id;
|
device_ = ctx.Device();
|
||||||
preds.SetDevice(device_);
|
preds.SetDevice(device_);
|
||||||
labels.SetDevice(device_);
|
labels.SetDevice(device_);
|
||||||
weights.SetDevice(device_);
|
weights.SetDevice(device_);
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||||
result = DeviceReduceMetrics(weights, labels, preds, n_class);
|
result = DeviceReduceMetrics(weights, labels, preds, n_class);
|
||||||
}
|
}
|
||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
@ -154,7 +154,7 @@ class MultiClassMetricsReduction {
|
|||||||
private:
|
private:
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
dh::PinnedMemory label_error_;
|
dh::PinnedMemory label_error_;
|
||||||
int device_{-1};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -176,7 +176,7 @@ struct EvalMClassBase : public MetricNoCache {
|
|||||||
CHECK_GE(nclass, 1U)
|
CHECK_GE(nclass, 1U)
|
||||||
<< "mlogloss and merror are only used for multi-class classification,"
|
<< "mlogloss and merror are only used for multi-class classification,"
|
||||||
<< " use logloss for binary classification";
|
<< " use logloss for binary classification";
|
||||||
int device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
auto result =
|
auto result =
|
||||||
reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
|
reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
|
||||||
dat[0] = result.Residue();
|
dat[0] = result.Residue();
|
||||||
|
|||||||
@ -35,7 +35,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
|
|||||||
auto d_gptr = p_cache->DataGroupPtr(ctx);
|
auto d_gptr = p_cache->DataGroupPtr(ctx);
|
||||||
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
||||||
|
|
||||||
predt.SetDevice(ctx->gpu_id);
|
predt.SetDevice(ctx->Device());
|
||||||
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
|
auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
|
||||||
auto topk = p_cache->Param().TopK();
|
auto topk = p_cache->Param().TopK();
|
||||||
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
|
auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
|
||||||
@ -90,7 +90,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
|
|||||||
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
|
CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
|
||||||
}
|
}
|
||||||
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
|
||||||
predt.SetDevice(ctx->gpu_id);
|
predt.SetDevice(ctx->Device());
|
||||||
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
|
auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
|
||||||
|
|
||||||
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
|
auto d_group_ptr = p_cache->DataGroupPtr(ctx);
|
||||||
|
|||||||
@ -130,18 +130,18 @@ class ElementWiseSurvivalMetricsReduction {
|
|||||||
const HostDeviceVector<bst_float>& preds) {
|
const HostDeviceVector<bst_float>& preds) {
|
||||||
PackedReduceResult result;
|
PackedReduceResult result;
|
||||||
|
|
||||||
if (ctx.gpu_id < 0) {
|
if (ctx.IsCPU()) {
|
||||||
result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
|
result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
|
||||||
preds, ctx.Threads());
|
preds, ctx.Threads());
|
||||||
}
|
}
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
else { // NOLINT
|
else { // NOLINT
|
||||||
preds.SetDevice(ctx.gpu_id);
|
preds.SetDevice(ctx.Device());
|
||||||
labels_lower_bound.SetDevice(ctx.gpu_id);
|
labels_lower_bound.SetDevice(ctx.Device());
|
||||||
labels_upper_bound.SetDevice(ctx.gpu_id);
|
labels_upper_bound.SetDevice(ctx.Device());
|
||||||
weights.SetDevice(ctx.gpu_id);
|
weights.SetDevice(ctx.Device());
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
|
||||||
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
|
result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
|
||||||
}
|
}
|
||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
|
|||||||
@ -100,7 +100,7 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
|
|||||||
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
|
detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
|
||||||
predt, alpha, p_tree);
|
predt, alpha, p_tree);
|
||||||
} else {
|
} else {
|
||||||
position.SetDevice(ctx->gpu_id);
|
position.SetDevice(ctx->Device());
|
||||||
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
|
detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
|
||||||
predt, alpha, p_tree);
|
predt, alpha, p_tree);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
|
|||||||
|
|
||||||
template <typename Distribution>
|
template <typename Distribution>
|
||||||
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
|
void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
|
||||||
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
|
linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
|
||||||
bool is_null_weight, float aft_loss_distribution_scale) {
|
bool is_null_weight, float aft_loss_distribution_scale) {
|
||||||
common::Transform<>::Init(
|
common::Transform<>::Init(
|
||||||
[=] XGBOOST_DEVICE(size_t _idx,
|
[=] XGBOOST_DEVICE(size_t _idx,
|
||||||
@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
|
|||||||
CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
|
CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
|
||||||
out_gpair->SetDevice(ctx_->Device());
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
out_gpair->Reshape(ndata, 1);
|
out_gpair->Reshape(ndata, 1);
|
||||||
const int device = ctx_->gpu_id;
|
const auto device = ctx_->Device();
|
||||||
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
|
const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
|
||||||
const bool is_null_weight = info.weights_.Size() == 0;
|
const bool is_null_weight = info.weights_.Size() == 0;
|
||||||
if (!is_null_weight) {
|
if (!is_null_weight) {
|
||||||
@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
|
|||||||
_preds[_idx] = exp(_preds[_idx]);
|
_preds[_idx] = exp(_preds[_idx]);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2018-2022 by XGBoost Contributors
|
* Copyright 2018-2023, XGBoost Contributors
|
||||||
* \file hinge.cc
|
* \file hinge.cc
|
||||||
* \brief Provides an implementation of the hinge loss function
|
* \brief Provides an implementation of the hinge loss function
|
||||||
* \author Henry Gouk
|
* \author Henry Gouk
|
||||||
@ -13,8 +13,7 @@
|
|||||||
#include "../common/transform.h"
|
#include "../common/transform.h"
|
||||||
#include "../common/common.h"
|
#include "../common/common.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::obj {
|
||||||
namespace obj {
|
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
|
DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
|
||||||
@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
|
|||||||
_out_gpair[_idx] = GradientPair(g, h);
|
_out_gpair[_idx] = GradientPair(g, h);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
|
||||||
ctx_->gpu_id).Eval(
|
ctx_->Device()).Eval(
|
||||||
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
|
out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
|
|||||||
_preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
|
_preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* DefaultEvalMetric() const override {
|
[[nodiscard]] const char* DefaultEvalMetric() const override {
|
||||||
return "error";
|
return "error";
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
|
|||||||
.describe("Hinge loss. Expects labels to be in [0,1f]")
|
.describe("Hinge loss. Expects labels to be in [0,1f]")
|
||||||
.set_body([]() { return new HingeObj(); });
|
.set_body([]() { return new HingeObj(); });
|
||||||
|
|
||||||
} // namespace obj
|
} // namespace xgboost::obj
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
|
|||||||
CheckInitInputs(info);
|
CheckInitInputs(info);
|
||||||
}
|
}
|
||||||
// Avoid altering any state in child objective.
|
// Avoid altering any state in child objective.
|
||||||
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
|
HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
|
||||||
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
|
linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
|
||||||
|
|
||||||
Json config{Object{}};
|
Json config{Object{}};
|
||||||
this->SaveConfig(&config);
|
this->SaveConfig(&config);
|
||||||
|
|||||||
@ -103,10 +103,10 @@ class LambdaRankObj : public FitIntercept {
|
|||||||
|
|
||||||
// Update position biased for unbiased click data
|
// Update position biased for unbiased click data
|
||||||
void UpdatePositionBias() {
|
void UpdatePositionBias() {
|
||||||
li_full_.SetDevice(ctx_->gpu_id);
|
li_full_.SetDevice(ctx_->Device());
|
||||||
lj_full_.SetDevice(ctx_->gpu_id);
|
lj_full_.SetDevice(ctx_->Device());
|
||||||
li_.SetDevice(ctx_->gpu_id);
|
li_.SetDevice(ctx_->Device());
|
||||||
lj_.SetDevice(ctx_->gpu_id);
|
lj_.SetDevice(ctx_->Device());
|
||||||
|
|
||||||
if (ctx_->IsCPU()) {
|
if (ctx_->IsCPU()) {
|
||||||
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
|
cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
|
||||||
|
|||||||
@ -290,12 +290,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
|
|||||||
linalg::VectorView<double> li, linalg::VectorView<double> lj,
|
linalg::VectorView<double> li, linalg::VectorView<double> lj,
|
||||||
linalg::Matrix<GradientPair>* out_gpair) {
|
linalg::Matrix<GradientPair>* out_gpair) {
|
||||||
// boilerplate
|
// boilerplate
|
||||||
std::int32_t device_id = ctx->gpu_id;
|
auto device = ctx->Device();
|
||||||
dh::safe_cuda(cudaSetDevice(device_id));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
auto n_groups = p_cache->Groups();
|
auto n_groups = p_cache->Groups();
|
||||||
|
|
||||||
info.labels.SetDevice(device_id);
|
info.labels.SetDevice(device);
|
||||||
preds.SetDevice(device_id);
|
preds.SetDevice(device);
|
||||||
out_gpair->SetDevice(ctx->Device());
|
out_gpair->SetDevice(ctx->Device());
|
||||||
out_gpair->Reshape(preds.Size(), 1);
|
out_gpair->Reshape(preds.Size(), 1);
|
||||||
|
|
||||||
|
|||||||
@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
const int nclass = param_.num_class;
|
const int nclass = param_.num_class;
|
||||||
const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
|
const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
|
||||||
|
|
||||||
auto device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
out_gpair->SetDevice(device);
|
out_gpair->SetDevice(device);
|
||||||
info.labels.SetDevice(device);
|
info.labels.SetDevice(device);
|
||||||
info.weights_.SetDevice(device);
|
info.weights_.SetDevice(device);
|
||||||
@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
|
|||||||
const int nclass = param_.num_class;
|
const int nclass = param_.num_class;
|
||||||
const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
|
const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
|
||||||
|
|
||||||
auto device = io_preds->DeviceIdx();
|
auto device = io_preds->Device();
|
||||||
if (prob) {
|
if (prob) {
|
||||||
common::Transform<>::Init(
|
common::Transform<>::Init(
|
||||||
[=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
|
[=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
|
||||||
|
|||||||
@ -70,15 +70,15 @@ class QuantileRegression : public ObjFunction {
|
|||||||
out_gpair->Reshape(info.num_row_, n_targets);
|
out_gpair->Reshape(info.num_row_, n_targets);
|
||||||
auto gpair = out_gpair->View(ctx_->Device());
|
auto gpair = out_gpair->View(ctx_->Device());
|
||||||
|
|
||||||
info.weights_.SetDevice(ctx_->gpu_id);
|
info.weights_.SetDevice(ctx_->Device());
|
||||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||||
: info.weights_.ConstDeviceSpan()};
|
: info.weights_.ConstDeviceSpan()};
|
||||||
|
|
||||||
preds.SetDevice(ctx_->gpu_id);
|
preds.SetDevice(ctx_->Device());
|
||||||
auto predt = linalg::MakeVec(&preds);
|
auto predt = linalg::MakeVec(&preds);
|
||||||
auto n_samples = info.num_row_;
|
auto n_samples = info.num_row_;
|
||||||
|
|
||||||
alpha_.SetDevice(ctx_->gpu_id);
|
alpha_.SetDevice(ctx_->Device());
|
||||||
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
|
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
|
||||||
|
|
||||||
linalg::ElementWiseKernel(
|
linalg::ElementWiseKernel(
|
||||||
@ -103,7 +103,7 @@ class QuantileRegression : public ObjFunction {
|
|||||||
CHECK(!alpha_.Empty());
|
CHECK(!alpha_.Empty());
|
||||||
|
|
||||||
auto n_targets = this->Targets(info);
|
auto n_targets = this->Targets(info);
|
||||||
base_score->SetDevice(ctx_->gpu_id);
|
base_score->SetDevice(ctx_->Device());
|
||||||
base_score->Reshape(n_targets);
|
base_score->Reshape(n_targets);
|
||||||
|
|
||||||
double sw{0};
|
double sw{0};
|
||||||
@ -129,7 +129,7 @@ class QuantileRegression : public ObjFunction {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
alpha_.SetDevice(ctx_->gpu_id);
|
alpha_.SetDevice(ctx_->Device());
|
||||||
auto d_alpha = alpha_.ConstDeviceSpan();
|
auto d_alpha = alpha_.ConstDeviceSpan();
|
||||||
auto d_labels = info.labels.View(ctx_->Device());
|
auto d_labels = info.labels.View(ctx_->Device());
|
||||||
auto seg_it = dh::MakeTransformIterator<std::size_t>(
|
auto seg_it = dh::MakeTransformIterator<std::size_t>(
|
||||||
@ -148,7 +148,7 @@ class QuantileRegression : public ObjFunction {
|
|||||||
val_it + n, base_score->Data());
|
val_it + n, base_score->Data());
|
||||||
sw = info.num_row_;
|
sw = info.num_row_;
|
||||||
} else {
|
} else {
|
||||||
info.weights_.SetDevice(ctx_->gpu_id);
|
info.weights_.SetDevice(ctx_->Device());
|
||||||
auto d_weights = info.weights_.ConstDeviceSpan();
|
auto d_weights = info.weights_.ConstDeviceSpan();
|
||||||
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||||
[=] XGBOOST_DEVICE(std::size_t i) {
|
[=] XGBOOST_DEVICE(std::size_t i) {
|
||||||
|
|||||||
@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
|
|||||||
|
|
||||||
size_t const ndata = preds.Size();
|
size_t const ndata = preds.Size();
|
||||||
out_gpair->SetDevice(ctx_->Device());
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
auto device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
|
|
||||||
bool is_null_weight = info.weights_.Size() == 0;
|
bool is_null_weight = info.weights_.Size() == 0;
|
||||||
auto scale_pos_weight = param_.scale_pos_weight;
|
auto scale_pos_weight = param_.scale_pos_weight;
|
||||||
@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
|
|||||||
additional_input_.HostVector().begin()[1] = is_null_weight;
|
additional_input_.HostVector().begin()[1] = is_null_weight;
|
||||||
|
|
||||||
const size_t nthreads = ctx_->Threads();
|
const size_t nthreads = ctx_->Threads();
|
||||||
bool on_device = device >= 0;
|
bool on_device = device.IsCUDA();
|
||||||
// On CPU we run the transformation each thread processing a contigious block of data
|
// On CPU we run the transformation each thread processing a contigious block of data
|
||||||
// for better performance.
|
// for better performance.
|
||||||
const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
|
const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
|
||||||
@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
|
|||||||
_preds[_idx] = Loss::PredTransform(_preds[_idx]);
|
_preds[_idx] = Loss::PredTransform(_preds[_idx]);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,14 +246,14 @@ class PseudoHuberRegression : public FitIntercept {
|
|||||||
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
|
CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
|
||||||
auto labels = info.labels.View(ctx_->Device());
|
auto labels = info.labels.View(ctx_->Device());
|
||||||
|
|
||||||
out_gpair->SetDevice(ctx_->gpu_id);
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||||
auto gpair = out_gpair->View(ctx_->Device());
|
auto gpair = out_gpair->View(ctx_->Device());
|
||||||
|
|
||||||
preds.SetDevice(ctx_->gpu_id);
|
preds.SetDevice(ctx_->Device());
|
||||||
auto predt = linalg::MakeVec(&preds);
|
auto predt = linalg::MakeVec(&preds);
|
||||||
|
|
||||||
info.weights_.SetDevice(ctx_->gpu_id);
|
info.weights_.SetDevice(ctx_->Device());
|
||||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||||
: info.weights_.ConstDeviceSpan()};
|
: info.weights_.ConstDeviceSpan()};
|
||||||
|
|
||||||
@ -327,7 +327,7 @@ class PoissonRegression : public FitIntercept {
|
|||||||
size_t const ndata = preds.Size();
|
size_t const ndata = preds.Size();
|
||||||
out_gpair->SetDevice(ctx_->Device());
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||||
auto device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
label_correct_.Resize(1);
|
label_correct_.Resize(1);
|
||||||
label_correct_.Fill(1);
|
label_correct_.Fill(1);
|
||||||
|
|
||||||
@ -369,7 +369,7 @@ class PoissonRegression : public FitIntercept {
|
|||||||
_preds[_idx] = expf(_preds[_idx]);
|
_preds[_idx] = expf(_preds[_idx]);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||||
@ -512,7 +512,7 @@ class GammaRegression : public FitIntercept {
|
|||||||
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
|
CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
|
||||||
CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
|
CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
|
||||||
const size_t ndata = preds.Size();
|
const size_t ndata = preds.Size();
|
||||||
auto device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
out_gpair->SetDevice(ctx_->Device());
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||||
label_correct_.Resize(1);
|
label_correct_.Resize(1);
|
||||||
@ -555,7 +555,7 @@ class GammaRegression : public FitIntercept {
|
|||||||
_preds[_idx] = expf(_preds[_idx]);
|
_preds[_idx] = expf(_preds[_idx]);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
|
||||||
@ -613,7 +613,7 @@ class TweedieRegression : public FitIntercept {
|
|||||||
out_gpair->SetDevice(ctx_->Device());
|
out_gpair->SetDevice(ctx_->Device());
|
||||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||||
|
|
||||||
auto device = ctx_->gpu_id;
|
auto device = ctx_->Device();
|
||||||
label_correct_.Resize(1);
|
label_correct_.Resize(1);
|
||||||
label_correct_.Fill(1);
|
label_correct_.Fill(1);
|
||||||
|
|
||||||
@ -660,7 +660,7 @@ class TweedieRegression : public FitIntercept {
|
|||||||
_preds[_idx] = expf(_preds[_idx]);
|
_preds[_idx] = expf(_preds[_idx]);
|
||||||
},
|
},
|
||||||
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
|
||||||
io_preds->DeviceIdx())
|
io_preds->Device())
|
||||||
.Eval(io_preds);
|
.Eval(io_preds);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -711,9 +711,9 @@ class MeanAbsoluteError : public ObjFunction {
|
|||||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||||
auto gpair = out_gpair->View(ctx_->Device());
|
auto gpair = out_gpair->View(ctx_->Device());
|
||||||
|
|
||||||
preds.SetDevice(ctx_->gpu_id);
|
preds.SetDevice(ctx_->Device());
|
||||||
auto predt = linalg::MakeVec(&preds);
|
auto predt = linalg::MakeVec(&preds);
|
||||||
info.weights_.SetDevice(ctx_->gpu_id);
|
info.weights_.SetDevice(ctx_->Device());
|
||||||
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
|
||||||
: info.weights_.ConstDeviceSpan()};
|
: info.weights_.ConstDeviceSpan()};
|
||||||
|
|
||||||
|
|||||||
@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
|
|||||||
|
|
||||||
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
|
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
|
||||||
bst_feature_t num_features, bst_row_t num_rows,
|
bst_feature_t num_features, bst_row_t num_rows,
|
||||||
size_t entry_start, float missing) :
|
size_t entry_start, float missing)
|
||||||
batch{batch},
|
: batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
|
||||||
columns{num_features},
|
extern __shared__ float _smem[];
|
||||||
use_shared{use_shared},
|
smem = _smem;
|
||||||
is_valid{missing} {
|
if (use_shared) {
|
||||||
extern __shared__ float _smem[];
|
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
smem = _smem;
|
size_t shared_elements = blockDim.x * num_features;
|
||||||
if (use_shared) {
|
dh::BlockFill(smem, shared_elements, nanf(""));
|
||||||
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
|
__syncthreads();
|
||||||
size_t shared_elements = blockDim.x * num_features;
|
if (global_idx < num_rows) {
|
||||||
dh::BlockFill(smem, shared_elements, nanf(""));
|
auto beg = global_idx * columns;
|
||||||
__syncthreads();
|
auto end = (global_idx + 1) * columns;
|
||||||
if (global_idx < num_rows) {
|
for (size_t i = beg; i < end; ++i) {
|
||||||
auto beg = global_idx * columns;
|
auto value = batch.GetElement(i).value;
|
||||||
auto end = (global_idx + 1) * columns;
|
if (is_valid(value)) {
|
||||||
for (size_t i = beg; i < end; ++i) {
|
smem[threadIdx.x * num_features + (i - beg)] = value;
|
||||||
auto value = batch.GetElement(i).value;
|
|
||||||
if (is_valid(value)) {
|
|
||||||
smem[threadIdx.x * num_features + (i - beg)] = value;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
|
||||||
}
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
|
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
|
||||||
if (use_shared) {
|
if (use_shared) {
|
||||||
return smem[threadIdx.x * columns + fidx];
|
return smem[threadIdx.x * columns + fidx];
|
||||||
}
|
}
|
||||||
@ -340,11 +337,11 @@ class DeviceModel {
|
|||||||
size_t tree_end_; // NOLINT
|
size_t tree_end_; // NOLINT
|
||||||
int num_group;
|
int num_group;
|
||||||
|
|
||||||
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
|
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
|
||||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
|
|
||||||
// Copy decision trees to device
|
// Copy decision trees to device
|
||||||
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
|
tree_segments = HostDeviceVector<size_t>({}, device);
|
||||||
auto& h_tree_segments = tree_segments.HostVector();
|
auto& h_tree_segments = tree_segments.HostVector();
|
||||||
h_tree_segments.reserve((tree_end - tree_begin) + 1);
|
h_tree_segments.reserve((tree_end - tree_begin) + 1);
|
||||||
size_t sum = 0;
|
size_t sum = 0;
|
||||||
@ -354,8 +351,8 @@ class DeviceModel {
|
|||||||
h_tree_segments.push_back(sum);
|
h_tree_segments.push_back(sum);
|
||||||
}
|
}
|
||||||
|
|
||||||
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
|
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
|
||||||
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
|
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
|
||||||
auto d_nodes = nodes.DevicePointer();
|
auto d_nodes = nodes.DevicePointer();
|
||||||
auto d_stats = stats.DevicePointer();
|
auto d_stats = stats.DevicePointer();
|
||||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
|
||||||
@ -369,12 +366,12 @@ class DeviceModel {
|
|||||||
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
|
||||||
}
|
}
|
||||||
|
|
||||||
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
|
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
|
||||||
auto& h_tree_group = tree_group.HostVector();
|
auto& h_tree_group = tree_group.HostVector();
|
||||||
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
|
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
|
||||||
|
|
||||||
// Initialize categorical splits.
|
// Initialize categorical splits.
|
||||||
split_types.SetDevice(gpu_id);
|
split_types.SetDevice(device);
|
||||||
std::vector<FeatureType>& h_split_types = split_types.HostVector();
|
std::vector<FeatureType>& h_split_types = split_types.HostVector();
|
||||||
h_split_types.resize(h_tree_segments.back());
|
h_split_types.resize(h_tree_segments.back());
|
||||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||||
@ -383,8 +380,8 @@ class DeviceModel {
|
|||||||
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
|
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
|
||||||
}
|
}
|
||||||
|
|
||||||
categories = HostDeviceVector<uint32_t>({}, gpu_id);
|
categories = HostDeviceVector<uint32_t>({}, device);
|
||||||
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
|
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
|
||||||
std::vector<uint32_t> &h_categories = categories.HostVector();
|
std::vector<uint32_t> &h_categories = categories.HostVector();
|
||||||
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
|
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
|
||||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||||
@ -397,7 +394,7 @@ class DeviceModel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
|
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
|
||||||
h_tree_segments.back(), {}, gpu_id);
|
h_tree_segments.back(), {}, device);
|
||||||
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
|
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
|
||||||
categories_node_segments.HostVector();
|
categories_node_segments.HostVector();
|
||||||
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
|
||||||
@ -485,8 +482,8 @@ struct PathInfo {
|
|||||||
void ExtractPaths(
|
void ExtractPaths(
|
||||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
|
||||||
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
|
||||||
int gpu_id) {
|
DeviceOrd device) {
|
||||||
dh::safe_cuda(cudaSetDevice(gpu_id));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
auto& device_model = *model;
|
auto& device_model = *model;
|
||||||
|
|
||||||
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
|
||||||
@ -773,12 +770,12 @@ class ColumnSplitHelper {
|
|||||||
template <bool predict_leaf>
|
template <bool predict_leaf>
|
||||||
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
|
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
|
||||||
bst_feature_t num_features, std::uint32_t num_group) const {
|
bst_feature_t num_features, std::uint32_t num_group) const {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
dh::caching_device_vector<BitType> decision_storage{};
|
dh::caching_device_vector<BitType> decision_storage{};
|
||||||
dh::caching_device_vector<BitType> missing_storage{};
|
dh::caching_device_vector<BitType> missing_storage{};
|
||||||
|
|
||||||
auto constexpr kBlockThreads = 128;
|
auto constexpr kBlockThreads = 128;
|
||||||
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
|
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
|
||||||
auto const shared_memory_bytes =
|
auto const shared_memory_bytes =
|
||||||
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
|
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
|
||||||
auto const use_shared = shared_memory_bytes != 0;
|
auto const use_shared = shared_memory_bytes != 0;
|
||||||
@ -791,8 +788,8 @@ class ColumnSplitHelper {
|
|||||||
BitVector decision_bits{dh::ToSpan(decision_storage)};
|
BitVector decision_bits{dh::ToSpan(decision_storage)};
|
||||||
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
BitVector missing_bits{dh::ToSpan(missing_storage)};
|
||||||
|
|
||||||
batch.offset.SetDevice(ctx_->gpu_id);
|
batch.offset.SetDevice(ctx_->Device());
|
||||||
batch.data.SetDevice(ctx_->gpu_id);
|
batch.data.SetDevice(ctx_->Device());
|
||||||
std::size_t entry_start = 0;
|
std::size_t entry_start = 0;
|
||||||
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
|
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
|
||||||
|
|
||||||
@ -823,9 +820,9 @@ class ColumnSplitHelper {
|
|||||||
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||||
dh::caching_device_vector<BitType>* missing_storage) const {
|
dh::caching_device_vector<BitType>* missing_storage) const {
|
||||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||||
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
|
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
|
||||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||||
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
|
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
|
||||||
@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
size_t num_features,
|
size_t num_features,
|
||||||
HostDeviceVector<bst_float>* predictions,
|
HostDeviceVector<bst_float>* predictions,
|
||||||
size_t batch_offset, bool is_dense) const {
|
size_t batch_offset, bool is_dense) const {
|
||||||
batch.offset.SetDevice(ctx_->gpu_id);
|
batch.offset.SetDevice(ctx_->Device());
|
||||||
batch.data.SetDevice(ctx_->gpu_id);
|
batch.data.SetDevice(ctx_->Device());
|
||||||
const uint32_t BLOCK_THREADS = 128;
|
const uint32_t BLOCK_THREADS = 128;
|
||||||
size_t num_rows = batch.Size();
|
size_t num_rows = batch.Size();
|
||||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
|
||||||
size_t shared_memory_bytes =
|
size_t shared_memory_bytes =
|
||||||
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
|
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
|
||||||
bool use_shared = shared_memory_bytes != 0;
|
bool use_shared = shared_memory_bytes != 0;
|
||||||
@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (tree_end - tree_begin == 0) {
|
if (tree_end - tree_begin == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
out_preds->SetDevice(ctx_->gpu_id);
|
out_preds->SetDevice(ctx_->Device());
|
||||||
auto const& info = dmat->Info();
|
auto const& info = dmat->Info();
|
||||||
DeviceModel d_model;
|
DeviceModel d_model;
|
||||||
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
|
d_model.Init(model, tree_begin, tree_end, ctx_->Device());
|
||||||
|
|
||||||
if (info.IsColumnSplit()) {
|
if (info.IsColumnSplit()) {
|
||||||
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
|
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
|
||||||
@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
} else {
|
} else {
|
||||||
size_t batch_offset = 0;
|
size_t batch_offset = 0;
|
||||||
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||||
dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
|
dmat->Info().feature_types.SetDevice(ctx_->Device());
|
||||||
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
|
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
|
||||||
this->PredictInternal(
|
this->PredictInternal(
|
||||||
page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
|
page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
|
||||||
d_model,
|
d_model,
|
||||||
out_preds,
|
out_preds,
|
||||||
batch_offset);
|
batch_offset);
|
||||||
@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
|
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
|
||||||
|
|
||||||
~GPUPredictor() override {
|
~GPUPredictor() override {
|
||||||
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
|
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
|
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
|
||||||
const gbm::GBTreeModel& model, uint32_t tree_begin,
|
const gbm::GBTreeModel& model, uint32_t tree_begin,
|
||||||
uint32_t tree_end = 0) const override {
|
uint32_t tree_end = 0) const override {
|
||||||
int device = ctx_->gpu_id;
|
CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
|
||||||
CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
|
|
||||||
auto* out_preds = &predts->predictions;
|
auto* out_preds = &predts->predictions;
|
||||||
if (tree_end == 0) {
|
if (tree_end == 0) {
|
||||||
tree_end = model.trees.size();
|
tree_end = model.trees.size();
|
||||||
@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
|
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
|
||||||
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
|
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
|
||||||
<< "Number of columns in data must equal to trained model.";
|
<< "Number of columns in data must equal to trained model.";
|
||||||
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
|
CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
|
||||||
<< "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
|
<< "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
|
||||||
<< "but data is on: " << m->DeviceIdx();
|
<< "but data is on: " << m->Device().Name();
|
||||||
if (p_m) {
|
if (p_m) {
|
||||||
p_m->Info().num_row_ = m->NumRows();
|
p_m->Info().num_row_ = m->NumRows();
|
||||||
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
|
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
|
||||||
@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
info.num_row_ = m->NumRows();
|
info.num_row_ = m->NumRows();
|
||||||
this->InitOutPredictions(info, &(out_preds->predictions), model);
|
this->InitOutPredictions(info, &(out_preds->predictions), model);
|
||||||
}
|
}
|
||||||
out_preds->predictions.SetDevice(m->DeviceIdx());
|
out_preds->predictions.SetDevice(m->Device());
|
||||||
|
|
||||||
const uint32_t BLOCK_THREADS = 128;
|
const uint32_t BLOCK_THREADS = 128;
|
||||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
|
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
|
||||||
|
|
||||||
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
|
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
|
||||||
size_t shared_memory_bytes =
|
size_t shared_memory_bytes =
|
||||||
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
|
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
|
||||||
DeviceModel d_model;
|
DeviceModel d_model;
|
||||||
d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
|
d_model.Init(model, tree_begin, tree_end, m->Device());
|
||||||
|
|
||||||
bool use_shared = shared_memory_bytes != 0;
|
bool use_shared = shared_memory_bytes != 0;
|
||||||
size_t entry_start = 0;
|
size_t entry_start = 0;
|
||||||
@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
}
|
}
|
||||||
CHECK(!p_fmat->Info().IsColumnSplit())
|
CHECK(!p_fmat->Info().IsColumnSplit())
|
||||||
<< "Predict contribution support for column-wise data split is not yet implemented.";
|
<< "Predict contribution support for column-wise data split is not yet implemented.";
|
||||||
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
out_contribs->SetDevice(ctx_->Device());
|
||||||
out_contribs->SetDevice(ctx_->gpu_id);
|
|
||||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||||
}
|
}
|
||||||
@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
||||||
device_paths;
|
device_paths;
|
||||||
DeviceModel d_model;
|
DeviceModel d_model;
|
||||||
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
|
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||||
dh::device_vector<uint32_t> categories;
|
dh::device_vector<uint32_t> categories;
|
||||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
|
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||||
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||||
batch.data.SetDevice(ctx_->gpu_id);
|
batch.data.SetDevice(ctx_->Device());
|
||||||
batch.offset.SetDevice(ctx_->gpu_id);
|
batch.offset.SetDevice(ctx_->Device());
|
||||||
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||||
model.learner_model_param->num_feature);
|
model.learner_model_param->num_feature);
|
||||||
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
||||||
@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
dh::tend(phis));
|
dh::tend(phis));
|
||||||
}
|
}
|
||||||
// Add the base margin term to last column
|
// Add the base margin term to last column
|
||||||
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
|
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
|
||||||
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
||||||
|
|
||||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||||
@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (tree_weights != nullptr) {
|
if (tree_weights != nullptr) {
|
||||||
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
LOG(FATAL) << "Dart booster feature " << not_implemented;
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
out_contribs->SetDevice(ctx_->gpu_id);
|
out_contribs->SetDevice(ctx_->Device());
|
||||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||||
}
|
}
|
||||||
@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
|
||||||
device_paths;
|
device_paths;
|
||||||
DeviceModel d_model;
|
DeviceModel d_model;
|
||||||
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
|
d_model.Init(model, 0, tree_end, ctx_->Device());
|
||||||
dh::device_vector<uint32_t> categories;
|
dh::device_vector<uint32_t> categories;
|
||||||
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
|
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
|
||||||
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||||
batch.data.SetDevice(ctx_->gpu_id);
|
batch.data.SetDevice(ctx_->Device());
|
||||||
batch.offset.SetDevice(ctx_->gpu_id);
|
batch.offset.SetDevice(ctx_->Device());
|
||||||
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||||
model.learner_model_param->num_feature);
|
model.learner_model_param->num_feature);
|
||||||
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
|
||||||
@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
dh::tend(phis));
|
dh::tend(phis));
|
||||||
}
|
}
|
||||||
// Add the base margin term to last column
|
// Add the base margin term to last column
|
||||||
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
|
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
|
||||||
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
|
||||||
|
|
||||||
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
auto base_score = model.learner_model_param->BaseScore(ctx_);
|
||||||
@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
|
||||||
const gbm::GBTreeModel &model,
|
const gbm::GBTreeModel &model,
|
||||||
unsigned tree_end) const override {
|
unsigned tree_end) const override {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
|
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
|
||||||
|
|
||||||
const MetaInfo& info = p_fmat->Info();
|
const MetaInfo& info = p_fmat->Info();
|
||||||
bst_row_t num_rows = info.num_row_;
|
bst_row_t num_rows = info.num_row_;
|
||||||
if (tree_end == 0 || tree_end > model.trees.size()) {
|
if (tree_end == 0 || tree_end > model.trees.size()) {
|
||||||
tree_end = static_cast<uint32_t>(model.trees.size());
|
tree_end = static_cast<uint32_t>(model.trees.size());
|
||||||
}
|
}
|
||||||
predictions->SetDevice(ctx_->gpu_id);
|
predictions->SetDevice(ctx_->Device());
|
||||||
predictions->Resize(num_rows * tree_end);
|
predictions->Resize(num_rows * tree_end);
|
||||||
DeviceModel d_model;
|
DeviceModel d_model;
|
||||||
d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
|
d_model.Init(model, 0, tree_end, this->ctx_->Device());
|
||||||
|
|
||||||
if (info.IsColumnSplit()) {
|
if (info.IsColumnSplit()) {
|
||||||
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
|
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
|
|
||||||
constexpr uint32_t kBlockThreads = 128;
|
constexpr uint32_t kBlockThreads = 128;
|
||||||
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
|
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
|
||||||
info.num_col_, max_shared_memory_bytes);
|
info.num_col_, max_shared_memory_bytes);
|
||||||
@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
|
|
||||||
if (p_fmat->PageExists<SparsePage>()) {
|
if (p_fmat->PageExists<SparsePage>()) {
|
||||||
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
|
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
|
||||||
batch.data.SetDevice(ctx_->gpu_id);
|
batch.data.SetDevice(ctx_->Device());
|
||||||
batch.offset.SetDevice(ctx_->gpu_id);
|
batch.offset.SetDevice(ctx_->Device());
|
||||||
bst_row_t batch_offset = 0;
|
bst_row_t batch_offset = 0;
|
||||||
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
|
||||||
model.learner_model_param->num_feature};
|
model.learner_model_param->num_feature};
|
||||||
@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
} else {
|
} else {
|
||||||
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
|
||||||
bst_row_t batch_offset = 0;
|
bst_row_t batch_offset = 0;
|
||||||
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
|
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
|
||||||
size_t num_rows = batch.Size();
|
size_t num_rows = batch.Size();
|
||||||
auto grid =
|
auto grid =
|
||||||
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
|
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
|
||||||
@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
/*! \brief Reconfigure the device when GPU is changed. */
|
/*! \brief Reconfigure the device when GPU is changed. */
|
||||||
static size_t ConfigureDevice(int device) {
|
static size_t ConfigureDevice(DeviceOrd device) {
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
return dh::MaxSharedMemory(device);
|
return dh::MaxSharedMemory(device.ordinal);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
|
|||||||
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
|
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
|
||||||
|
|
||||||
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
|
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
|
||||||
if (ctx_->gpu_id >= 0) {
|
if (ctx_->Device().IsCUDA()) {
|
||||||
out_preds->SetDevice(ctx_->gpu_id);
|
out_preds->SetDevice(ctx_->Device());
|
||||||
}
|
}
|
||||||
if (!base_margin->Empty()) {
|
if (!base_margin->Empty()) {
|
||||||
out_preds->Resize(n);
|
out_preds->Resize(n);
|
||||||
|
|||||||
@ -19,8 +19,7 @@
|
|||||||
#include "xgboost/linalg.h" // TensorView, Tensor, Constant
|
#include "xgboost/linalg.h" // TensorView, Tensor, Constant
|
||||||
#include "xgboost/logging.h" // CHECK_EQ
|
#include "xgboost/logging.h" // CHECK_EQ
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::tree {
|
||||||
namespace tree {
|
|
||||||
namespace cpu_impl {
|
namespace cpu_impl {
|
||||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||||
linalg::TensorView<GradientPair const, 2> gpair,
|
linalg::TensorView<GradientPair const, 2> gpair,
|
||||||
@ -68,7 +67,7 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
|
|||||||
|
|
||||||
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
|
||||||
bst_target_t n_targets, linalg::Vector<float>* out) {
|
bst_target_t n_targets, linalg::Vector<float>* out) {
|
||||||
out->SetDevice(ctx->gpu_id);
|
out->SetDevice(ctx->Device());
|
||||||
out->Reshape(n_targets);
|
out->Reshape(n_targets);
|
||||||
|
|
||||||
gpair.SetDevice(ctx->Device());
|
gpair.SetDevice(ctx->Device());
|
||||||
@ -76,5 +75,4 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
|
|||||||
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
|
ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
|
||||||
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
|
: cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
|
||||||
}
|
}
|
||||||
} // namespace tree
|
} // namespace xgboost::tree
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -21,9 +21,7 @@
|
|||||||
#include "xgboost/logging.h" // CHECK_EQ
|
#include "xgboost/logging.h" // CHECK_EQ
|
||||||
#include "xgboost/span.h" // span
|
#include "xgboost/span.h" // span
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::tree::cuda_impl {
|
||||||
namespace tree {
|
|
||||||
namespace cuda_impl {
|
|
||||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
||||||
auto n_targets = out.Size();
|
auto n_targets = out.Size();
|
||||||
@ -50,7 +48,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
|||||||
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
||||||
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
||||||
|
|
||||||
collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
|
collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
|
||||||
d_sum.Size() * 2);
|
d_sum.Size() * 2);
|
||||||
|
|
||||||
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
|
thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
|
||||||
@ -59,6 +57,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
|||||||
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
|
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} // namespace cuda_impl
|
} // namespace xgboost::tree::cuda_impl
|
||||||
} // namespace tree
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -413,7 +413,7 @@ void GPUHistEvaluator::EvaluateSplits(
|
|||||||
auto const world_size = collective::GetWorldSize();
|
auto const world_size = collective::GetWorldSize();
|
||||||
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
|
dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
|
||||||
auto all_candidates = dh::ToSpan(all_candidate_storage);
|
auto all_candidates = dh::ToSpan(all_candidate_storage);
|
||||||
collective::AllGather(device_, out_splits.data(), all_candidates.data(),
|
collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
|
||||||
out_splits.size() * sizeof(DeviceSplitCandidate));
|
out_splits.size() * sizeof(DeviceSplitCandidate));
|
||||||
|
|
||||||
// Reduce to get the best candidate from all workers.
|
// Reduce to get the best candidate from all workers.
|
||||||
|
|||||||
@ -85,7 +85,7 @@ class GPUHistEvaluator {
|
|||||||
std::size_t node_categorical_storage_size_ = 0;
|
std::size_t node_categorical_storage_size_ = 0;
|
||||||
// Is the data split column-wise?
|
// Is the data split column-wise?
|
||||||
bool is_column_split_ = false;
|
bool is_column_split_ = false;
|
||||||
int32_t device_;
|
DeviceOrd device_;
|
||||||
|
|
||||||
// Copy the categories from device to host asynchronously.
|
// Copy the categories from device to host asynchronously.
|
||||||
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
void CopyToHost( const std::vector<bst_node_t>& nidx);
|
||||||
@ -133,14 +133,14 @@ class GPUHistEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, int32_t device)
|
GPUHistEvaluator(TrainParam const ¶m, bst_feature_t n_features, DeviceOrd device)
|
||||||
: tree_evaluator_{param, n_features, device}, param_{param} {}
|
: tree_evaluator_{param, n_features, device}, param_{param} {}
|
||||||
/**
|
/**
|
||||||
* \brief Reset the evaluator, should be called before any use.
|
* \brief Reset the evaluator, should be called before any use.
|
||||||
*/
|
*/
|
||||||
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||||
bst_feature_t n_features, TrainParam const ¶m, bool is_column_split,
|
bst_feature_t n_features, TrainParam const ¶m, bool is_column_split,
|
||||||
int32_t device);
|
DeviceOrd device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* \brief Get host category storage for nidx. Different from the internal version, this
|
* \brief Get host category storage for nidx. Different from the internal version, this
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
* Copyright 2022 by XGBoost Contributors
|
* Copyright 2022-2023 by XGBoost Contributors
|
||||||
*
|
*
|
||||||
* \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
|
* \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
|
||||||
* compilation time.
|
* compilation time.
|
||||||
@ -12,11 +12,10 @@
|
|||||||
#include "evaluate_splits.cuh"
|
#include "evaluate_splits.cuh"
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::tree {
|
||||||
namespace tree {
|
|
||||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||||
bst_feature_t n_features, TrainParam const ¶m,
|
bst_feature_t n_features, TrainParam const ¶m,
|
||||||
bool is_column_split, int32_t device) {
|
bool is_column_split, DeviceOrd device) {
|
||||||
param_ = param;
|
param_ = param;
|
||||||
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
tree_evaluator_ = TreeEvaluator{param, n_features, device};
|
||||||
has_categoricals_ = cuts.HasCategorical();
|
has_categoricals_ = cuts.HasCategorical();
|
||||||
@ -127,6 +126,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
|||||||
});
|
});
|
||||||
return dh::ToSpan(cat_sorted_idx_);
|
return dh::ToSpan(cat_sorted_idx_);
|
||||||
}
|
}
|
||||||
|
} // namespace xgboost::tree
|
||||||
} // namespace tree
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2020 by XGBoost Contributors
|
* Copyright 2020-2023 by XGBoost Contributors
|
||||||
*/
|
*/
|
||||||
#ifndef FEATURE_GROUPS_CUH_
|
#ifndef FEATURE_GROUPS_CUH_
|
||||||
#define FEATURE_GROUPS_CUH_
|
#define FEATURE_GROUPS_CUH_
|
||||||
@ -102,11 +102,10 @@ struct FeatureGroups {
|
|||||||
InitSingle(cuts);
|
InitSingle(cuts);
|
||||||
}
|
}
|
||||||
|
|
||||||
FeatureGroupsAccessor DeviceAccessor(int device) const {
|
[[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
|
||||||
feature_segments.SetDevice(device);
|
feature_segments.SetDevice(device);
|
||||||
bin_segments.SetDevice(device);
|
bin_segments.SetDevice(device);
|
||||||
return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
|
return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
|
||||||
max_group_bins};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|||||||
@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
|
|||||||
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
|
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
|
||||||
auto page = batch.Impl();
|
auto page = batch.Impl();
|
||||||
if (!page_) {
|
if (!page_) {
|
||||||
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
|
page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
|
||||||
page->row_stride, dmat->Info().num_row_);
|
page->row_stride, dmat->Info().num_row_);
|
||||||
}
|
}
|
||||||
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
|
size_t num_elements = page_->Copy(ctx->Device(), page, offset);
|
||||||
offset += num_elements;
|
offset += num_elements;
|
||||||
}
|
}
|
||||||
page_concatenated_ = true;
|
page_concatenated_ = true;
|
||||||
@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
|
|||||||
auto first_page = (*batch_iterator.begin()).Impl();
|
auto first_page = (*batch_iterator.begin()).Impl();
|
||||||
// Create a new ELLPACK page with empty rows.
|
// Create a new ELLPACK page with empty rows.
|
||||||
page_.reset(); // Release the device memory first before reallocating
|
page_.reset(); // Release the device memory first before reallocating
|
||||||
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
|
page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
|
||||||
first_page->row_stride, sample_rows));
|
first_page->row_stride, sample_rows));
|
||||||
|
|
||||||
// Compact the ELLPACK pages into the single sample page.
|
// Compact the ELLPACK pages into the single sample page.
|
||||||
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||||
for (auto& batch : batch_iterator) {
|
for (auto& batch : batch_iterator) {
|
||||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||||
}
|
}
|
||||||
|
|
||||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||||
@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
|
|||||||
auto first_page = (*batch_iterator.begin()).Impl();
|
auto first_page = (*batch_iterator.begin()).Impl();
|
||||||
// Create a new ELLPACK page with empty rows.
|
// Create a new ELLPACK page with empty rows.
|
||||||
page_.reset(); // Release the device memory first before reallocating
|
page_.reset(); // Release the device memory first before reallocating
|
||||||
page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
|
page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
|
||||||
first_page->row_stride, sample_rows));
|
first_page->row_stride, sample_rows));
|
||||||
|
|
||||||
// Compact the ELLPACK pages into the single sample page.
|
// Compact the ELLPACK pages into the single sample page.
|
||||||
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
|
||||||
for (auto& batch : batch_iterator) {
|
for (auto& batch : batch_iterator) {
|
||||||
page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
|
page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
|
||||||
}
|
}
|
||||||
|
|
||||||
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
|
||||||
|
|||||||
@ -13,15 +13,15 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
|
|
||||||
RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
|
RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
|
||||||
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
: device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||||
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
|
||||||
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
RowPartitioner::~RowPartitioner() {
|
RowPartitioner::~RowPartitioner() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_idx_));
|
dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
|
||||||
|
|||||||
@ -199,7 +199,7 @@ class RowPartitioner {
|
|||||||
static constexpr bst_node_t kIgnoredTreePosition = -1;
|
static constexpr bst_node_t kIgnoredTreePosition = -1;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int device_idx_;
|
DeviceOrd device_idx_;
|
||||||
/*! \brief In here if you want to find the rows belong to a node nid, first you need to
|
/*! \brief In here if you want to find the rows belong to a node nid, first you need to
|
||||||
* get the indices segment from ridx_segments[nid], then get the row index that
|
* get the indices segment from ridx_segments[nid], then get the row index that
|
||||||
* represents position of row in input data X. `RowPartitioner::GetRows` would be a
|
* represents position of row in input data X. `RowPartitioner::GetRows` would be a
|
||||||
@ -223,7 +223,7 @@ class RowPartitioner {
|
|||||||
dh::PinnedMemory pinned2_;
|
dh::PinnedMemory pinned2_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
RowPartitioner(int device_idx, size_t num_rows);
|
RowPartitioner(DeviceOrd device_idx, size_t num_rows);
|
||||||
~RowPartitioner();
|
~RowPartitioner();
|
||||||
RowPartitioner(const RowPartitioner&) = delete;
|
RowPartitioner(const RowPartitioner&) = delete;
|
||||||
RowPartitioner& operator=(const RowPartitioner&) = delete;
|
RowPartitioner& operator=(const RowPartitioner&) = delete;
|
||||||
|
|||||||
@ -477,7 +477,7 @@ class HistEvaluator {
|
|||||||
: ctx_{ctx},
|
: ctx_{ctx},
|
||||||
param_{param},
|
param_{param},
|
||||||
column_sampler_{std::move(sampler)},
|
column_sampler_{std::move(sampler)},
|
||||||
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
|
tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
|
||||||
is_col_split_{info.IsColumnSplit()} {
|
is_col_split_{info.IsColumnSplit()} {
|
||||||
interaction_constraints_.Configure(*param, info.num_col_);
|
interaction_constraints_.Configure(*param, info.num_col_);
|
||||||
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
|
column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
|
||||||
@ -696,7 +696,7 @@ class HistMultiEvaluator {
|
|||||||
stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
|
stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
|
||||||
gain_.resize(1);
|
gain_.resize(1);
|
||||||
|
|
||||||
linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
|
linalg::Vector<float> weight({n_targets}, ctx_->Device());
|
||||||
CalcWeight(*param_, root_sum, weight.HostView());
|
CalcWeight(*param_, root_sum, weight.HostView());
|
||||||
auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
|
auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
|
||||||
gain_.front() = root_gain;
|
gain_.front() = root_gain;
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2018-2020 by Contributors
|
* Copyright 2018-2023 by Contributors
|
||||||
* \file split_evaluator.h
|
* \file split_evaluator.h
|
||||||
* \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
|
* \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
|
||||||
* \author Henry Gouk
|
* \author Henry Gouk
|
||||||
@ -23,8 +23,7 @@
|
|||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
#include "xgboost/tree_model.h"
|
#include "xgboost/tree_model.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::tree {
|
||||||
namespace tree {
|
|
||||||
class TreeEvaluator {
|
class TreeEvaluator {
|
||||||
// hist and exact use parent id to calculate constraints.
|
// hist and exact use parent id to calculate constraints.
|
||||||
static constexpr bst_node_t kRootParentId =
|
static constexpr bst_node_t kRootParentId =
|
||||||
@ -33,13 +32,13 @@ class TreeEvaluator {
|
|||||||
HostDeviceVector<float> lower_bounds_;
|
HostDeviceVector<float> lower_bounds_;
|
||||||
HostDeviceVector<float> upper_bounds_;
|
HostDeviceVector<float> upper_bounds_;
|
||||||
HostDeviceVector<int32_t> monotone_;
|
HostDeviceVector<int32_t> monotone_;
|
||||||
int32_t device_;
|
DeviceOrd device_;
|
||||||
bool has_constraint_;
|
bool has_constraint_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
|
TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
|
||||||
device_ = device;
|
device_ = device;
|
||||||
if (device != Context::kCpuId) {
|
if (device.IsCUDA()) {
|
||||||
lower_bounds_.SetDevice(device);
|
lower_bounds_.SetDevice(device);
|
||||||
upper_bounds_.SetDevice(device);
|
upper_bounds_.SetDevice(device);
|
||||||
monotone_.SetDevice(device);
|
monotone_.SetDevice(device);
|
||||||
@ -59,7 +58,7 @@ class TreeEvaluator {
|
|||||||
has_constraint_ = true;
|
has_constraint_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device_ != Context::kCpuId) {
|
if (device_.IsCUDA()) {
|
||||||
// Pull to device early.
|
// Pull to device early.
|
||||||
lower_bounds_.ConstDeviceSpan();
|
lower_bounds_.ConstDeviceSpan();
|
||||||
upper_bounds_.ConstDeviceSpan();
|
upper_bounds_.ConstDeviceSpan();
|
||||||
@ -122,7 +121,7 @@ class TreeEvaluator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fast floating point division instruction on device
|
// Fast floating point division instruction on device
|
||||||
XGBOOST_DEVICE float Divide(float a, float b) const {
|
[[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
|
||||||
#ifdef __CUDA_ARCH__
|
#ifdef __CUDA_ARCH__
|
||||||
return __fdividef(a, b);
|
return __fdividef(a, b);
|
||||||
#else
|
#else
|
||||||
@ -154,7 +153,7 @@ class TreeEvaluator {
|
|||||||
public:
|
public:
|
||||||
/* Get a view to the evaluator that can be passed down to device. */
|
/* Get a view to the evaluator that can be passed down to device. */
|
||||||
template <typename ParamT = TrainParam> auto GetEvaluator() const {
|
template <typename ParamT = TrainParam> auto GetEvaluator() const {
|
||||||
if (device_ != Context::kCpuId) {
|
if (device_.IsCUDA()) {
|
||||||
auto constraints = monotone_.ConstDevicePointer();
|
auto constraints = monotone_.ConstDevicePointer();
|
||||||
return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
|
return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
|
||||||
upper_bounds_.ConstDevicePointer(), has_constraint_};
|
upper_bounds_.ConstDevicePointer(), has_constraint_};
|
||||||
@ -215,7 +214,6 @@ enum SplitType {
|
|||||||
// partition-based categorical split
|
// partition-based categorical split
|
||||||
kPart = 2
|
kPart = 2
|
||||||
};
|
};
|
||||||
} // namespace tree
|
} // namespace xgboost::tree
|
||||||
} // namespace xgboost
|
|
||||||
|
|
||||||
#endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_
|
#endif // XGBOOST_TREE_SPLIT_EVALUATOR_H_
|
||||||
|
|||||||
@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
: param_(param),
|
: param_(param),
|
||||||
colmaker_train_param_{colmaker_train_param},
|
colmaker_train_param_{colmaker_train_param},
|
||||||
ctx_{ctx},
|
ctx_{ctx},
|
||||||
tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
|
tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
|
||||||
interaction_constraints_{std::move(_interaction_constraints)},
|
interaction_constraints_{std::move(_interaction_constraints)},
|
||||||
column_densities_(column_densities) {}
|
column_densities_(column_densities) {}
|
||||||
// update one tree, growing
|
// update one tree, growing
|
||||||
|
|||||||
@ -74,7 +74,7 @@ class DeviceHistogramStorage {
|
|||||||
dh::device_vector<typename GradientSumT::ValueT> overflow_;
|
dh::device_vector<typename GradientSumT::ValueT> overflow_;
|
||||||
std::map<int, size_t> overflow_nidx_map_;
|
std::map<int, size_t> overflow_nidx_map_;
|
||||||
int n_bins_;
|
int n_bins_;
|
||||||
int device_id_;
|
DeviceOrd device_id_;
|
||||||
static constexpr size_t kNumItemsInGradientSum =
|
static constexpr size_t kNumItemsInGradientSum =
|
||||||
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
|
sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
|
||||||
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
|
static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
|
||||||
@ -82,7 +82,7 @@ class DeviceHistogramStorage {
|
|||||||
public:
|
public:
|
||||||
// Start with about 16mb
|
// Start with about 16mb
|
||||||
DeviceHistogramStorage() { data_.reserve(1 << 22); }
|
DeviceHistogramStorage() { data_.reserve(1 << 22); }
|
||||||
void Init(int device_id, int n_bins) {
|
void Init(DeviceOrd device_id, int n_bins) {
|
||||||
this->n_bins_ = n_bins;
|
this->n_bins_ = n_bins;
|
||||||
this->device_id_ = device_id;
|
this->device_id_ = device_id;
|
||||||
}
|
}
|
||||||
@ -196,7 +196,7 @@ struct GPUHistMakerDevice {
|
|||||||
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
|
||||||
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
|
TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
|
||||||
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
|
uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
|
||||||
: evaluator_{_param, n_features, ctx->gpu_id},
|
: evaluator_{_param, n_features, ctx->Device()},
|
||||||
ctx_(ctx),
|
ctx_(ctx),
|
||||||
feature_types{_feature_types},
|
feature_types{_feature_types},
|
||||||
param(std::move(_param)),
|
param(std::move(_param)),
|
||||||
@ -211,7 +211,7 @@ struct GPUHistMakerDevice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CHECK(column_sampler_);
|
CHECK(column_sampler_);
|
||||||
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
|
monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
|
||||||
}
|
}
|
||||||
|
|
||||||
~GPUHistMakerDevice() = default;
|
~GPUHistMakerDevice() = default;
|
||||||
@ -220,7 +220,7 @@ struct GPUHistMakerDevice {
|
|||||||
if (!feature_groups) {
|
if (!feature_groups) {
|
||||||
CHECK(page);
|
CHECK(page);
|
||||||
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
|
feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
|
||||||
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
|
dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
|
||||||
sizeof(GradientPairPrecise));
|
sizeof(GradientPairPrecise));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -231,7 +231,7 @@ struct GPUHistMakerDevice {
|
|||||||
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
|
||||||
param.colsample_bynode, param.colsample_bylevel,
|
param.colsample_bynode, param.colsample_bylevel,
|
||||||
param.colsample_bytree);
|
param.colsample_bytree);
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
|
|
||||||
this->interaction_constraints.Reset();
|
this->interaction_constraints.Reset();
|
||||||
|
|
||||||
@ -246,15 +246,15 @@ struct GPUHistMakerDevice {
|
|||||||
gpair = sample.gpair;
|
gpair = sample.gpair;
|
||||||
|
|
||||||
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
|
||||||
dmat->Info().IsColumnSplit(), ctx_->gpu_id);
|
dmat->Info().IsColumnSplit(), ctx_->Device());
|
||||||
|
|
||||||
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
|
quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
|
||||||
|
|
||||||
row_partitioner.reset(); // Release the device memory first before reallocating
|
row_partitioner.reset(); // Release the device memory first before reallocating
|
||||||
row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
|
row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
|
||||||
|
|
||||||
// Init histogram
|
// Init histogram
|
||||||
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
|
hist.Init(ctx_->Device(), page->Cuts().TotalBins());
|
||||||
hist.Reset();
|
hist.Reset();
|
||||||
|
|
||||||
this->InitFeatureGroupsOnce();
|
this->InitFeatureGroupsOnce();
|
||||||
@ -267,7 +267,7 @@ struct GPUHistMakerDevice {
|
|||||||
sampled_features->SetDevice(ctx_->Device());
|
sampled_features->SetDevice(ctx_->Device());
|
||||||
common::Span<bst_feature_t> feature_set =
|
common::Span<bst_feature_t> feature_set =
|
||||||
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
|
||||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||||
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
|
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
|
||||||
EvaluateSplitSharedInputs shared_inputs{
|
EvaluateSplitSharedInputs shared_inputs{
|
||||||
gpu_param,
|
gpu_param,
|
||||||
@ -289,7 +289,7 @@ struct GPUHistMakerDevice {
|
|||||||
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
|
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
|
||||||
std::vector<bst_node_t> nidx(2 * candidates.size());
|
std::vector<bst_node_t> nidx(2 * candidates.size());
|
||||||
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
|
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
|
||||||
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||||
EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
|
EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
|
||||||
matrix.feature_segments, matrix.gidx_fvalue_map,
|
matrix.feature_segments, matrix.gidx_fvalue_map,
|
||||||
matrix.min_fvalue,
|
matrix.min_fvalue,
|
||||||
@ -342,9 +342,9 @@ struct GPUHistMakerDevice {
|
|||||||
void BuildHist(int nidx) {
|
void BuildHist(int nidx) {
|
||||||
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
auto d_node_hist = hist.GetNodeHistogram(nidx);
|
||||||
auto d_ridx = row_partitioner->GetRows(nidx);
|
auto d_ridx = row_partitioner->GetRows(nidx);
|
||||||
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
|
BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
|
||||||
feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
|
feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
|
||||||
*quantiser);
|
d_node_hist, *quantiser);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attempt to do subtraction trick
|
// Attempt to do subtraction trick
|
||||||
@ -413,10 +413,10 @@ struct GPUHistMakerDevice {
|
|||||||
});
|
});
|
||||||
|
|
||||||
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
collective::AllReduce<collective::Operation::kBitwiseOR>(
|
||||||
ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
|
ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
|
||||||
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
collective::AllReduce<collective::Operation::kBitwiseAND>(
|
||||||
ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
|
ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
|
||||||
collective::Synchronize(ctx_->gpu_id);
|
collective::Synchronize(ctx_->Ordinal());
|
||||||
|
|
||||||
row_partitioner->UpdatePositionBatch(
|
row_partitioner->UpdatePositionBatch(
|
||||||
nidx, left_nidx, right_nidx, split_data,
|
nidx, left_nidx, right_nidx, split_data,
|
||||||
@ -454,7 +454,7 @@ struct GPUHistMakerDevice {
|
|||||||
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
|
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||||
|
|
||||||
if (info_.IsColumnSplit()) {
|
if (info_.IsColumnSplit()) {
|
||||||
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
|
UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
|
||||||
@ -524,9 +524,9 @@ struct GPUHistMakerDevice {
|
|||||||
common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
|
common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
|
||||||
common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
|
common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
|
||||||
HostDeviceVector<bst_node_t>* p_out_position) {
|
HostDeviceVector<bst_node_t>* p_out_position) {
|
||||||
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
|
auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
|
||||||
auto d_gpair = this->gpair;
|
auto d_gpair = this->gpair;
|
||||||
p_out_position->SetDevice(ctx_->gpu_id);
|
p_out_position->SetDevice(ctx_->Device());
|
||||||
p_out_position->Resize(row_partitioner->GetRows().size());
|
p_out_position->Resize(row_partitioner->GetRows().size());
|
||||||
|
|
||||||
auto new_position_op = [=] __device__(size_t row_id, int position) {
|
auto new_position_op = [=] __device__(size_t row_id, int position) {
|
||||||
@ -613,7 +613,7 @@ struct GPUHistMakerDevice {
|
|||||||
monitor.Start("AllReduce");
|
monitor.Start("AllReduce");
|
||||||
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||||
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
|
using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
|
||||||
collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
|
collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
|
||||||
page->Cuts().TotalBins() * 2 * num_histograms);
|
page->Cuts().TotalBins() * 2 * num_histograms);
|
||||||
|
|
||||||
monitor.Stop("AllReduce");
|
monitor.Stop("AllReduce");
|
||||||
@ -855,7 +855,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
|
void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
|
||||||
CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
|
CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
|
||||||
info_ = &dmat->Info();
|
info_ = &dmat->Info();
|
||||||
|
|
||||||
// Synchronise the column sampling seed
|
// Synchronise the column sampling seed
|
||||||
@ -864,8 +864,8 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
|
||||||
|
|
||||||
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
|
||||||
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
|
||||||
info_->feature_types.SetDevice(ctx_->gpu_id);
|
info_->feature_types.SetDevice(ctx_->Device());
|
||||||
maker = std::make_unique<GPUHistMakerDevice>(
|
maker = std::make_unique<GPUHistMakerDevice>(
|
||||||
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
|
||||||
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
|
*param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
|
||||||
@ -890,7 +890,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
this->InitData(param, p_fmat, p_tree);
|
this->InitData(param, p_fmat, p_tree);
|
||||||
monitor_.Stop("InitData");
|
monitor_.Stop("InitData");
|
||||||
|
|
||||||
gpair->SetDevice(ctx_->gpu_id);
|
gpair->SetDevice(ctx_->Device());
|
||||||
maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1023,7 +1023,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
|
|||||||
this->InitData(p_fmat, p_tree);
|
this->InitData(p_fmat, p_tree);
|
||||||
monitor_.Stop("InitData");
|
monitor_.Stop("InitData");
|
||||||
|
|
||||||
gpair->SetDevice(ctx_->gpu_id);
|
gpair->SetDevice(ctx_->Device());
|
||||||
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
|
|||||||
auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
|
auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
|
||||||
if (need_copy()) {
|
if (need_copy()) {
|
||||||
// allocate buffer
|
// allocate buffer
|
||||||
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
|
sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
|
||||||
h_sample_out = sample_out.HostView();
|
h_sample_out = sample_out.HostView();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,7 +34,7 @@ void VerifyAllReduceBitwiseAND() {
|
|||||||
auto const rank = collective::GetRank();
|
auto const rank = collective::GetRank();
|
||||||
std::bitset<64> original{};
|
std::bitset<64> original{};
|
||||||
original[rank] = true;
|
original[rank] = true;
|
||||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
|
||||||
collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
|
collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
|
||||||
collective::Synchronize(rank);
|
collective::Synchronize(rank);
|
||||||
EXPECT_EQ(buffer.HostVector()[0], 0ULL);
|
EXPECT_EQ(buffer.HostVector()[0], 0ULL);
|
||||||
@ -56,7 +56,7 @@ void VerifyAllReduceBitwiseOR() {
|
|||||||
auto const rank = collective::GetRank();
|
auto const rank = collective::GetRank();
|
||||||
std::bitset<64> original{};
|
std::bitset<64> original{};
|
||||||
original[rank] = true;
|
original[rank] = true;
|
||||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
|
||||||
collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
|
collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
|
||||||
collective::Synchronize(rank);
|
collective::Synchronize(rank);
|
||||||
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
||||||
@ -78,7 +78,7 @@ void VerifyAllReduceBitwiseXOR() {
|
|||||||
auto const rank = collective::GetRank();
|
auto const rank = collective::GetRank();
|
||||||
std::bitset<64> original{~0ULL};
|
std::bitset<64> original{~0ULL};
|
||||||
original[rank] = false;
|
original[rank] = false;
|
||||||
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
|
HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
|
||||||
collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
|
collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
|
||||||
collective::Synchronize(rank);
|
collective::Synchronize(rank);
|
||||||
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
|
||||||
|
|||||||
@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
|
|||||||
|
|
||||||
EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
|
EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
|
||||||
|
|
||||||
p_mat->Info().Validate(-1);
|
p_mat->Info().Validate(DeviceOrd::CPU());
|
||||||
EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
|
EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
|
||||||
dmlc::Error);
|
dmlc::Error);
|
||||||
|
|
||||||
@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
|
|||||||
void TestSketchFromWeights(bool with_group) {
|
void TestSketchFromWeights(bool with_group) {
|
||||||
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
|
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
|
||||||
size_t constexpr kGroups = 10;
|
size_t constexpr kGroups = 10;
|
||||||
auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
|
auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
|
||||||
Context ctx;
|
Context ctx;
|
||||||
common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
|
common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
|
||||||
|
|
||||||
|
|||||||
@ -208,7 +208,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
|
|||||||
ASSERT_EQ(info.feature_types.Size(), n_features);
|
ASSERT_EQ(info.feature_types.Size(), n_features);
|
||||||
|
|
||||||
HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
|
HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
|
||||||
cuts_ptr.SetDevice(0);
|
cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
|
||||||
|
|
||||||
dh::device_vector<float> weight(n_samples * n_features, 0);
|
dh::device_vector<float> weight(n_samples * n_features, 0);
|
||||||
dh::Iota(dh::ToSpan(weight));
|
dh::Iota(dh::ToSpan(weight));
|
||||||
@ -221,7 +221,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
|
|||||||
thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
|
thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
|
||||||
detail::EntryCompareOp());
|
detail::EntryCompareOp());
|
||||||
|
|
||||||
detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
|
detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
|
||||||
&weight, &columns_ptr);
|
&weight, &columns_ptr);
|
||||||
|
|
||||||
auto const& h_cptr = cuts_ptr.ConstHostVector();
|
auto const& h_cptr = cuts_ptr.ConstHostVector();
|
||||||
@ -363,7 +363,8 @@ template <typename Adapter>
|
|||||||
auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
|
auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
|
||||||
common::HistogramCuts batched_cuts;
|
common::HistogramCuts batched_cuts;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
|
SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
|
||||||
|
DeviceOrd::CUDA(0));
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
|
AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
|
||||||
sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
|
sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
|
||||||
@ -430,7 +431,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
|
|||||||
ConsoleLogger::Configure({{"verbosity", "3"}});
|
ConsoleLogger::Configure({{"verbosity", "3"}});
|
||||||
common::HistogramCuts batched_cuts;
|
common::HistogramCuts batched_cuts;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
|
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
|
||||||
AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
|
AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
|
||||||
&sketch_container);
|
&sketch_container);
|
||||||
HistogramCuts cuts;
|
HistogramCuts cuts;
|
||||||
@ -458,7 +459,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
|
|||||||
ConsoleLogger::Configure({{"verbosity", "3"}});
|
ConsoleLogger::Configure({{"verbosity", "3"}});
|
||||||
common::HistogramCuts batched_cuts;
|
common::HistogramCuts batched_cuts;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
|
SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
|
||||||
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(),
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
&sketch_container);
|
&sketch_container);
|
||||||
@ -493,7 +494,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ASSERT_EQ(info.feature_types.Size(), 1);
|
ASSERT_EQ(info.feature_types.Size(), 1);
|
||||||
SketchContainer container(info.feature_types, num_bins, 1, n, 0);
|
SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
|
||||||
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
AdapterDeviceSketch(adapter.Value(), num_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(), &container);
|
std::numeric_limits<float>::quiet_NaN(), &container);
|
||||||
HistogramCuts cuts;
|
HistogramCuts cuts;
|
||||||
@ -566,7 +567,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
|
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
|
||||||
dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
|
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
|
||||||
auto n = n_samples * n_features;
|
auto n = n_samples * n_features;
|
||||||
std::vector<float> x;
|
std::vector<float> x;
|
||||||
x.resize(n);
|
x.resize(n);
|
||||||
@ -606,21 +607,21 @@ void TestGetColumnSize(std::size_t n_samples) {
|
|||||||
std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
|
std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
|
||||||
|
|
||||||
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
|
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
|
||||||
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
||||||
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
|
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
|
||||||
|
|
||||||
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
|
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
|
||||||
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
||||||
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
||||||
ASSERT_EQ(h_column_size, h_column_size_1);
|
ASSERT_EQ(h_column_size, h_column_size_1);
|
||||||
|
|
||||||
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
|
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
|
||||||
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
||||||
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
||||||
ASSERT_EQ(h_column_size, h_column_size_1);
|
ASSERT_EQ(h_column_size, h_column_size_1);
|
||||||
|
|
||||||
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
|
detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
|
||||||
ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
|
||||||
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
|
||||||
ASSERT_EQ(h_column_size, h_column_size_1);
|
ASSERT_EQ(h_column_size, h_column_size_1);
|
||||||
}
|
}
|
||||||
@ -697,9 +698,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
|
|||||||
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
|
size_t constexpr kRows = 300, kCols = 20, kBins = 256;
|
||||||
size_t constexpr kGroups = 10;
|
size_t constexpr kGroups = 10;
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
std::string m =
|
std::string m = RandomDataGenerator{kRows, kCols, 0}
|
||||||
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
|
.Device(DeviceOrd::CUDA(0))
|
||||||
&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
Context ctx;
|
Context ctx;
|
||||||
auto& h_weights = info.weights_.HostVector();
|
auto& h_weights = info.weights_.HostVector();
|
||||||
@ -718,14 +719,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
|
|||||||
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
|
info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
|
||||||
}
|
}
|
||||||
|
|
||||||
info.weights_.SetDevice(0);
|
info.weights_.SetDevice(DeviceOrd::CUDA(0));
|
||||||
info.num_row_ = kRows;
|
info.num_row_ = kRows;
|
||||||
info.num_col_ = kCols;
|
info.num_col_ = kCols;
|
||||||
|
|
||||||
data::CupyAdapter adapter(m);
|
data::CupyAdapter adapter(m);
|
||||||
auto const& batch = adapter.Value();
|
auto const& batch = adapter.Value();
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
|
SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
|
||||||
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
|
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
|
||||||
&sketch_container);
|
&sketch_container);
|
||||||
|
|
||||||
@ -769,7 +770,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
|
|||||||
// https://github.com/dmlc/xgboost/issues/7946
|
// https://github.com/dmlc/xgboost/issues/7946
|
||||||
h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
|
h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
|
||||||
}
|
}
|
||||||
SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
|
SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
|
||||||
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
|
AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
|
||||||
&sketch_container);
|
&sketch_container);
|
||||||
sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
|
sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2018 XGBoost contributors
|
* Copyright 2018-2023 XGBoost contributors
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include <thrust/equal.h>
|
#include <thrust/equal.h>
|
||||||
#include <thrust/iterator/counting_iterator.h>
|
#include <thrust/iterator/counting_iterator.h>
|
||||||
@ -9,14 +8,13 @@
|
|||||||
#include "../../../src/common/device_helpers.cuh"
|
#include "../../../src/common/device_helpers.cuh"
|
||||||
#include <xgboost/host_device_vector.h>
|
#include <xgboost/host_device_vector.h>
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common {
|
||||||
namespace common {
|
|
||||||
namespace {
|
namespace {
|
||||||
void SetDeviceForTest(int device) {
|
void SetDeviceForTest(DeviceOrd device) {
|
||||||
int n_devices;
|
int n_devices;
|
||||||
dh::safe_cuda(cudaGetDeviceCount(&n_devices));
|
dh::safe_cuda(cudaGetDeviceCount(&n_devices));
|
||||||
device %= n_devices;
|
device.ordinal %= n_devices;
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
@ -31,13 +29,13 @@ struct HostDeviceVectorSetDeviceHandler {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
|
void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
|
||||||
// create the vector
|
// create the vector
|
||||||
v->SetDevice(device);
|
v->SetDevice(device);
|
||||||
v->Resize(n);
|
v->Resize(n);
|
||||||
|
|
||||||
ASSERT_EQ(v->Size(), n);
|
ASSERT_EQ(v->Size(), n);
|
||||||
ASSERT_EQ(v->DeviceIdx(), device);
|
ASSERT_EQ(v->Device(), device);
|
||||||
// ensure that the device have read-write access
|
// ensure that the device have read-write access
|
||||||
ASSERT_TRUE(v->DeviceCanRead());
|
ASSERT_TRUE(v->DeviceCanRead());
|
||||||
ASSERT_TRUE(v->DeviceCanWrite());
|
ASSERT_TRUE(v->DeviceCanWrite());
|
||||||
@ -57,7 +55,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void PlusOne(HostDeviceVector<int> *v) {
|
void PlusOne(HostDeviceVector<int> *v) {
|
||||||
int device = v->DeviceIdx();
|
auto device = v->Device();
|
||||||
SetDeviceForTest(device);
|
SetDeviceForTest(device);
|
||||||
thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
|
thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
|
||||||
[=]__device__(unsigned int a){ return a + 1; });
|
[=]__device__(unsigned int a){ return a + 1; });
|
||||||
@ -69,7 +67,7 @@ void CheckDevice(HostDeviceVector<int>* v,
|
|||||||
unsigned int first,
|
unsigned int first,
|
||||||
GPUAccess access) {
|
GPUAccess access) {
|
||||||
ASSERT_EQ(v->Size(), size);
|
ASSERT_EQ(v->Size(), size);
|
||||||
SetDeviceForTest(v->DeviceIdx());
|
SetDeviceForTest(v->Device());
|
||||||
|
|
||||||
ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
|
ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
|
||||||
thrust::make_counting_iterator(first)));
|
thrust::make_counting_iterator(first)));
|
||||||
@ -100,7 +98,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
|
|||||||
ASSERT_FALSE(v->DeviceCanWrite());
|
ASSERT_FALSE(v->DeviceCanWrite());
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestHostDeviceVector(size_t n, int device) {
|
void TestHostDeviceVector(size_t n, DeviceOrd device) {
|
||||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||||
HostDeviceVector<int> v;
|
HostDeviceVector<int> v;
|
||||||
InitHostDeviceVector(n, device, &v);
|
InitHostDeviceVector(n, device, &v);
|
||||||
@ -113,13 +111,13 @@ void TestHostDeviceVector(size_t n, int device) {
|
|||||||
|
|
||||||
TEST(HostDeviceVector, Basic) {
|
TEST(HostDeviceVector, Basic) {
|
||||||
size_t n = 1001;
|
size_t n = 1001;
|
||||||
int device = 0;
|
DeviceOrd device = DeviceOrd::CUDA(0);
|
||||||
TestHostDeviceVector(n, device);
|
TestHostDeviceVector(n, device);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(HostDeviceVector, Copy) {
|
TEST(HostDeviceVector, Copy) {
|
||||||
size_t n = 1001;
|
size_t n = 1001;
|
||||||
int device = 0;
|
auto device = DeviceOrd::CUDA(0);
|
||||||
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
|
||||||
|
|
||||||
HostDeviceVector<int> v;
|
HostDeviceVector<int> v;
|
||||||
@ -143,15 +141,15 @@ TEST(HostDeviceVector, SetDevice) {
|
|||||||
h_vec[i] = i;
|
h_vec[i] = i;
|
||||||
}
|
}
|
||||||
HostDeviceVector<int> vec (h_vec);
|
HostDeviceVector<int> vec (h_vec);
|
||||||
auto device = 0;
|
auto device = DeviceOrd::CUDA(0);
|
||||||
|
|
||||||
vec.SetDevice(device);
|
vec.SetDevice(device);
|
||||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||||
auto span = vec.DeviceSpan(); // sync to device
|
auto span = vec.DeviceSpan(); // sync to device
|
||||||
|
|
||||||
vec.SetDevice(-1); // pull back to cpu.
|
vec.SetDevice(DeviceOrd::CPU()); // pull back to cpu.
|
||||||
ASSERT_EQ(vec.Size(), h_vec.size());
|
ASSERT_EQ(vec.Size(), h_vec.size());
|
||||||
ASSERT_EQ(vec.DeviceIdx(), -1);
|
ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
|
||||||
|
|
||||||
auto h_vec_1 = vec.HostVector();
|
auto h_vec_1 = vec.HostVector();
|
||||||
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
|
||||||
@ -159,7 +157,7 @@ TEST(HostDeviceVector, SetDevice) {
|
|||||||
|
|
||||||
TEST(HostDeviceVector, Span) {
|
TEST(HostDeviceVector, Span) {
|
||||||
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
|
||||||
vec.SetDevice(0);
|
vec.SetDevice(DeviceOrd::CUDA(0));
|
||||||
auto span = vec.DeviceSpan();
|
auto span = vec.DeviceSpan();
|
||||||
ASSERT_EQ(vec.Size(), span.size());
|
ASSERT_EQ(vec.Size(), span.size());
|
||||||
ASSERT_EQ(vec.DevicePointer(), span.data());
|
ASSERT_EQ(vec.DevicePointer(), span.data());
|
||||||
@ -183,5 +181,4 @@ TEST(HostDeviceVector, Empty) {
|
|||||||
ASSERT_FALSE(another.Empty());
|
ASSERT_FALSE(another.Empty());
|
||||||
ASSERT_TRUE(vec.Empty());
|
ASSERT_TRUE(vec.Empty());
|
||||||
}
|
}
|
||||||
} // namespace common
|
} // namespace xgboost::common
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ namespace xgboost::linalg {
|
|||||||
namespace {
|
namespace {
|
||||||
void TestElementWiseKernel() {
|
void TestElementWiseKernel() {
|
||||||
auto device = DeviceOrd::CUDA(0);
|
auto device = DeviceOrd::CUDA(0);
|
||||||
Tensor<float, 3> l{{2, 3, 4}, 0};
|
Tensor<float, 3> l{{2, 3, 4}, device};
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Non-contiguous
|
* Non-contiguous
|
||||||
|
|||||||
@ -9,9 +9,7 @@
|
|||||||
#include "../../../src/data/adapter.h"
|
#include "../../../src/data/adapter.h"
|
||||||
#include "xgboost/context.h"
|
#include "xgboost/context.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common {
|
||||||
namespace common {
|
|
||||||
|
|
||||||
TEST(Quantile, LoadBalance) {
|
TEST(Quantile, LoadBalance) {
|
||||||
size_t constexpr kRows = 1000, kCols = 100;
|
size_t constexpr kRows = 1000, kCols = 100;
|
||||||
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
|
auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
|
||||||
@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto m = RandomDataGenerator{kRows, kCols, 0}
|
auto m = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(Context::kCpuId)
|
.Device(DeviceOrd::CPU())
|
||||||
.Type(ft)
|
.Type(ft)
|
||||||
.MaxCategory(17)
|
.MaxCategory(17)
|
||||||
.Seed(rank + seed)
|
.Seed(rank + seed)
|
||||||
@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
|
|||||||
auto constexpr kWorkers = 4;
|
auto constexpr kWorkers = 4;
|
||||||
RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
|
RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
|
||||||
}
|
}
|
||||||
|
} // namespace xgboost::common
|
||||||
} // namespace common
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -25,7 +25,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
|
|||||||
TEST(GPUQuantile, Basic) {
|
TEST(GPUQuantile, Basic) {
|
||||||
constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
|
constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch(ft, kBins, kCols, kRows, 0);
|
SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
|
||||||
dh::caching_device_vector<Entry> entries;
|
dh::caching_device_vector<Entry> entries;
|
||||||
dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
|
dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
|
||||||
thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
|
thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
|
||||||
@ -38,12 +38,12 @@ void TestSketchUnique(float sparsity) {
|
|||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
|
||||||
|
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
|
std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
|
||||||
.Seed(seed)
|
.Seed(seed)
|
||||||
.Device(0)
|
.Device(FstCU())
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
||||||
@ -58,7 +58,7 @@ void TestSketchUnique(float sparsity) {
|
|||||||
thrust::make_counting_iterator(0llu),
|
thrust::make_counting_iterator(0llu),
|
||||||
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
[=] __device__(size_t idx) { return batch.GetElement(idx); });
|
||||||
auto end = kCols * kRows;
|
auto end = kCols * kRows;
|
||||||
detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
|
detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
|
||||||
&cut_sizes_scan, &column_sizes_scan);
|
&cut_sizes_scan, &column_sizes_scan);
|
||||||
auto const& cut_sizes = cut_sizes_scan.HostVector();
|
auto const& cut_sizes = cut_sizes_scan.HostVector();
|
||||||
ASSERT_LE(sketch.Data().size(), cut_sizes.back());
|
ASSERT_LE(sketch.Data().size(), cut_sizes.back());
|
||||||
@ -86,9 +86,9 @@ TEST(GPUQuantile, Unique) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// if with_error is true, the test tolerates floating point error
|
// if with_error is true, the test tolerates floating point error
|
||||||
void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
|
void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
|
||||||
Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
|
Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
|
||||||
dh::safe_cuda(cudaSetDevice(device));
|
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||||
std::vector<SketchEntry> h_in(in.size());
|
std::vector<SketchEntry> h_in(in.size());
|
||||||
dh::CopyDeviceSpanToVector(&h_in, in);
|
dh::CopyDeviceSpanToVector(&h_in, in);
|
||||||
std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
|
std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
|
||||||
@ -123,13 +123,12 @@ TEST(GPUQuantile, Prune) {
|
|||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
|
||||||
|
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str =
|
||||||
.Device(0)
|
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
|
||||||
.Seed(seed)
|
&storage);
|
||||||
.GenerateArrayInterface(&storage);
|
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(), &sketch);
|
std::numeric_limits<float>::quiet_NaN(), &sketch);
|
||||||
@ -145,7 +144,7 @@ TEST(GPUQuantile, Prune) {
|
|||||||
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
|
ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
|
||||||
sketch.Data().data() + sketch.Data().size(),
|
sketch.Data().data() + sketch.Data().size(),
|
||||||
detail::SketchUnique{}));
|
detail::SketchUnique{}));
|
||||||
TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
|
TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -153,10 +152,10 @@ TEST(GPUQuantile, MergeEmpty) {
|
|||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
size_t n_bins = 10;
|
size_t n_bins = 10;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
|
||||||
HostDeviceVector<float> storage_0;
|
HostDeviceVector<float> storage_0;
|
||||||
std::string interface_str_0 =
|
std::string interface_str_0 =
|
||||||
RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
|
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
|
||||||
&storage_0);
|
&storage_0);
|
||||||
data::CupyAdapter adapter_0(interface_str_0);
|
data::CupyAdapter adapter_0(interface_str_0);
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
@ -193,34 +192,33 @@ TEST(GPUQuantile, MergeBasic) {
|
|||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
|
||||||
HostDeviceVector<float> storage_0;
|
HostDeviceVector<float> storage_0;
|
||||||
std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(0)
|
.Device(FstCU())
|
||||||
.Seed(seed)
|
.Seed(seed)
|
||||||
.GenerateArrayInterface(&storage_0);
|
.GenerateArrayInterface(&storage_0);
|
||||||
data::CupyAdapter adapter_0(interface_str_0);
|
data::CupyAdapter adapter_0(interface_str_0);
|
||||||
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(), &sketch_0);
|
std::numeric_limits<float>::quiet_NaN(), &sketch_0);
|
||||||
|
|
||||||
SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
|
SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
|
||||||
HostDeviceVector<float> storage_1;
|
HostDeviceVector<float> storage_1;
|
||||||
std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str_1 =
|
||||||
.Device(0)
|
RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
|
||||||
.Seed(seed)
|
&storage_1);
|
||||||
.GenerateArrayInterface(&storage_1);
|
|
||||||
data::CupyAdapter adapter_1(interface_str_1);
|
data::CupyAdapter adapter_1(interface_str_1);
|
||||||
AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
|
||||||
std::numeric_limits<float>::quiet_NaN(), &sketch_1);
|
&sketch_1);
|
||||||
|
|
||||||
size_t size_before_merge = sketch_0.Data().size();
|
size_t size_before_merge = sketch_0.Data().size();
|
||||||
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
|
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
|
||||||
if (info.weights_.Size() != 0) {
|
if (info.weights_.Size() != 0) {
|
||||||
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
|
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
|
||||||
sketch_0.FixError();
|
sketch_0.FixError();
|
||||||
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
|
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
|
||||||
} else {
|
} else {
|
||||||
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
|
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto columns_ptr = sketch_0.ColumnsPtr();
|
auto columns_ptr = sketch_0.ColumnsPtr();
|
||||||
@ -240,24 +238,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
|
|||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
int32_t seed = 0;
|
int32_t seed = 0;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
|
SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
|
||||||
HostDeviceVector<float> storage_0;
|
HostDeviceVector<float> storage_0;
|
||||||
std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
|
std::string interface_str_0 =
|
||||||
.Device(0)
|
RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
|
||||||
.Seed(seed)
|
&storage_0);
|
||||||
.GenerateArrayInterface(&storage_0);
|
|
||||||
data::CupyAdapter adapter_0(interface_str_0);
|
data::CupyAdapter adapter_0(interface_str_0);
|
||||||
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(),
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
&sketch_0);
|
&sketch_0);
|
||||||
|
|
||||||
size_t f_rows = rows * frac;
|
size_t f_rows = rows * frac;
|
||||||
SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
|
SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
|
||||||
HostDeviceVector<float> storage_1;
|
HostDeviceVector<float> storage_1;
|
||||||
std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
|
std::string interface_str_1 =
|
||||||
.Device(0)
|
RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
|
||||||
.Seed(seed)
|
&storage_1);
|
||||||
.GenerateArrayInterface(&storage_1);
|
|
||||||
auto data_1 = storage_1.DeviceSpan();
|
auto data_1 = storage_1.DeviceSpan();
|
||||||
auto tuple_it = thrust::make_tuple(
|
auto tuple_it = thrust::make_tuple(
|
||||||
thrust::make_counting_iterator<size_t>(0ul), data_1.data());
|
thrust::make_counting_iterator<size_t>(0ul), data_1.data());
|
||||||
@ -279,7 +275,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
|
|||||||
|
|
||||||
size_t size_before_merge = sketch_0.Data().size();
|
size_t size_before_merge = sketch_0.Data().size();
|
||||||
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
|
sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
|
||||||
TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
|
TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
|
||||||
|
|
||||||
auto columns_ptr = sketch_0.ColumnsPtr();
|
auto columns_ptr = sketch_0.ColumnsPtr();
|
||||||
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
|
std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
|
||||||
@ -310,11 +306,10 @@ TEST(GPUQuantile, MergeDuplicated) {
|
|||||||
TEST(GPUQuantile, MultiMerge) {
|
TEST(GPUQuantile, MultiMerge) {
|
||||||
constexpr size_t kRows = 20, kCols = 1;
|
constexpr size_t kRows = 20, kCols = 1;
|
||||||
int32_t world = 2;
|
int32_t world = 2;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||||
MetaInfo const &info) {
|
|
||||||
// Set up single node version
|
// Set up single node version
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
|
||||||
|
|
||||||
size_t intermediate_num_cuts = std::min(
|
size_t intermediate_num_cuts = std::min(
|
||||||
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
|
kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
|
||||||
@ -322,12 +317,12 @@ TEST(GPUQuantile, MultiMerge) {
|
|||||||
for (auto rank = 0; rank < world; ++rank) {
|
for (auto rank = 0; rank < world; ++rank) {
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
|
||||||
.Device(0)
|
.Device(FstCU())
|
||||||
.Seed(rank + seed)
|
.Seed(rank + seed)
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
data::CupyAdapter adapter(interface_str);
|
data::CupyAdapter adapter(interface_str);
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
containers.emplace_back(ft, n_bins, kCols, kRows, 0);
|
containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
|
||||||
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
AdapterDeviceSketch(adapter.Value(), n_bins, info,
|
||||||
std::numeric_limits<float>::quiet_NaN(),
|
std::numeric_limits<float>::quiet_NaN(),
|
||||||
&containers.back());
|
&containers.back());
|
||||||
@ -337,12 +332,10 @@ TEST(GPUQuantile, MultiMerge) {
|
|||||||
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
|
sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
|
||||||
sketch_on_single_node.FixError();
|
sketch_on_single_node.FixError();
|
||||||
}
|
}
|
||||||
TestQuantileElemRank(0, sketch_on_single_node.Data(),
|
TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
|
||||||
sketch_on_single_node.ColumnsPtr());
|
|
||||||
|
|
||||||
sketch_on_single_node.Unique();
|
sketch_on_single_node.Unique();
|
||||||
TestQuantileElemRank(0, sketch_on_single_node.Data(),
|
TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
|
||||||
sketch_on_single_node.ColumnsPtr());
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -351,7 +344,7 @@ void TestAllReduceBasic() {
|
|||||||
auto const world = collective::GetWorldSize();
|
auto const world = collective::GetWorldSize();
|
||||||
constexpr size_t kRows = 1000, kCols = 100;
|
constexpr size_t kRows = 1000, kCols = 100;
|
||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
|
||||||
auto const device = GPUIDX;
|
auto const device = DeviceOrd::CUDA(GPUIDX);
|
||||||
|
|
||||||
// Set up single node version;
|
// Set up single node version;
|
||||||
HostDeviceVector<FeatureType> ft({}, device);
|
HostDeviceVector<FeatureType> ft({}, device);
|
||||||
@ -483,7 +476,7 @@ void TestSameOnAllWorkers() {
|
|||||||
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
|
||||||
MetaInfo const &info) {
|
MetaInfo const &info) {
|
||||||
auto const rank = collective::GetRank();
|
auto const rank = collective::GetRank();
|
||||||
auto const device = GPUIDX;
|
auto const device = DeviceOrd::CUDA(GPUIDX);
|
||||||
HostDeviceVector<FeatureType> ft({}, device);
|
HostDeviceVector<FeatureType> ft({}, device);
|
||||||
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
|
SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
|
||||||
HostDeviceVector<float> storage({}, device);
|
HostDeviceVector<float> storage({}, device);
|
||||||
@ -514,9 +507,9 @@ void TestSameOnAllWorkers() {
|
|||||||
thrust::copy(thrust::device, local_data.data(),
|
thrust::copy(thrust::device, local_data.data(),
|
||||||
local_data.data() + local_data.size(),
|
local_data.data() + local_data.size(),
|
||||||
all_workers.begin() + local_data.size() * rank);
|
all_workers.begin() + local_data.size() * rank);
|
||||||
collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
|
collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
|
||||||
all_workers.size());
|
all_workers.size());
|
||||||
collective::Synchronize(device);
|
collective::Synchronize(device.ordinal);
|
||||||
|
|
||||||
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
|
auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
|
||||||
std::vector<float> h_base_line(base_line.size());
|
std::vector<float> h_base_line(base_line.size());
|
||||||
@ -562,7 +555,7 @@ TEST(GPUQuantile, Push) {
|
|||||||
columns_ptr[1] = kRows;
|
columns_ptr[1] = kRows;
|
||||||
|
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
|
||||||
sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
|
sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
|
||||||
|
|
||||||
auto sketch_data = sketch.Data();
|
auto sketch_data = sketch.Data();
|
||||||
@ -602,7 +595,7 @@ TEST(GPUQuantile, MultiColPush) {
|
|||||||
|
|
||||||
int32_t n_bins = 16;
|
int32_t n_bins = 16;
|
||||||
HostDeviceVector<FeatureType> ft;
|
HostDeviceVector<FeatureType> ft;
|
||||||
SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
|
SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
|
||||||
dh::device_vector<Entry> d_entries {entries};
|
dh::device_vector<Entry> d_entries {entries};
|
||||||
|
|
||||||
dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
|
dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
|
||||||
|
|||||||
@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
|
|||||||
HostDeviceVector<float> predt(info.num_row_, 0);
|
HostDeviceVector<float> predt(info.num_row_, 0);
|
||||||
auto& h_predt = predt.HostVector();
|
auto& h_predt = predt.HostVector();
|
||||||
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
|
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
|
||||||
predt.SetDevice(ctx->gpu_id);
|
predt.SetDevice(ctx->Device());
|
||||||
|
|
||||||
auto rank_idx =
|
auto rank_idx =
|
||||||
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
|
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
|
||||||
@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
|
|||||||
auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
|
auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
|
||||||
// empty label
|
// empty label
|
||||||
ASSERT_THROW(fail(), dmlc::Error);
|
ASSERT_THROW(fail(), dmlc::Error);
|
||||||
info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
|
info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
|
||||||
// invalid label
|
// invalid label
|
||||||
ASSERT_THROW(fail(), dmlc::Error);
|
ASSERT_THROW(fail(), dmlc::Error);
|
||||||
auto h_labels = info.labels.HostView();
|
auto h_labels = info.labels.HostView();
|
||||||
|
|||||||
@ -35,7 +35,7 @@ void TestCalcQueriesInvIDCG() {
|
|||||||
auto d_scores = dh::ToSpan(scores);
|
auto d_scores = dh::ToSpan(scores);
|
||||||
common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
|
common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
|
||||||
|
|
||||||
linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
|
linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
|
||||||
|
|
||||||
ltr::LambdaRankParam p;
|
ltr::LambdaRankParam p;
|
||||||
p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
|
p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
|
||||||
@ -70,7 +70,7 @@ void TestRankingCache(Context const* ctx) {
|
|||||||
HostDeviceVector<float> predt(info.num_row_, 0);
|
HostDeviceVector<float> predt(info.num_row_, 0);
|
||||||
auto& h_predt = predt.HostVector();
|
auto& h_predt = predt.HostVector();
|
||||||
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
|
std::iota(h_predt.begin(), h_predt.end(), 0.0f);
|
||||||
predt.SetDevice(ctx->gpu_id);
|
predt.SetDevice(ctx->Device());
|
||||||
|
|
||||||
auto rank_idx =
|
auto rank_idx =
|
||||||
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
|
cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
|
||||||
|
|||||||
@ -9,12 +9,11 @@
|
|||||||
#include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter
|
#include "../../../src/common/transform_iterator.h" // common::MakeIndexTransformIter
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::common {
|
||||||
namespace common {
|
|
||||||
TEST(Stats, Quantile) {
|
TEST(Stats, Quantile) {
|
||||||
Context ctx;
|
Context ctx;
|
||||||
{
|
{
|
||||||
linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
|
linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
|
||||||
std::vector<size_t> index{0, 2, 3, 4, 6};
|
std::vector<size_t> index{0, 2, 3, 4, 6};
|
||||||
auto h_arr = arr.HostView();
|
auto h_arr = arr.HostView();
|
||||||
auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
|
auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
|
||||||
@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
|
|||||||
|
|
||||||
TEST(Stats, WeightedQuantile) {
|
TEST(Stats, WeightedQuantile) {
|
||||||
Context ctx;
|
Context ctx;
|
||||||
linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
|
linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
|
||||||
linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
|
linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
|
||||||
|
|
||||||
auto h_arr = arr.HostView();
|
auto h_arr = arr.HostView();
|
||||||
auto h_weight = weight.HostView();
|
auto h_weight = weight.HostView();
|
||||||
@ -64,7 +63,7 @@ TEST(Stats, Median) {
|
|||||||
Context ctx;
|
Context ctx;
|
||||||
|
|
||||||
{
|
{
|
||||||
linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
|
linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
|
||||||
HostDeviceVector<float> weights;
|
HostDeviceVector<float> weights;
|
||||||
linalg::Tensor<float, 1> out;
|
linalg::Tensor<float, 1> out;
|
||||||
Median(&ctx, values, weights, &out);
|
Median(&ctx, values, weights, &out);
|
||||||
@ -83,7 +82,7 @@ TEST(Stats, Median) {
|
|||||||
{
|
{
|
||||||
ctx = ctx.MakeCPU();
|
ctx = ctx.MakeCPU();
|
||||||
// 4x2 matrix
|
// 4x2 matrix
|
||||||
linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
|
linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
|
||||||
HostDeviceVector<float> weights;
|
HostDeviceVector<float> weights;
|
||||||
linalg::Tensor<float, 1> out;
|
linalg::Tensor<float, 1> out;
|
||||||
Median(&ctx, values, weights, &out);
|
Median(&ctx, values, weights, &out);
|
||||||
@ -102,14 +101,14 @@ TEST(Stats, Median) {
|
|||||||
namespace {
|
namespace {
|
||||||
void TestMean(Context const* ctx) {
|
void TestMean(Context const* ctx) {
|
||||||
std::size_t n{128};
|
std::size_t n{128};
|
||||||
linalg::Vector<float> data({n}, ctx->gpu_id);
|
linalg::Vector<float> data({n}, ctx->Device());
|
||||||
auto h_v = data.HostView().Values();
|
auto h_v = data.HostView().Values();
|
||||||
std::iota(h_v.begin(), h_v.end(), .0f);
|
std::iota(h_v.begin(), h_v.end(), .0f);
|
||||||
|
|
||||||
auto nf = static_cast<float>(n);
|
auto nf = static_cast<float>(n);
|
||||||
float mean = nf * (nf - 1) / 2 / n;
|
float mean = nf * (nf - 1) / 2 / n;
|
||||||
|
|
||||||
linalg::Vector<float> res{{1}, ctx->gpu_id};
|
linalg::Vector<float> res{{1}, ctx->Device()};
|
||||||
Mean(ctx, data, &res);
|
Mean(ctx, data, &res);
|
||||||
auto h_res = res.HostView();
|
auto h_res = res.HostView();
|
||||||
ASSERT_EQ(h_res.Size(), 1);
|
ASSERT_EQ(h_res.Size(), 1);
|
||||||
@ -128,5 +127,4 @@ TEST(Stats, GPUMean) {
|
|||||||
TestMean(&ctx);
|
TestMean(&ctx);
|
||||||
}
|
}
|
||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
} // namespace common
|
} // namespace xgboost::common
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -20,8 +20,8 @@ namespace common {
|
|||||||
namespace {
|
namespace {
|
||||||
class StatsGPU : public ::testing::Test {
|
class StatsGPU : public ::testing::Test {
|
||||||
private:
|
private:
|
||||||
linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
|
linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
|
||||||
linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
|
linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
|
||||||
HostDeviceVector<float> results_;
|
HostDeviceVector<float> results_;
|
||||||
using TestSet = std::vector<std::pair<float, float>>;
|
using TestSet = std::vector<std::pair<float, float>>;
|
||||||
Context ctx_;
|
Context ctx_;
|
||||||
@ -46,7 +46,7 @@ class StatsGPU : public ::testing::Test {
|
|||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
|
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
|
||||||
auto d_arr = arr.View(DeviceOrd::CUDA(0));
|
auto d_arr = arr.View(DeviceOrd::CUDA(0));
|
||||||
|
|
||||||
auto key_it = dh::MakeTransformIterator<std::size_t>(
|
auto key_it = dh::MakeTransformIterator<std::size_t>(
|
||||||
@ -58,7 +58,7 @@ class StatsGPU : public ::testing::Test {
|
|||||||
|
|
||||||
// one alpha for each segment
|
// one alpha for each segment
|
||||||
HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
|
HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
|
||||||
alphas.SetDevice(0);
|
alphas.SetDevice(FstCU());
|
||||||
auto d_alphas = alphas.ConstDeviceSpan();
|
auto d_alphas = alphas.ConstDeviceSpan();
|
||||||
auto w_it = thrust::make_constant_iterator(0.1f);
|
auto w_it = thrust::make_constant_iterator(0.1f);
|
||||||
SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
|
SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
|
||||||
@ -80,7 +80,7 @@ class StatsGPU : public ::testing::Test {
|
|||||||
auto val_it =
|
auto val_it =
|
||||||
dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||||
[=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
|
[=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
|
||||||
linalg::Tensor<float, 1> weights{{10}, 0};
|
linalg::Tensor<float, 1> weights{{10}, FstCU()};
|
||||||
linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
|
linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
|
||||||
[=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
|
[=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
|
||||||
auto w_it = weights.Data()->ConstDevicePointer();
|
auto w_it = weights.Data()->ConstDevicePointer();
|
||||||
@ -101,7 +101,7 @@ class StatsGPU : public ::testing::Test {
|
|||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
data.insert(data.cend(), seg.begin(), seg.end());
|
data.insert(data.cend(), seg.begin(), seg.end());
|
||||||
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
|
linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
|
||||||
auto d_arr = arr.View(DeviceOrd::CUDA(0));
|
auto d_arr = arr.View(DeviceOrd::CUDA(0));
|
||||||
|
|
||||||
auto key_it = dh::MakeTransformIterator<std::size_t>(
|
auto key_it = dh::MakeTransformIterator<std::size_t>(
|
||||||
@ -113,7 +113,7 @@ class StatsGPU : public ::testing::Test {
|
|||||||
|
|
||||||
// one alpha for each segment
|
// one alpha for each segment
|
||||||
HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
|
HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
|
||||||
alphas.SetDevice(0);
|
alphas.SetDevice(FstCU());
|
||||||
auto d_alphas = alphas.ConstDeviceSpan();
|
auto d_alphas = alphas.ConstDeviceSpan();
|
||||||
SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
|
SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
|
||||||
val_it + d_arr.Size(), &results_);
|
val_it + d_arr.Size(), &results_);
|
||||||
|
|||||||
@ -11,63 +11,59 @@
|
|||||||
#include "../../../src/common/transform.h"
|
#include "../../../src/common/transform.h"
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost::common {
|
||||||
|
namespace {
|
||||||
|
constexpr DeviceOrd TransformDevice() {
|
||||||
#if defined(__CUDACC__)
|
#if defined(__CUDACC__)
|
||||||
|
return DeviceOrd::CUDA(0);
|
||||||
#define TRANSFORM_GPU 0
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
return DeviceOrd::CPU();
|
||||||
#define TRANSFORM_GPU -1
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
namespace xgboost {
|
} // namespace
|
||||||
namespace common {
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct TestTransformRange {
|
struct TestTransformRange {
|
||||||
void XGBOOST_DEVICE operator()(size_t _idx,
|
void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
|
||||||
Span<bst_float> _out, Span<const bst_float> _in) {
|
|
||||||
_out[_idx] = _in[_idx];
|
_out[_idx] = _in[_idx];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(Transform, DeclareUnifiedTest(Basic)) {
|
TEST(Transform, DeclareUnifiedTest(Basic)) {
|
||||||
const size_t size {256};
|
const size_t size{256};
|
||||||
std::vector<bst_float> h_in(size);
|
std::vector<float> h_in(size);
|
||||||
std::vector<bst_float> h_out(size);
|
std::vector<float> h_out(size);
|
||||||
std::iota(h_in.begin(), h_in.end(), 0);
|
std::iota(h_in.begin(), h_in.end(), 0);
|
||||||
std::vector<bst_float> h_sol(size);
|
std::vector<float> h_sol(size);
|
||||||
std::iota(h_sol.begin(), h_sol.end(), 0);
|
std::iota(h_sol.begin(), h_sol.end(), 0);
|
||||||
|
|
||||||
const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
|
auto device = TransformDevice();
|
||||||
HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
|
HostDeviceVector<float> const in_vec{h_in, device};
|
||||||
|
HostDeviceVector<float> out_vec{h_out, device};
|
||||||
out_vec.Fill(0);
|
out_vec.Fill(0);
|
||||||
|
|
||||||
Transform<>::Init(TestTransformRange<bst_float>{},
|
Transform<>::Init(TestTransformRange<float>{},
|
||||||
Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
|
Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
|
||||||
TRANSFORM_GPU)
|
TransformDevice())
|
||||||
.Eval(&out_vec, &in_vec);
|
.Eval(&out_vec, &in_vec);
|
||||||
std::vector<bst_float> res = out_vec.HostVector();
|
std::vector<float> res = out_vec.HostVector();
|
||||||
|
|
||||||
ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
|
ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(__CUDACC__)
|
#if !defined(__CUDACC__)
|
||||||
TEST(TransformDeathTest, Exception) {
|
TEST(TransformDeathTest, Exception) {
|
||||||
size_t const kSize {16};
|
size_t const kSize{16};
|
||||||
std::vector<bst_float> h_in(kSize);
|
std::vector<float> h_in(kSize);
|
||||||
const HostDeviceVector<bst_float> in_vec{h_in, -1};
|
const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
|
||||||
EXPECT_DEATH(
|
EXPECT_DEATH(
|
||||||
{
|
{
|
||||||
Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
|
Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
|
||||||
Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
|
Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
|
||||||
-1)
|
DeviceOrd::CPU())
|
||||||
.Eval(&in_vec);
|
.Eval(&in_vec);
|
||||||
},
|
},
|
||||||
"");
|
"");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
} // namespace xgboost::common
|
||||||
} // namespace common
|
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
5
tests/cpp/common/test_transform_range.cu
Normal file
5
tests/cpp/common/test_transform_range.cu
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2023 XGBoost contributors
|
||||||
|
*/
|
||||||
|
// Dummy file to keep the CUDA tests.
|
||||||
|
#include "test_transform_range.cc"
|
||||||
@ -59,12 +59,12 @@ TEST(DeviceAdapter, GetRowCounts) {
|
|||||||
for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
|
for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
|
auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
|
||||||
.Device(ctx.gpu_id)
|
.Device(ctx.Device())
|
||||||
.GenerateArrayInterface(&storage);
|
.GenerateArrayInterface(&storage);
|
||||||
auto adapter = CupyAdapter{str_arr};
|
auto adapter = CupyAdapter{str_arr};
|
||||||
HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
|
HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
|
||||||
offset.SetDevice(ctx.gpu_id);
|
offset.SetDevice(ctx.Device());
|
||||||
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
|
auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
|
||||||
std::numeric_limits<float>::quiet_NaN());
|
std::numeric_limits<float>::quiet_NaN());
|
||||||
ASSERT_EQ(rstride, n_features);
|
ASSERT_EQ(rstride, n_features);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
|
|||||||
Context ctx{MakeCUDACtx(0)};
|
Context ctx{MakeCUDACtx(0)};
|
||||||
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
|
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
|
||||||
auto ellpack = EllpackPage(&ctx, m.get(), p);
|
auto ellpack = EllpackPage(&ctx, m.get(), p);
|
||||||
auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
|
auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
|
||||||
ASSERT_EQ(kCats, accessor.NumBins());
|
ASSERT_EQ(kCats, accessor.NumBins());
|
||||||
|
|
||||||
auto x_copy = x;
|
auto x_copy = x;
|
||||||
@ -152,13 +152,12 @@ TEST(EllpackPage, Copy) {
|
|||||||
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
|
EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
|
||||||
kRows);
|
|
||||||
|
|
||||||
// Copy batch pages into the result page.
|
// Copy batch pages into the result page.
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
|
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
|
||||||
size_t num_elements = result.Copy(0, batch.Impl(), offset);
|
size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
|
||||||
offset += num_elements;
|
offset += num_elements;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -172,10 +171,12 @@ TEST(EllpackPage, Copy) {
|
|||||||
EXPECT_EQ(impl->base_rowid, current_row);
|
EXPECT_EQ(impl->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl->Size(); i++) {
|
for (size_t i = 0; i < impl->Size(); i++) {
|
||||||
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
|
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
|
||||||
|
row_d.data().get()));
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
|
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
|
||||||
|
row_result_d.data().get()));
|
||||||
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
||||||
|
|
||||||
EXPECT_EQ(row, row_result);
|
EXPECT_EQ(row, row_result);
|
||||||
@ -199,8 +200,7 @@ TEST(EllpackPage, Compact) {
|
|||||||
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
|
||||||
|
|
||||||
// Create an empty result page.
|
// Create an empty result page.
|
||||||
EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
|
EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
|
||||||
kCompactedRows);
|
|
||||||
|
|
||||||
// Compact batch pages into the result page.
|
// Compact batch pages into the result page.
|
||||||
std::vector<size_t> row_indexes_h {
|
std::vector<size_t> row_indexes_h {
|
||||||
@ -209,7 +209,7 @@ TEST(EllpackPage, Compact) {
|
|||||||
thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
|
thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
|
||||||
common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
|
common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
|
||||||
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
|
for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
|
||||||
result.Compact(0, batch.Impl(), row_indexes_span);
|
result.Compact(FstCU(), batch.Impl(), row_indexes_span);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t current_row = 0;
|
size_t current_row = 0;
|
||||||
@ -228,13 +228,13 @@ TEST(EllpackPage, Compact) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0),
|
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
|
||||||
current_row, row_d.data().get()));
|
current_row, row_d.data().get()));
|
||||||
dh::safe_cuda(cudaDeviceSynchronize());
|
dh::safe_cuda(cudaDeviceSynchronize());
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(kCols,
|
dh::LaunchN(kCols,
|
||||||
ReadRowFunction(result.GetDeviceAccessor(0), compacted_row,
|
ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
|
||||||
row_result_d.data().get()));
|
row_result_d.data().get()));
|
||||||
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
|
||||||
|
|
||||||
|
|||||||
@ -30,7 +30,7 @@ namespace xgboost::data {
|
|||||||
TEST(GradientIndex, ExternalMemoryBaseRowID) {
|
TEST(GradientIndex, ExternalMemoryBaseRowID) {
|
||||||
Context ctx;
|
Context ctx;
|
||||||
auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
|
auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
|
||||||
.Device(ctx.gpu_id)
|
.Device(ctx.Device())
|
||||||
.Batches(8)
|
.Batches(8)
|
||||||
.GenerateSparsePageDMatrix("cache", true);
|
.GenerateSparsePageDMatrix("cache", true);
|
||||||
|
|
||||||
|
|||||||
@ -11,9 +11,7 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "test_iterative_dmatrix.h"
|
#include "test_iterative_dmatrix.h"
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost::data {
|
||||||
namespace data {
|
|
||||||
|
|
||||||
void TestEquivalent(float sparsity) {
|
void TestEquivalent(float sparsity) {
|
||||||
Context ctx{MakeCUDACtx(0)};
|
Context ctx{MakeCUDACtx(0)};
|
||||||
|
|
||||||
@ -23,14 +21,14 @@ void TestEquivalent(float sparsity) {
|
|||||||
std::size_t offset = 0;
|
std::size_t offset = 0;
|
||||||
auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
|
auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
|
||||||
std::unique_ptr<EllpackPageImpl> page_concatenated {
|
std::unique_ptr<EllpackPageImpl> page_concatenated {
|
||||||
new EllpackPageImpl(0, first->Cuts(), first->is_dense,
|
new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
|
||||||
first->row_stride, 1000 * 100)};
|
first->row_stride, 1000 * 100)};
|
||||||
for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
|
for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
|
||||||
auto page = batch.Impl();
|
auto page = batch.Impl();
|
||||||
size_t num_elements = page_concatenated->Copy(0, page, offset);
|
size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
|
||||||
offset += num_elements;
|
offset += num_elements;
|
||||||
}
|
}
|
||||||
auto from_iter = page_concatenated->GetDeviceAccessor(0);
|
auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
|
||||||
ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
|
ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
|
||||||
ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
|
ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
|
||||||
|
|
||||||
@ -40,7 +38,7 @@ void TestEquivalent(float sparsity) {
|
|||||||
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
|
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
|
||||||
auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
|
||||||
for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
|
for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
|
||||||
auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
|
auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
|
||||||
|
|
||||||
std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
|
std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
|
||||||
std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
|
std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
|
||||||
@ -152,10 +150,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
|
|||||||
auto impl = ellpack.Impl();
|
auto impl = ellpack.Impl();
|
||||||
common::CompressedIterator<uint32_t> iterator(
|
common::CompressedIterator<uint32_t> iterator(
|
||||||
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
|
||||||
EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
|
EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
|
||||||
EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
|
EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
|
||||||
// null values get placed after valid values in a row
|
// null values get placed after valid values in a row
|
||||||
EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
|
EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
|
||||||
EXPECT_EQ(m.Info().num_col_, cols);
|
EXPECT_EQ(m.Info().num_col_, cols);
|
||||||
EXPECT_EQ(m.Info().num_row_, rows);
|
EXPECT_EQ(m.Info().num_row_, rows);
|
||||||
EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
|
EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
|
||||||
@ -183,5 +181,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
|
|||||||
TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
|
TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
|
||||||
&ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
|
&ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
|
||||||
}
|
}
|
||||||
} // namespace data
|
} // namespace xgboost::data
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -12,6 +12,7 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
#include "xgboost/base.h"
|
#include "xgboost/base.h"
|
||||||
|
|
||||||
|
namespace xgboost {
|
||||||
TEST(MetaInfo, GetSet) {
|
TEST(MetaInfo, GetSet) {
|
||||||
xgboost::Context ctx;
|
xgboost::Context ctx;
|
||||||
xgboost::MetaInfo info;
|
xgboost::MetaInfo info;
|
||||||
@ -236,9 +237,9 @@ TEST(MetaInfo, Validate) {
|
|||||||
info.num_nonzero_ = 12;
|
info.num_nonzero_ = 12;
|
||||||
info.num_col_ = 3;
|
info.num_col_ = 3;
|
||||||
std::vector<xgboost::bst_group_t> groups (11);
|
std::vector<xgboost::bst_group_t> groups (11);
|
||||||
xgboost::Context ctx;
|
Context ctx;
|
||||||
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
|
info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
|
||||||
EXPECT_THROW(info.Validate(0), dmlc::Error);
|
EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
|
||||||
|
|
||||||
std::vector<float> labels(info.num_row_ + 1);
|
std::vector<float> labels(info.num_row_ + 1);
|
||||||
EXPECT_THROW(
|
EXPECT_THROW(
|
||||||
@ -261,11 +262,11 @@ TEST(MetaInfo, Validate) {
|
|||||||
info.group_ptr_.clear();
|
info.group_ptr_.clear();
|
||||||
labels.resize(info.num_row_);
|
labels.resize(info.num_row_);
|
||||||
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
|
info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
|
||||||
info.labels.SetDevice(0);
|
info.labels.SetDevice(FstCU());
|
||||||
EXPECT_THROW(info.Validate(1), dmlc::Error);
|
EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
|
||||||
|
|
||||||
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
|
xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
|
||||||
d_groups.SetDevice(0);
|
d_groups.SetDevice(FstCU());
|
||||||
d_groups.DevicePointer(); // pull to device
|
d_groups.DevicePointer(); // pull to device
|
||||||
std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
|
std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
|
||||||
d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
|
d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
|
||||||
@ -306,6 +307,5 @@ TEST(MetaInfo, HostExtend) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace xgboost {
|
|
||||||
TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
|
TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,31 +1,27 @@
|
|||||||
/*!
|
/**
|
||||||
* Copyright 2021 XGBoost contributors
|
* Copyright 2021-2023, XGBoost contributors
|
||||||
*/
|
*/
|
||||||
#include <gtest/gtest.h>
|
#include <gtest/gtest.h>
|
||||||
#include "../helpers.h"
|
|
||||||
#include "../../../src/data/proxy_dmatrix.h"
|
|
||||||
#include "../../../src/data/adapter.h"
|
|
||||||
|
|
||||||
namespace xgboost {
|
#include "../../../src/data/adapter.h"
|
||||||
namespace data {
|
#include "../../../src/data/proxy_dmatrix.h"
|
||||||
|
#include "../helpers.h"
|
||||||
|
|
||||||
|
namespace xgboost::data {
|
||||||
TEST(ProxyDMatrix, HostData) {
|
TEST(ProxyDMatrix, HostData) {
|
||||||
DMatrixProxy proxy;
|
DMatrixProxy proxy;
|
||||||
size_t constexpr kRows = 100, kCols = 10;
|
size_t constexpr kRows = 100, kCols = 10;
|
||||||
std::vector<HostDeviceVector<float>> label_storage(1);
|
std::vector<HostDeviceVector<float>> label_storage(1);
|
||||||
|
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
auto data = RandomDataGenerator(kRows, kCols, 0.5)
|
auto data =
|
||||||
.Device(0)
|
RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
|
||||||
.GenerateArrayInterface(&storage);
|
|
||||||
|
|
||||||
proxy.SetArrayData(data.c_str());
|
proxy.SetArrayData(data.c_str());
|
||||||
|
|
||||||
auto n_samples = HostAdapterDispatch(
|
auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
|
||||||
&proxy, [](auto const &value) { return value.Size(); });
|
|
||||||
ASSERT_EQ(n_samples, kRows);
|
ASSERT_EQ(n_samples, kRows);
|
||||||
auto n_features = HostAdapterDispatch(
|
auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
|
||||||
&proxy, [](auto const &value) { return value.NumCols(); });
|
|
||||||
ASSERT_EQ(n_features, kCols);
|
ASSERT_EQ(n_features, kCols);
|
||||||
}
|
}
|
||||||
} // namespace data
|
} // namespace xgboost::data
|
||||||
} // namespace xgboost
|
|
||||||
|
|||||||
@ -15,10 +15,12 @@ namespace xgboost::data {
|
|||||||
TEST(ProxyDMatrix, DeviceData) {
|
TEST(ProxyDMatrix, DeviceData) {
|
||||||
constexpr size_t kRows{100}, kCols{100};
|
constexpr size_t kRows{100}, kCols{100};
|
||||||
HostDeviceVector<float> storage;
|
HostDeviceVector<float> storage;
|
||||||
auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
|
auto data =
|
||||||
|
RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
|
||||||
std::vector<HostDeviceVector<float>> label_storage(1);
|
std::vector<HostDeviceVector<float>> label_storage(1);
|
||||||
auto labels =
|
auto labels = RandomDataGenerator(kRows, 1, 0)
|
||||||
RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
|
.Device(FstCU())
|
||||||
|
.GenerateColumnarArrayInterface(&label_storage);
|
||||||
|
|
||||||
DMatrixProxy proxy;
|
DMatrixProxy proxy;
|
||||||
proxy.SetCUDAArray(data.c_str());
|
proxy.SetCUDAArray(data.c_str());
|
||||||
@ -31,7 +33,7 @@ TEST(ProxyDMatrix, DeviceData) {
|
|||||||
|
|
||||||
std::vector<HostDeviceVector<float>> columnar_storage(kCols);
|
std::vector<HostDeviceVector<float>> columnar_storage(kCols);
|
||||||
data = RandomDataGenerator(kRows, kCols, 0)
|
data = RandomDataGenerator(kRows, kCols, 0)
|
||||||
.Device(0)
|
.Device(FstCU())
|
||||||
.GenerateColumnarArrayInterface(&columnar_storage);
|
.GenerateColumnarArrayInterface(&columnar_storage);
|
||||||
proxy.SetCUDAArray(data.c_str());
|
proxy.SetCUDAArray(data.c_str());
|
||||||
ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
|
ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
|
||||||
|
|||||||
@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
|
|||||||
std::iota(upper.begin(), upper.end(), 1.0f);
|
std::iota(upper.begin(), upper.end(), 1.0f);
|
||||||
|
|
||||||
auto& margin = p_m->Info().base_margin_;
|
auto& margin = p_m->Info().base_margin_;
|
||||||
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
|
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
|
||||||
|
|
||||||
std::array<int32_t, 3> ridxs {1, 3, 5};
|
std::array<int32_t, 3> ridxs {1, 3, 5};
|
||||||
std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
|
std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
|
||||||
@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
|
|||||||
std::iota(upper.begin(), upper.end(), 1.0f);
|
std::iota(upper.begin(), upper.end(), 1.0f);
|
||||||
|
|
||||||
auto& margin = p_m->Info().base_margin_;
|
auto& margin = p_m->Info().base_margin_;
|
||||||
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
|
margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
|
||||||
|
|
||||||
auto constexpr kSlices {2};
|
auto constexpr kSlices {2};
|
||||||
auto constexpr kSliceSize {4};
|
auto constexpr kSliceSize {4};
|
||||||
|
|||||||
@ -134,11 +134,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
|
|||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
|
for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
|
||||||
if (!impl_ext) {
|
if (!impl_ext) {
|
||||||
impl_ext.reset(new EllpackPageImpl(
|
impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
|
||||||
batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
|
batch.Impl()->Cuts(), batch.Impl()->is_dense,
|
||||||
batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
|
batch.Impl()->row_stride, kRows);
|
||||||
}
|
}
|
||||||
auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
|
auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
|
||||||
offset += n_elems;
|
offset += n_elems;
|
||||||
}
|
}
|
||||||
EXPECT_EQ(impl_ext->base_rowid, 0);
|
EXPECT_EQ(impl_ext->base_rowid, 0);
|
||||||
@ -198,10 +198,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
|
|||||||
EXPECT_EQ(impl_ext->base_rowid, current_row);
|
EXPECT_EQ(impl_ext->base_rowid, current_row);
|
||||||
|
|
||||||
for (size_t i = 0; i < impl_ext->Size(); i++) {
|
for (size_t i = 0; i < impl_ext->Size(); i++) {
|
||||||
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
|
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
|
||||||
|
row_d.data().get()));
|
||||||
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
thrust::copy(row_d.begin(), row_d.end(), row.begin());
|
||||||
|
|
||||||
dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
|
dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
|
||||||
|
row_ext_d.data().get()));
|
||||||
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
|
thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
|
||||||
|
|
||||||
EXPECT_EQ(row, row_ext);
|
EXPECT_EQ(row, row_ext);
|
||||||
|
|||||||
@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
|
|||||||
|
|
||||||
gbtree.Configure({{"tree_method", "hist"}});
|
gbtree.Configure({{"tree_method", "hist"}});
|
||||||
auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
|
auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
|
||||||
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
|
linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
|
||||||
gpair.Data()->Copy(GenerateRandomGradients(kRows));
|
gpair.Data()->Copy(GenerateRandomGradients(kRows));
|
||||||
|
|
||||||
PredictionCacheEntry out_predictions;
|
PredictionCacheEntry out_predictions;
|
||||||
@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
|
|||||||
|
|
||||||
// pull data into device.
|
// pull data into device.
|
||||||
data.HostVector();
|
data.HostVector();
|
||||||
data.SetDevice(0);
|
data.SetDevice(DeviceOrd::CUDA(0));
|
||||||
data.DeviceSpan();
|
data.DeviceSpan();
|
||||||
ASSERT_FALSE(data.HostCanWrite());
|
ASSERT_FALSE(data.HostCanWrite());
|
||||||
|
|
||||||
@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
|
|||||||
}
|
}
|
||||||
learner->Configure();
|
learner->Configure();
|
||||||
for (std::int32_t i = 0; i < 3; ++i) {
|
for (std::int32_t i = 0; i < 3; ++i) {
|
||||||
linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
|
linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
|
||||||
gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
|
gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
|
||||||
learner->BoostOneIter(0, Xy, &gpair);
|
learner->BoostOneIter(0, Xy, &gpair);
|
||||||
}
|
}
|
||||||
@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
|
|||||||
if (device == "GPU") {
|
if (device == "GPU") {
|
||||||
ctx = MakeCUDACtx(0);
|
ctx = MakeCUDACtx(0);
|
||||||
}
|
}
|
||||||
auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
|
auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
|
||||||
auto array_str = rng.GenerateArrayInterface(&data);
|
auto array_str = rng.GenerateArrayInterface(&data);
|
||||||
auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
|
auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
|
||||||
|
|
||||||
@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
|
|||||||
auto test_qdm_err = [&](std::string booster, Context const* ctx) {
|
auto test_qdm_err = [&](std::string booster, Context const* ctx) {
|
||||||
std::shared_ptr<DMatrix> p_fmat;
|
std::shared_ptr<DMatrix> p_fmat;
|
||||||
bst_bin_t max_bins = 16;
|
bst_bin_t max_bins = 16;
|
||||||
auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
|
auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
|
||||||
if (ctx->IsCPU()) {
|
if (ctx->IsCPU()) {
|
||||||
p_fmat = rng.GenerateQuantileDMatrix(true);
|
p_fmat = rng.GenerateQuantileDMatrix(true);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
|
|||||||
bst_feature_t n_features{32};
|
bst_feature_t n_features{32};
|
||||||
HostDeviceVector<float> X_storage;
|
HostDeviceVector<float> X_storage;
|
||||||
// use a different device than the learner
|
// use a different device than the learner
|
||||||
std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
|
auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
|
||||||
auto X = RandomDataGenerator{n_samples, n_features, 0.0}
|
auto X = RandomDataGenerator{n_samples, n_features, 0.0}
|
||||||
.Device(data_ordinal)
|
.Device(data_ordinal)
|
||||||
.GenerateArrayInterface(&X_storage);
|
.GenerateArrayInterface(&X_storage);
|
||||||
@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
|
|||||||
auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
|
auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
|
||||||
|
|
||||||
std::shared_ptr<DMatrix> Xy;
|
std::shared_ptr<DMatrix> Xy;
|
||||||
if (data_ordinal == Context::kCpuId) {
|
if (data_ordinal.IsCPU()) {
|
||||||
auto X_adapter = data::ArrayAdapter{StringView{X}};
|
auto X_adapter = data::ArrayAdapter{StringView{X}};
|
||||||
Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
|
Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
|
||||||
} else {
|
} else {
|
||||||
@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
|
|||||||
|
|
||||||
std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
|
std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
|
||||||
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
|
auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
|
||||||
if (data_ordinal == Context::kCpuId) {
|
if (data_ordinal.IsCPU()) {
|
||||||
proxy->SetArrayData(StringView{X});
|
proxy->SetArrayData(StringView{X});
|
||||||
} else {
|
} else {
|
||||||
proxy->SetCUDAArray(X.c_str());
|
proxy->SetCUDAArray(X.c_str());
|
||||||
@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
|
|||||||
|
|
||||||
// test when the contexts match
|
// test when the contexts match
|
||||||
Context new_ctx = *proxy->Ctx();
|
Context new_ctx = *proxy->Ctx();
|
||||||
ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
|
ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
|
||||||
|
|
||||||
learner->SetParam("device", new_ctx.DeviceName());
|
learner->SetParam("device", new_ctx.DeviceName());
|
||||||
HostDeviceVector<float>* out_predt_1{nullptr};
|
HostDeviceVector<float>* out_predt_1{nullptr};
|
||||||
|
|||||||
@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|||||||
std::vector<xgboost::bst_float> out_hess) {
|
std::vector<xgboost::bst_float> out_hess) {
|
||||||
xgboost::MetaInfo info;
|
xgboost::MetaInfo info;
|
||||||
info.num_row_ = labels.size();
|
info.num_row_ = labels.size();
|
||||||
info.labels = xgboost::linalg::Tensor<float, 2>{
|
info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
|
||||||
labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
|
labels.cend(),
|
||||||
|
{labels.size(), static_cast<std::size_t>(1)},
|
||||||
|
xgboost::DeviceOrd::CPU()};
|
||||||
info.weights_.HostVector() = weights;
|
info.weights_.HostVector() = weights;
|
||||||
|
|
||||||
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
||||||
@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|||||||
std::vector<xgboost::bst_float> out_hess) {
|
std::vector<xgboost::bst_float> out_hess) {
|
||||||
xgboost::MetaInfo info;
|
xgboost::MetaInfo info;
|
||||||
info.num_row_ = labels.size();
|
info.num_row_ = labels.size();
|
||||||
info.labels = xgboost::linalg::Matrix<float>{
|
info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
|
||||||
labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
|
labels.cend(),
|
||||||
|
{labels.size(), static_cast<std::size_t>(1)},
|
||||||
|
xgboost::DeviceOrd::CPU()};
|
||||||
info.weights_.HostVector() = weights;
|
info.weights_.HostVector() = weights;
|
||||||
info.group_ptr_ = groups;
|
info.group_ptr_ = groups;
|
||||||
|
|
||||||
@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
|
|||||||
xgboost::DataSplitMode data_split_mode) {
|
xgboost::DataSplitMode data_split_mode) {
|
||||||
return GetMultiMetricEval(
|
return GetMultiMetricEval(
|
||||||
metric, preds,
|
metric, preds,
|
||||||
xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
|
xgboost::linalg::Tensor<float, 2>{
|
||||||
groups, data_split_mode);
|
labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
|
||||||
|
weights, groups, data_split_mode);
|
||||||
}
|
}
|
||||||
|
|
||||||
double GetMultiMetricEval(xgboost::Metric* metric,
|
double GetMultiMetricEval(xgboost::Metric* metric,
|
||||||
@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
|
|||||||
p_fmat->Info().labels.Data());
|
p_fmat->Info().labels.Data());
|
||||||
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
|
||||||
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
|
||||||
if (device_ != Context::kCpuId) {
|
if (device_.IsCUDA()) {
|
||||||
p_fmat->Info().labels.SetDevice(device_);
|
p_fmat->Info().labels.SetDevice(device_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
|
|||||||
v = dist(&lcg);
|
v = dist(&lcg);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
out->SetDevice(device_);
|
out->SetDevice(device_);
|
||||||
out->DeviceSpan();
|
out->DeviceSpan();
|
||||||
}
|
}
|
||||||
@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
|
|||||||
|
|
||||||
std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
|
std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
|
||||||
HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
|
HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
|
||||||
std::size_t batches, std::int32_t device) {
|
std::size_t batches, DeviceOrd device) {
|
||||||
std::vector<std::string> result(batches);
|
std::vector<std::string> result(batches);
|
||||||
std::vector<Json> objects;
|
std::vector<Json> objects;
|
||||||
|
|
||||||
@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
|
|||||||
auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
|
auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
|
||||||
Json array_interface{Object()};
|
Json array_interface{Object()};
|
||||||
array_interface["data"] = std::vector<Json>(2);
|
array_interface["data"] = std::vector<Json>(2);
|
||||||
if (device >= 0) {
|
if (device.IsCUDA()) {
|
||||||
array_interface["data"][0] =
|
array_interface["data"][0] =
|
||||||
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
|
Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
|
||||||
array_interface["stream"] = Null{};
|
array_interface["stream"] = Null{};
|
||||||
@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
|
|||||||
h_rptr.emplace_back(rptr);
|
h_rptr.emplace_back(rptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
value->SetDevice(device_);
|
value->SetDevice(device_);
|
||||||
value->DeviceSpan();
|
value->DeviceSpan();
|
||||||
row_ptr->SetDevice(device_);
|
row_ptr->SetDevice(device_);
|
||||||
@ -400,7 +405,7 @@ void RandomDataGenerator::GenerateCSR(
|
|||||||
out->Info().labels.Reshape(this->rows_, this->n_targets_);
|
out->Info().labels.Reshape(this->rows_, this->n_targets_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (device_ >= 0) {
|
if (device_.IsCUDA()) {
|
||||||
out->Info().labels.SetDevice(device_);
|
out->Info().labels.SetDevice(device_);
|
||||||
out->Info().feature_types.SetDevice(device_);
|
out->Info().feature_types.SetDevice(device_);
|
||||||
for (auto const& page : out->GetBatches<SparsePage>()) {
|
for (auto const& page : out->GetBatches<SparsePage>()) {
|
||||||
@ -423,7 +428,7 @@ void RandomDataGenerator::GenerateCSR(
|
|||||||
CHECK_GE(this->n_batches_, 1)
|
CHECK_GE(this->n_batches_, 1)
|
||||||
<< "Must set the n_batches before generating an external memory DMatrix.";
|
<< "Must set the n_batches before generating an external memory DMatrix.";
|
||||||
std::unique_ptr<ArrayIterForTest> iter;
|
std::unique_ptr<ArrayIterForTest> iter;
|
||||||
if (device_ == Context::kCpuId) {
|
if (device_.IsCPU()) {
|
||||||
iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
|
iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
|
||||||
} else {
|
} else {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
@ -487,7 +492,7 @@ int CudaArrayIterForTest::Next() {
|
|||||||
NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
|
NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
|
||||||
size_t batches)
|
size_t batches)
|
||||||
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
||||||
rng_->Device(Context::kCpuId);
|
rng_->Device(DeviceOrd::CPU());
|
||||||
std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||||
this->Reset();
|
this->Reset();
|
||||||
}
|
}
|
||||||
@ -644,8 +649,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
|
|||||||
labels[i] = i;
|
labels[i] = i;
|
||||||
}
|
}
|
||||||
p_dmat->Info().labels =
|
p_dmat->Info().labels =
|
||||||
linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
|
linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
|
||||||
linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
|
linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
|
||||||
auto h_gpair = gpair.HostView();
|
auto h_gpair = gpair.HostView();
|
||||||
for (size_t i = 0; i < kRows; ++i) {
|
for (size_t i = 0; i < kRows; ++i) {
|
||||||
h_gpair(i) = GradientPair{static_cast<float>(i), 1};
|
h_gpair(i) = GradientPair{static_cast<float>(i), 1};
|
||||||
@ -674,7 +679,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
|
|||||||
CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
|
CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
|
||||||
this->data_.Copy(data);
|
this->data_.Copy(data);
|
||||||
std::tie(batches_, interface_) =
|
std::tie(batches_, interface_) =
|
||||||
MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
|
MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
|
ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
|
||||||
|
|||||||
@ -9,7 +9,7 @@ namespace xgboost {
|
|||||||
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
|
CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
|
||||||
size_t cols, size_t batches)
|
size_t cols, size_t batches)
|
||||||
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
: ArrayIterForTest{sparsity, rows, cols, batches} {
|
||||||
rng_->Device(0);
|
rng_->Device(FstCU());
|
||||||
std::tie(batches_, interface_) =
|
std::tie(batches_, interface_) =
|
||||||
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
|
||||||
this->Reset();
|
this->Reset();
|
||||||
|
|||||||
@ -231,7 +231,7 @@ class RandomDataGenerator {
|
|||||||
|
|
||||||
bst_target_t n_targets_{1};
|
bst_target_t n_targets_{1};
|
||||||
|
|
||||||
std::int32_t device_{Context::kCpuId};
|
DeviceOrd device_{DeviceOrd::CPU()};
|
||||||
std::size_t n_batches_{0};
|
std::size_t n_batches_{0};
|
||||||
std::uint64_t seed_{0};
|
std::uint64_t seed_{0};
|
||||||
SimpleLCG lcg_;
|
SimpleLCG lcg_;
|
||||||
@ -256,7 +256,7 @@ class RandomDataGenerator {
|
|||||||
upper_ = v;
|
upper_ = v;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
RandomDataGenerator& Device(int32_t d) {
|
RandomDataGenerator& Device(DeviceOrd d) {
|
||||||
device_ = d;
|
device_ = d;
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
|
|||||||
* \brief Make a context that uses CUDA if device >= 0.
|
* \brief Make a context that uses CUDA if device >= 0.
|
||||||
*/
|
*/
|
||||||
inline Context MakeCUDACtx(std::int32_t device) {
|
inline Context MakeCUDACtx(std::int32_t device) {
|
||||||
if (device == Context::kCpuId) {
|
if (device == DeviceOrd::CPUOrdinal()) {
|
||||||
return Context{};
|
return Context{};
|
||||||
}
|
}
|
||||||
return Context{}.MakeCUDA(device);
|
return Context{}.MakeCUDA(device);
|
||||||
@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
|
|||||||
* \brief Make learner model param
|
* \brief Make learner model param
|
||||||
*/
|
*/
|
||||||
inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
|
inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
|
||||||
int32_t device = Context::kCpuId) {
|
DeviceOrd device = DeviceOrd::CPU()) {
|
||||||
size_t shape[1]{1};
|
size_t shape[1]{1};
|
||||||
LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
|
LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
|
||||||
n_groups, 1, MultiStrategy::kOneOutputPerTree);
|
n_groups, 1, MultiStrategy::kOneOutputPerTree);
|
||||||
@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
|
|||||||
|
|
||||||
class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
|
class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
|
||||||
|
|
||||||
|
inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
|
||||||
} // namespace xgboost
|
} // namespace xgboost
|
||||||
|
|||||||
@ -1,3 +1,8 @@
|
|||||||
|
/**
|
||||||
|
* Copyright 2020-2023, XGBoost contributors
|
||||||
|
*/
|
||||||
|
#pragma once
|
||||||
|
|
||||||
#if defined(__CUDACC__)
|
#if defined(__CUDACC__)
|
||||||
#include "../../src/data/ellpack_page.cuh"
|
#include "../../src/data/ellpack_page.cuh"
|
||||||
#endif
|
#endif
|
||||||
@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
|
|||||||
};
|
};
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
|
inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
|
||||||
int n_rows, int n_cols, bst_float sparsity= 0) {
|
bst_float sparsity = 0) {
|
||||||
auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
|
auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
|
||||||
const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
|
||||||
|
|
||||||
@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto page = std::unique_ptr<EllpackPageImpl>(
|
auto page = std::unique_ptr<EllpackPageImpl>(
|
||||||
new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {}));
|
new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
|
|||||||
// Invalid dataset
|
// Invalid dataset
|
||||||
auto p_fmat = EmptyDMatrix();
|
auto p_fmat = EmptyDMatrix();
|
||||||
MetaInfo& info = p_fmat->Info();
|
MetaInfo& info = p_fmat->Info();
|
||||||
info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
|
info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
|
||||||
float auc = metric->Evaluate({1, 1}, p_fmat);
|
float auc = metric->Evaluate({1, 1}, p_fmat);
|
||||||
ASSERT_TRUE(std::isnan(auc));
|
ASSERT_TRUE(std::isnan(auc));
|
||||||
*info.labels.Data() = HostDeviceVector<float>{};
|
*info.labels.Data() = HostDeviceVector<float>{};
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user