Remove internal use of gpu_id. (#9568)
This commit is contained in:
@@ -123,7 +123,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
|
||||
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
||||
}
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan) {
|
||||
@@ -240,13 +240,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
|
||||
sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
|
||||
return {0, e.index, e.fvalue}; // row_idx is not needed for scaning column size.
|
||||
});
|
||||
detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
|
||||
detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
|
||||
IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
|
||||
&column_sizes_scan);
|
||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||
if (sketch_container->HasCategorical()) {
|
||||
auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
|
||||
detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
||||
detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
|
||||
&column_sizes_scan);
|
||||
}
|
||||
|
||||
@@ -347,7 +347,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
|
||||
|
||||
HistogramCuts cuts;
|
||||
SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
|
||||
ctx->Ordinal());
|
||||
ctx->Device());
|
||||
CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
|
||||
for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
|
||||
std::size_t page_nnz = page.data.Size();
|
||||
|
||||
@@ -82,9 +82,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
|
||||
}
|
||||
|
||||
template <std::uint32_t kBlockThreads, typename Kernel>
|
||||
std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
|
||||
std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
|
||||
int n_mps = 0;
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
|
||||
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
|
||||
int n_blocks_per_mp = 0;
|
||||
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
|
||||
kBlockThreads, shared_mem));
|
||||
@@ -106,11 +106,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
|
||||
* \param out_column_size Output buffer for the size of each column.
|
||||
*/
|
||||
template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
|
||||
void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
|
||||
void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
|
||||
data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
|
||||
thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
|
||||
|
||||
std::size_t max_shared_memory = dh::MaxSharedMemory(device);
|
||||
std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
|
||||
// Not strictly correct as we should use number of samples to determine the type of
|
||||
// counter. However, the sample size is not known due to sliding window on number of
|
||||
// elements.
|
||||
@@ -154,7 +154,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
|
||||
}
|
||||
|
||||
template <typename BatchIt>
|
||||
void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
|
||||
void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
|
||||
IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
|
||||
HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
|
||||
dh::caching_device_vector<size_t>* column_sizes_scan) {
|
||||
@@ -215,7 +215,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
|
||||
// Count the valid entries in each column and copy them out.
|
||||
template <typename AdapterBatch, typename BatchIter>
|
||||
void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
|
||||
float missing, size_t columns, size_t cuts_per_feature, int device,
|
||||
float missing, size_t columns, size_t cuts_per_feature,
|
||||
DeviceOrd device,
|
||||
HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
|
||||
dh::caching_device_vector<size_t>* column_sizes_scan,
|
||||
dh::device_vector<Entry>* sorted_entries) {
|
||||
@@ -239,7 +240,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
|
||||
void SortByWeight(dh::device_vector<float>* weights,
|
||||
dh::device_vector<Entry>* sorted_entries);
|
||||
|
||||
void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||
dh::device_vector<Entry>* p_sorted_entries,
|
||||
dh::device_vector<float>* p_sorted_weights,
|
||||
dh::caching_device_vector<size_t>* p_column_sizes_scan);
|
||||
@@ -277,7 +278,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
|
||||
|
||||
template <typename AdapterBatch>
|
||||
void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
int device, size_t columns, size_t begin, size_t end,
|
||||
DeviceOrd device, size_t columns, size_t begin, size_t end,
|
||||
float missing, SketchContainer *sketch_container,
|
||||
int num_cuts) {
|
||||
// Copy current subset of valid elements into temporary storage and sort
|
||||
@@ -316,11 +317,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
||||
template <typename Batch>
|
||||
void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
||||
int num_cuts_per_feature,
|
||||
bool is_ranking, float missing, int device,
|
||||
bool is_ranking, float missing, DeviceOrd device,
|
||||
size_t columns, size_t begin, size_t end,
|
||||
SketchContainer *sketch_container) {
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
info.weights_.SetDevice(device);
|
||||
auto weights = info.weights_.ConstDeviceSpan();
|
||||
|
||||
@@ -412,14 +413,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
size_t num_rows = batch.NumRows();
|
||||
size_t num_cols = batch.NumCols();
|
||||
size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
|
||||
int32_t device = sketch_container->DeviceIdx();
|
||||
auto device = sketch_container->DeviceIdx();
|
||||
bool weighted = !info.weights_.Empty();
|
||||
|
||||
if (weighted) {
|
||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||
sketch_batch_num_elements,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, true);
|
||||
device.ordinal, num_cuts_per_feature, true);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
@@ -432,7 +433,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
|
||||
sketch_batch_num_elements = detail::SketchBatchNumElements(
|
||||
sketch_batch_num_elements,
|
||||
num_rows, num_cols, std::numeric_limits<size_t>::max(),
|
||||
device, num_cuts_per_feature, false);
|
||||
device.ordinal, num_cuts_per_feature, false);
|
||||
for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
|
||||
size_t end =
|
||||
std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
|
||||
|
||||
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(size, v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
|
||||
: impl_(nullptr) {
|
||||
impl_ = new HostDeviceVectorImpl<T>(init);
|
||||
}
|
||||
@@ -81,7 +81,7 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return -1; }
|
||||
DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
|
||||
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(int) const {}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
|
||||
|
||||
|
||||
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
||||
template <typename T>
|
||||
class HostDeviceVectorImpl {
|
||||
public:
|
||||
HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
|
||||
if (device.IsCUDA()) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
data_d_->resize(size, v);
|
||||
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
|
||||
|
||||
// Initializer can be std::vector<T> or std::initializer_list<T>
|
||||
template <class Initializer>
|
||||
HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
|
||||
if (device >= 0) {
|
||||
HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
|
||||
if (device.IsCUDA()) {
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
LazyResizeDevice(init.size());
|
||||
Copy(init);
|
||||
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_{that.gpu_access_} {}
|
||||
|
||||
~HostDeviceVectorImpl() {
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
SetDevice();
|
||||
}
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
[[nodiscard]] size_t Size() const {
|
||||
return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
|
||||
}
|
||||
|
||||
int DeviceIdx() const { return device_; }
|
||||
[[nodiscard]] DeviceOrd Device() const { return device_; }
|
||||
|
||||
T* DevicePointer() {
|
||||
LazySyncDevice(GPUAccess::kWrite);
|
||||
@@ -138,7 +138,7 @@ class HostDeviceVectorImpl {
|
||||
} else {
|
||||
auto ptr = other->ConstDevicePointer();
|
||||
SetDevice();
|
||||
CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
|
||||
CHECK_EQ(this->Device(), other->Device());
|
||||
dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
|
||||
ptr,
|
||||
other->Size() * sizeof(T),
|
||||
@@ -156,24 +156,25 @@ class HostDeviceVectorImpl {
|
||||
return data_h_;
|
||||
}
|
||||
|
||||
void SetDevice(int device) {
|
||||
void SetDevice(DeviceOrd device) {
|
||||
if (device_ == device) { return; }
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
LazySyncHost(GPUAccess::kNone);
|
||||
}
|
||||
|
||||
if (device_ >= 0 && device >= 0) {
|
||||
CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
|
||||
if (device_.IsCUDA() && device.IsCUDA()) {
|
||||
CHECK_EQ(device_.ordinal, device.ordinal)
|
||||
<< "New device ordinal is different from previous one.";
|
||||
}
|
||||
device_ = device;
|
||||
if (device_ >= 0) {
|
||||
if (device_.IsCUDA()) {
|
||||
LazyResizeDevice(data_h_.size());
|
||||
}
|
||||
}
|
||||
|
||||
void Resize(size_t new_size, T v) {
|
||||
if (new_size == Size()) { return; }
|
||||
if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
|
||||
if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
|
||||
// fast on-device resize
|
||||
gpu_access_ = GPUAccess::kWrite;
|
||||
SetDevice();
|
||||
@@ -218,16 +219,16 @@ class HostDeviceVectorImpl {
|
||||
gpu_access_ = access;
|
||||
}
|
||||
|
||||
bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||
bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||
bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||
bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
GPUAccess Access() const { return gpu_access_; }
|
||||
[[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
|
||||
[[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
|
||||
[[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
|
||||
[[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
|
||||
[[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
|
||||
[[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
|
||||
[[nodiscard]] GPUAccess Access() const { return gpu_access_; }
|
||||
|
||||
private:
|
||||
int device_{-1};
|
||||
DeviceOrd device_{DeviceOrd::CPU()};
|
||||
std::vector<T> data_h_{};
|
||||
std::unique_ptr<dh::device_vector<T>> data_d_{};
|
||||
GPUAccess gpu_access_{GPUAccess::kNone};
|
||||
@@ -259,11 +260,11 @@ class HostDeviceVectorImpl {
|
||||
}
|
||||
|
||||
void SetDevice() {
|
||||
CHECK_GE(device_, 0);
|
||||
CHECK_GE(device_.ordinal, 0);
|
||||
if (cudaSetDeviceHandler == nullptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
} else {
|
||||
(*cudaSetDeviceHandler)(device_);
|
||||
(*cudaSetDeviceHandler)(device_.ordinal);
|
||||
}
|
||||
|
||||
if (!data_d_) {
|
||||
@@ -273,15 +274,15 @@ class HostDeviceVectorImpl {
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||
|
||||
template <typename T>
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
|
||||
HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
|
||||
: impl_(new HostDeviceVectorImpl<T>(init, device)) {}
|
||||
|
||||
template <typename T>
|
||||
@@ -309,7 +310,9 @@ template <typename T>
|
||||
size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
|
||||
|
||||
template <typename T>
|
||||
int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
|
||||
DeviceOrd HostDeviceVector<T>::Device() const {
|
||||
return impl_->Device();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* HostDeviceVector<T>::DevicePointer() {
|
||||
@@ -389,14 +392,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
|
||||
return impl_->Access();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(int device) const {
|
||||
impl_->SetDevice(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
|
||||
impl_->SetDevice(device.ordinal);
|
||||
impl_->SetDevice(device);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
@@ -8,16 +8,12 @@
|
||||
#include "xgboost/context.h" // Context
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace cuda_impl {
|
||||
namespace xgboost::common::cuda_impl {
|
||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||
values.SetDevice(ctx->gpu_id);
|
||||
values.SetDevice(ctx->Device());
|
||||
auto const d_values = values.ConstDeviceSpan();
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
||||
thrust::plus<float>{});
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common::cuda_impl
|
||||
|
||||
@@ -24,7 +24,7 @@ struct OptionalWeights {
|
||||
inline OptionalWeights MakeOptionalWeights(Context const* ctx,
|
||||
HostDeviceVector<float> const& weights) {
|
||||
if (ctx->IsCUDA()) {
|
||||
weights.SetDevice(ctx->gpu_id);
|
||||
weights.SetDevice(ctx->Device());
|
||||
}
|
||||
return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
|
||||
}
|
||||
|
||||
@@ -207,10 +207,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
||||
// summary does the output element come from) result by definition of merged rank. So we
|
||||
// run it in 2 passes to obtain the merge path and then customize the standard merge
|
||||
// algorithm.
|
||||
void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
|
||||
Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
|
||||
Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
dh::safe_cuda(cudaSetDevice(device.ordinal));
|
||||
CHECK_EQ(d_x.size() + d_y.size(), out.size());
|
||||
CHECK_EQ(x_ptr.size(), out_ptr.size());
|
||||
CHECK_EQ(y_ptr.size(), out_ptr.size());
|
||||
@@ -308,7 +308,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
|
||||
void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
|
||||
common::Span<OffsetT> cuts_ptr,
|
||||
size_t total_cuts, Span<float> weights) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
Span<SketchEntry> out;
|
||||
dh::device_vector<SketchEntry> cuts;
|
||||
bool first_window = this->Current().empty();
|
||||
@@ -367,7 +367,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
* pruning or merging. We preserve the first type and remove the second type.
|
||||
*/
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
|
||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||
|
||||
@@ -407,7 +407,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
||||
|
||||
void SketchContainer::Prune(size_t to) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
|
||||
OffsetT to_total = 0;
|
||||
auto& h_columns_ptr = columns_ptr_b_.HostVector();
|
||||
@@ -442,7 +442,7 @@ void SketchContainer::Prune(size_t to) {
|
||||
|
||||
void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
Span<SketchEntry const> that) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
timer_.Start(__func__);
|
||||
if (this->Current().size() == 0) {
|
||||
CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
|
||||
@@ -477,7 +477,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
|
||||
}
|
||||
|
||||
void SketchContainer::FixError() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
|
||||
auto in = dh::ToSpan(this->Current());
|
||||
dh::LaunchN(in.size(), [=] __device__(size_t idx) {
|
||||
@@ -502,7 +502,7 @@ void SketchContainer::FixError() {
|
||||
}
|
||||
|
||||
void SketchContainer::AllReduce(bool is_column_split) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
auto world = collective::GetWorldSize();
|
||||
if (world == 1 || is_column_split) {
|
||||
return;
|
||||
@@ -529,15 +529,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
|
||||
auto offset = rank * d_columns_ptr.size();
|
||||
thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
|
||||
gathered_ptrs.begin() + offset);
|
||||
collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
|
||||
collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
|
||||
gathered_ptrs.size());
|
||||
|
||||
// Get the data from all workers.
|
||||
std::vector<size_t> recv_lengths;
|
||||
dh::caching_device_vector<char> recvbuf;
|
||||
collective::AllGatherV(device_, this->Current().data().get(),
|
||||
collective::AllGatherV(device_.ordinal, this->Current().data().get(),
|
||||
dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
|
||||
collective::Synchronize(device_);
|
||||
collective::Synchronize(device_.ordinal);
|
||||
|
||||
// Segment the received data.
|
||||
auto s_recvbuf = dh::ToSpan(recvbuf);
|
||||
@@ -584,7 +584,7 @@ struct InvalidCatOp {
|
||||
|
||||
void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
p_cuts->min_vals_.Resize(num_columns_);
|
||||
|
||||
// Sync between workers.
|
||||
|
||||
@@ -41,7 +41,7 @@ class SketchContainer {
|
||||
bst_row_t num_rows_;
|
||||
bst_feature_t num_columns_;
|
||||
int32_t num_bins_;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
|
||||
// Double buffer as neither prune nor merge can be performed inplace.
|
||||
dh::device_vector<SketchEntry> entries_a_;
|
||||
@@ -93,35 +93,32 @@ class SketchContainer {
|
||||
* \param num_rows Total number of rows in known dataset (typically the rows in current worker).
|
||||
* \param device GPU ID.
|
||||
*/
|
||||
SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
|
||||
int32_t max_bin, bst_feature_t num_columns,
|
||||
bst_row_t num_rows, int32_t device)
|
||||
: num_rows_{num_rows},
|
||||
num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||
CHECK_GE(device, 0);
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(num_columns + 1);
|
||||
this->columns_ptr_b_.SetDevice(device_);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||
SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
|
||||
bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
|
||||
: num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
|
||||
CHECK(device.IsCUDA());
|
||||
// Initialize Sketches for this dmatrix
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
this->columns_ptr_.Resize(num_columns + 1);
|
||||
this->columns_ptr_b_.SetDevice(device_);
|
||||
this->columns_ptr_b_.Resize(num_columns + 1);
|
||||
|
||||
this->feature_types_.Resize(feature_types.Size());
|
||||
this->feature_types_.Copy(feature_types);
|
||||
// Pull to device.
|
||||
this->feature_types_.SetDevice(device);
|
||||
this->feature_types_.ConstDeviceSpan();
|
||||
this->feature_types_.ConstHostSpan();
|
||||
this->feature_types_.Resize(feature_types.Size());
|
||||
this->feature_types_.Copy(feature_types);
|
||||
// Pull to device.
|
||||
this->feature_types_.SetDevice(device);
|
||||
this->feature_types_.ConstDeviceSpan();
|
||||
this->feature_types_.ConstHostSpan();
|
||||
|
||||
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
||||
has_categorical_ =
|
||||
!d_feature_types.empty() &&
|
||||
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
|
||||
common::IsCatOp{});
|
||||
auto d_feature_types = feature_types_.ConstDeviceSpan();
|
||||
has_categorical_ =
|
||||
!d_feature_types.empty() &&
|
||||
thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
|
||||
|
||||
timer_.Init(__func__);
|
||||
}
|
||||
timer_.Init(__func__);
|
||||
}
|
||||
/* \brief Return GPU ID for this container. */
|
||||
int32_t DeviceIdx() const { return device_; }
|
||||
[[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
|
||||
/* \brief Whether the predictor matrix contains categorical features. */
|
||||
bool HasCategorical() const { return has_categorical_; }
|
||||
/* \brief Accumulate weights of duplicated entries in input. */
|
||||
@@ -175,7 +172,7 @@ class SketchContainer {
|
||||
template <typename KeyComp = thrust::equal_to<size_t>>
|
||||
size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
|
||||
timer_.Start(__func__);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
this->columns_ptr_.SetDevice(device_);
|
||||
Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||
CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
|
||||
|
||||
@@ -15,8 +15,7 @@
|
||||
#include "xgboost/linalg.h" // Tensor, UnravelIndex, Apply
|
||||
#include "xgboost/logging.h" // CHECK_EQ
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace xgboost::common {
|
||||
void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||
HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
|
||||
if (!ctx->IsCPU()) {
|
||||
@@ -46,8 +45,8 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
|
||||
}
|
||||
|
||||
void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
|
||||
v.SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
v.SetDevice(ctx->Device());
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(1);
|
||||
|
||||
if (ctx->IsCPU()) {
|
||||
@@ -62,5 +61,4 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
|
||||
cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common
|
||||
|
||||
@@ -15,14 +15,12 @@
|
||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||
#include "xgboost/linalg.h" // linalg::TensorView, UnravelIndex, Apply
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
namespace cuda_impl {
|
||||
namespace xgboost::common::cuda_impl {
|
||||
void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||
common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
|
||||
CHECK_GE(t.Shape(1), 1);
|
||||
HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
|
||||
segments.SetDevice(ctx->gpu_id);
|
||||
segments.SetDevice(ctx->Device());
|
||||
auto d_segments = segments.DeviceSpan();
|
||||
dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
|
||||
[=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
|
||||
@@ -31,7 +29,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
|
||||
return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
|
||||
});
|
||||
|
||||
out->SetDevice(ctx->gpu_id);
|
||||
out->SetDevice(ctx->Device());
|
||||
out->Reshape(t.Shape(1));
|
||||
if (weights.Empty()) {
|
||||
common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
|
||||
@@ -60,6 +58,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
|
||||
dh::TemporaryArray<char> temp{bytes};
|
||||
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
|
||||
}
|
||||
} // namespace cuda_impl
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::common::cuda_impl
|
||||
|
||||
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
|
||||
auto d_sorted_idx = dh::ToSpan(sorted_idx);
|
||||
auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
|
||||
|
||||
quantiles->SetDevice(ctx->gpu_id);
|
||||
quantiles->SetDevice(ctx->Device());
|
||||
quantiles->Resize(n_segments);
|
||||
auto d_results = quantiles->DeviceSpan();
|
||||
|
||||
@@ -220,7 +220,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
|
||||
scan_val, weights_cdf.begin());
|
||||
|
||||
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
||||
quantiles->SetDevice(ctx->gpu_id);
|
||||
quantiles->SetDevice(ctx->Device());
|
||||
quantiles->Resize(n_segments);
|
||||
auto d_results = quantiles->DeviceSpan();
|
||||
auto d_weight_cdf = dh::ToSpan(weights_cdf);
|
||||
|
||||
@@ -60,8 +60,8 @@ class Transform {
|
||||
template <typename Functor>
|
||||
struct Evaluator {
|
||||
public:
|
||||
Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
|
||||
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
|
||||
Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
|
||||
: func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
|
||||
|
||||
/*!
|
||||
* \brief Evaluate the functor with input pointers to HostDeviceVector.
|
||||
@@ -71,7 +71,7 @@ class Transform {
|
||||
*/
|
||||
template <typename... HDV>
|
||||
void Eval(HDV... vectors) const {
|
||||
bool on_device = device_ >= 0;
|
||||
bool on_device = device_.IsCUDA();
|
||||
|
||||
if (on_device) {
|
||||
LaunchCUDA(func_, vectors...);
|
||||
@@ -116,11 +116,11 @@ class Transform {
|
||||
}
|
||||
// Recursive unpack for Shard.
|
||||
template <typename T>
|
||||
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
|
||||
void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
|
||||
vector->SetDevice(device);
|
||||
}
|
||||
template <typename Head, typename... Rest>
|
||||
void UnpackShard(int device,
|
||||
void UnpackShard(DeviceOrd device,
|
||||
const HostDeviceVector<Head> *_vector,
|
||||
const HostDeviceVector<Rest> *... _vectors) const {
|
||||
_vector->SetDevice(device);
|
||||
@@ -140,7 +140,7 @@ class Transform {
|
||||
// granularity is used in data vector.
|
||||
size_t shard_size = range_size;
|
||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
dh::safe_cuda(cudaSetDevice(device_.ordinal));
|
||||
const int kGrids =
|
||||
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
if (kGrids == 0) {
|
||||
@@ -174,7 +174,7 @@ class Transform {
|
||||
/*! \brief Range object specifying parallel threads index range. */
|
||||
Range range_;
|
||||
int32_t n_threads_;
|
||||
int32_t device_;
|
||||
DeviceOrd device_;
|
||||
};
|
||||
|
||||
public:
|
||||
@@ -192,8 +192,8 @@ class Transform {
|
||||
*/
|
||||
template <typename Functor>
|
||||
static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
|
||||
int32_t device_idx) {
|
||||
return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
|
||||
DeviceOrd device) {
|
||||
return Evaluator<Functor>{func, std::move(range), n_threads, device};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user