add namespace aliases to reduce code
This commit is contained in:
parent
e00131c465
commit
4a4b528d54
@ -6,6 +6,12 @@
|
|||||||
#include <thrust/execution_policy.h>
|
#include <thrust/execution_policy.h>
|
||||||
#include "device_helpers.cuh"
|
#include "device_helpers.cuh"
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_HIP
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
struct CUDAContext {
|
struct CUDAContext {
|
||||||
private:
|
private:
|
||||||
@ -17,37 +23,21 @@ struct CUDAContext {
|
|||||||
* \brief Caching thrust policy.
|
* \brief Caching thrust policy.
|
||||||
*/
|
*/
|
||||||
auto CTP() const {
|
auto CTP() const {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
#if THRUST_MAJOR_VERSION >= 2
|
||||||
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
||||||
#else
|
#else
|
||||||
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream());
|
||||||
#endif // THRUST_MAJOR_VERSION >= 2
|
#endif // THRUST_MAJOR_VERSION >= 2
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
|
||||||
return thrust::hip::par_nosync(caching_alloc_).on(dh::DefaultStream());
|
|
||||||
#else
|
|
||||||
return thrust::hip::par(caching_alloc_).on(dh::DefaultStream());
|
|
||||||
#endif // THRUST_MAJOR_VERSION >= 2
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* \brief Thrust policy without caching allocator.
|
* \brief Thrust policy without caching allocator.
|
||||||
*/
|
*/
|
||||||
auto TP() const {
|
auto TP() const {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
#if THRUST_MAJOR_VERSION >= 2
|
||||||
return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par_nosync(alloc_).on(dh::DefaultStream());
|
||||||
#else
|
#else
|
||||||
return thrust::cuda::par(alloc_).on(dh::DefaultStream());
|
return thrust::cuda::par(alloc_).on(dh::DefaultStream());
|
||||||
#endif // THRUST_MAJOR_VERSION >= 2
|
#endif // THRUST_MAJOR_VERSION >= 2
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
#if THRUST_MAJOR_VERSION >= 2
|
|
||||||
return thrust::hip::par_nosync(alloc_).on(dh::DefaultStream());
|
|
||||||
#else
|
|
||||||
return thrust::hip::par(alloc_).on(dh::DefaultStream());
|
|
||||||
#endif // THRUST_MAJOR_VERSION >= 2
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
auto Stream() const { return dh::DefaultStream(); }
|
auto Stream() const { return dh::DefaultStream(); }
|
||||||
};
|
};
|
||||||
|
|||||||
@ -26,6 +26,12 @@
|
|||||||
#include "quantile.h"
|
#include "quantile.h"
|
||||||
#include "xgboost/host_device_vector.h"
|
#include "xgboost/host_device_vector.h"
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_HIP
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::common {
|
namespace xgboost::common {
|
||||||
constexpr float SketchContainer::kFactor;
|
constexpr float SketchContainer::kFactor;
|
||||||
|
|
||||||
@ -112,7 +118,6 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
|
|||||||
// Sort both entries and wegihts.
|
// Sort both entries and wegihts.
|
||||||
dh::XGBDeviceAllocator<char> alloc;
|
dh::XGBDeviceAllocator<char> alloc;
|
||||||
CHECK_EQ(weights->size(), sorted_entries->size());
|
CHECK_EQ(weights->size(), sorted_entries->size());
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
|
thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
|
||||||
weights->begin(), detail::EntryCompareOp());
|
weights->begin(), detail::EntryCompareOp());
|
||||||
|
|
||||||
@ -122,17 +127,6 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
|
|||||||
thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
|
thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
|
||||||
weights->begin(),
|
weights->begin(),
|
||||||
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::sort_by_key(thrust::hip::par(alloc), sorted_entries->begin(), sorted_entries->end(),
|
|
||||||
weights->begin(), detail::EntryCompareOp());
|
|
||||||
|
|
||||||
// Scan weights
|
|
||||||
dh::XGBCachingDeviceAllocator<char> caching;
|
|
||||||
thrust::inclusive_scan_by_key(
|
|
||||||
thrust::hip::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
|
|
||||||
weights->begin(),
|
|
||||||
[=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
|
||||||
|
|||||||
@ -19,6 +19,10 @@
|
|||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
namespace cub = hipcub;
|
namespace cub = hipcub;
|
||||||
|
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace xgboost::common {
|
namespace xgboost::common {
|
||||||
@ -175,18 +179,10 @@ void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cu
|
|||||||
return thrust::min(num_cuts_per_feature, column_size);
|
return thrust::min(num_cuts_per_feature, column_size);
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
|
thrust::exclusive_scan(thrust::cuda::par(alloc), cut_ptr_it,
|
||||||
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
|
cut_ptr_it + column_sizes_scan->size(), cuts_ptr->DevicePointer());
|
||||||
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
|
thrust::exclusive_scan(thrust::cuda::par(alloc), column_sizes_scan->begin(),
|
||||||
column_sizes_scan->end(), column_sizes_scan->begin());
|
column_sizes_scan->end(), column_sizes_scan->begin());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::exclusive_scan(thrust::hip::par(alloc), cut_ptr_it,
|
|
||||||
cut_ptr_it + column_sizes_scan->size(),
|
|
||||||
cuts_ptr->DevicePointer());
|
|
||||||
thrust::exclusive_scan(thrust::hip::par(alloc), column_sizes_scan->begin(),
|
|
||||||
column_sizes_scan->end(), column_sizes_scan->begin());
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline size_t constexpr BytesPerElement(bool has_weight) {
|
inline size_t constexpr BytesPerElement(bool has_weight) {
|
||||||
@ -309,13 +305,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
|
|||||||
&sorted_entries);
|
&sorted_entries);
|
||||||
dh::XGBDeviceAllocator<char> alloc;
|
dh::XGBDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
|
||||||
sorted_entries.end(), detail::EntryCompareOp());
|
sorted_entries.end(), detail::EntryCompareOp());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::sort(thrust::hip::par(alloc), sorted_entries.begin(),
|
|
||||||
sorted_entries.end(), detail::EntryCompareOp());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (sketch_container->HasCategorical()) {
|
if (sketch_container->HasCategorical()) {
|
||||||
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
auto d_cuts_ptr = cuts_ptr.DeviceSpan();
|
||||||
@ -374,19 +365,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
|||||||
return weights[group_idx];
|
return weights[group_idx];
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
||||||
weight_iter + begin, weight_iter + end,
|
weight_iter + begin, weight_iter + end,
|
||||||
batch_iter + begin,
|
batch_iter + begin,
|
||||||
d_temp_weights.data(), // output
|
d_temp_weights.data(), // output
|
||||||
is_valid);
|
is_valid);
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto retit = thrust::copy_if(thrust::hip::par(alloc),
|
|
||||||
weight_iter + begin, weight_iter + end,
|
|
||||||
batch_iter + begin,
|
|
||||||
d_temp_weights.data(), // output
|
|
||||||
is_valid);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
||||||
} else {
|
} else {
|
||||||
@ -397,19 +380,11 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
|
|||||||
return weights[batch.GetElement(idx).row_idx];
|
return weights[batch.GetElement(idx).row_idx];
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
auto retit = thrust::copy_if(thrust::cuda::par(alloc),
|
||||||
weight_iter + begin, weight_iter + end,
|
weight_iter + begin, weight_iter + end,
|
||||||
batch_iter + begin,
|
batch_iter + begin,
|
||||||
d_temp_weights.data(), // output
|
d_temp_weights.data(), // output
|
||||||
is_valid);
|
is_valid);
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto retit = thrust::copy_if(thrust::hip::par(alloc),
|
|
||||||
weight_iter + begin, weight_iter + end,
|
|
||||||
batch_iter + begin,
|
|
||||||
d_temp_weights.data(), // output
|
|
||||||
is_valid);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
CHECK_EQ(retit - d_temp_weights.data(), d_temp_weights.size());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,18 +8,19 @@
|
|||||||
#include "xgboost/context.h" // Context
|
#include "xgboost/context.h" // Context
|
||||||
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
#include "xgboost/host_device_vector.h" // HostDeviceVector
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_HIP
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::common::cuda_impl {
|
namespace xgboost::common::cuda_impl {
|
||||||
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
|
||||||
values.SetDevice(ctx->Device());
|
values.SetDevice(ctx->Device());
|
||||||
auto const d_values = values.ConstDeviceSpan();
|
auto const d_values = values.ConstDeviceSpan();
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
return dh::Reduce(thrust::cuda::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
||||||
thrust::plus<float>{});
|
thrust::plus<float>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
return dh::Reduce(thrust::hip::par(alloc), dh::tcbegin(d_values), dh::tcend(d_values), 0.0,
|
|
||||||
thrust::plus<float>{});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
} // namespace xgboost::common::cuda_impl
|
} // namespace xgboost::common::cuda_impl
|
||||||
|
|||||||
@ -22,6 +22,12 @@
|
|||||||
#include "transform_iterator.h" // MakeIndexTransformIter
|
#include "transform_iterator.h" // MakeIndexTransformIter
|
||||||
#include "xgboost/span.h"
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_HIP
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
@ -147,7 +153,6 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
|||||||
// We reuse the memory for storing merge path.
|
// We reuse the memory for storing merge path.
|
||||||
common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
|
common::Span<Tuple> merge_path{reinterpret_cast<Tuple *>(out.data()), out.size()};
|
||||||
// Determine the merge path, 0 if element is from x, 1 if it's from y.
|
// Determine the merge path, 0 if element is from x, 1 if it's from y.
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::merge_by_key(
|
thrust::merge_by_key(
|
||||||
thrust::cuda::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(),
|
thrust::cuda::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(),
|
||||||
y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it,
|
y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it,
|
||||||
@ -160,36 +165,15 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
|||||||
}
|
}
|
||||||
return l_column_id < r_column_id;
|
return l_column_id < r_column_id;
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::merge_by_key(
|
|
||||||
thrust::hip::par(alloc), x_merge_key_it, x_merge_key_it + d_x.size(),
|
|
||||||
y_merge_key_it, y_merge_key_it + d_y.size(), x_merge_val_it,
|
|
||||||
y_merge_val_it, thrust::make_discard_iterator(), merge_path.data(),
|
|
||||||
[=] __device__(auto const &l, auto const &r) -> bool {
|
|
||||||
auto l_column_id = thrust::get<0>(l);
|
|
||||||
auto r_column_id = thrust::get<0>(r);
|
|
||||||
if (l_column_id == r_column_id) {
|
|
||||||
return thrust::get<1>(l).value < thrust::get<1>(r).value;
|
|
||||||
}
|
|
||||||
return l_column_id < r_column_id;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Compute output ptr
|
// Compute output ptr
|
||||||
auto transform_it =
|
auto transform_it =
|
||||||
thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
|
thrust::make_zip_iterator(thrust::make_tuple(x_ptr.data(), y_ptr.data()));
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::transform(
|
thrust::transform(
|
||||||
thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
|
thrust::cuda::par(alloc), transform_it, transform_it + x_ptr.size(),
|
||||||
out_ptr.data(),
|
out_ptr.data(),
|
||||||
[] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); });
|
[] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); });
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::transform(
|
|
||||||
thrust::hip::par(alloc), transform_it, transform_it + x_ptr.size(),
|
|
||||||
out_ptr.data(),
|
|
||||||
[] __device__(auto const& t) { return thrust::get<0>(t) + thrust::get<1>(t); });
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// 0^th is the indicator, 1^th is placeholder
|
// 0^th is the indicator, 1^th is placeholder
|
||||||
auto get_ind = []XGBOOST_DEVICE(Tuple const& t) { return thrust::get<0>(t); };
|
auto get_ind = []XGBOOST_DEVICE(Tuple const& t) { return thrust::get<0>(t); };
|
||||||
@ -215,7 +199,6 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
|||||||
// comparison, index of y is incremented by 1 from y_0 to y_1, and at the same time, y_0
|
// comparison, index of y is incremented by 1 from y_0 to y_1, and at the same time, y_0
|
||||||
// is landed into output as the first element in merge result. The scan result is the
|
// is landed into output as the first element in merge result. The scan result is the
|
||||||
// subscript of x and y.
|
// subscript of x and y.
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::exclusive_scan_by_key(
|
thrust::exclusive_scan_by_key(
|
||||||
thrust::cuda::par(alloc), scan_key_it, scan_key_it + merge_path.size(),
|
thrust::cuda::par(alloc), scan_key_it, scan_key_it + merge_path.size(),
|
||||||
scan_val_it, merge_path.data(),
|
scan_val_it, merge_path.data(),
|
||||||
@ -224,16 +207,6 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
|
|||||||
[=] __device__(Tuple const &l, Tuple const &r) -> Tuple {
|
[=] __device__(Tuple const &l, Tuple const &r) -> Tuple {
|
||||||
return thrust::make_tuple(get_x(l) + get_x(r), get_y(l) + get_y(r));
|
return thrust::make_tuple(get_x(l) + get_x(r), get_y(l) + get_y(r));
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::exclusive_scan_by_key(
|
|
||||||
thrust::hip::par(alloc), scan_key_it, scan_key_it + merge_path.size(),
|
|
||||||
scan_val_it, merge_path.data(),
|
|
||||||
thrust::make_tuple<uint64_t, uint64_t>(0ul, 0ul),
|
|
||||||
thrust::equal_to<size_t>{},
|
|
||||||
[=] __device__(Tuple const &l, Tuple const &r) -> Tuple {
|
|
||||||
return thrust::make_tuple(get_x(l) + get_x(r), get_y(l) + get_y(r));
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return merge_path;
|
return merge_path;
|
||||||
}
|
}
|
||||||
@ -414,7 +387,6 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
|||||||
// Reverse scan to accumulate weights into first duplicated element on left.
|
// Reverse scan to accumulate weights into first duplicated element on left.
|
||||||
auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
|
auto val_it = thrust::make_reverse_iterator(dh::tend(entries));
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::inclusive_scan_by_key(
|
thrust::inclusive_scan_by_key(
|
||||||
thrust::cuda::par(alloc), key_it, key_it + entries.size(),
|
thrust::cuda::par(alloc), key_it, key_it + entries.size(),
|
||||||
val_it, val_it,
|
val_it, val_it,
|
||||||
@ -428,21 +400,6 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
|
|||||||
}
|
}
|
||||||
return l;
|
return l;
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::inclusive_scan_by_key(
|
|
||||||
thrust::hip::par(alloc), key_it, key_it + entries.size(),
|
|
||||||
val_it, val_it,
|
|
||||||
thrust::equal_to<size_t>{},
|
|
||||||
[] __device__(SketchEntry const &r, SketchEntry const &l) {
|
|
||||||
// Only accumulate for the first type of duplication.
|
|
||||||
if (l.value - r.value == 0 && l.rmin - r.rmin != 0) {
|
|
||||||
auto w = l.wmin + r.wmin;
|
|
||||||
SketchEntry v{l.rmin, l.rmin + w, w, l.value};
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
return l;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan();
|
auto d_columns_ptr_out = columns_ptr_b_.DeviceSpan();
|
||||||
// thrust unique_by_key preserves the first element.
|
// thrust unique_by_key preserves the first element.
|
||||||
@ -691,7 +648,6 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
|||||||
// track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
|
// track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
|
||||||
dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
|
dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
|
||||||
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
|
dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto new_end = thrust::reduce_by_key(
|
auto new_end = thrust::reduce_by_key(
|
||||||
thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
|
thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
|
||||||
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
|
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
|
||||||
@ -705,21 +661,6 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
|
|||||||
default_entry);
|
default_entry);
|
||||||
thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
|
thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
|
||||||
d_max_keys.begin(), d_max_results.begin());
|
d_max_keys.begin(), d_max_results.begin());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto new_end = thrust::reduce_by_key(
|
|
||||||
thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
|
|
||||||
d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
|
|
||||||
[] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
|
|
||||||
d_max_keys.erase(new_end.first, d_max_keys.end());
|
|
||||||
d_max_values.erase(new_end.second, d_max_values.end());
|
|
||||||
|
|
||||||
// The device vector needs to be initialized explicitly since we may have some missing columns.
|
|
||||||
SketchEntry default_entry{};
|
|
||||||
dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
|
|
||||||
default_entry);
|
|
||||||
thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(),
|
|
||||||
d_max_keys.begin(), d_max_results.begin());
|
|
||||||
#endif
|
|
||||||
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
|
dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
|
||||||
auto max_it = MakeIndexTransformIter([&](auto i) {
|
auto max_it = MakeIndexTransformIter([&](auto i) {
|
||||||
if (IsCat(h_feature_types, i)) {
|
if (IsCat(h_feature_types, i)) {
|
||||||
|
|||||||
@ -10,6 +10,12 @@
|
|||||||
#include "timer.h"
|
#include "timer.h"
|
||||||
#include "categorical.h"
|
#include "categorical.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
@ -184,19 +190,11 @@ class SketchContainer {
|
|||||||
|
|
||||||
d_column_scan = this->columns_ptr_.DeviceSpan();
|
d_column_scan = this->columns_ptr_.DeviceSpan();
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
size_t n_uniques = dh::SegmentedUnique(
|
size_t n_uniques = dh::SegmentedUnique(
|
||||||
thrust::cuda::par(alloc), d_column_scan.data(),
|
thrust::cuda::par(alloc), d_column_scan.data(),
|
||||||
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
||||||
entries.data() + entries.size(), scan_out.DevicePointer(),
|
entries.data() + entries.size(), scan_out.DevicePointer(),
|
||||||
entries.data(), detail::SketchUnique{}, key_comp);
|
entries.data(), detail::SketchUnique{}, key_comp);
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
size_t n_uniques = dh::SegmentedUnique(
|
|
||||||
thrust::hip::par(alloc), d_column_scan.data(),
|
|
||||||
d_column_scan.data() + d_column_scan.size(), entries.data(),
|
|
||||||
entries.data() + entries.size(), scan_out.DevicePointer(),
|
|
||||||
entries.data(), detail::SketchUnique{}, key_comp);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
this->columns_ptr_.Copy(scan_out);
|
this->columns_ptr_.Copy(scan_out);
|
||||||
CHECK(!this->columns_ptr_.HostCanRead());
|
CHECK(!this->columns_ptr_.HostCanRead());
|
||||||
|
|||||||
@ -23,6 +23,12 @@
|
|||||||
#include "xgboost/context.h" // Context
|
#include "xgboost/context.h" // Context
|
||||||
#include "xgboost/span.h" // Span
|
#include "xgboost/span.h" // Span
|
||||||
|
|
||||||
|
#ifdef XGBOOST_USE_HIP
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace common {
|
namespace common {
|
||||||
namespace detail {
|
namespace detail {
|
||||||
@ -217,13 +223,8 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
|
|||||||
auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
auto scan_val = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
|
||||||
detail::WeightOp<WIter>{w_begin, d_sorted_idx});
|
detail::WeightOp<WIter>{w_begin, d_sorted_idx});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
|
thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
|
||||||
scan_val, weights_cdf.begin());
|
scan_val, weights_cdf.begin());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::inclusive_scan_by_key(thrust::hip::par(caching), scan_key, scan_key + n_weights,
|
|
||||||
scan_val, weights_cdf.begin());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
auto n_segments = std::distance(seg_beg, seg_end) - 1;
|
||||||
quantiles->SetDevice(ctx->Device());
|
quantiles->SetDevice(ctx->Device());
|
||||||
|
|||||||
@ -17,6 +17,9 @@
|
|||||||
|
|
||||||
#if defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_HIP)
|
||||||
namespace cub = hipcub;
|
namespace cub = hipcub;
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
@ -122,13 +125,8 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
|
|||||||
group_ptr_.resize(h_num_runs_out + 1, 0);
|
group_ptr_.resize(h_num_runs_out + 1, 0);
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(),
|
thrust::inclusive_scan(thrust::cuda::par(alloc), cnt.begin(),
|
||||||
cnt.begin() + h_num_runs_out, cnt.begin());
|
cnt.begin() + h_num_runs_out, cnt.begin());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::inclusive_scan(thrust::hip::par(alloc), cnt.begin(),
|
|
||||||
cnt.begin() + h_num_runs_out, cnt.begin());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
|
thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
|
||||||
group_ptr_.begin() + 1);
|
group_ptr_.begin() + 1);
|
||||||
|
|||||||
@ -17,6 +17,12 @@
|
|||||||
#include "adapter.h"
|
#include "adapter.h"
|
||||||
#include "array_interface.h"
|
#include "array_interface.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace data {
|
namespace data {
|
||||||
|
|
||||||
@ -246,17 +252,10 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offs
|
|||||||
});
|
});
|
||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
bst_row_t row_stride =
|
bst_row_t row_stride =
|
||||||
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
||||||
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
|
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
bst_row_t row_stride =
|
|
||||||
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
|
|
||||||
thrust::device_pointer_cast(offset.data()) + offset.size(),
|
|
||||||
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
|
|
||||||
#endif
|
|
||||||
return row_stride;
|
return row_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,13 +279,8 @@ bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
|
|||||||
// intervals to early stop. But we expect all data to be valid here, using small
|
// intervals to early stop. But we expect all data to be valid here, using small
|
||||||
// intervals only decreases performance due to excessive kernel launch and stream
|
// intervals only decreases performance due to excessive kernel launch and stream
|
||||||
// synchronization.
|
// synchronization.
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
|
auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
|
||||||
thrust::logical_and<>{});
|
thrust::logical_and<>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true,
|
|
||||||
thrust::logical_and<>{});
|
|
||||||
#endif
|
|
||||||
return valid;
|
return valid;
|
||||||
}
|
}
|
||||||
}; // namespace data
|
}; // namespace data
|
||||||
|
|||||||
@ -16,6 +16,12 @@
|
|||||||
#include "simple_batch_iterator.h"
|
#include "simple_batch_iterator.h"
|
||||||
#include "sparse_page_source.h"
|
#include "sparse_page_source.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::data {
|
namespace xgboost::data {
|
||||||
void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
||||||
DataIterHandle iter_handle, float missing,
|
DataIterHandle iter_handle, float missing,
|
||||||
@ -86,11 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
|
|||||||
return GetRowCounts(value, row_counts_span, get_device(), missing);
|
return GetRowCounts(value, row_counts_span, get_device(), missing);
|
||||||
}));
|
}));
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
|
nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
nnz += thrust::reduce(thrust::hip::par(alloc), row_counts.begin(), row_counts.end());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
batches++;
|
batches++;
|
||||||
} while (iter.Next());
|
} while (iter.Next());
|
||||||
|
|||||||
@ -13,6 +13,12 @@
|
|||||||
#include "../common/error_msg.h" // for InfInData
|
#include "../common/error_msg.h" // for InfInData
|
||||||
#include "device_adapter.cuh" // for HasInfInData
|
#include "device_adapter.cuh" // for HasInfInData
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::data {
|
namespace xgboost::data {
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA)
|
||||||
@ -69,15 +75,9 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
|
|||||||
});
|
});
|
||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
|
||||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
thrust::device_pointer_cast(offset.data() + offset.size()),
|
||||||
thrust::device_pointer_cast(offset.data()));
|
thrust::device_pointer_cast(offset.data()));
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
|
|
||||||
thrust::device_pointer_cast(offset.data() + offset.size()),
|
|
||||||
thrust::device_pointer_cast(offset.data()));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename AdapterBatchT>
|
template <typename AdapterBatchT>
|
||||||
|
|||||||
@ -25,6 +25,12 @@
|
|||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
#include "xgboost/span.h"
|
#include "xgboost/span.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace metric {
|
namespace metric {
|
||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
@ -127,17 +133,10 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
thrust::make_counting_iterator(0),
|
thrust::make_counting_iterator(0),
|
||||||
[=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
|
[=] XGBOOST_DEVICE(size_t i) { return predts[d_sorted_idx[i]]; });
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto end_unique = thrust::unique_by_key_copy(
|
auto end_unique = thrust::unique_by_key_copy(
|
||||||
thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
thrust::cuda::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
||||||
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
||||||
dh::tbegin(d_unique_idx));
|
dh::tbegin(d_unique_idx));
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto end_unique = thrust::unique_by_key_copy(
|
|
||||||
thrust::hip::par(alloc), uni_key, uni_key + d_sorted_idx.size(),
|
|
||||||
dh::tbegin(d_unique_idx), thrust::make_discard_iterator(),
|
|
||||||
dh::tbegin(d_unique_idx));
|
|
||||||
#endif
|
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
|
d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
|
||||||
|
|
||||||
@ -179,11 +178,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
|
|
||||||
Pair last = cache->fptp.back();
|
Pair last = cache->fptp.back();
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
|
double auc = thrust::reduce(thrust::cuda::par(alloc), in, in + d_unique_idx.size());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
double auc = thrust::reduce(thrust::hip::par(alloc), in, in + d_unique_idx.size());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return std::make_tuple(last.first, last.second, auc);
|
return std::make_tuple(last.first, last.second, auc);
|
||||||
}
|
}
|
||||||
@ -239,15 +234,9 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
|
|||||||
double tp_sum;
|
double tp_sum;
|
||||||
double auc_sum;
|
double auc_sum;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::tie(auc_sum, tp_sum) =
|
thrust::tie(auc_sum, tp_sum) =
|
||||||
thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
|
thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::tie(auc_sum, tp_sum) =
|
|
||||||
thrust::reduce(thrust::hip::par(alloc), reduce_in, reduce_in + n_classes,
|
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (tp_sum != 0 && !std::isnan(auc_sum)) {
|
if (tp_sum != 0 && !std::isnan(auc_sum)) {
|
||||||
auc_sum /= tp_sum;
|
auc_sum /= tp_sum;
|
||||||
@ -329,15 +318,9 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
|
|||||||
return auc;
|
return auc;
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_in,
|
||||||
key_in + d_unique_idx.size(), val_in,
|
key_in + d_unique_idx.size(), val_in,
|
||||||
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::reduce_by_key(thrust::hip::par(alloc), key_in,
|
|
||||||
key_in + d_unique_idx.size(), val_in,
|
|
||||||
thrust::make_discard_iterator(), dh::tbegin(d_auc));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -410,7 +393,6 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
|
|||||||
dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
|
dh::TemporaryArray<uint32_t> unique_class_ptr(d_class_ptr.size());
|
||||||
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
dh::tbegin(d_class_ptr),
|
dh::tbegin(d_class_ptr),
|
||||||
@ -421,18 +403,6 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
|
|||||||
d_unique_class_ptr.data(),
|
d_unique_class_ptr.data(),
|
||||||
dh::tbegin(d_unique_idx),
|
dh::tbegin(d_unique_idx),
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
|
||||||
thrust::hip::par(alloc),
|
|
||||||
dh::tbegin(d_class_ptr),
|
|
||||||
dh::tend(d_class_ptr),
|
|
||||||
uni_key,
|
|
||||||
uni_key + d_sorted_idx.size(),
|
|
||||||
dh::tbegin(d_unique_idx),
|
|
||||||
d_unique_class_ptr.data(),
|
|
||||||
dh::tbegin(d_unique_idx),
|
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
||||||
|
|
||||||
@ -553,15 +523,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
|||||||
thrust::make_counting_iterator(0),
|
thrust::make_counting_iterator(0),
|
||||||
[=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
|
[=] XGBOOST_DEVICE(size_t i) { return d_group_ptr[i + 1] - d_group_ptr[i]; });
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
size_t n_valid = thrust::count_if(
|
size_t n_valid = thrust::count_if(
|
||||||
thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
thrust::cuda::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
||||||
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
size_t n_valid = thrust::count_if(
|
|
||||||
thrust::hip::par(alloc), check_it, check_it + group_ptr.size() - 1,
|
|
||||||
[=] XGBOOST_DEVICE(size_t len) { return len >= 3; });
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (n_valid < info.group_ptr_.size() - 1) {
|
if (n_valid < info.group_ptr_.size() - 1) {
|
||||||
InvalidGroupAUC();
|
InvalidGroupAUC();
|
||||||
@ -659,13 +623,8 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
|
|||||||
/**
|
/**
|
||||||
* Scale the AUC with number of items in each group.
|
* Scale the AUC with number of items in each group.
|
||||||
*/
|
*/
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
|
double auc = thrust::reduce(thrust::cuda::par(alloc), dh::tbegin(s_d_auc),
|
||||||
dh::tend(s_d_auc), 0.0);
|
dh::tend(s_d_auc), 0.0);
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
double auc = thrust::reduce(thrust::hip::par(alloc), dh::tbegin(s_d_auc),
|
|
||||||
dh::tend(s_d_auc), 0.0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return std::make_pair(auc, n_valid);
|
return std::make_pair(auc, n_valid);
|
||||||
}
|
}
|
||||||
@ -694,15 +653,9 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
|
|||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
double total_pos, total_neg;
|
double total_pos, total_neg;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::tie(total_pos, total_neg) =
|
thrust::tie(total_pos, total_neg) =
|
||||||
thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
|
thrust::reduce(thrust::cuda::par(alloc), it, it + labels.Size(),
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::tie(total_pos, total_neg) =
|
|
||||||
thrust::reduce(thrust::hip::par(alloc), it, it + labels.Size(),
|
|
||||||
Pair{0.0, 0.0}, PairPlus<double, double>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
if (total_pos <= 0.0 || total_neg <= 0.0) {
|
||||||
return {0.0f, 0.0f, 0.0f};
|
return {0.0f, 0.0f, 0.0f};
|
||||||
@ -755,17 +708,10 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
|
|||||||
});
|
});
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
||||||
key_it + predts.size(), val_it,
|
key_it + predts.size(), val_it,
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
|
|
||||||
key_it + predts.size(), val_it,
|
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate AUC
|
* Calculate AUC
|
||||||
@ -834,7 +780,6 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
|
dh::TemporaryArray<uint32_t> unique_class_ptr(d_group_ptr.size());
|
||||||
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
auto d_unique_class_ptr = dh::ToSpan(unique_class_ptr);
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
auto n_uniques = dh::SegmentedUniqueByKey(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
dh::tbegin(d_group_ptr),
|
dh::tbegin(d_group_ptr),
|
||||||
@ -845,18 +790,6 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
d_unique_class_ptr.data(),
|
d_unique_class_ptr.data(),
|
||||||
dh::tbegin(d_unique_idx),
|
dh::tbegin(d_unique_idx),
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto n_uniques = dh::SegmentedUniqueByKey(
|
|
||||||
thrust::hip::par(alloc),
|
|
||||||
dh::tbegin(d_group_ptr),
|
|
||||||
dh::tend(d_group_ptr),
|
|
||||||
uni_key,
|
|
||||||
uni_key + d_sorted_idx.size(),
|
|
||||||
dh::tbegin(d_unique_idx),
|
|
||||||
d_unique_class_ptr.data(),
|
|
||||||
dh::tbegin(d_unique_idx),
|
|
||||||
thrust::equal_to<thrust::pair<uint32_t, float>>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
d_unique_idx = d_unique_idx.subspan(0, n_uniques);
|
||||||
|
|
||||||
@ -909,15 +842,9 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
|
|||||||
return thrust::make_pair(0.0, static_cast<uint32_t>(1));
|
return thrust::make_pair(0.0, static_cast<uint32_t>(1));
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
||||||
thrust::cuda::par(alloc), it, it + n_groups,
|
thrust::cuda::par(alloc), it, it + n_groups,
|
||||||
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::tie(auc, invalid_groups) = thrust::reduce(
|
|
||||||
thrust::hip::par(alloc), it, it + n_groups,
|
|
||||||
thrust::pair<double, uint32_t>(0.0, 0), PairPlus<double, uint32_t>{});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
return std::make_pair(auc, n_groups - invalid_groups);
|
return std::make_pair(auc, n_groups - invalid_groups);
|
||||||
}
|
}
|
||||||
@ -949,17 +876,10 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
dh::XGBDeviceAllocator<char> alloc;
|
dh::XGBDeviceAllocator<char> alloc;
|
||||||
auto labels = info.labels.View(ctx->Device());
|
auto labels = info.labels.View(ctx->Device());
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
|
if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
|
||||||
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
||||||
InvalidLabels();
|
InvalidLabels();
|
||||||
}
|
}
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
if (thrust::any_of(thrust::hip::par(alloc), dh::tbegin(labels.Values()),
|
|
||||||
dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
|
|
||||||
InvalidLabels();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get total positive/negative for each group.
|
* Get total positive/negative for each group.
|
||||||
@ -981,17 +901,10 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
|
|||||||
return thrust::make_pair(y * w, (1.0 - y) * w);
|
return thrust::make_pair(y * w, (1.0 - y) * w);
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
thrust::reduce_by_key(thrust::cuda::par(alloc), key_it,
|
||||||
key_it + predts.size(), val_it,
|
key_it + predts.size(), val_it,
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
thrust::make_discard_iterator(), totals.begin(),
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::reduce_by_key(thrust::hip::par(alloc), key_it,
|
|
||||||
key_it + predts.size(), val_it,
|
|
||||||
thrust::make_discard_iterator(), totals.begin(),
|
|
||||||
thrust::equal_to<size_t>{}, PairPlus<double, double>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calculate AUC
|
* Calculate AUC
|
||||||
|
|||||||
@ -30,6 +30,12 @@
|
|||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace metric {
|
namespace metric {
|
||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
@ -47,7 +53,7 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
|
|||||||
PackedReduceResult result;
|
PackedReduceResult result;
|
||||||
auto labels = info.labels.View(ctx->Device());
|
auto labels = info.labels.View(ctx->Device());
|
||||||
if (ctx->IsCUDA()) {
|
if (ctx->IsCUDA()) {
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
thrust::counting_iterator<size_t> begin(0);
|
thrust::counting_iterator<size_t> begin(0);
|
||||||
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
||||||
@ -62,21 +68,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
|
|||||||
return PackedReduceResult{v, wt};
|
return PackedReduceResult{v, wt};
|
||||||
},
|
},
|
||||||
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
|
||||||
thrust::counting_iterator<size_t> begin(0);
|
|
||||||
thrust::counting_iterator<size_t> end = begin + labels.Size();
|
|
||||||
result = thrust::transform_reduce(
|
|
||||||
thrust::hip::par(alloc), begin, end,
|
|
||||||
[=] XGBOOST_DEVICE(size_t i) {
|
|
||||||
auto idx = linalg::UnravelIndex(i, labels.Shape());
|
|
||||||
auto sample_id = std::get<0>(idx);
|
|
||||||
auto target_id = std::get<1>(idx);
|
|
||||||
auto res = loss(i, sample_id, target_id);
|
|
||||||
float v{std::get<0>(res)}, wt{std::get<1>(res)};
|
|
||||||
return PackedReduceResult{v, wt};
|
|
||||||
},
|
|
||||||
PackedReduceResult{}, thrust::plus<PackedReduceResult>());
|
|
||||||
#else
|
#else
|
||||||
common::AssertGPUSupport();
|
common::AssertGPUSupport();
|
||||||
#endif // defined(XGBOOST_USE_CUDA)
|
#endif // defined(XGBOOST_USE_CUDA)
|
||||||
|
|||||||
@ -24,6 +24,12 @@
|
|||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
|
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace metric {
|
namespace metric {
|
||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
@ -104,7 +110,6 @@ class MultiClassMetricsReduction {
|
|||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
PackedReduceResult result = thrust::transform_reduce(
|
PackedReduceResult result = thrust::transform_reduce(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
begin, end,
|
begin, end,
|
||||||
@ -122,25 +127,6 @@ class MultiClassMetricsReduction {
|
|||||||
},
|
},
|
||||||
PackedReduceResult(),
|
PackedReduceResult(),
|
||||||
thrust::plus<PackedReduceResult>());
|
thrust::plus<PackedReduceResult>());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
PackedReduceResult result = thrust::transform_reduce(
|
|
||||||
thrust::hip::par(alloc),
|
|
||||||
begin, end,
|
|
||||||
[=] XGBOOST_DEVICE(size_t idx) {
|
|
||||||
bst_float weight = is_null_weight ? 1.0f : s_weights[idx];
|
|
||||||
bst_float residue = 0;
|
|
||||||
auto label = static_cast<int>(s_labels[idx]);
|
|
||||||
if (label >= 0 && label < static_cast<int32_t>(n_class)) {
|
|
||||||
residue = EvalRowPolicy::EvalRow(
|
|
||||||
label, &s_preds[idx * n_class], n_class) * weight;
|
|
||||||
} else {
|
|
||||||
s_label_error[0] = label;
|
|
||||||
}
|
|
||||||
return PackedReduceResult{ residue, weight };
|
|
||||||
},
|
|
||||||
PackedReduceResult(),
|
|
||||||
thrust::plus<PackedReduceResult>());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
CheckLabelError(s_label_error[0], n_class);
|
CheckLabelError(s_label_error[0], n_class);
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,12 @@
|
|||||||
#include "../common/device_helpers.cuh"
|
#include "../common/device_helpers.cuh"
|
||||||
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
|
#endif // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
using AFTParam = xgboost::common::AFTParam;
|
using AFTParam = xgboost::common::AFTParam;
|
||||||
using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType;
|
using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType;
|
||||||
template <typename Distribution>
|
template <typename Distribution>
|
||||||
@ -103,7 +109,6 @@ class ElementWiseSurvivalMetricsReduction {
|
|||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
PackedReduceResult result = thrust::transform_reduce(
|
PackedReduceResult result = thrust::transform_reduce(
|
||||||
thrust::cuda::par(alloc),
|
thrust::cuda::par(alloc),
|
||||||
begin, end,
|
begin, end,
|
||||||
@ -118,22 +123,6 @@ class ElementWiseSurvivalMetricsReduction {
|
|||||||
},
|
},
|
||||||
PackedReduceResult(),
|
PackedReduceResult(),
|
||||||
thrust::plus<PackedReduceResult>());
|
thrust::plus<PackedReduceResult>());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
PackedReduceResult result = thrust::transform_reduce(
|
|
||||||
thrust::hip::par(alloc),
|
|
||||||
begin, end,
|
|
||||||
[=] XGBOOST_DEVICE(size_t idx) {
|
|
||||||
double weight = is_null_weight ? 1.0 : static_cast<double>(s_weights[idx]);
|
|
||||||
double residue = d_policy.EvalRow(
|
|
||||||
static_cast<double>(s_label_lower_bound[idx]),
|
|
||||||
static_cast<double>(s_label_upper_bound[idx]),
|
|
||||||
static_cast<double>(s_preds[idx]));
|
|
||||||
residue *= weight;
|
|
||||||
return PackedReduceResult{residue, weight};
|
|
||||||
},
|
|
||||||
PackedReduceResult(),
|
|
||||||
thrust::plus<PackedReduceResult>());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,6 +29,12 @@
|
|||||||
#include "xgboost/tree_model.h"
|
#include "xgboost/tree_model.h"
|
||||||
#include "xgboost/tree_updater.h"
|
#include "xgboost/tree_updater.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::predictor {
|
namespace xgboost::predictor {
|
||||||
DMLC_REGISTRY_FILE_TAG(gpu_predictor);
|
DMLC_REGISTRY_FILE_TAG(gpu_predictor);
|
||||||
|
|
||||||
@ -512,7 +518,6 @@ void ExtractPaths(
|
|||||||
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
|
return PathInfo{static_cast<int64_t>(idx), path_length, tree_idx};
|
||||||
});
|
});
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto end = thrust::copy_if(
|
auto end = thrust::copy_if(
|
||||||
thrust::cuda::par(alloc), nodes_transform,
|
thrust::cuda::par(alloc), nodes_transform,
|
||||||
nodes_transform + d_nodes.size(), info.begin(),
|
nodes_transform + d_nodes.size(), info.begin(),
|
||||||
@ -525,20 +530,6 @@ void ExtractPaths(
|
|||||||
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
|
thrust::exclusive_scan(thrust::cuda::par(alloc), length_iterator,
|
||||||
length_iterator + info.size() + 1,
|
length_iterator + info.size() + 1,
|
||||||
path_segments.begin());
|
path_segments.begin());
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto end = thrust::copy_if(
|
|
||||||
thrust::hip::par(alloc), nodes_transform,
|
|
||||||
nodes_transform + d_nodes.size(), info.begin(),
|
|
||||||
[=] __device__(const PathInfo& e) { return e.leaf_position != -1; });
|
|
||||||
info.resize(end - info.begin());
|
|
||||||
auto length_iterator = dh::MakeTransformIterator<size_t>(
|
|
||||||
info.begin(),
|
|
||||||
[=] __device__(const PathInfo& info) { return info.length; });
|
|
||||||
dh::caching_device_vector<size_t> path_segments(info.size() + 1);
|
|
||||||
thrust::exclusive_scan(thrust::hip::par(alloc), length_iterator,
|
|
||||||
length_iterator + info.size() + 1,
|
|
||||||
path_segments.begin());
|
|
||||||
#endif
|
|
||||||
|
|
||||||
paths->resize(path_segments.back());
|
paths->resize(path_segments.back());
|
||||||
|
|
||||||
|
|||||||
@ -21,6 +21,12 @@
|
|||||||
#include "xgboost/logging.h" // CHECK_EQ
|
#include "xgboost/logging.h" // CHECK_EQ
|
||||||
#include "xgboost/span.h" // span
|
#include "xgboost/span.h" // span
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::tree::cuda_impl {
|
namespace xgboost::tree::cuda_impl {
|
||||||
void FitStump(Context const* ctx, MetaInfo const& info,
|
void FitStump(Context const* ctx, MetaInfo const& info,
|
||||||
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
|
||||||
@ -45,11 +51,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
|
|||||||
|
|
||||||
dh::XGBCachingDeviceAllocator<char> alloc;
|
dh::XGBCachingDeviceAllocator<char> alloc;
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
auto policy = thrust::cuda::par(alloc);
|
auto policy = thrust::cuda::par(alloc);
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
auto policy = thrust::hip::par(alloc);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
|
||||||
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
|
||||||
|
|||||||
@ -12,6 +12,12 @@
|
|||||||
#include "evaluate_splits.cuh"
|
#include "evaluate_splits.cuh"
|
||||||
#include "xgboost/data.h"
|
#include "xgboost/data.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost::tree {
|
namespace xgboost::tree {
|
||||||
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
|
||||||
bst_feature_t n_features, TrainParam const ¶m,
|
bst_feature_t n_features, TrainParam const ¶m,
|
||||||
@ -28,7 +34,6 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
|
|||||||
// This condition avoids sort-based split function calls if the users want
|
// This condition avoids sort-based split function calls if the users want
|
||||||
// onehot-encoding-based splits.
|
// onehot-encoding-based splits.
|
||||||
// For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
|
// For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
need_sort_histogram_ =
|
need_sort_histogram_ =
|
||||||
thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
|
thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
|
||||||
auto idx = i - 1;
|
auto idx = i - 1;
|
||||||
@ -39,18 +44,6 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
need_sort_histogram_ =
|
|
||||||
thrust::any_of(thrust::hip::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
|
|
||||||
auto idx = i - 1;
|
|
||||||
if (common::IsCat(ft, idx)) {
|
|
||||||
auto n_bins = ptrs[i] - ptrs[idx];
|
|
||||||
bool use_sort = !common::UseOneHot(n_bins, to_onehot);
|
|
||||||
return use_sort;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
node_categorical_storage_size_ =
|
node_categorical_storage_size_ =
|
||||||
common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
|
common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
|
||||||
@ -72,19 +65,11 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
|
|||||||
auto it = thrust::make_counting_iterator(0ul);
|
auto it = thrust::make_counting_iterator(0ul);
|
||||||
auto values = cuts.cut_values_.ConstDeviceSpan();
|
auto values = cuts.cut_values_.ConstDeviceSpan();
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
|
thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
|
||||||
[=] XGBOOST_DEVICE(size_t i) {
|
[=] XGBOOST_DEVICE(size_t i) {
|
||||||
auto fidx = dh::SegmentId(ptrs, i);
|
auto fidx = dh::SegmentId(ptrs, i);
|
||||||
return fidx;
|
return fidx;
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::transform(thrust::hip::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
|
|
||||||
[=] XGBOOST_DEVICE(size_t i) {
|
|
||||||
auto fidx = dh::SegmentId(ptrs, i);
|
|
||||||
return fidx;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
is_column_split_ = is_column_split;
|
is_column_split_ = is_column_split;
|
||||||
device_ = device;
|
device_ = device;
|
||||||
@ -101,7 +86,6 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
|||||||
auto d_feature_idx = dh::ToSpan(feature_idx_);
|
auto d_feature_idx = dh::ToSpan(feature_idx_);
|
||||||
auto total_bins = shared_inputs.feature_values.size();
|
auto total_bins = shared_inputs.feature_values.size();
|
||||||
|
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
|
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
|
||||||
[=] XGBOOST_DEVICE(uint32_t i) {
|
[=] XGBOOST_DEVICE(uint32_t i) {
|
||||||
auto const &input = d_inputs[i / total_bins];
|
auto const &input = d_inputs[i / total_bins];
|
||||||
@ -115,27 +99,11 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
|||||||
}
|
}
|
||||||
return thrust::make_tuple(i, 0.0f);
|
return thrust::make_tuple(i, 0.0f);
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::transform(thrust::hip::par(alloc), it, it + data.size(), dh::tbegin(data),
|
|
||||||
[=] XGBOOST_DEVICE(uint32_t i) {
|
|
||||||
auto const &input = d_inputs[i / total_bins];
|
|
||||||
auto j = i % total_bins;
|
|
||||||
auto fidx = d_feature_idx[j];
|
|
||||||
if (common::IsCat(shared_inputs.feature_types, fidx)) {
|
|
||||||
auto grad =
|
|
||||||
shared_inputs.rounding.ToFloatingPoint(input.gradient_histogram[j]);
|
|
||||||
auto lw = evaluator.CalcWeightCat(shared_inputs.param, grad);
|
|
||||||
return thrust::make_tuple(i, lw);
|
|
||||||
}
|
|
||||||
return thrust::make_tuple(i, 0.0f);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Sort an array segmented according to
|
// Sort an array segmented according to
|
||||||
// - nodes
|
// - nodes
|
||||||
// - features within each node
|
// - features within each node
|
||||||
// - gradients within each feature
|
// - gradients within each feature
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
|
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
|
||||||
dh::tbegin(sorted_idx),
|
dh::tbegin(sorted_idx),
|
||||||
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
|
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
|
||||||
@ -166,38 +134,6 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
|
|||||||
}
|
}
|
||||||
return li < ri;
|
return li < ri;
|
||||||
});
|
});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
thrust::stable_sort_by_key(thrust::hip::par(alloc), dh::tbegin(data), dh::tend(data),
|
|
||||||
dh::tbegin(sorted_idx),
|
|
||||||
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
|
|
||||||
auto li = thrust::get<0>(l);
|
|
||||||
auto ri = thrust::get<0>(r);
|
|
||||||
|
|
||||||
auto l_node = li / total_bins;
|
|
||||||
auto r_node = ri / total_bins;
|
|
||||||
|
|
||||||
if (l_node != r_node) {
|
|
||||||
return l_node < r_node; // not the same node
|
|
||||||
}
|
|
||||||
|
|
||||||
li = li % total_bins;
|
|
||||||
ri = ri % total_bins;
|
|
||||||
|
|
||||||
auto lfidx = d_feature_idx[li];
|
|
||||||
auto rfidx = d_feature_idx[ri];
|
|
||||||
|
|
||||||
if (lfidx != rfidx) {
|
|
||||||
return lfidx < rfidx; // not the same feature
|
|
||||||
}
|
|
||||||
|
|
||||||
if (common::IsCat(shared_inputs.feature_types, lfidx)) {
|
|
||||||
auto lw = thrust::get<1>(l);
|
|
||||||
auto rw = thrust::get<1>(r);
|
|
||||||
return lw < rw;
|
|
||||||
}
|
|
||||||
return li < ri;
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
return dh::ToSpan(cat_sorted_idx_);
|
return dh::ToSpan(cat_sorted_idx_);
|
||||||
}
|
}
|
||||||
} // namespace xgboost::tree
|
} // namespace xgboost::tree
|
||||||
|
|||||||
@ -16,6 +16,12 @@
|
|||||||
#include "row_partitioner.cuh"
|
#include "row_partitioner.cuh"
|
||||||
#include "xgboost/base.h"
|
#include "xgboost/base.h"
|
||||||
|
|
||||||
|
#if defined(XGBOOST_USE_HIP)
|
||||||
|
namespace thrust {
|
||||||
|
namespace cuda = thrust::hip;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
namespace tree {
|
namespace tree {
|
||||||
namespace {
|
namespace {
|
||||||
@ -60,13 +66,8 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
|
|||||||
|
|
||||||
thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
|
thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
|
||||||
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
|
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
|
||||||
#if defined(XGBOOST_USE_CUDA)
|
|
||||||
Pair p =
|
Pair p =
|
||||||
dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
|
dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
|
||||||
#elif defined(XGBOOST_USE_HIP)
|
|
||||||
Pair p =
|
|
||||||
dh::Reduce(thrust::hip::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Treat pair as array of 4 primitive types to allreduce
|
// Treat pair as array of 4 primitive types to allreduce
|
||||||
using ReduceT = typename decltype(p.first)::ValueT;
|
using ReduceT = typename decltype(p.first)::ValueT;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user