sync Jun 1

This commit is contained in:
Your Name
2023-06-01 15:55:06 -07:00
76 changed files with 1424 additions and 595 deletions

View File

@@ -26,6 +26,12 @@
#include "xgboost/logging.h"
#include "xgboost/span.h"
#if defined(XGBOOST_USE_CUDA)
#include "cuda_fp16.h"
#elif defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_fp16.h>
#endif
namespace xgboost {
// Common errors in parsing columnar format.
struct ArrayInterfaceErrors {
@@ -304,12 +310,12 @@ class ArrayInterfaceHandler {
template <typename T, typename E = void>
struct ToDType;
// float
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
template <>
struct ToDType<__half> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF2;
};
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
template <>
struct ToDType<float> {
static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
@@ -459,11 +465,11 @@ class ArrayInterface {
CHECK(sizeof(long double) == 16) << error::NoF128();
type = T::kF16;
} else if (typestr[1] == 'f' && typestr[2] == '2') {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
type = T::kF2;
#else
LOG(FATAL) << "Half type is not supported.";
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
} else if (typestr[1] == 'f' && typestr[2] == '4') {
type = T::kF4;
} else if (typestr[1] == 'f' && typestr[2] == '8') {
@@ -490,20 +496,17 @@ class ArrayInterface {
}
}
XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
[[nodiscard]] XGBOOST_DEVICE std::size_t Shape(size_t i) const { return shape[i]; }
[[nodiscard]] XGBOOST_DEVICE std::size_t Stride(size_t i) const { return strides[i]; }
template <typename Fn>
XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
using T = ArrayInterfaceHandler::Type;
switch (type) {
case T::kF2: {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
return func(reinterpret_cast<__half const *>(data));
#else
SPAN_CHECK(false);
return func(reinterpret_cast<float const *>(data));
#endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#endif // defined(XGBOOST_USE_CUDA) || || defined(__HIP_PLATFORM_AMD__)
}
case T::kF4:
return func(reinterpret_cast<float const *>(data));
@@ -540,23 +543,23 @@ class ArrayInterface {
return func(reinterpret_cast<uint64_t const *>(data));
}
XGBOOST_DEVICE std::size_t ElementSize() const {
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementSize() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return sizeof(std::remove_pointer_t<decltype(typed_data_ptr)>);
});
}
XGBOOST_DEVICE std::size_t ElementAlignment() const {
[[nodiscard]] XGBOOST_DEVICE std::size_t ElementAlignment() const {
return this->DispatchCall([](auto *typed_data_ptr) {
return std::alignment_of<std::remove_pointer_t<decltype(typed_data_ptr)>>::value;
});
}
template <typename T = float, typename... Index>
XGBOOST_DEVICE T operator()(Index &&...index) const {
XGBOOST_HOST_DEV_INLINE T operator()(Index &&...index) const {
static_assert(sizeof...(index) <= D, "Invalid index.");
return this->DispatchCall([=](auto const *p_values) -> T {
std::size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
// No operator defined for half -> size_t
using Type = std::conditional_t<
std::is_same<__half,
@@ -566,7 +569,7 @@ class ArrayInterface {
return static_cast<T>(static_cast<Type>(p_values[offset]));
#else
return static_cast<T>(p_values[offset]);
#endif
#endif // defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
});
}
@@ -603,7 +606,7 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
};
switch (array.type) {
case ArrayInterfaceHandler::kF2: {
#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) || defined(__HIP_PLATFORM_AMD__)
#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
dispatch(__half{});
#endif
break;

View File

@@ -698,6 +698,9 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
this->feature_type_names = that.feature_type_names;
auto &h_feature_types = feature_types.HostVector();
LoadFeatureType(this->feature_type_names, &h_feature_types);
} else if (!that.feature_types.Empty()) {
this->feature_types.Resize(that.feature_types.Size());
this->feature_types.Copy(that.feature_types);
}
if (!that.feature_weights.Empty()) {
this->feature_weights.Resize(that.feature_weights.Size());

View File

@@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
: columns_(columns),
num_rows_(num_rows) {}
size_t Size() const { return num_rows_ * columns_.size(); }
__device__ COOTuple GetElement(size_t idx) const {
__device__ __forceinline__ COOTuple GetElement(size_t idx) const {
size_t column_idx = idx % columns_.size();
size_t row_idx = idx / columns_.size();
auto const& column = columns_[column_idx];
@@ -39,6 +39,14 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
auto const& column = columns_[fidx];
float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
? column(ridx)
: std::numeric_limits<float>::quiet_NaN();
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
@@ -166,6 +174,10 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
float value = array_interface_(row_idx, column_idx);
return {row_idx, column_idx, value};
}
__device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
float value = array_interface_(ridx, fidx);
return value;
}
XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
@@ -202,40 +214,64 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
// Returns maximum row length
template <typename AdapterBatchT>
size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
int device_idx, float missing) {
#if defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#elif defined(XGBOOST_USE_CUDA)
std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
float missing) {
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaSetDevice(device_idx));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipSetDevice(device_idx));
#endif
IsValidFunctor is_valid(missing);
#if defined(XGBOOST_USE_CUDA)
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#elif defined(XGBOOST_USE_HIP)
dh::safe_cuda(hipMemsetAsync(offset.data(), '\0', offset.size_bytes()));
#endif
auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
// Use more than 1 threads for each row in case of dataset being too wide.
bst_feature_t stride{0};
if (n_features < 32) {
stride = std::min(n_features, 4u);
} else if (n_features < 64) {
stride = 8;
} else if (n_features < 128) {
stride = 16;
} else {
stride = 32;
}
// Count elements per row
dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
auto element = batch.GetElement(idx);
if (is_valid(element)) {
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&offset[element.row_idx]),
static_cast<unsigned long long>(1)); // NOLINT
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
bst_row_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
if (is_valid(batch.GetElement(ridx, fidx))) {
cnt++;
}
}
atomicAdd(reinterpret_cast<unsigned long long*>( // NOLINT
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
#if defined(XGBOOST_USE_HIP)
size_t row_stride =
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<std::size_t>(0), thrust::maximum<size_t>());
#elif defined(XGBOOST_USE_CUDA)
size_t row_stride =
#if defined(XGBOOST_USE_CUDA)
bst_row_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<std::size_t>(0), thrust::maximum<size_t>());
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
#elif defined(XGBOOST_USE_HIP)
bst_row_t row_stride =
dh::Reduce(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
#endif
return row_stride;
}
@@ -243,13 +279,29 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
* \brief Check there's no inf in data.
*/
template <typename AdapterBatchT>
bool HasInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
bool NoInfInData(AdapterBatchT const& batch, IsValidFunctor is_valid) {
auto counting = thrust::make_counting_iterator(0llu);
auto value_iter = dh::MakeTransformIterator<float>(
counting, [=] XGBOOST_DEVICE(std::size_t idx) { return batch.GetElement(idx).value; });
auto valid =
thrust::none_of(value_iter, value_iter + batch.Size(),
[is_valid] XGBOOST_DEVICE(float v) { return is_valid(v) && std::isinf(v); });
auto value_iter = dh::MakeTransformIterator<bool>(counting, [=] XGBOOST_DEVICE(std::size_t idx) {
auto v = batch.GetElement(idx).value;
if (!is_valid(v)) {
// discard the invalid elements.
return true;
}
// check that there's no inf in data.
return !std::isinf(v);
});
dh::XGBCachingDeviceAllocator<char> alloc;
// The default implementation in thrust optimizes any_of/none_of/all_of by using small
// intervals to early stop. But we expect all data to be valid here, using small
// intervals only decreases performance due to excessive kernel launch and stream
// synchronization.
#if defined(XGBOOST_USE_CUDA)
auto valid = dh::Reduce(thrust::cuda::par(alloc), value_iter, value_iter + batch.Size(), true,
thrust::logical_and<>{});
#elif defined(XGBOOST_USE_HIP)
auto valid = dh::Reduce(thrust::hip::par(alloc), value_iter, value_iter + batch.Size(), true,
thrust::logical_and<>{});
#endif
return valid;
}
}; // namespace data

View File

@@ -213,7 +213,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
// correct output position
auto counting = thrust::make_counting_iterator(0llu);
data::IsValidFunctor is_valid(missing);
bool valid = data::HasInfInData(batch, is_valid);
bool valid = data::NoInfInData(batch, is_valid);
CHECK(valid) << error::InfInData();
auto key_iter = dh::MakeTransformIterator<size_t>(

View File

@@ -92,7 +92,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
}
auto batch_rows = num_rows();
accumulated_rows += batch_rows;
dh::caching_device_vector<size_t> row_counts(batch_rows + 1, 0);
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);
@@ -163,7 +163,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
#endif
auto rows = num_rows();
dh::caching_device_vector<size_t> row_counts(rows + 1, 0);
dh::device_vector<size_t> row_counts(rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
Dispatch(proxy, [=](auto const& value) {
return GetRowCounts(value, row_counts_span, get_device(), missing);

View File

@@ -92,7 +92,7 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
template <typename AdapterBatchT>
size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
SparsePage* page) {
bool valid = HasInfInData(batch, IsValidFunctor{missing});
bool valid = NoInfInData(batch, IsValidFunctor{missing});
CHECK(valid) << error::InfInData();
page->offset.SetDevice(device);