remove logging

This commit is contained in:
Hendrik Groove 2024-10-21 21:52:34 +02:00
parent 9659d0e7bd
commit 20a9c223b6
5 changed files with 1 additions and 111 deletions

View File

@ -260,7 +260,7 @@ if (USE_HIP)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w") set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
set(CMAKE_HIP_SEPARABLE_COMPILATION ON) #set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap) add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
endif (USE_HIP) endif (USE_HIP)

View File

@ -579,8 +579,6 @@ xgboost::common::Span<T> LazyResize(xgboost::Context const *ctx,
template <typename T> template <typename T>
void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) { void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) {
CHECK_EQ(dst->size(), src.size()); CHECK_EQ(dst->size(), src.size());
std::cerr << "CopyDeviceSpanToVector: Copying " << src.size() * sizeof(T)
<< " bytes from device to host" << std::endl;
dh::safe_cuda(hipMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T), dh::safe_cuda(hipMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T),
hipMemcpyDeviceToHost)); hipMemcpyDeviceToHost));
} }
@ -609,8 +607,6 @@ void CopyToD(HContainer const &h, DContainer *d) {
using HVT = std::remove_cv_t<typename HContainer::value_type>; using HVT = std::remove_cv_t<typename HContainer::value_type>;
using DVT = std::remove_cv_t<typename DContainer::value_type>; using DVT = std::remove_cv_t<typename DContainer::value_type>;
static_assert(std::is_same<HVT, DVT>::value, "Host and device containers must have same value type."); static_assert(std::is_same<HVT, DVT>::value, "Host and device containers must have same value type.");
std::cerr << "CopyToD: Copying " << h.size() * sizeof(HVT)
<< " bytes from host to device" << std::endl;
dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT), dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
hipMemcpyHostToDevice)); hipMemcpyHostToDevice));
} }
@ -661,7 +657,6 @@ struct PinnedMemory {
*/ */
template <typename T> template <typename T>
typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) { typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
std::cerr << "Entering SumReduction, nVals: " << nVals << std::endl;
using ValueT = typename std::iterator_traits<T>::value_type; using ValueT = typename std::iterator_traits<T>::value_type;
size_t tmpSize {0}; size_t tmpSize {0};
@ -669,21 +664,14 @@ typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
try { try {
dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals)); dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
std::cerr << "Temporary storage size: " << tmpSize << std::endl;
TemporaryArray<char> temp(tmpSize + sizeof(ValueT)); TemporaryArray<char> temp(tmpSize + sizeof(ValueT));
auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1; auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1;
dh::safe_cuda(hipcub::DeviceReduce::Sum( dh::safe_cuda(hipcub::DeviceReduce::Sum(
reinterpret_cast<void *>(ptr), tmpSize, in, reinterpret_cast<ValueT *>(temp.data().get()), nVals)); reinterpret_cast<void *>(ptr), tmpSize, in, reinterpret_cast<ValueT *>(temp.data().get()), nVals));
ValueT sum; ValueT sum;
dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT), hipMemcpyDeviceToHost)); dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT), hipMemcpyDeviceToHost));
std::cerr << "SumReduction completed successfully" << std::endl;
return sum; return sum;
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << "Exception in SumReduction: " << e.what() << std::endl;
throw; throw;
} }
} }
@ -971,10 +959,8 @@ size_t SegmentedUniqueByKey(
template <typename Policy, typename InputIt, typename Init, typename Func> template <typename Policy, typename InputIt, typename Init, typename Func>
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) { auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
std::cerr << "Entering Reduce function" << std::endl;
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2; size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
size_t size = std::distance(first, second); size_t size = std::distance(first, second);
std::cerr << "Total size for reduction: " << size << std::endl;
using Ty = std::remove_cv_t<Init>; using Ty = std::remove_cv_t<Init>;
Ty aggregate = init; Ty aggregate = init;
@ -984,35 +970,21 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
auto end_it = first + std::min(offset + kLimit, size); auto end_it = first + std::min(offset + kLimit, size);
size_t batch_size = std::distance(begin_it, end_it); size_t batch_size = std::distance(begin_it, end_it);
CHECK_LE(batch_size, size); CHECK_LE(batch_size, size);
std::cerr << "Processing batch: offset=" << offset << ", batch_size=" << batch_size << std::endl;
try { try {
// Print the iterator types // Print the iterator types
std::cerr << "Iterator types - begin: " << typeid(begin_it).name()
<< ", end: " << typeid(end_it).name() << std::endl;
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op); auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
aggregate = reduce_op(aggregate, ret); aggregate = reduce_op(aggregate, ret);
std::cerr << "Batch reduction completed successfully" << std::endl;
} catch (const thrust::system_error& e) { } catch (const thrust::system_error& e) {
std::cerr << "Thrust system error in reduce: " << e.what() << std::endl;
std::cerr << "Error code: " << e.code() << std::endl;
throw; throw;
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << "Exception in thrust::reduce: " << e.what() << std::endl;
throw; throw;
} }
// Check for any HIP errors after the reduction // Check for any HIP errors after the reduction
hipError_t hip_err = hipGetLastError(); hipError_t hip_err = hipGetLastError();
if (hip_err != hipSuccess) { if (hip_err != hipSuccess) {
std::cerr << "HIP error after reduction: " << hipGetErrorString(hip_err) << std::endl;
} }
} }
std::cerr << "Exiting Reduce function" << std::endl;
return aggregate; return aggregate;
} }

View File

@ -62,50 +62,35 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
} }
bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) { bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
std::cerr << "Entering IsCudaPtr with ptr: " << ptr << std::endl;
if (!ptr) { if (!ptr) {
std::cerr << "Pointer is null, returning false" << std::endl;
return false; return false;
} }
// Check if the pointer is within the process's address space // Check if the pointer is within the process's address space
uintptr_t ptr_value = reinterpret_cast<uintptr_t>(ptr); uintptr_t ptr_value = reinterpret_cast<uintptr_t>(ptr);
uintptr_t process_max_addr = (uintptr_t)-1; uintptr_t process_max_addr = (uintptr_t)-1;
std::cerr << "Pointer value: " << ptr_value << ", Max address: " << process_max_addr << std::endl;
#if defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_HIP)
hipPointerAttribute_t attr; hipPointerAttribute_t attr;
std::cerr << "Calling hipPointerGetAttributes" << std::endl;
auto err = hipPointerGetAttributes(&attr, ptr); auto err = hipPointerGetAttributes(&attr, ptr);
std::cerr << "hipPointerGetAttributes returned: " << hipGetErrorString(err) << std::endl;
if (err == hipErrorInvalidValue) { if (err == hipErrorInvalidValue) {
std::cerr << "Invalid pointer (hipErrorInvalidValue), returning false" << std::endl;
return false; return false;
} else if (err == hipSuccess) { } else if (err == hipSuccess) {
std::cerr << "Pointer attributes obtained successfully" << std::endl;
std::cerr << "Memory type: " << attr.type << std::endl;
switch (attr.type) { switch (attr.type) {
case hipMemoryTypeUnregistered: case hipMemoryTypeUnregistered:
std::cerr << "Memory type is Unregistered, returning false" << std::endl;
return false; return false;
case hipMemoryTypeHost: case hipMemoryTypeHost:
std::cerr << "Memory type is Host, returning false" << std::endl;
return false; return false;
case hipMemoryTypeDevice: case hipMemoryTypeDevice:
std::cerr << "Memory type is Device, returning true" << std::endl;
return true; return true;
case hipMemoryTypeManaged: case hipMemoryTypeManaged:
std::cerr << "Memory type is Managed, returning true" << std::endl;
return true; return true;
default: default:
std::cerr << "Unknown memory type: " << attr.type << std::endl;
return false; return false;
} }
} else { } else {
std::cerr << "hipPointerGetAttributes failed with error: "
<< hipGetErrorString(err) << std::endl;
return false; return false;
} }
#elif defined(XGBOOST_USE_CUDA) #elif defined(XGBOOST_USE_CUDA)

View File

@ -1265,54 +1265,42 @@ class LearnerImpl : public LearnerIO {
} }
void UpdateOneIter(int iter, std::shared_ptr<DMatrix> train) override { void UpdateOneIter(int iter, std::shared_ptr<DMatrix> train) override {
std::cerr << "Entering UpdateOneIter, iteration: " << iter << std::endl;
monitor_.Start("UpdateOneIter"); monitor_.Start("UpdateOneIter");
TrainingObserver::Instance().Update(iter); TrainingObserver::Instance().Update(iter);
std::cerr << "Configuring..." << std::endl;
this->Configure(); this->Configure();
std::cerr << "Initializing base score..." << std::endl;
this->InitBaseScore(train.get()); this->InitBaseScore(train.get());
if (ctx_.seed_per_iteration) { if (ctx_.seed_per_iteration) {
std::cerr << "Setting seed for iteration..." << std::endl;
common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter); common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
} }
std::cerr << "Validating DMatrix..." << std::endl;
this->ValidateDMatrix(train.get(), true); this->ValidateDMatrix(train.get(), true);
std::cerr << "Caching predictions..." << std::endl;
auto& predt = prediction_container_.Cache(train, ctx_.Device()); auto& predt = prediction_container_.Cache(train, ctx_.Device());
monitor_.Start("PredictRaw"); monitor_.Start("PredictRaw");
std::cerr << "Predicting raw values..." << std::endl;
this->PredictRaw(train.get(), &predt, true, 0, 0); this->PredictRaw(train.get(), &predt, true, 0, 0);
TrainingObserver::Instance().Observe(predt.predictions, "Predictions"); TrainingObserver::Instance().Observe(predt.predictions, "Predictions");
monitor_.Stop("PredictRaw"); monitor_.Stop("PredictRaw");
monitor_.Start("GetGradient"); monitor_.Start("GetGradient");
std::cerr << "Getting gradients..." << std::endl;
try { try {
GetGradient(predt.predictions, train->Info(), iter, &gpair_); GetGradient(predt.predictions, train->Info(), iter, &gpair_);
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << "Exception in GetGradient: " << e.what() << std::endl;
throw; throw;
} }
monitor_.Stop("GetGradient"); monitor_.Stop("GetGradient");
TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients"); TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
std::cerr << "Performing boosting..." << std::endl;
try { try {
gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get()); gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << "Exception in DoBoost: " << e.what() << std::endl;
throw; throw;
} }
monitor_.Stop("UpdateOneIter"); monitor_.Stop("UpdateOneIter");
std::cerr << "Exiting UpdateOneIter" << std::endl;
} }
void BoostOneIter(int iter, std::shared_ptr<DMatrix> train, void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,

View File

@ -68,31 +68,15 @@ class RegLossObj : public FitIntercept {
public: public:
void ValidateLabel(MetaInfo const& info) { void ValidateLabel(MetaInfo const& info) {
std::cerr << "Entering ValidateLabel function" << std::endl;
std::cerr << "Number of rows: " << info.num_row_ << std::endl;
std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl;
// Check GPU memory
size_t free, total;
if (hipMemGetInfo(&free, &total) == hipSuccess) {
std::cerr << "GPU Memory - Free: " << free << ", Total: " << total << std::endl;
} else {
std::cerr << "Failed to get GPU memory info" << std::endl;
}
auto label = info.labels.View(ctx_->Device()); auto label = info.labels.View(ctx_->Device());
std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl;
std::cerr << "Label data pointer: " << label.Values().data() << std::endl;
bool valid = false; bool valid = false;
try { try {
valid = ctx_->DispatchDevice( valid = ctx_->DispatchDevice(
[&] { [&] {
std::cerr << "Validating labels on CPU" << std::endl;
return std::all_of(linalg::cbegin(label), linalg::cend(label), return std::all_of(linalg::cbegin(label), linalg::cend(label),
[](float y) -> bool { [](float y) -> bool {
if (!std::isfinite(y)) { if (!std::isfinite(y)) {
std::cerr << "Non-finite label value found: " << y << std::endl;
return false; return false;
} }
return Loss::CheckLabel(y); return Loss::CheckLabel(y);
@ -100,9 +84,7 @@ void ValidateLabel(MetaInfo const& info) {
}, },
[&] { [&] {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
std::cerr << "Validating labels on GPU" << std::endl;
auto cuctx = ctx_->CUDACtx(); auto cuctx = ctx_->CUDACtx();
std::cerr << "CUDA context pointer: " << cuctx << std::endl;
auto it = dh::MakeTransformIterator<bool>( auto it = dh::MakeTransformIterator<bool>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool { thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
@ -115,12 +97,9 @@ void ValidateLabel(MetaInfo const& info) {
return Loss::CheckLabel(y); return Loss::CheckLabel(y);
}); });
std::cerr << "Starting GPU reduction" << std::endl;
bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{}); bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
std::cerr << "GPU reduction completed" << std::endl;
return result; return result;
#else #else
std::cerr << "GPU support not enabled" << std::endl;
common::AssertGPUSupport(); common::AssertGPUSupport();
return false; return false;
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) #endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@ -130,17 +109,10 @@ void ValidateLabel(MetaInfo const& info) {
valid = false; valid = false;
} }
std::cerr << "Label validation result: " << (valid ? "Valid" : "Invalid") << std::endl;
if (!valid) { if (!valid) {
std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl;
// Print GPU error info
hipError_t error = hipGetLastError(); hipError_t error = hipGetLastError();
std::cerr << "Last GPU error: " << hipGetErrorString(error) << std::endl;
LOG(FATAL) << Loss::LabelErrorMsg(); LOG(FATAL) << Loss::LabelErrorMsg();
} }
std::cerr << "Exiting ValidateLabel function" << std::endl;
} }
// 0 - scale_pos_weight, 1 - is_null_weight // 0 - scale_pos_weight, 1 - is_null_weight
RegLossObj(): additional_input_(2) {} RegLossObj(): additional_input_(2) {}
@ -676,40 +648,18 @@ class MeanAbsoluteError : public ObjFunction {
void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override { std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
std::cerr << "Entering GetGradient, iteration: " << iter << std::endl;
try { try {
GPU_CHECK_LAST(); // Check for any previous GPU errors
std::cerr << "Checking regression inputs..." << std::endl;
CheckRegInputs(info, preds); CheckRegInputs(info, preds);
std::cerr << "Setting up labels..." << std::endl;
auto labels = info.labels.View(ctx_->Device()); auto labels = info.labels.View(ctx_->Device());
std::cerr << "Labels shape: " << labels.Shape()[0] << "x" << labels.Shape()[1] << std::endl;
std::cerr << "Setting up output gradient pairs..." << std::endl;
out_gpair->SetDevice(ctx_->Device()); out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info)); out_gpair->Reshape(info.num_row_, this->Targets(info));
auto gpair = out_gpair->View(ctx_->Device()); auto gpair = out_gpair->View(ctx_->Device());
std::cerr << "Gradient pairs shape: " << gpair.Shape()[0] << "x" << gpair.Shape()[1] << std::endl;
GPU_CHECK_LAST(); // Check for GPU errors after memory operations
std::cerr << "Setting up predictions..." << std::endl;
preds.SetDevice(ctx_->Device()); preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info)); auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
std::cerr << "Predictions shape: " << predt.Shape()[0] << "x" << predt.Shape()[1] << std::endl;
std::cerr << "Setting up weights..." << std::endl;
info.weights_.SetDevice(ctx_->Device()); info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()}; : info.weights_.ConstHostSpan()};
std::cerr << "Weights size: " << weight.Size() << std::endl;
GPU_CHECK_LAST(); // Check for GPU errors before kernel launch
std::cerr << "Running ElementWiseKernel..." << std::endl;
linalg::ElementWiseKernel( linalg::ElementWiseKernel(
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable { ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
auto sign = [](auto x) { auto sign = [](auto x) {
@ -721,16 +671,11 @@ void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
gpair(i, j) = GradientPair{grad, hess}; gpair(i, j) = GradientPair{grad, hess};
}); });
GPU_CHECK_LAST(); // Check for GPU errors after kernel execution
std::cerr << "ElementWiseKernel completed successfully" << std::endl;
} catch (const std::exception& e) { } catch (const std::exception& e) {
std::cerr << "Exception in GetGradient: " << e.what() << std::endl; std::cerr << "Exception in GetGradient: " << e.what() << std::endl;
GPU_CHECK_LAST(); // Check for GPU errors in case of exception GPU_CHECK_LAST(); // Check for GPU errors in case of exception
throw; throw;
} }
std::cerr << "Exiting GetGradient" << std::endl;
} }
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_margin) const override { void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_margin) const override {