remove logging
This commit is contained in:
parent
9659d0e7bd
commit
20a9c223b6
@ -260,7 +260,7 @@ if (USE_HIP)
|
||||
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wunused-result -w")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${HIP_INCLUDE_DIRS}")
|
||||
set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
|
||||
#set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
|
||||
add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
|
||||
endif (USE_HIP)
|
||||
|
||||
|
||||
@ -579,8 +579,6 @@ xgboost::common::Span<T> LazyResize(xgboost::Context const *ctx,
|
||||
template <typename T>
|
||||
void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<T> src) {
|
||||
CHECK_EQ(dst->size(), src.size());
|
||||
std::cerr << "CopyDeviceSpanToVector: Copying " << src.size() * sizeof(T)
|
||||
<< " bytes from device to host" << std::endl;
|
||||
dh::safe_cuda(hipMemcpyAsync(dst->data(), src.data(), dst->size() * sizeof(T),
|
||||
hipMemcpyDeviceToHost));
|
||||
}
|
||||
@ -609,8 +607,6 @@ void CopyToD(HContainer const &h, DContainer *d) {
|
||||
using HVT = std::remove_cv_t<typename HContainer::value_type>;
|
||||
using DVT = std::remove_cv_t<typename DContainer::value_type>;
|
||||
static_assert(std::is_same<HVT, DVT>::value, "Host and device containers must have same value type.");
|
||||
std::cerr << "CopyToD: Copying " << h.size() * sizeof(HVT)
|
||||
<< " bytes from host to device" << std::endl;
|
||||
dh::safe_cuda(hipMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
|
||||
hipMemcpyHostToDevice));
|
||||
}
|
||||
@ -661,7 +657,6 @@ struct PinnedMemory {
|
||||
*/
|
||||
template <typename T>
|
||||
typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
|
||||
std::cerr << "Entering SumReduction, nVals: " << nVals << std::endl;
|
||||
using ValueT = typename std::iterator_traits<T>::value_type;
|
||||
|
||||
size_t tmpSize {0};
|
||||
@ -669,21 +664,14 @@ typename std::iterator_traits<T>::value_type SumReduction(T in, int nVals) {
|
||||
|
||||
try {
|
||||
dh::safe_cuda(hipcub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
|
||||
std::cerr << "Temporary storage size: " << tmpSize << std::endl;
|
||||
|
||||
TemporaryArray<char> temp(tmpSize + sizeof(ValueT));
|
||||
auto ptr = reinterpret_cast<ValueT *>(temp.data().get()) + 1;
|
||||
|
||||
dh::safe_cuda(hipcub::DeviceReduce::Sum(
|
||||
reinterpret_cast<void *>(ptr), tmpSize, in, reinterpret_cast<ValueT *>(temp.data().get()), nVals));
|
||||
|
||||
ValueT sum;
|
||||
dh::safe_cuda(hipMemcpy(&sum, temp.data().get(), sizeof(ValueT), hipMemcpyDeviceToHost));
|
||||
|
||||
std::cerr << "SumReduction completed successfully" << std::endl;
|
||||
return sum;
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in SumReduction: " << e.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
@ -971,10 +959,8 @@ size_t SegmentedUniqueByKey(
|
||||
|
||||
template <typename Policy, typename InputIt, typename Init, typename Func>
|
||||
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
|
||||
std::cerr << "Entering Reduce function" << std::endl;
|
||||
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
|
||||
size_t size = std::distance(first, second);
|
||||
std::cerr << "Total size for reduction: " << size << std::endl;
|
||||
|
||||
using Ty = std::remove_cv_t<Init>;
|
||||
Ty aggregate = init;
|
||||
@ -984,35 +970,21 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
|
||||
auto end_it = first + std::min(offset + kLimit, size);
|
||||
size_t batch_size = std::distance(begin_it, end_it);
|
||||
CHECK_LE(batch_size, size);
|
||||
|
||||
std::cerr << "Processing batch: offset=" << offset << ", batch_size=" << batch_size << std::endl;
|
||||
|
||||
try {
|
||||
// Print the iterator types
|
||||
std::cerr << "Iterator types - begin: " << typeid(begin_it).name()
|
||||
<< ", end: " << typeid(end_it).name() << std::endl;
|
||||
|
||||
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
|
||||
aggregate = reduce_op(aggregate, ret);
|
||||
|
||||
std::cerr << "Batch reduction completed successfully" << std::endl;
|
||||
} catch (const thrust::system_error& e) {
|
||||
std::cerr << "Thrust system error in reduce: " << e.what() << std::endl;
|
||||
std::cerr << "Error code: " << e.code() << std::endl;
|
||||
throw;
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in thrust::reduce: " << e.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
|
||||
// Check for any HIP errors after the reduction
|
||||
hipError_t hip_err = hipGetLastError();
|
||||
if (hip_err != hipSuccess) {
|
||||
std::cerr << "HIP error after reduction: " << hipGetErrorString(hip_err) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << "Exiting Reduce function" << std::endl;
|
||||
return aggregate;
|
||||
}
|
||||
|
||||
|
||||
@ -62,50 +62,35 @@ void ArrayInterfaceHandler::SyncCudaStream(std::int64_t stream) {
|
||||
}
|
||||
|
||||
bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
|
||||
std::cerr << "Entering IsCudaPtr with ptr: " << ptr << std::endl;
|
||||
if (!ptr) {
|
||||
std::cerr << "Pointer is null, returning false" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if the pointer is within the process's address space
|
||||
uintptr_t ptr_value = reinterpret_cast<uintptr_t>(ptr);
|
||||
uintptr_t process_max_addr = (uintptr_t)-1;
|
||||
std::cerr << "Pointer value: " << ptr_value << ", Max address: " << process_max_addr << std::endl;
|
||||
|
||||
#if defined(XGBOOST_USE_HIP)
|
||||
hipPointerAttribute_t attr;
|
||||
std::cerr << "Calling hipPointerGetAttributes" << std::endl;
|
||||
auto err = hipPointerGetAttributes(&attr, ptr);
|
||||
|
||||
std::cerr << "hipPointerGetAttributes returned: " << hipGetErrorString(err) << std::endl;
|
||||
|
||||
if (err == hipErrorInvalidValue) {
|
||||
std::cerr << "Invalid pointer (hipErrorInvalidValue), returning false" << std::endl;
|
||||
return false;
|
||||
} else if (err == hipSuccess) {
|
||||
std::cerr << "Pointer attributes obtained successfully" << std::endl;
|
||||
std::cerr << "Memory type: " << attr.type << std::endl;
|
||||
switch (attr.type) {
|
||||
case hipMemoryTypeUnregistered:
|
||||
std::cerr << "Memory type is Unregistered, returning false" << std::endl;
|
||||
return false;
|
||||
case hipMemoryTypeHost:
|
||||
std::cerr << "Memory type is Host, returning false" << std::endl;
|
||||
return false;
|
||||
case hipMemoryTypeDevice:
|
||||
std::cerr << "Memory type is Device, returning true" << std::endl;
|
||||
return true;
|
||||
case hipMemoryTypeManaged:
|
||||
std::cerr << "Memory type is Managed, returning true" << std::endl;
|
||||
return true;
|
||||
default:
|
||||
std::cerr << "Unknown memory type: " << attr.type << std::endl;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
std::cerr << "hipPointerGetAttributes failed with error: "
|
||||
<< hipGetErrorString(err) << std::endl;
|
||||
return false;
|
||||
}
|
||||
#elif defined(XGBOOST_USE_CUDA)
|
||||
|
||||
@ -1265,54 +1265,42 @@ class LearnerImpl : public LearnerIO {
|
||||
}
|
||||
|
||||
void UpdateOneIter(int iter, std::shared_ptr<DMatrix> train) override {
|
||||
std::cerr << "Entering UpdateOneIter, iteration: " << iter << std::endl;
|
||||
monitor_.Start("UpdateOneIter");
|
||||
TrainingObserver::Instance().Update(iter);
|
||||
|
||||
std::cerr << "Configuring..." << std::endl;
|
||||
this->Configure();
|
||||
|
||||
std::cerr << "Initializing base score..." << std::endl;
|
||||
this->InitBaseScore(train.get());
|
||||
|
||||
if (ctx_.seed_per_iteration) {
|
||||
std::cerr << "Setting seed for iteration..." << std::endl;
|
||||
common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
|
||||
}
|
||||
|
||||
std::cerr << "Validating DMatrix..." << std::endl;
|
||||
this->ValidateDMatrix(train.get(), true);
|
||||
|
||||
std::cerr << "Caching predictions..." << std::endl;
|
||||
auto& predt = prediction_container_.Cache(train, ctx_.Device());
|
||||
|
||||
monitor_.Start("PredictRaw");
|
||||
std::cerr << "Predicting raw values..." << std::endl;
|
||||
this->PredictRaw(train.get(), &predt, true, 0, 0);
|
||||
TrainingObserver::Instance().Observe(predt.predictions, "Predictions");
|
||||
monitor_.Stop("PredictRaw");
|
||||
|
||||
monitor_.Start("GetGradient");
|
||||
std::cerr << "Getting gradients..." << std::endl;
|
||||
try {
|
||||
GetGradient(predt.predictions, train->Info(), iter, &gpair_);
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in GetGradient: " << e.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
monitor_.Stop("GetGradient");
|
||||
TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
|
||||
|
||||
std::cerr << "Performing boosting..." << std::endl;
|
||||
try {
|
||||
gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in DoBoost: " << e.what() << std::endl;
|
||||
throw;
|
||||
}
|
||||
|
||||
monitor_.Stop("UpdateOneIter");
|
||||
std::cerr << "Exiting UpdateOneIter" << std::endl;
|
||||
}
|
||||
|
||||
void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
|
||||
|
||||
@ -68,31 +68,15 @@ class RegLossObj : public FitIntercept {
|
||||
|
||||
public:
|
||||
void ValidateLabel(MetaInfo const& info) {
|
||||
std::cerr << "Entering ValidateLabel function" << std::endl;
|
||||
std::cerr << "Number of rows: " << info.num_row_ << std::endl;
|
||||
std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl;
|
||||
|
||||
// Check GPU memory
|
||||
size_t free, total;
|
||||
if (hipMemGetInfo(&free, &total) == hipSuccess) {
|
||||
std::cerr << "GPU Memory - Free: " << free << ", Total: " << total << std::endl;
|
||||
} else {
|
||||
std::cerr << "Failed to get GPU memory info" << std::endl;
|
||||
}
|
||||
|
||||
auto label = info.labels.View(ctx_->Device());
|
||||
std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl;
|
||||
std::cerr << "Label data pointer: " << label.Values().data() << std::endl;
|
||||
|
||||
bool valid = false;
|
||||
try {
|
||||
valid = ctx_->DispatchDevice(
|
||||
[&] {
|
||||
std::cerr << "Validating labels on CPU" << std::endl;
|
||||
return std::all_of(linalg::cbegin(label), linalg::cend(label),
|
||||
[](float y) -> bool {
|
||||
if (!std::isfinite(y)) {
|
||||
std::cerr << "Non-finite label value found: " << y << std::endl;
|
||||
return false;
|
||||
}
|
||||
return Loss::CheckLabel(y);
|
||||
@ -100,9 +84,7 @@ void ValidateLabel(MetaInfo const& info) {
|
||||
},
|
||||
[&] {
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
std::cerr << "Validating labels on GPU" << std::endl;
|
||||
auto cuctx = ctx_->CUDACtx();
|
||||
std::cerr << "CUDA context pointer: " << cuctx << std::endl;
|
||||
|
||||
auto it = dh::MakeTransformIterator<bool>(
|
||||
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
|
||||
@ -115,12 +97,9 @@ void ValidateLabel(MetaInfo const& info) {
|
||||
return Loss::CheckLabel(y);
|
||||
});
|
||||
|
||||
std::cerr << "Starting GPU reduction" << std::endl;
|
||||
bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
|
||||
std::cerr << "GPU reduction completed" << std::endl;
|
||||
return result;
|
||||
#else
|
||||
std::cerr << "GPU support not enabled" << std::endl;
|
||||
common::AssertGPUSupport();
|
||||
return false;
|
||||
#endif // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
@ -130,17 +109,10 @@ void ValidateLabel(MetaInfo const& info) {
|
||||
valid = false;
|
||||
}
|
||||
|
||||
std::cerr << "Label validation result: " << (valid ? "Valid" : "Invalid") << std::endl;
|
||||
|
||||
if (!valid) {
|
||||
std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl;
|
||||
// Print GPU error info
|
||||
hipError_t error = hipGetLastError();
|
||||
std::cerr << "Last GPU error: " << hipGetErrorString(error) << std::endl;
|
||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||
}
|
||||
|
||||
std::cerr << "Exiting ValidateLabel function" << std::endl;
|
||||
}
|
||||
// 0 - scale_pos_weight, 1 - is_null_weight
|
||||
RegLossObj(): additional_input_(2) {}
|
||||
@ -676,40 +648,18 @@ class MeanAbsoluteError : public ObjFunction {
|
||||
|
||||
void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
|
||||
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
|
||||
std::cerr << "Entering GetGradient, iteration: " << iter << std::endl;
|
||||
|
||||
try {
|
||||
GPU_CHECK_LAST(); // Check for any previous GPU errors
|
||||
|
||||
std::cerr << "Checking regression inputs..." << std::endl;
|
||||
CheckRegInputs(info, preds);
|
||||
|
||||
std::cerr << "Setting up labels..." << std::endl;
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
std::cerr << "Labels shape: " << labels.Shape()[0] << "x" << labels.Shape()[1] << std::endl;
|
||||
|
||||
std::cerr << "Setting up output gradient pairs..." << std::endl;
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
auto gpair = out_gpair->View(ctx_->Device());
|
||||
std::cerr << "Gradient pairs shape: " << gpair.Shape()[0] << "x" << gpair.Shape()[1] << std::endl;
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors after memory operations
|
||||
|
||||
std::cerr << "Setting up predictions..." << std::endl;
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
|
||||
std::cerr << "Predictions shape: " << predt.Shape()[0] << "x" << predt.Shape()[1] << std::endl;
|
||||
|
||||
std::cerr << "Setting up weights..." << std::endl;
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan()};
|
||||
std::cerr << "Weights size: " << weight.Size() << std::endl;
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors before kernel launch
|
||||
|
||||
std::cerr << "Running ElementWiseKernel..." << std::endl;
|
||||
linalg::ElementWiseKernel(
|
||||
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
|
||||
auto sign = [](auto x) {
|
||||
@ -721,16 +671,11 @@ void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
|
||||
gpair(i, j) = GradientPair{grad, hess};
|
||||
});
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors after kernel execution
|
||||
|
||||
std::cerr << "ElementWiseKernel completed successfully" << std::endl;
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in GetGradient: " << e.what() << std::endl;
|
||||
GPU_CHECK_LAST(); // Check for GPU errors in case of exception
|
||||
throw;
|
||||
}
|
||||
|
||||
std::cerr << "Exiting GetGradient" << std::endl;
|
||||
}
|
||||
|
||||
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_margin) const override {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user