diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index 9ae9b076e..bc6f14c4e 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -39,6 +39,7 @@ #include #include #include +#include "common/gpu_error_check.h" #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) #include "../common/cuda_context.cuh" // for CUDAContext @@ -655,21 +656,44 @@ class MeanAbsoluteError : public ObjFunction { return std::max(static_cast(1), info.labels.Shape(1)); } - void GetGradient(HostDeviceVector const& preds, const MetaInfo& info, - std::int32_t /*iter*/, linalg::Matrix* out_gpair) override { - CheckRegInputs(info, preds); - auto labels = info.labels.View(ctx_->Device()); +#include "common/gpu_error_check.h" +void GetGradient(HostDeviceVector const& preds, const MetaInfo& info, + std::int32_t iter, linalg::Matrix* out_gpair) override { + std::cerr << "Entering GetGradient, iteration: " << iter << std::endl; + + try { + GPU_CHECK_LAST(); // Check for any previous GPU errors + + std::cerr << "Checking regression inputs..." << std::endl; + CheckRegInputs(info, preds); + + std::cerr << "Setting up labels..." << std::endl; + auto labels = info.labels.View(ctx_->Device()); + std::cerr << "Labels shape: " << labels.Shape()[0] << "x" << labels.Shape()[1] << std::endl; + + std::cerr << "Setting up output gradient pairs..." << std::endl; out_gpair->SetDevice(ctx_->Device()); out_gpair->Reshape(info.num_row_, this->Targets(info)); auto gpair = out_gpair->View(ctx_->Device()); + std::cerr << "Gradient pairs shape: " << gpair.Shape()[0] << "x" << gpair.Shape()[1] << std::endl; + GPU_CHECK_LAST(); // Check for GPU errors after memory operations + + std::cerr << "Setting up predictions..." << std::endl; preds.SetDevice(ctx_->Device()); auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info)); + std::cerr << "Predictions shape: " << predt.Shape()[0] << "x" << predt.Shape()[1] << std::endl; + + std::cerr << "Setting up weights..." << std::endl; info.weights_.SetDevice(ctx_->Device()); common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan() : info.weights_.ConstHostSpan()}; + std::cerr << "Weights size: " << weight.Size() << std::endl; + GPU_CHECK_LAST(); // Check for GPU errors before kernel launch + + std::cerr << "Running ElementWiseKernel..." << std::endl; linalg::ElementWiseKernel( ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable { auto sign = [](auto x) { @@ -680,8 +704,19 @@ class MeanAbsoluteError : public ObjFunction { auto grad = sign(predt(i, j) - y) * hess; gpair(i, j) = GradientPair{grad, hess}; }); + + GPU_CHECK_LAST(); // Check for GPU errors after kernel execution + + std::cerr << "ElementWiseKernel completed successfully" << std::endl; + } catch (const std::exception& e) { + std::cerr << "Exception in GetGradient: " << e.what() << std::endl; + GPU_CHECK_LAST(); // Check for GPU errors in case of exception + throw; } + std::cerr << "Exiting GetGradient" << std::endl; +} + void InitEstimation(MetaInfo const& info, linalg::Tensor* base_margin) const override { CheckInitInputs(info); base_margin->Reshape(this->Targets(info));