get gradient error logging
This commit is contained in:
parent
ab41cd26a6
commit
f50d5344f3
@ -39,6 +39,7 @@
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
#include <exception>
|
||||
#include "common/gpu_error_check.h"
|
||||
|
||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||
#include "../common/cuda_context.cuh" // for CUDAContext
|
||||
@ -655,21 +656,44 @@ class MeanAbsoluteError : public ObjFunction {
|
||||
return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
|
||||
}
|
||||
|
||||
void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
|
||||
std::int32_t /*iter*/, linalg::Matrix<GradientPair>* out_gpair) override {
|
||||
CheckRegInputs(info, preds);
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
#include "common/gpu_error_check.h"
|
||||
|
||||
void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
|
||||
std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
|
||||
std::cerr << "Entering GetGradient, iteration: " << iter << std::endl;
|
||||
|
||||
try {
|
||||
GPU_CHECK_LAST(); // Check for any previous GPU errors
|
||||
|
||||
std::cerr << "Checking regression inputs..." << std::endl;
|
||||
CheckRegInputs(info, preds);
|
||||
|
||||
std::cerr << "Setting up labels..." << std::endl;
|
||||
auto labels = info.labels.View(ctx_->Device());
|
||||
std::cerr << "Labels shape: " << labels.Shape()[0] << "x" << labels.Shape()[1] << std::endl;
|
||||
|
||||
std::cerr << "Setting up output gradient pairs..." << std::endl;
|
||||
out_gpair->SetDevice(ctx_->Device());
|
||||
out_gpair->Reshape(info.num_row_, this->Targets(info));
|
||||
auto gpair = out_gpair->View(ctx_->Device());
|
||||
std::cerr << "Gradient pairs shape: " << gpair.Shape()[0] << "x" << gpair.Shape()[1] << std::endl;
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors after memory operations
|
||||
|
||||
std::cerr << "Setting up predictions..." << std::endl;
|
||||
preds.SetDevice(ctx_->Device());
|
||||
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
|
||||
std::cerr << "Predictions shape: " << predt.Shape()[0] << "x" << predt.Shape()[1] << std::endl;
|
||||
|
||||
std::cerr << "Setting up weights..." << std::endl;
|
||||
info.weights_.SetDevice(ctx_->Device());
|
||||
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
|
||||
: info.weights_.ConstHostSpan()};
|
||||
std::cerr << "Weights size: " << weight.Size() << std::endl;
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors before kernel launch
|
||||
|
||||
std::cerr << "Running ElementWiseKernel..." << std::endl;
|
||||
linalg::ElementWiseKernel(
|
||||
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
|
||||
auto sign = [](auto x) {
|
||||
@ -680,8 +704,19 @@ class MeanAbsoluteError : public ObjFunction {
|
||||
auto grad = sign(predt(i, j) - y) * hess;
|
||||
gpair(i, j) = GradientPair{grad, hess};
|
||||
});
|
||||
|
||||
GPU_CHECK_LAST(); // Check for GPU errors after kernel execution
|
||||
|
||||
std::cerr << "ElementWiseKernel completed successfully" << std::endl;
|
||||
} catch (const std::exception& e) {
|
||||
std::cerr << "Exception in GetGradient: " << e.what() << std::endl;
|
||||
GPU_CHECK_LAST(); // Check for GPU errors in case of exception
|
||||
throw;
|
||||
}
|
||||
|
||||
std::cerr << "Exiting GetGradient" << std::endl;
|
||||
}
|
||||
|
||||
void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_margin) const override {
|
||||
CheckInitInputs(info);
|
||||
base_margin->Reshape(this->Targets(info));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user