diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index c1c122648..c7a33cb02 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -72,8 +72,17 @@ void ValidateLabel(MetaInfo const& info) { std::cerr << "Number of rows: " << info.num_row_ << std::endl; std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl; + // Check GPU memory + size_t free, total; + if (hipMemGetInfo(&free, &total) == hipSuccess) { + std::cerr << "GPU Memory - Free: " << free << ", Total: " << total << std::endl; + } else { + std::cerr << "Failed to get GPU memory info" << std::endl; + } + auto label = info.labels.View(ctx_->Device()); std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl; + std::cerr << "Label data pointer: " << label.Values().data() << std::endl; bool valid = false; try { @@ -93,6 +102,8 @@ void ValidateLabel(MetaInfo const& info) { #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) std::cerr << "Validating labels on GPU" << std::endl; auto cuctx = ctx_->CUDACtx(); + std::cerr << "CUDA context pointer: " << cuctx << std::endl; + auto it = dh::MakeTransformIterator( thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool { auto [m, n] = linalg::UnravelIndex(i, label.Shape()); @@ -103,7 +114,11 @@ void ValidateLabel(MetaInfo const& info) { } return Loss::CheckLabel(y); }); - return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{}); + + std::cerr << "Starting GPU reduction" << std::endl; + bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{}); + std::cerr << "GPU reduction completed" << std::endl; + return result; #else std::cerr << "GPU support not enabled" << std::endl; common::AssertGPUSupport(); @@ -119,6 +134,9 @@ void ValidateLabel(MetaInfo const& info) { if (!valid) { std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl; + // Print GPU error info + hipError_t error = hipGetLastError(); + std::cerr << "Last GPU error: " << hipGetErrorString(error) << std::endl; LOG(FATAL) << Loss::LabelErrorMsg(); }