validate label debug

This commit is contained in:
Hendrik Groove 2024-10-20 18:11:03 +02:00
parent f54355f470
commit 4a10135006

View File

@ -72,8 +72,17 @@ void ValidateLabel(MetaInfo const& info) {
std::cerr << "Number of rows: " << info.num_row_ << std::endl; std::cerr << "Number of rows: " << info.num_row_ << std::endl;
std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl; std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl;
// Check GPU memory
size_t free, total;
if (hipMemGetInfo(&free, &total) == hipSuccess) {
std::cerr << "GPU Memory - Free: " << free << ", Total: " << total << std::endl;
} else {
std::cerr << "Failed to get GPU memory info" << std::endl;
}
auto label = info.labels.View(ctx_->Device()); auto label = info.labels.View(ctx_->Device());
std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl; std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl;
std::cerr << "Label data pointer: " << label.Values().data() << std::endl;
bool valid = false; bool valid = false;
try { try {
@ -93,6 +102,8 @@ void ValidateLabel(MetaInfo const& info) {
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP) #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
std::cerr << "Validating labels on GPU" << std::endl; std::cerr << "Validating labels on GPU" << std::endl;
auto cuctx = ctx_->CUDACtx(); auto cuctx = ctx_->CUDACtx();
std::cerr << "CUDA context pointer: " << cuctx << std::endl;
auto it = dh::MakeTransformIterator<bool>( auto it = dh::MakeTransformIterator<bool>(
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool { thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
auto [m, n] = linalg::UnravelIndex(i, label.Shape()); auto [m, n] = linalg::UnravelIndex(i, label.Shape());
@ -103,7 +114,11 @@ void ValidateLabel(MetaInfo const& info) {
} }
return Loss::CheckLabel(y); return Loss::CheckLabel(y);
}); });
return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
std::cerr << "Starting GPU reduction" << std::endl;
bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
std::cerr << "GPU reduction completed" << std::endl;
return result;
#else #else
std::cerr << "GPU support not enabled" << std::endl; std::cerr << "GPU support not enabled" << std::endl;
common::AssertGPUSupport(); common::AssertGPUSupport();
@ -119,6 +134,9 @@ void ValidateLabel(MetaInfo const& info) {
if (!valid) { if (!valid) {
std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl; std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl;
// Print GPU error info
hipError_t error = hipGetLastError();
std::cerr << "Last GPU error: " << hipGetErrorString(error) << std::endl;
LOG(FATAL) << Loss::LabelErrorMsg(); LOG(FATAL) << Loss::LabelErrorMsg();
} }