validate label debug
This commit is contained in:
parent
f54355f470
commit
4a10135006
@ -72,8 +72,17 @@ void ValidateLabel(MetaInfo const& info) {
|
|||||||
std::cerr << "Number of rows: " << info.num_row_ << std::endl;
|
std::cerr << "Number of rows: " << info.num_row_ << std::endl;
|
||||||
std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl;
|
std::cerr << "Label shape: " << info.labels.Shape()[0] << "x" << info.labels.Shape()[1] << std::endl;
|
||||||
|
|
||||||
|
// Check GPU memory
|
||||||
|
size_t free, total;
|
||||||
|
if (hipMemGetInfo(&free, &total) == hipSuccess) {
|
||||||
|
std::cerr << "GPU Memory - Free: " << free << ", Total: " << total << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Failed to get GPU memory info" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
auto label = info.labels.View(ctx_->Device());
|
auto label = info.labels.View(ctx_->Device());
|
||||||
std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl;
|
std::cerr << "Label device: " << (ctx_->Device().IsCUDA() ? "GPU" : "CPU") << std::endl;
|
||||||
|
std::cerr << "Label data pointer: " << label.Values().data() << std::endl;
|
||||||
|
|
||||||
bool valid = false;
|
bool valid = false;
|
||||||
try {
|
try {
|
||||||
@ -93,6 +102,8 @@ void ValidateLabel(MetaInfo const& info) {
|
|||||||
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
|
||||||
std::cerr << "Validating labels on GPU" << std::endl;
|
std::cerr << "Validating labels on GPU" << std::endl;
|
||||||
auto cuctx = ctx_->CUDACtx();
|
auto cuctx = ctx_->CUDACtx();
|
||||||
|
std::cerr << "CUDA context pointer: " << cuctx << std::endl;
|
||||||
|
|
||||||
auto it = dh::MakeTransformIterator<bool>(
|
auto it = dh::MakeTransformIterator<bool>(
|
||||||
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
|
thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
|
||||||
auto [m, n] = linalg::UnravelIndex(i, label.Shape());
|
auto [m, n] = linalg::UnravelIndex(i, label.Shape());
|
||||||
@ -103,7 +114,11 @@ void ValidateLabel(MetaInfo const& info) {
|
|||||||
}
|
}
|
||||||
return Loss::CheckLabel(y);
|
return Loss::CheckLabel(y);
|
||||||
});
|
});
|
||||||
return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
|
|
||||||
|
std::cerr << "Starting GPU reduction" << std::endl;
|
||||||
|
bool result = dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
|
||||||
|
std::cerr << "GPU reduction completed" << std::endl;
|
||||||
|
return result;
|
||||||
#else
|
#else
|
||||||
std::cerr << "GPU support not enabled" << std::endl;
|
std::cerr << "GPU support not enabled" << std::endl;
|
||||||
common::AssertGPUSupport();
|
common::AssertGPUSupport();
|
||||||
@ -119,6 +134,9 @@ void ValidateLabel(MetaInfo const& info) {
|
|||||||
|
|
||||||
if (!valid) {
|
if (!valid) {
|
||||||
std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl;
|
std::cerr << "Invalid labels detected. Error message: " << Loss::LabelErrorMsg() << std::endl;
|
||||||
|
// Print GPU error info
|
||||||
|
hipError_t error = hipGetLastError();
|
||||||
|
std::cerr << "Last GPU error: " << hipGetErrorString(error) << std::endl;
|
||||||
LOG(FATAL) << Loss::LabelErrorMsg();
|
LOG(FATAL) << Loss::LabelErrorMsg();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user