Enhance nvtx support. (#5636)
This commit is contained in:
parent
67d267f9da
commit
eaf2a00b5c
@ -158,6 +158,10 @@ else (BUILD_STATIC_LIB)
|
||||
add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
|
||||
endif (BUILD_STATIC_LIB)
|
||||
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(xgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
#-- Hide all C++ symbols
|
||||
if (HIDE_CXX_SYMBOLS)
|
||||
set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
|
||||
@ -178,6 +182,9 @@ endif (JVM_BINDINGS)
|
||||
|
||||
#-- CLI for xgboost
|
||||
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
|
||||
if (USE_NVTX)
|
||||
enable_nvtx(runxgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
target_include_directories(runxgboost
|
||||
PRIVATE
|
||||
|
||||
@ -141,3 +141,10 @@ DESTINATION \"${build_dir}/bak\")")
|
||||
install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\"
|
||||
\"${build_dir}/R-package/cmake_install.cmake\")")
|
||||
endfunction(setup_rpackage_install_target)
|
||||
|
||||
macro(enable_nvtx target)
|
||||
find_package(NVTX REQUIRED)
|
||||
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
|
||||
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
|
||||
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
endmacro()
|
||||
|
||||
26
cmake/modules/FindNVTX.cmake
Normal file
26
cmake/modules/FindNVTX.cmake
Normal file
@ -0,0 +1,26 @@
|
||||
if (NVTX_LIBRARY)
|
||||
unset(NVTX_LIBRARY CACHE)
|
||||
endif (NVTX_LIBRARY)
|
||||
|
||||
set(NVTX_LIB_NAME nvToolsExt)
|
||||
|
||||
|
||||
find_path(NVTX_INCLUDE_DIR
|
||||
NAMES nvToolsExt.h
|
||||
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
|
||||
|
||||
|
||||
find_library(NVTX_LIBRARY
|
||||
NAMES nvToolsExt
|
||||
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
|
||||
|
||||
message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(NVTX DEFAULT_MSG
|
||||
NVTX_INCLUDE_DIR NVTX_LIBRARY)
|
||||
|
||||
mark_as_advanced(
|
||||
NVTX_INCLUDE_DIR
|
||||
NVTX_LIBRARY
|
||||
)
|
||||
@ -25,8 +25,7 @@ if (USE_CUDA)
|
||||
endif (USE_NCCL)
|
||||
|
||||
if (USE_NVTX)
|
||||
target_include_directories(objxgboost PRIVATE "${NVTX_HEADER_DIR}")
|
||||
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
enable_nvtx(objxgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
target_compile_options(objxgboost PRIVATE
|
||||
|
||||
@ -10,12 +10,21 @@
|
||||
#include "timer.h"
|
||||
#include "xgboost/json.h"
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvToolsExt.h>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void Monitor::Start(std::string const &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
statistics_map_[name].timer.Start();
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
std::string nvtx_name = label_ + "::" + name;
|
||||
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
@ -24,6 +33,9 @@ void Monitor::Stop(const std::string &name) {
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
/*!
|
||||
* Copyright by Contributors 2019
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvToolsExt.h>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "timer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void Monitor::StartCuda(const std::string& name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
stats.nvtx_id = nvtxRangeStartA(name.c_str());
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
void Monitor::StopCuda(const std::string& name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map_[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
@ -82,8 +82,6 @@ struct Monitor {
|
||||
void Init(std::string label) { this->label_ = label; }
|
||||
void Start(const std::string &name);
|
||||
void Stop(const std::string &name);
|
||||
void StartCuda(const std::string &name);
|
||||
void StopCuda(const std::string &name);
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@ -77,9 +77,9 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
monitor_.Init("ellpack_page");
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
this->InitCompressedData(device);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(device);
|
||||
monitor_.Stop("InitCompressedData");
|
||||
}
|
||||
|
||||
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
|
||||
@ -101,21 +101,21 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
|
||||
|
||||
n_rows = dmat->Info().num_row_;
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
monitor_.Start("Quantiles");
|
||||
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
|
||||
row_stride = GetRowStride(dmat);
|
||||
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
monitor_.Stop("Quantiles");
|
||||
|
||||
monitor_.StartCuda("InitCompressedData");
|
||||
monitor_.Start("InitCompressedData");
|
||||
InitCompressedData(param.gpu_id);
|
||||
monitor_.StopCuda("InitCompressedData");
|
||||
monitor_.Stop("InitCompressedData");
|
||||
|
||||
monitor_.StartCuda("BinningCompression");
|
||||
monitor_.Start("BinningCompression");
|
||||
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
|
||||
CreateHistIndices(param.gpu_id, batch);
|
||||
}
|
||||
monitor_.StopCuda("BinningCompression");
|
||||
monitor_.Stop("BinningCompression");
|
||||
}
|
||||
|
||||
template <typename AdapterBatchT>
|
||||
@ -324,7 +324,7 @@ struct CopyPage {
|
||||
|
||||
// Copy the data from the given EllpackPage to the current page.
|
||||
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
||||
monitor_.StartCuda("Copy");
|
||||
monitor_.Start("Copy");
|
||||
size_t num_elements = page->n_rows * page->row_stride;
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
@ -332,7 +332,7 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
|
||||
gidx_buffer.SetDevice(device);
|
||||
page->gidx_buffer.SetDevice(device);
|
||||
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
|
||||
monitor_.StopCuda("Copy");
|
||||
monitor_.Stop("Copy");
|
||||
return num_elements;
|
||||
}
|
||||
|
||||
@ -381,14 +381,14 @@ struct CompactPage {
|
||||
// Compacts the data from the given EllpackPage into the current page.
|
||||
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
|
||||
common::Span<size_t> row_indexes) {
|
||||
monitor_.StartCuda("Compact");
|
||||
monitor_.Start("Compact");
|
||||
CHECK_EQ(row_stride, page->row_stride);
|
||||
CHECK_EQ(NumSymbols(), page->NumSymbols());
|
||||
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
|
||||
gidx_buffer.SetDevice(device);
|
||||
page->gidx_buffer.SetDevice(device);
|
||||
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
|
||||
monitor_.StopCuda("Compact");
|
||||
monitor_.Stop("Compact");
|
||||
}
|
||||
|
||||
// Initialize the buffer to stored compressed features.
|
||||
|
||||
@ -29,14 +29,14 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,
|
||||
monitor_.Init("ellpack_page_source");
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
|
||||
monitor_.StartCuda("Quantiles");
|
||||
monitor_.Start("Quantiles");
|
||||
size_t row_stride = GetRowStride(dmat);
|
||||
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
|
||||
monitor_.StopCuda("Quantiles");
|
||||
monitor_.Stop("Quantiles");
|
||||
|
||||
monitor_.StartCuda("WriteEllpackPages");
|
||||
monitor_.Start("WriteEllpackPages");
|
||||
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
|
||||
monitor_.StopCuda("WriteEllpackPages");
|
||||
monitor_.Stop("WriteEllpackPages");
|
||||
|
||||
external_prefetcher_.reset(
|
||||
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));
|
||||
|
||||
@ -354,9 +354,9 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
|
||||
// Sample a DMatrix based on the given gradient pairs.
|
||||
GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
|
||||
DMatrix* dmat) {
|
||||
monitor_.StartCuda("Sample");
|
||||
monitor_.Start("Sample");
|
||||
GradientBasedSample sample = strategy_->Sample(gpair, dmat);
|
||||
monitor_.StopCuda("Sample");
|
||||
monitor_.Stop("Sample");
|
||||
return sample;
|
||||
}
|
||||
|
||||
|
||||
@ -557,7 +557,7 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
void AllReduceHist(int nidx, dh::AllReducer* reducer) {
|
||||
monitor.StartCuda("AllReduce");
|
||||
monitor.Start("AllReduce");
|
||||
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
|
||||
reducer->AllReduceSum(
|
||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||
@ -565,7 +565,7 @@ struct GPUHistMakerDevice {
|
||||
page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||
reducer->Synchronize();
|
||||
|
||||
monitor.StopCuda("AllReduce");
|
||||
monitor.Stop("AllReduce");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -670,13 +670,13 @@ struct GPUHistMakerDevice {
|
||||
RegTree* p_tree, dh::AllReducer* reducer) {
|
||||
auto& tree = *p_tree;
|
||||
|
||||
monitor.StartCuda("Reset");
|
||||
monitor.Start("Reset");
|
||||
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
|
||||
monitor.StopCuda("Reset");
|
||||
monitor.Stop("Reset");
|
||||
|
||||
monitor.StartCuda("InitRoot");
|
||||
monitor.Start("InitRoot");
|
||||
this->InitRoot(p_tree, reducer);
|
||||
monitor.StopCuda("InitRoot");
|
||||
monitor.Stop("InitRoot");
|
||||
|
||||
auto timestamp = qexpand->size();
|
||||
auto num_leaves = 1;
|
||||
@ -696,19 +696,19 @@ struct GPUHistMakerDevice {
|
||||
// Only create child entries if needed
|
||||
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
|
||||
num_leaves)) {
|
||||
monitor.StartCuda("UpdatePosition");
|
||||
monitor.Start("UpdatePosition");
|
||||
this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
|
||||
monitor.StopCuda("UpdatePosition");
|
||||
monitor.Stop("UpdatePosition");
|
||||
|
||||
monitor.StartCuda("BuildHist");
|
||||
monitor.Start("BuildHist");
|
||||
this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
|
||||
monitor.StopCuda("BuildHist");
|
||||
monitor.Stop("BuildHist");
|
||||
|
||||
monitor.StartCuda("EvaluateSplits");
|
||||
monitor.Start("EvaluateSplits");
|
||||
auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx,
|
||||
right_child_nidx,
|
||||
*p_tree);
|
||||
monitor.StopCuda("EvaluateSplits");
|
||||
monitor.Stop("EvaluateSplits");
|
||||
|
||||
qexpand->push(ExpandEntry(left_child_nidx,
|
||||
tree.GetDepth(left_child_nidx), splits.at(0),
|
||||
@ -719,9 +719,9 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
}
|
||||
|
||||
monitor.StartCuda("FinalisePosition");
|
||||
monitor.Start("FinalisePosition");
|
||||
this->FinalisePosition(p_tree, p_fmat);
|
||||
monitor.StopCuda("FinalisePosition");
|
||||
monitor.Stop("FinalisePosition");
|
||||
}
|
||||
};
|
||||
|
||||
@ -744,7 +744,7 @@ class GPUHistMakerSpecialised {
|
||||
|
||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
const std::vector<RegTree*>& trees) {
|
||||
monitor_.StartCuda("Update");
|
||||
monitor_.Start("Update");
|
||||
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param_.learning_rate;
|
||||
@ -765,7 +765,7 @@ class GPUHistMakerSpecialised {
|
||||
}
|
||||
|
||||
param_.learning_rate = lr;
|
||||
monitor_.StopCuda("Update");
|
||||
monitor_.Stop("Update");
|
||||
}
|
||||
|
||||
void InitDataOnce(DMatrix* dmat) {
|
||||
@ -800,9 +800,9 @@ class GPUHistMakerSpecialised {
|
||||
|
||||
void InitData(DMatrix* dmat) {
|
||||
if (!initialised_) {
|
||||
monitor_.StartCuda("InitDataOnce");
|
||||
monitor_.Start("InitDataOnce");
|
||||
this->InitDataOnce(dmat);
|
||||
monitor_.StopCuda("InitDataOnce");
|
||||
monitor_.Stop("InitDataOnce");
|
||||
}
|
||||
}
|
||||
|
||||
@ -823,9 +823,9 @@ class GPUHistMakerSpecialised {
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
monitor_.StartCuda("InitData");
|
||||
monitor_.Start("InitData");
|
||||
this->InitData(p_fmat);
|
||||
monitor_.StopCuda("InitData");
|
||||
monitor_.Stop("InitData");
|
||||
|
||||
gpair->SetDevice(device_);
|
||||
maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
|
||||
@ -835,10 +835,10 @@ class GPUHistMakerSpecialised {
|
||||
if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
||||
return false;
|
||||
}
|
||||
monitor_.StartCuda("UpdatePredictionCache");
|
||||
monitor_.Start("UpdatePredictionCache");
|
||||
p_out_preds->SetDevice(device_);
|
||||
maker->UpdatePredictionCache(p_out_preds->DevicePointer());
|
||||
monitor_.StopCuda("UpdatePredictionCache");
|
||||
monitor_.Stop("UpdatePredictionCache");
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -40,9 +40,9 @@ if (USE_CUDA)
|
||||
endif (USE_NCCL)
|
||||
|
||||
if (USE_NVTX)
|
||||
target_include_directories(testxgboost PRIVATE "${NVTX_HEADER_DIR}")
|
||||
target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
|
||||
enable_nvtx(testxgboost)
|
||||
endif (USE_NVTX)
|
||||
|
||||
if (MSVC)
|
||||
target_compile_options(testxgboost PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>
|
||||
|
||||
@ -94,7 +94,7 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
|
||||
|
||||
bool exception = false;
|
||||
int threads = 1000;
|
||||
#pragma omp parallel for
|
||||
#pragma omp parallel for num_threads(threads)
|
||||
for (auto i = 0; i < threads; i++) {
|
||||
try {
|
||||
auto iter = dmat->GetBatches<SparsePage>().begin();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user