Enhance nvtx support. (#5636)

This commit is contained in:
Jiaming Yuan 2020-05-06 22:54:24 +08:00 committed by GitHub
parent 67d267f9da
commit eaf2a00b5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 98 additions and 87 deletions

View File

@ -158,6 +158,10 @@ else (BUILD_STATIC_LIB)
add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES}) add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
endif (BUILD_STATIC_LIB) endif (BUILD_STATIC_LIB)
if (USE_NVTX)
enable_nvtx(xgboost)
endif (USE_NVTX)
#-- Hide all C++ symbols #-- Hide all C++ symbols
if (HIDE_CXX_SYMBOLS) if (HIDE_CXX_SYMBOLS)
set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden) set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
@ -178,6 +182,9 @@ endif (JVM_BINDINGS)
#-- CLI for xgboost #-- CLI for xgboost
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES}) add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
if (USE_NVTX)
enable_nvtx(runxgboost)
endif (USE_NVTX)
target_include_directories(runxgboost target_include_directories(runxgboost
PRIVATE PRIVATE

View File

@ -141,3 +141,10 @@ DESTINATION \"${build_dir}/bak\")")
install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\" install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\"
\"${build_dir}/R-package/cmake_install.cmake\")") \"${build_dir}/R-package/cmake_install.cmake\")")
endfunction(setup_rpackage_install_target) endfunction(setup_rpackage_install_target)
macro(enable_nvtx target)
find_package(NVTX REQUIRED)
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
endmacro()

View File

@ -0,0 +1,26 @@
if (NVTX_LIBRARY)
unset(NVTX_LIBRARY CACHE)
endif (NVTX_LIBRARY)
set(NVTX_LIB_NAME nvToolsExt)
find_path(NVTX_INCLUDE_DIR
NAMES nvToolsExt.h
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
find_library(NVTX_LIBRARY
NAMES nvToolsExt
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NVTX DEFAULT_MSG
NVTX_INCLUDE_DIR NVTX_LIBRARY)
mark_as_advanced(
NVTX_INCLUDE_DIR
NVTX_LIBRARY
)

View File

@ -25,8 +25,7 @@ if (USE_CUDA)
endif (USE_NCCL) endif (USE_NCCL)
if (USE_NVTX) if (USE_NVTX)
target_include_directories(objxgboost PRIVATE "${NVTX_HEADER_DIR}") enable_nvtx(objxgboost)
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
endif (USE_NVTX) endif (USE_NVTX)
target_compile_options(objxgboost PRIVATE target_compile_options(objxgboost PRIVATE

View File

@ -10,12 +10,21 @@
#include "timer.h" #include "timer.h"
#include "xgboost/json.h" #include "xgboost/json.h"
#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)
namespace xgboost { namespace xgboost {
namespace common { namespace common {
void Monitor::Start(std::string const &name) { void Monitor::Start(std::string const &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
statistics_map_[name].timer.Start(); auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
std::string nvtx_name = label_ + "::" + name;
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
} }
} }
@ -24,6 +33,9 @@ void Monitor::Stop(const std::string &name) {
auto &stats = statistics_map_[name]; auto &stats = statistics_map_[name];
stats.timer.Stop(); stats.timer.Stop();
stats.count++; stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
} }
} }

View File

@ -1,38 +0,0 @@
/*!
* Copyright by Contributors 2019
*/
#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)
#include <string>
#include "xgboost/logging.h"
#include "device_helpers.cuh"
#include "timer.h"
namespace xgboost {
namespace common {
void Monitor::StartCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
stats.nvtx_id = nvtxRangeStartA(name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
}
}
void Monitor::StopCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map_[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
}
}
} // namespace common
} // namespace xgboost

View File

@ -82,8 +82,6 @@ struct Monitor {
void Init(std::string label) { this->label_ = label; } void Init(std::string label) { this->label_ = label; }
void Start(const std::string &name); void Start(const std::string &name);
void Stop(const std::string &name); void Stop(const std::string &name);
void StartCuda(const std::string &name);
void StopCuda(const std::string &name);
}; };
} // namespace common } // namespace common
} // namespace xgboost } // namespace xgboost

View File

@ -77,9 +77,9 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
monitor_.Init("ellpack_page"); monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device)); dh::safe_cuda(cudaSetDevice(device));
monitor_.StartCuda("InitCompressedData"); monitor_.Start("InitCompressedData");
this->InitCompressedData(device); InitCompressedData(device);
monitor_.StopCuda("InitCompressedData"); monitor_.Stop("InitCompressedData");
} }
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts, EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
@ -101,21 +101,21 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
n_rows = dmat->Info().num_row_; n_rows = dmat->Info().num_row_;
monitor_.StartCuda("Quantiles"); monitor_.Start("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts. // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat); row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin); cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles"); monitor_.Stop("Quantiles");
monitor_.StartCuda("InitCompressedData"); monitor_.Start("InitCompressedData");
InitCompressedData(param.gpu_id); InitCompressedData(param.gpu_id);
monitor_.StopCuda("InitCompressedData"); monitor_.Stop("InitCompressedData");
monitor_.StartCuda("BinningCompression"); monitor_.Start("BinningCompression");
for (const auto& batch : dmat->GetBatches<SparsePage>()) { for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(param.gpu_id, batch); CreateHistIndices(param.gpu_id, batch);
} }
monitor_.StopCuda("BinningCompression"); monitor_.Stop("BinningCompression");
} }
template <typename AdapterBatchT> template <typename AdapterBatchT>
@ -324,7 +324,7 @@ struct CopyPage {
// Copy the data from the given EllpackPage to the current page. // Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) { size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
monitor_.StartCuda("Copy"); monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride; size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_EQ(NumSymbols(), page->NumSymbols());
@ -332,7 +332,7 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
gidx_buffer.SetDevice(device); gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device); page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, num_elements, CopyPage(this, page, offset)); dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
monitor_.StopCuda("Copy"); monitor_.Stop("Copy");
return num_elements; return num_elements;
} }
@ -381,14 +381,14 @@ struct CompactPage {
// Compacts the data from the given EllpackPage into the current page. // Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page, void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
common::Span<size_t> row_indexes) { common::Span<size_t> row_indexes) {
monitor_.StartCuda("Compact"); monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride); CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols()); CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
gidx_buffer.SetDevice(device); gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device); page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes)); dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
monitor_.StopCuda("Compact"); monitor_.Stop("Compact");
} }
// Initialize the buffer to stored compressed features. // Initialize the buffer to stored compressed features.

View File

@ -29,14 +29,14 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,
monitor_.Init("ellpack_page_source"); monitor_.Init("ellpack_page_source");
dh::safe_cuda(cudaSetDevice(param.gpu_id)); dh::safe_cuda(cudaSetDevice(param.gpu_id));
monitor_.StartCuda("Quantiles"); monitor_.Start("Quantiles");
size_t row_stride = GetRowStride(dmat); size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin); auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles"); monitor_.Stop("Quantiles");
monitor_.StartCuda("WriteEllpackPages"); monitor_.Start("WriteEllpackPages");
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride); WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
monitor_.StopCuda("WriteEllpackPages"); monitor_.Stop("WriteEllpackPages");
external_prefetcher_.reset( external_prefetcher_.reset(
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_)); new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));

View File

@ -354,9 +354,9 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
// Sample a DMatrix based on the given gradient pairs. // Sample a DMatrix based on the given gradient pairs.
GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair, GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
DMatrix* dmat) { DMatrix* dmat) {
monitor_.StartCuda("Sample"); monitor_.Start("Sample");
GradientBasedSample sample = strategy_->Sample(gpair, dmat); GradientBasedSample sample = strategy_->Sample(gpair, dmat);
monitor_.StopCuda("Sample"); monitor_.Stop("Sample");
return sample; return sample;
} }

View File

@ -557,7 +557,7 @@ struct GPUHistMakerDevice {
} }
void AllReduceHist(int nidx, dh::AllReducer* reducer) { void AllReduceHist(int nidx, dh::AllReducer* reducer) {
monitor.StartCuda("AllReduce"); monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data(); auto d_node_hist = hist.GetNodeHistogram(nidx).data();
reducer->AllReduceSum( reducer->AllReduceSum(
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist), reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
@ -565,7 +565,7 @@ struct GPUHistMakerDevice {
page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
reducer->Synchronize(); reducer->Synchronize();
monitor.StopCuda("AllReduce"); monitor.Stop("AllReduce");
} }
/** /**
@ -670,13 +670,13 @@ struct GPUHistMakerDevice {
RegTree* p_tree, dh::AllReducer* reducer) { RegTree* p_tree, dh::AllReducer* reducer) {
auto& tree = *p_tree; auto& tree = *p_tree;
monitor.StartCuda("Reset"); monitor.Start("Reset");
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
monitor.StopCuda("Reset"); monitor.Stop("Reset");
monitor.StartCuda("InitRoot"); monitor.Start("InitRoot");
this->InitRoot(p_tree, reducer); this->InitRoot(p_tree, reducer);
monitor.StopCuda("InitRoot"); monitor.Stop("InitRoot");
auto timestamp = qexpand->size(); auto timestamp = qexpand->size();
auto num_leaves = 1; auto num_leaves = 1;
@ -696,19 +696,19 @@ struct GPUHistMakerDevice {
// Only create child entries if needed // Only create child entries if needed
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
num_leaves)) { num_leaves)) {
monitor.StartCuda("UpdatePosition"); monitor.Start("UpdatePosition");
this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]); this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
monitor.StopCuda("UpdatePosition"); monitor.Stop("UpdatePosition");
monitor.StartCuda("BuildHist"); monitor.Start("BuildHist");
this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
monitor.StopCuda("BuildHist"); monitor.Stop("BuildHist");
monitor.StartCuda("EvaluateSplits"); monitor.Start("EvaluateSplits");
auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx, auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx,
right_child_nidx, right_child_nidx,
*p_tree); *p_tree);
monitor.StopCuda("EvaluateSplits"); monitor.Stop("EvaluateSplits");
qexpand->push(ExpandEntry(left_child_nidx, qexpand->push(ExpandEntry(left_child_nidx,
tree.GetDepth(left_child_nidx), splits.at(0), tree.GetDepth(left_child_nidx), splits.at(0),
@ -719,9 +719,9 @@ struct GPUHistMakerDevice {
} }
} }
monitor.StartCuda("FinalisePosition"); monitor.Start("FinalisePosition");
this->FinalisePosition(p_tree, p_fmat); this->FinalisePosition(p_tree, p_fmat);
monitor.StopCuda("FinalisePosition"); monitor.Stop("FinalisePosition");
} }
}; };
@ -744,7 +744,7 @@ class GPUHistMakerSpecialised {
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat, void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) { const std::vector<RegTree*>& trees) {
monitor_.StartCuda("Update"); monitor_.Start("Update");
// rescale learning rate according to size of trees // rescale learning rate according to size of trees
float lr = param_.learning_rate; float lr = param_.learning_rate;
@ -765,7 +765,7 @@ class GPUHistMakerSpecialised {
} }
param_.learning_rate = lr; param_.learning_rate = lr;
monitor_.StopCuda("Update"); monitor_.Stop("Update");
} }
void InitDataOnce(DMatrix* dmat) { void InitDataOnce(DMatrix* dmat) {
@ -800,9 +800,9 @@ class GPUHistMakerSpecialised {
void InitData(DMatrix* dmat) { void InitData(DMatrix* dmat) {
if (!initialised_) { if (!initialised_) {
monitor_.StartCuda("InitDataOnce"); monitor_.Start("InitDataOnce");
this->InitDataOnce(dmat); this->InitDataOnce(dmat);
monitor_.StopCuda("InitDataOnce"); monitor_.Stop("InitDataOnce");
} }
} }
@ -823,9 +823,9 @@ class GPUHistMakerSpecialised {
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
RegTree* p_tree) { RegTree* p_tree) {
monitor_.StartCuda("InitData"); monitor_.Start("InitData");
this->InitData(p_fmat); this->InitData(p_fmat);
monitor_.StopCuda("InitData"); monitor_.Stop("InitData");
gpair->SetDevice(device_); gpair->SetDevice(device_);
maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_); maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
@ -835,10 +835,10 @@ class GPUHistMakerSpecialised {
if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
return false; return false;
} }
monitor_.StartCuda("UpdatePredictionCache"); monitor_.Start("UpdatePredictionCache");
p_out_preds->SetDevice(device_); p_out_preds->SetDevice(device_);
maker->UpdatePredictionCache(p_out_preds->DevicePointer()); maker->UpdatePredictionCache(p_out_preds->DevicePointer());
monitor_.StopCuda("UpdatePredictionCache"); monitor_.Stop("UpdatePredictionCache");
return true; return true;
} }

View File

@ -40,9 +40,9 @@ if (USE_CUDA)
endif (USE_NCCL) endif (USE_NCCL)
if (USE_NVTX) if (USE_NVTX)
target_include_directories(testxgboost PRIVATE "${NVTX_HEADER_DIR}") enable_nvtx(testxgboost)
target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
endif (USE_NVTX) endif (USE_NVTX)
if (MSVC) if (MSVC)
target_compile_options(testxgboost PRIVATE target_compile_options(testxgboost PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8> $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>

View File

@ -94,7 +94,7 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
bool exception = false; bool exception = false;
int threads = 1000; int threads = 1000;
#pragma omp parallel for #pragma omp parallel for num_threads(threads)
for (auto i = 0; i < threads; i++) { for (auto i = 0; i < threads; i++) {
try { try {
auto iter = dmat->GetBatches<SparsePage>().begin(); auto iter = dmat->GetBatches<SparsePage>().begin();