Enhance nvtx support. (#5636)

This commit is contained in:
Jiaming Yuan 2020-05-06 22:54:24 +08:00 committed by GitHub
parent 67d267f9da
commit eaf2a00b5c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 98 additions and 87 deletions

View File

@ -158,6 +158,10 @@ else (BUILD_STATIC_LIB)
add_library(xgboost SHARED ${XGBOOST_OBJ_SOURCES})
endif (BUILD_STATIC_LIB)
if (USE_NVTX)
enable_nvtx(xgboost)
endif (USE_NVTX)
#-- Hide all C++ symbols
if (HIDE_CXX_SYMBOLS)
set_target_properties(objxgboost PROPERTIES CXX_VISIBILITY_PRESET hidden)
@ -178,6 +182,9 @@ endif (JVM_BINDINGS)
#-- CLI for xgboost
add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc ${XGBOOST_OBJ_SOURCES})
if (USE_NVTX)
enable_nvtx(runxgboost)
endif (USE_NVTX)
target_include_directories(runxgboost
PRIVATE

View File

@ -141,3 +141,10 @@ DESTINATION \"${build_dir}/bak\")")
install(CODE "file(RENAME \"${build_dir}/bak/cmake_install.cmake\"
\"${build_dir}/R-package/cmake_install.cmake\")")
endfunction(setup_rpackage_install_target)
macro(enable_nvtx target)
find_package(NVTX REQUIRED)
target_include_directories(${target} PRIVATE "${NVTX_INCLUDE_DIR}")
target_link_libraries(${target} PRIVATE "${NVTX_LIBRARY}")
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
endmacro()

View File

@ -0,0 +1,26 @@
if (NVTX_LIBRARY)
unset(NVTX_LIBRARY CACHE)
endif (NVTX_LIBRARY)
set(NVTX_LIB_NAME nvToolsExt)
find_path(NVTX_INCLUDE_DIR
NAMES nvToolsExt.h
PATHS ${CUDA_HOME}/include ${CUDA_INCLUDE} /usr/local/cuda/include)
find_library(NVTX_LIBRARY
NAMES nvToolsExt
PATHS ${CUDA_HOME}/lib64 /usr/local/cuda/lib64)
message(STATUS "Using nvtx library: ${NVTX_LIBRARY}")
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NVTX DEFAULT_MSG
NVTX_INCLUDE_DIR NVTX_LIBRARY)
mark_as_advanced(
NVTX_INCLUDE_DIR
NVTX_LIBRARY
)

View File

@ -25,8 +25,7 @@ if (USE_CUDA)
endif (USE_NCCL)
if (USE_NVTX)
target_include_directories(objxgboost PRIVATE "${NVTX_HEADER_DIR}")
target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
enable_nvtx(objxgboost)
endif (USE_NVTX)
target_compile_options(objxgboost PRIVATE

View File

@ -10,12 +10,21 @@
#include "timer.h"
#include "xgboost/json.h"
#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)
namespace xgboost {
namespace common {
void Monitor::Start(std::string const &name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
statistics_map_[name].timer.Start();
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
std::string nvtx_name = label_ + "::" + name;
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
}
}
@ -24,6 +33,9 @@ void Monitor::Stop(const std::string &name) {
auto &stats = statistics_map_[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
}
}

View File

@ -1,38 +0,0 @@
/*!
* Copyright by Contributors 2019
*/
#if defined(XGBOOST_USE_NVTX)
#include <nvToolsExt.h>
#endif // defined(XGBOOST_USE_NVTX)
#include <string>
#include "xgboost/logging.h"
#include "device_helpers.cuh"
#include "timer.h"
namespace xgboost {
namespace common {
void Monitor::StartCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
stats.nvtx_id = nvtxRangeStartA(name.c_str());
#endif // defined(XGBOOST_USE_NVTX)
}
}
void Monitor::StopCuda(const std::string& name) {
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
auto &stats = statistics_map_[name];
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
#endif // defined(XGBOOST_USE_NVTX)
}
}
} // namespace common
} // namespace xgboost

View File

@ -82,8 +82,6 @@ struct Monitor {
void Init(std::string label) { this->label_ = label; }
void Start(const std::string &name);
void Stop(const std::string &name);
void StartCuda(const std::string &name);
void StopCuda(const std::string &name);
};
} // namespace common
} // namespace xgboost

View File

@ -77,9 +77,9 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device));
monitor_.StartCuda("InitCompressedData");
this->InitCompressedData(device);
monitor_.StopCuda("InitCompressedData");
monitor_.Start("InitCompressedData");
InitCompressedData(device);
monitor_.Stop("InitCompressedData");
}
EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
@ -101,21 +101,21 @@ EllpackPageImpl::EllpackPageImpl(DMatrix* dmat, const BatchParam& param)
n_rows = dmat->Info().num_row_;
monitor_.StartCuda("Quantiles");
monitor_.Start("Quantiles");
// Create the quantile sketches for the dmatrix and initialize HistogramCuts.
row_stride = GetRowStride(dmat);
cuts_ = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");
monitor_.Stop("Quantiles");
monitor_.StartCuda("InitCompressedData");
monitor_.Start("InitCompressedData");
InitCompressedData(param.gpu_id);
monitor_.StopCuda("InitCompressedData");
monitor_.Stop("InitCompressedData");
monitor_.StartCuda("BinningCompression");
monitor_.Start("BinningCompression");
for (const auto& batch : dmat->GetBatches<SparsePage>()) {
CreateHistIndices(param.gpu_id, batch);
}
monitor_.StopCuda("BinningCompression");
monitor_.Stop("BinningCompression");
}
template <typename AdapterBatchT>
@ -324,7 +324,7 @@ struct CopyPage {
// Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
monitor_.StartCuda("Copy");
monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
@ -332,7 +332,7 @@ size_t EllpackPageImpl::Copy(int device, EllpackPageImpl* page, size_t offset) {
gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, num_elements, CopyPage(this, page, offset));
monitor_.StopCuda("Copy");
monitor_.Stop("Copy");
return num_elements;
}
@ -381,14 +381,14 @@ struct CompactPage {
// Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(int device, EllpackPageImpl* page,
common::Span<size_t> row_indexes) {
monitor_.StartCuda("Compact");
monitor_.Start("Compact");
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device);
dh::LaunchN(device, page->n_rows, CompactPage(this, page, row_indexes));
monitor_.StopCuda("Compact");
monitor_.Stop("Compact");
}
// Initialize the buffer to stored compressed features.

View File

@ -29,14 +29,14 @@ EllpackPageSource::EllpackPageSource(DMatrix* dmat,
monitor_.Init("ellpack_page_source");
dh::safe_cuda(cudaSetDevice(param.gpu_id));
monitor_.StartCuda("Quantiles");
monitor_.Start("Quantiles");
size_t row_stride = GetRowStride(dmat);
auto cuts = common::DeviceSketch(param.gpu_id, dmat, param.max_bin);
monitor_.StopCuda("Quantiles");
monitor_.Stop("Quantiles");
monitor_.StartCuda("WriteEllpackPages");
monitor_.Start("WriteEllpackPages");
WriteEllpackPages(param.gpu_id, dmat, cuts, cache_info, row_stride);
monitor_.StopCuda("WriteEllpackPages");
monitor_.Stop("WriteEllpackPages");
external_prefetcher_.reset(
new ExternalMemoryPrefetcher<EllpackPage>(cache_info_));

View File

@ -354,9 +354,9 @@ GradientBasedSampler::GradientBasedSampler(EllpackPageImpl* page,
// Sample a DMatrix based on the given gradient pairs.
GradientBasedSample GradientBasedSampler::Sample(common::Span<GradientPair> gpair,
DMatrix* dmat) {
monitor_.StartCuda("Sample");
monitor_.Start("Sample");
GradientBasedSample sample = strategy_->Sample(gpair, dmat);
monitor_.StopCuda("Sample");
monitor_.Stop("Sample");
return sample;
}

View File

@ -557,7 +557,7 @@ struct GPUHistMakerDevice {
}
void AllReduceHist(int nidx, dh::AllReducer* reducer) {
monitor.StartCuda("AllReduce");
monitor.Start("AllReduce");
auto d_node_hist = hist.GetNodeHistogram(nidx).data();
reducer->AllReduceSum(
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
@ -565,7 +565,7 @@ struct GPUHistMakerDevice {
page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
reducer->Synchronize();
monitor.StopCuda("AllReduce");
monitor.Stop("AllReduce");
}
/**
@ -670,13 +670,13 @@ struct GPUHistMakerDevice {
RegTree* p_tree, dh::AllReducer* reducer) {
auto& tree = *p_tree;
monitor.StartCuda("Reset");
monitor.Start("Reset");
this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
monitor.StopCuda("Reset");
monitor.Stop("Reset");
monitor.StartCuda("InitRoot");
monitor.Start("InitRoot");
this->InitRoot(p_tree, reducer);
monitor.StopCuda("InitRoot");
monitor.Stop("InitRoot");
auto timestamp = qexpand->size();
auto num_leaves = 1;
@ -696,19 +696,19 @@ struct GPUHistMakerDevice {
// Only create child entries if needed
if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
num_leaves)) {
monitor.StartCuda("UpdatePosition");
monitor.Start("UpdatePosition");
this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
monitor.StopCuda("UpdatePosition");
monitor.Stop("UpdatePosition");
monitor.StartCuda("BuildHist");
monitor.Start("BuildHist");
this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
monitor.StopCuda("BuildHist");
monitor.Stop("BuildHist");
monitor.StartCuda("EvaluateSplits");
monitor.Start("EvaluateSplits");
auto splits = this->EvaluateLeftRightSplits(candidate, left_child_nidx,
right_child_nidx,
*p_tree);
monitor.StopCuda("EvaluateSplits");
monitor.Stop("EvaluateSplits");
qexpand->push(ExpandEntry(left_child_nidx,
tree.GetDepth(left_child_nidx), splits.at(0),
@ -719,9 +719,9 @@ struct GPUHistMakerDevice {
}
}
monitor.StartCuda("FinalisePosition");
monitor.Start("FinalisePosition");
this->FinalisePosition(p_tree, p_fmat);
monitor.StopCuda("FinalisePosition");
monitor.Stop("FinalisePosition");
}
};
@ -744,7 +744,7 @@ class GPUHistMakerSpecialised {
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
const std::vector<RegTree*>& trees) {
monitor_.StartCuda("Update");
monitor_.Start("Update");
// rescale learning rate according to size of trees
float lr = param_.learning_rate;
@ -765,7 +765,7 @@ class GPUHistMakerSpecialised {
}
param_.learning_rate = lr;
monitor_.StopCuda("Update");
monitor_.Stop("Update");
}
void InitDataOnce(DMatrix* dmat) {
@ -800,9 +800,9 @@ class GPUHistMakerSpecialised {
void InitData(DMatrix* dmat) {
if (!initialised_) {
monitor_.StartCuda("InitDataOnce");
monitor_.Start("InitDataOnce");
this->InitDataOnce(dmat);
monitor_.StopCuda("InitDataOnce");
monitor_.Stop("InitDataOnce");
}
}
@ -823,9 +823,9 @@ class GPUHistMakerSpecialised {
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
RegTree* p_tree) {
monitor_.StartCuda("InitData");
monitor_.Start("InitData");
this->InitData(p_fmat);
monitor_.StopCuda("InitData");
monitor_.Stop("InitData");
gpair->SetDevice(device_);
maker->UpdateTree(gpair, p_fmat, p_tree, &reducer_);
@ -835,10 +835,10 @@ class GPUHistMakerSpecialised {
if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
return false;
}
monitor_.StartCuda("UpdatePredictionCache");
monitor_.Start("UpdatePredictionCache");
p_out_preds->SetDevice(device_);
maker->UpdatePredictionCache(p_out_preds->DevicePointer());
monitor_.StopCuda("UpdatePredictionCache");
monitor_.Stop("UpdatePredictionCache");
return true;
}

View File

@ -40,9 +40,9 @@ if (USE_CUDA)
endif (USE_NCCL)
if (USE_NVTX)
target_include_directories(testxgboost PRIVATE "${NVTX_HEADER_DIR}")
target_compile_definitions(testxgboost PRIVATE -DXGBOOST_USE_NVTX=1)
enable_nvtx(testxgboost)
endif (USE_NVTX)
if (MSVC)
target_compile_options(testxgboost PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>

View File

@ -94,7 +94,7 @@ TEST(SparsePageDMatrix, ThreadSafetyException) {
bool exception = false;
int threads = 1000;
#pragma omp parallel for
#pragma omp parallel for num_threads(threads)
for (auto i = 0; i < threads; i++) {
try {
auto iter = dmat->GetBatches<SparsePage>().begin();