Run training with empty DMatrix. (#4990)
This makes GPU Hist robust in distributed environment as some workers might not be associated with any data in either training or evaluation. * Disable rabit mock test for now: See #5012 . * Disable dask-cudf test at prediction for now: See #5003 * Launch dask job for all workers despite they might not have any data. * Check 0 rows in elementwise evaluation metrics. Using AUC and AUC-PR still throws an error. See #4663 for a robust fix. * Add tests for edge cases. * Add `LaunchKernel` wrapper handling zero sized grid. * Move some parts of allreducer into a cu file. * Don't validate feature names when the booster is empty. * Sync number of columns in DMatrix. As num_feature is required to be the same across all workers in data split mode. * Filtering in dask interface now by default syncs all booster that's not empty, instead of using rank 0. * Fix Jenkins' GPU tests. * Install dask-cuda from source in Jenkins' test. Now all tests are actually running. * Restore GPU Hist tree synchronization test. * Check UUID of running devices. The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field. * Fix CMake policy and project variables. Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13. * Fix copying data to CPU * Fix race condition in cpu predictor. * Fix duplicated DMatrix construction. * Don't download extra nccl in CI script.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
file(GLOB_RECURSE CPU_SOURCES *.cc *.h)
|
||||
list(REMOVE_ITEM CPU_SOURCES ${PROJECT_SOURCE_DIR}/src/cli_main.cc)
|
||||
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/cli_main.cc)
|
||||
|
||||
#-- Object library
|
||||
# Object library is necessary for jvm-package, which creates its own shared
|
||||
@@ -9,7 +9,7 @@ if (USE_CUDA)
|
||||
add_library(objxgboost OBJECT ${CPU_SOURCES} ${CUDA_SOURCES} ${PLUGINS_SOURCES})
|
||||
target_compile_definitions(objxgboost
|
||||
PRIVATE -DXGBOOST_USE_CUDA=1)
|
||||
target_include_directories(objxgboost PRIVATE ${PROJECT_SOURCE_DIR}/cub/)
|
||||
target_include_directories(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/cub/)
|
||||
target_compile_options(objxgboost PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
|
||||
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
|
||||
@@ -43,9 +43,9 @@ endif (USE_CUDA)
|
||||
|
||||
target_include_directories(objxgboost
|
||||
PRIVATE
|
||||
${PROJECT_SOURCE_DIR}/include
|
||||
${PROJECT_SOURCE_DIR}/dmlc-core/include
|
||||
${PROJECT_SOURCE_DIR}/rabit/include)
|
||||
${xgboost_SOURCE_DIR}/include
|
||||
${xgboost_SOURCE_DIR}/dmlc-core/include
|
||||
${xgboost_SOURCE_DIR}/rabit/include)
|
||||
target_compile_options(objxgboost
|
||||
PRIVATE
|
||||
$<$<AND:$<CXX_COMPILER_ID:MSVC>,$<COMPILE_LANGUAGE:CXX>>:/MP>
|
||||
|
||||
91
src/common/device_helpers.cu
Normal file
91
src/common/device_helpers.cu
Normal file
@@ -0,0 +1,91 @@
|
||||
/*!
|
||||
* Copyright 2017-2019 XGBoost contributors
|
||||
*
|
||||
* \brief Utilities for CUDA.
|
||||
*/
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
#include <nccl.h>
|
||||
#endif // #ifdef XGBOOST_USE_NCCL
|
||||
#include <sstream>
|
||||
|
||||
#include "device_helpers.cuh"
|
||||
|
||||
namespace dh {
|
||||
|
||||
#if __CUDACC_VER_MAJOR__ > 9
|
||||
constexpr std::size_t kUuidLength =
|
||||
sizeof(std::declval<cudaDeviceProp>().uuid) / sizeof(uint64_t);
|
||||
|
||||
void GetCudaUUID(int world_size, int rank, int device_ord,
|
||||
xgboost::common::Span<uint64_t, kUuidLength> uuid) {
|
||||
cudaDeviceProp prob;
|
||||
safe_cuda(cudaGetDeviceProperties(&prob, device_ord));
|
||||
std::memcpy(uuid.data(), static_cast<void*>(&(prob.uuid)), sizeof(prob.uuid));
|
||||
}
|
||||
|
||||
std::string PrintUUID(xgboost::common::Span<uint64_t, kUuidLength> uuid) {
|
||||
std::stringstream ss;
|
||||
for (auto v : uuid) {
|
||||
ss << std::hex << v;
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
#endif // __CUDACC_VER_MAJOR__ > 9
|
||||
|
||||
void AllReducer::Init(int _device_ordinal) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
LOG(DEBUG) << "Running nccl init on: " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__;
|
||||
|
||||
device_ordinal = _device_ordinal;
|
||||
int32_t const rank = rabit::GetRank();
|
||||
|
||||
#if __CUDACC_VER_MAJOR__ > 9
|
||||
int32_t const world = rabit::GetWorldSize();
|
||||
|
||||
std::vector<uint64_t> uuids(world * kUuidLength, 0);
|
||||
auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
|
||||
auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
|
||||
GetCudaUUID(world, rank, device_ordinal, s_this_uuid);
|
||||
|
||||
// No allgather yet.
|
||||
rabit::Allreduce<rabit::op::Sum, uint64_t>(uuids.data(), uuids.size());
|
||||
|
||||
std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);;
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
|
||||
converted[j] =
|
||||
xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
|
||||
j++;
|
||||
}
|
||||
|
||||
auto iter = std::unique(converted.begin(), converted.end());
|
||||
auto n_uniques = std::distance(converted.begin(), iter);
|
||||
CHECK_EQ(n_uniques, world)
|
||||
<< "Multiple processes within communication group running on same CUDA "
|
||||
<< "device is not supported";
|
||||
#endif // __CUDACC_VER_MAJOR__ > 9
|
||||
|
||||
id = GetUniqueId();
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rank));
|
||||
safe_cuda(cudaStreamCreate(&stream));
|
||||
initialised_ = true;
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
}
|
||||
|
||||
AllReducer::~AllReducer() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
if (initialised_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream));
|
||||
ncclCommDestroy(comm);
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
|
||||
}
|
||||
#endif // XGBOOST_USE_NCCL
|
||||
}
|
||||
|
||||
} // namespace dh
|
||||
@@ -7,24 +7,25 @@
|
||||
#include <thrust/device_malloc_allocator.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
#include <thrust/system_error.h>
|
||||
#include <xgboost/logging.h>
|
||||
|
||||
#include <omp.h>
|
||||
#include <rabit/rabit.h>
|
||||
#include <cub/cub.cuh>
|
||||
#include <cub/util_allocator.cuh>
|
||||
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <omp.h>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
#include <cub/cub.cuh>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include "xgboost/host_device_vector.h"
|
||||
#include "xgboost/span.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "timer.h"
|
||||
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
@@ -205,24 +206,53 @@ __global__ void LaunchNKernel(size_t begin, size_t end, L lambda) {
|
||||
}
|
||||
template <typename L>
|
||||
__global__ void LaunchNKernel(int device_idx, size_t begin, size_t end,
|
||||
L lambda) {
|
||||
L lambda) {
|
||||
for (auto i : GridStrideRange(begin, end)) {
|
||||
lambda(i, device_idx);
|
||||
}
|
||||
}
|
||||
|
||||
/* \brief A wrapper around kernel launching syntax, used to guard against empty input.
|
||||
*
|
||||
* - nvcc fails to deduce template argument when kernel is a template accepting __device__
|
||||
* function as argument. Hence functions like `LaunchN` cannot use this wrapper.
|
||||
*
|
||||
* - With c++ initialization list `{}` syntax, you are forced to comply with the CUDA type
|
||||
* spcification.
|
||||
*/
|
||||
class LaunchKernel {
|
||||
size_t shmem_size_;
|
||||
cudaStream_t stream_;
|
||||
|
||||
dim3 grids_;
|
||||
dim3 blocks_;
|
||||
|
||||
public:
|
||||
LaunchKernel(uint32_t _grids, uint32_t _blk, size_t _shmem=0, cudaStream_t _s=0) :
|
||||
grids_{_grids, 1, 1}, blocks_{_blk, 1, 1}, shmem_size_{_shmem}, stream_{_s} {}
|
||||
LaunchKernel(dim3 _grids, dim3 _blk, size_t _shmem=0, cudaStream_t _s=0) :
|
||||
grids_{_grids}, blocks_{_blk}, shmem_size_{_shmem}, stream_{_s} {}
|
||||
|
||||
template <typename K, typename... Args>
|
||||
void operator()(K kernel, Args... args) {
|
||||
if (XGBOOST_EXPECT(grids_.x * grids_.y * grids_.z == 0, false)) {
|
||||
LOG(DEBUG) << "Skipping empty CUDA kernel.";
|
||||
return;
|
||||
}
|
||||
kernel<<<grids_, blocks_, shmem_size_, stream_>>>(args...); // NOLINT
|
||||
}
|
||||
};
|
||||
|
||||
template <int ITEMS_PER_THREAD = 8, int BLOCK_THREADS = 256, typename L>
|
||||
inline void LaunchN(int device_idx, size_t n, cudaStream_t stream, L lambda) {
|
||||
if (n == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
const int GRID_SIZE =
|
||||
static_cast<int>(xgboost::common::DivRoundUp(n, ITEMS_PER_THREAD * BLOCK_THREADS));
|
||||
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>(static_cast<size_t>(0),
|
||||
n, lambda);
|
||||
LaunchNKernel<<<GRID_SIZE, BLOCK_THREADS, 0, stream>>>( // NOLINT
|
||||
static_cast<size_t>(0), n, lambda);
|
||||
}
|
||||
|
||||
// Default stream version
|
||||
@@ -301,6 +331,16 @@ inline detail::MemoryLogger &GlobalMemoryLogger() {
|
||||
return memory_logger;
|
||||
}
|
||||
|
||||
// dh::DebugSyncDevice(__FILE__, __LINE__);
|
||||
inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
|
||||
if (file != "" && line != -1) {
|
||||
auto rank = rabit::GetRank();
|
||||
LOG(DEBUG) << "R:" << rank << ": " << file << ":" << line;
|
||||
}
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
safe_cuda(cudaGetLastError());
|
||||
}
|
||||
|
||||
namespace detail{
|
||||
/**
|
||||
* \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
|
||||
@@ -763,7 +803,7 @@ void SparseTransformLbs(int device_idx, dh::CubMemory *temp_memory,
|
||||
BLOCK_THREADS, segments, num_segments, count);
|
||||
|
||||
LbsKernel<TILE_SIZE, ITEMS_PER_THREAD, BLOCK_THREADS, OffsetT>
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates,
|
||||
<<<uint32_t(num_tiles), BLOCK_THREADS>>>(tmp_tile_coordinates, // NOLINT
|
||||
segments + 1, f, num_segments);
|
||||
}
|
||||
|
||||
@@ -963,7 +1003,6 @@ class SaveCudaContext {
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL
|
||||
* this is a dummy class that will error if used with more than one GPU.
|
||||
*/
|
||||
|
||||
class AllReducer {
|
||||
bool initialised_;
|
||||
size_t allreduce_bytes_; // Keep statistics of the number of bytes communicated
|
||||
@@ -986,31 +1025,9 @@ class AllReducer {
|
||||
*
|
||||
* \param device_ordinal The device ordinal.
|
||||
*/
|
||||
void Init(int _device_ordinal);
|
||||
|
||||
void Init(int _device_ordinal) {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
/** \brief this >monitor . init. */
|
||||
device_ordinal = _device_ordinal;
|
||||
id = GetUniqueId();
|
||||
dh::safe_cuda(cudaSetDevice(device_ordinal));
|
||||
dh::safe_nccl(ncclCommInitRank(&comm, rabit::GetWorldSize(), id, rabit::GetRank()));
|
||||
safe_cuda(cudaStreamCreate(&stream));
|
||||
initialised_ = true;
|
||||
#endif
|
||||
}
|
||||
~AllReducer() {
|
||||
#ifdef XGBOOST_USE_NCCL
|
||||
if (initialised_) {
|
||||
dh::safe_cuda(cudaStreamDestroy(stream));
|
||||
ncclCommDestroy(comm);
|
||||
}
|
||||
if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
|
||||
LOG(CONSOLE) << "======== NCCL Statistics========";
|
||||
LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
|
||||
LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_/1048576;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
~AllReducer();
|
||||
|
||||
/**
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
|
||||
@@ -293,6 +293,7 @@ void DenseCuts::Build(DMatrix* p_fmat, uint32_t max_num_bins) {
|
||||
|
||||
void DenseCuts::Init
|
||||
(std::vector<WXQSketch>* in_sketchs, uint32_t max_num_bins) {
|
||||
monitor_.Start(__func__);
|
||||
std::vector<WXQSketch>& sketchs = *in_sketchs;
|
||||
constexpr int kFactor = 8;
|
||||
// gather the histogram data
|
||||
@@ -332,6 +333,7 @@ void DenseCuts::Init
|
||||
CHECK_GT(cut_size, p_cuts_->cut_ptrs_.back());
|
||||
p_cuts_->cut_ptrs_.push_back(cut_size);
|
||||
}
|
||||
monitor_.Stop(__func__);
|
||||
}
|
||||
|
||||
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) {
|
||||
|
||||
@@ -252,8 +252,10 @@ class GPUSketcher {
|
||||
});
|
||||
} else if (n_cuts_cur_[icol] > 0) {
|
||||
// if more elements than cuts: use binary search on cumulative weights
|
||||
int block = 256;
|
||||
FindCutsK<<<common::DivRoundUp(n_cuts_cur_[icol], block), block>>>(
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
uint32_t const kGrids = common::DivRoundUp(n_cuts_cur_[icol], kBlockThreads);
|
||||
dh::LaunchKernel {kGrids, kBlockThreads} (
|
||||
FindCutsK,
|
||||
cuts_d_.data().get() + icol * n_cuts_,
|
||||
fvalues_cur_.data().get(),
|
||||
weights2_.data().get(),
|
||||
@@ -403,7 +405,8 @@ class GPUSketcher {
|
||||
// NOTE: This will typically support ~ 4M features - 64K*64
|
||||
dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(num_cols_, block3.y), 1);
|
||||
UnpackFeaturesK<<<grid3, block3>>>(
|
||||
dh::LaunchKernel {grid3, block3} (
|
||||
UnpackFeaturesK,
|
||||
fvalues_.data().get(),
|
||||
has_weights_ ? feature_weights_.data().get() : nullptr,
|
||||
row_ptrs_.data().get() + batch_row_begin,
|
||||
|
||||
@@ -13,6 +13,20 @@
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void Monitor::Start(std::string const &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
statistics_map[name].timer.Start();
|
||||
}
|
||||
}
|
||||
|
||||
void Monitor::Stop(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Monitor::StatMap> Monitor::CollectFromOtherRanks() const {
|
||||
// Since other nodes might have started timers that this one haven't, so
|
||||
// we can't simply call all reduce.
|
||||
|
||||
38
src/common/timer.cu
Normal file
38
src/common/timer.cu
Normal file
@@ -0,0 +1,38 @@
|
||||
/*!
|
||||
* Copyright by Contributors 2019
|
||||
*/
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
#include <nvToolsExt.h>
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "xgboost/logging.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "timer.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
void Monitor::StartCuda(const std::string& name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
stats.nvtx_id = nvtxRangeStartA(name.c_str());
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
|
||||
void Monitor::StopCuda(const std::string& name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
#endif // defined(XGBOOST_USE_NVTX)
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
@@ -10,10 +10,6 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
#include <nvToolsExt.h>
|
||||
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
@@ -84,37 +80,10 @@ struct Monitor {
|
||||
void Print() const;
|
||||
|
||||
void Init(std::string label) { this->label = label; }
|
||||
void Start(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
statistics_map[name].timer.Start();
|
||||
}
|
||||
}
|
||||
void Stop(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
}
|
||||
}
|
||||
void StartCuda(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Start();
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
stats.nvtx_id = nvtxRangeStartA(name.c_str());
|
||||
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
}
|
||||
}
|
||||
void StopCuda(const std::string &name) {
|
||||
if (ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
|
||||
auto &stats = statistics_map[name];
|
||||
stats.timer.Stop();
|
||||
stats.count++;
|
||||
#if defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
nvtxRangeEnd(stats.nvtx_id);
|
||||
#endif // defined(XGBOOST_USE_NVTX) && defined(__CUDACC__)
|
||||
}
|
||||
}
|
||||
void Start(const std::string &name);
|
||||
void Stop(const std::string &name);
|
||||
void StartCuda(const std::string &name);
|
||||
void StopCuda(const std::string &name);
|
||||
};
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -133,9 +133,12 @@ class Transform {
|
||||
size_t shard_size = range_size;
|
||||
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
const int GRID_SIZE =
|
||||
const int kGrids =
|
||||
static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
|
||||
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
|
||||
if (kGrids == 0) {
|
||||
return;
|
||||
}
|
||||
detail::LaunchCUDAKernel<<<kGrids, kBlockThreads>>>( // NOLINT
|
||||
_func, shard_range, UnpackHDVOnDevice(_vectors)...);
|
||||
}
|
||||
#else
|
||||
|
||||
@@ -320,6 +320,32 @@ void DMatrix::SaveToLocalFile(const std::string& fname) {
|
||||
DMatrix* DMatrix::Create(std::unique_ptr<DataSource<SparsePage>>&& source,
|
||||
const std::string& cache_prefix) {
|
||||
if (cache_prefix.length() == 0) {
|
||||
// FIXME(trivialfis): Currently distcol is broken so we here check for number of rows.
|
||||
// If we bring back column split this check will break.
|
||||
bool is_distributed { rabit::IsDistributed() };
|
||||
if (is_distributed) {
|
||||
auto world_size = rabit::GetWorldSize();
|
||||
auto rank = rabit::GetRank();
|
||||
std::vector<uint64_t> ncols(world_size, 0);
|
||||
ncols[rank] = source->info.num_col_;
|
||||
rabit::Allreduce<rabit::op::Sum>(ncols.data(), ncols.size());
|
||||
auto max_cols = std::max_element(ncols.cbegin(), ncols.cend());
|
||||
auto max_ind = std::distance(ncols.cbegin(), max_cols);
|
||||
// FIXME(trivialfis): This is a hack, we should store a reference to global shape if possible.
|
||||
if (source->info.num_col_ == 0 && source->info.num_row_ == 0) {
|
||||
LOG(WARNING) << "DMatrix at rank: " << rank << " worker is empty.";
|
||||
source->info.num_col_ = *max_cols;
|
||||
}
|
||||
|
||||
// validate the number of columns across all workers.
|
||||
for (size_t i = 0; i < ncols.size(); ++i) {
|
||||
auto v = ncols[i];
|
||||
CHECK(v == 0 || v == *max_cols)
|
||||
<< "DMatrix at rank: " << i << " worker "
|
||||
<< "has different number of columns than rank: " << max_ind << " worker. "
|
||||
<< "(" << v << " vs. " << *max_cols << ")";
|
||||
}
|
||||
}
|
||||
return new data::SimpleDMatrix(std::move(source));
|
||||
} else {
|
||||
#if DMLC_ENABLE_STD_THREAD
|
||||
|
||||
@@ -99,13 +99,13 @@ EllpackInfo::EllpackInfo(int device,
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat,
|
||||
dh::BulkAllocator& ba)
|
||||
dh::BulkAllocator* ba)
|
||||
: is_dense(is_dense), row_stride(row_stride), n_bins(hmat.Ptrs().back()) {
|
||||
|
||||
ba.Allocate(device,
|
||||
&feature_segments, hmat.Ptrs().size(),
|
||||
&gidx_fvalue_map, hmat.Values().size(),
|
||||
&min_fvalue, hmat.MinValues().size());
|
||||
ba->Allocate(device,
|
||||
&feature_segments, hmat.Ptrs().size(),
|
||||
&gidx_fvalue_map, hmat.Values().size(),
|
||||
&min_fvalue, hmat.MinValues().size());
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.Values());
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.MinValues());
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.Ptrs());
|
||||
@@ -116,7 +116,7 @@ void EllpackPageImpl::InitInfo(int device,
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat) {
|
||||
matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, ba_);
|
||||
matrix.info = EllpackInfo(device, is_dense, row_stride, hmat, &ba_);
|
||||
}
|
||||
|
||||
// Initialize the buffer to stored compressed features.
|
||||
@@ -189,7 +189,8 @@ void EllpackPageImpl::CreateHistIndices(int device,
|
||||
const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
|
||||
common::DivRoundUp(row_stride, block3.y),
|
||||
1);
|
||||
CompressBinEllpackKernel<<<grid3, block3>>>(
|
||||
dh::LaunchKernel {grid3, block3} (
|
||||
CompressBinEllpackKernel,
|
||||
common::CompressedBufferWriter(num_symbols),
|
||||
gidx_buffer.data(),
|
||||
row_ptrs.data().get(),
|
||||
|
||||
@@ -70,7 +70,7 @@ struct EllpackInfo {
|
||||
bool is_dense,
|
||||
size_t row_stride,
|
||||
const common::HistogramCuts& hmat,
|
||||
dh::BulkAllocator& ba);
|
||||
dh::BulkAllocator* ba);
|
||||
};
|
||||
|
||||
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
||||
|
||||
@@ -85,7 +85,7 @@ EllpackPageSourceImpl::EllpackPageSourceImpl(DMatrix* dmat,
|
||||
monitor_.StopCuda("Quantiles");
|
||||
|
||||
monitor_.StartCuda("CreateEllpackInfo");
|
||||
ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, hmat, ba_);
|
||||
ellpack_info_ = EllpackInfo(device_, dmat->IsDense(), row_stride, hmat, &ba_);
|
||||
monitor_.StopCuda("CreateEllpackInfo");
|
||||
|
||||
monitor_.StartCuda("WriteEllpackPages");
|
||||
|
||||
@@ -101,7 +101,7 @@ void CountValid(std::vector<Json> const& j_columns, uint32_t column_id,
|
||||
HostDeviceVector<size_t>* out_offset,
|
||||
dh::caching_device_vector<int32_t>* out_d_flag,
|
||||
uint32_t* out_n_rows) {
|
||||
int32_t constexpr kThreads = 256;
|
||||
uint32_t constexpr kThreads = 256;
|
||||
auto const& j_column = j_columns[column_id];
|
||||
auto const& column_obj = get<Object const>(j_column);
|
||||
Columnar<T> foreign_column = ArrayInterfaceHandler::ExtractArray<T>(column_obj);
|
||||
@@ -123,8 +123,9 @@ void CountValid(std::vector<Json> const& j_columns, uint32_t column_id,
|
||||
|
||||
common::Span<size_t> s_offsets = out_offset->DeviceSpan();
|
||||
|
||||
int32_t const kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
CountValidKernel<T><<<kBlocks, kThreads>>>(
|
||||
uint32_t const kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
dh::LaunchKernel {kBlocks, kThreads} (
|
||||
CountValidKernel<T>,
|
||||
foreign_column,
|
||||
has_missing, missing,
|
||||
out_d_flag->data().get(), s_offsets);
|
||||
@@ -135,13 +136,15 @@ template <typename T>
|
||||
void CreateCSR(std::vector<Json> const& j_columns, uint32_t column_id, uint32_t n_rows,
|
||||
bool has_missing, float missing,
|
||||
dh::device_vector<size_t>* tmp_offset, common::Span<Entry> s_data) {
|
||||
int32_t constexpr kThreads = 256;
|
||||
uint32_t constexpr kThreads = 256;
|
||||
auto const& j_column = j_columns[column_id];
|
||||
auto const& column_obj = get<Object const>(j_column);
|
||||
Columnar<T> foreign_column = ArrayInterfaceHandler::ExtractArray<T>(column_obj);
|
||||
int32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
CreateCSRKernel<T><<<kBlocks, kThreads>>>(foreign_column, column_id, has_missing, missing,
|
||||
dh::ToSpan(*tmp_offset), s_data);
|
||||
uint32_t kBlocks = common::DivRoundUp(n_rows, kThreads);
|
||||
dh::LaunchKernel {kBlocks, kThreads} (
|
||||
CreateCSRKernel<T>,
|
||||
foreign_column, column_id, has_missing, missing,
|
||||
dh::ToSpan(*tmp_offset), s_data);
|
||||
}
|
||||
|
||||
void SimpleCSRSource::FromDeviceColumnar(std::vector<Json> const& columns,
|
||||
|
||||
@@ -246,6 +246,14 @@ class GBTree : public GradientBooster {
|
||||
std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
|
||||
DMatrix* f_dmat = nullptr) const {
|
||||
CHECK(configured_);
|
||||
auto on_device = f_dmat && (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
// Use GPU Predictor if data is already on device.
|
||||
if (!specified_predictor_ && on_device) {
|
||||
CHECK(gpu_predictor_);
|
||||
return gpu_predictor_;
|
||||
}
|
||||
#endif // defined(XGBOOST_USE_CUDA)
|
||||
// GPU_Hist by default has prediction cache calculated from quantile values, so GPU
|
||||
// Predictor is not used for training dataset. But when XGBoost performs continue
|
||||
// training with an existing model, the prediction cache is not availbale and number
|
||||
@@ -256,7 +264,7 @@ class GBTree : public GradientBooster {
|
||||
(model_.param.num_trees != 0) &&
|
||||
// FIXME(trivialfis): Implement a better method for testing whether data is on
|
||||
// device after DMatrix refactoring is done.
|
||||
(f_dmat && !((*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead()))) {
|
||||
!on_device) {
|
||||
return cpu_predictor_;
|
||||
}
|
||||
if (tparam_.predictor == "cpu_predictor") {
|
||||
|
||||
@@ -630,7 +630,7 @@ class LearnerImpl : public Learner {
|
||||
CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
|
||||
<< "Unfortunately, XGBoost does not support data matrices with "
|
||||
<< std::numeric_limits<unsigned>::max() << " features or greater";
|
||||
num_feature = std::max(num_feature, static_cast<unsigned>(num_col));
|
||||
num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
|
||||
}
|
||||
// run allreduce on num_feature to find the maximum value
|
||||
rabit::Allreduce<rabit::op::Max>(&num_feature, 1, nullptr, nullptr, "num_feature");
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
* \file elementwise_metric.cc
|
||||
* \brief evaluation metrics for elementwise binary or regression.
|
||||
* \author Kailong Chen, Tianqi Chen
|
||||
*
|
||||
* The expressions like wsum == 0 ? esum : esum / wsum is used to handle empty dataset.
|
||||
*/
|
||||
#include <rabit/rabit.h>
|
||||
#include <xgboost/metric.h>
|
||||
@@ -142,7 +144,7 @@ struct EvalRowRMSE {
|
||||
return diff * diff;
|
||||
}
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return std::sqrt(esum / wsum);
|
||||
return wsum == 0 ? std::sqrt(esum) : std::sqrt(esum / wsum);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -150,12 +152,13 @@ struct EvalRowRMSLE {
|
||||
char const* Name() const {
|
||||
return "rmsle";
|
||||
}
|
||||
|
||||
XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
|
||||
bst_float diff = std::log1p(label) - std::log1p(pred);
|
||||
return diff * diff;
|
||||
}
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return std::sqrt(esum / wsum);
|
||||
return wsum == 0 ? std::sqrt(esum) : std::sqrt(esum / wsum);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -168,7 +171,7 @@ struct EvalRowMAE {
|
||||
return std::abs(label - pred);
|
||||
}
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -190,7 +193,7 @@ struct EvalRowLogLoss {
|
||||
}
|
||||
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -225,7 +228,7 @@ struct EvalError {
|
||||
}
|
||||
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -245,7 +248,7 @@ struct EvalPoissonNegLogLik {
|
||||
}
|
||||
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -278,7 +281,7 @@ struct EvalGammaNLogLik {
|
||||
return -((y * theta - b) / a + c);
|
||||
}
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -304,7 +307,7 @@ struct EvalTweedieNLogLik {
|
||||
return -a + b;
|
||||
}
|
||||
static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||
return esum / wsum;
|
||||
return wsum == 0 ? esum : esum / wsum;
|
||||
}
|
||||
|
||||
protected:
|
||||
@@ -323,7 +326,9 @@ struct EvalEWiseBase : public Metric {
|
||||
bst_float Eval(const HostDeviceVector<bst_float>& preds,
|
||||
const MetaInfo& info,
|
||||
bool distributed) override {
|
||||
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
||||
if (info.labels_.Size() == 0) {
|
||||
LOG(WARNING) << "label set is empty";
|
||||
}
|
||||
CHECK_EQ(preds.Size(), info.labels_.Size())
|
||||
<< "label and prediction size not match, "
|
||||
<< "hint: use merror or mlogloss for multi-class classification";
|
||||
@@ -333,6 +338,7 @@ struct EvalEWiseBase : public Metric {
|
||||
reducer_.Reduce(*tparam_, device, info.weights_, info.labels_, preds);
|
||||
|
||||
double dat[2] { result.Residue(), result.Weights() };
|
||||
|
||||
if (distributed) {
|
||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||
}
|
||||
|
||||
@@ -54,7 +54,9 @@ class RegLossObj : public ObjFunction {
|
||||
const MetaInfo &info,
|
||||
int iter,
|
||||
HostDeviceVector<GradientPair>* out_gpair) override {
|
||||
CHECK_NE(info.labels_.Size(), 0U) << "label set cannot be empty";
|
||||
if (info.labels_.Size() == 0U) {
|
||||
LOG(WARNING) << "Label set is empty.";
|
||||
}
|
||||
CHECK_EQ(preds.Size(), info.labels_.Size())
|
||||
<< "labels are not correctly provided"
|
||||
<< "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size();
|
||||
|
||||
@@ -60,6 +60,9 @@ class CPUPredictor : public Predictor {
|
||||
constexpr int kUnroll = 8;
|
||||
const auto nsize = static_cast<bst_omp_uint>(batch.Size());
|
||||
const bst_omp_uint rest = nsize % kUnroll;
|
||||
// Pull to host before entering omp block, as this is not thread safe.
|
||||
batch.data.HostVector();
|
||||
batch.offset.HostVector();
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (bst_omp_uint i = 0; i < nsize - rest; i += kUnroll) {
|
||||
const int tid = omp_get_thread_num();
|
||||
|
||||
@@ -225,12 +225,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
HostDeviceVector<bst_float>* predictions,
|
||||
size_t batch_offset) {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
const int BLOCK_THREADS = 128;
|
||||
const uint32_t BLOCK_THREADS = 128;
|
||||
size_t num_rows = batch.Size();
|
||||
const int GRID_SIZE = static_cast<int>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
|
||||
|
||||
int shared_memory_bytes = static_cast<int>
|
||||
(sizeof(float) * num_features * BLOCK_THREADS);
|
||||
auto shared_memory_bytes =
|
||||
static_cast<size_t>(sizeof(float) * num_features * BLOCK_THREADS);
|
||||
bool use_shared = true;
|
||||
if (shared_memory_bytes > max_shared_memory_bytes_) {
|
||||
shared_memory_bytes = 0;
|
||||
@@ -238,11 +238,12 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
}
|
||||
size_t entry_start = 0;
|
||||
|
||||
PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
|
||||
(dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset),
|
||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
|
||||
batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows,
|
||||
entry_start, use_shared, this->num_group_);
|
||||
dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
|
||||
PredictKernel<BLOCK_THREADS>,
|
||||
dh::ToSpan(nodes_), predictions->DeviceSpan().subspan(batch_offset),
|
||||
dh::ToSpan(tree_segments_), dh::ToSpan(tree_group_), batch.offset.DeviceSpan(),
|
||||
batch.data.DeviceSpan(), this->tree_begin_, this->tree_end_, num_features, num_rows,
|
||||
entry_start, use_shared, this->num_group_);
|
||||
}
|
||||
|
||||
void InitModel(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
|
||||
|
||||
@@ -165,10 +165,11 @@ __global__ void ClearBuffersKernel(
|
||||
void FeatureInteractionConstraint::ClearBuffers() {
|
||||
CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
|
||||
CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto const n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
|
||||
ClearBuffersKernel<<<n_grids, kBlockThreads>>>(
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
ClearBuffersKernel,
|
||||
output_buffer_bits_, input_buffer_bits_);
|
||||
}
|
||||
|
||||
@@ -222,12 +223,14 @@ common::Span<int32_t> FeatureInteractionConstraint::Query(
|
||||
LBitField64 node_constraints = s_node_constraints_[nid];
|
||||
CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
|
||||
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(
|
||||
common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
|
||||
SetInputBufferKernel<<<n_grids, kBlockThreads>>>(feature_list, input_buffer_bits_);
|
||||
|
||||
QueryFeatureListKernel<<<n_grids, kBlockThreads>>>(
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
SetInputBufferKernel,
|
||||
feature_list, input_buffer_bits_);
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
QueryFeatureListKernel,
|
||||
node_constraints, input_buffer_bits_, output_buffer_bits_);
|
||||
|
||||
thrust::counting_iterator<int32_t> begin(0);
|
||||
@@ -327,20 +330,20 @@ void FeatureInteractionConstraint::Split(
|
||||
dim3 const block3(16, 64, 1);
|
||||
dim3 const grid3(common::DivRoundUp(n_sets_, 16),
|
||||
common::DivRoundUp(s_fconstraints_.size(), 64));
|
||||
RestoreFeatureListFromSetsKernel<<<grid3, block3>>>
|
||||
(feature_buffer_,
|
||||
feature_id,
|
||||
s_fconstraints_,
|
||||
s_fconstraints_ptr_,
|
||||
s_sets_,
|
||||
s_sets_ptr_);
|
||||
dh::LaunchKernel {grid3, block3} (
|
||||
RestoreFeatureListFromSetsKernel,
|
||||
feature_buffer_, feature_id,
|
||||
s_fconstraints_, s_fconstraints_ptr_,
|
||||
s_sets_, s_sets_ptr_);
|
||||
|
||||
int constexpr kBlockThreads = 256;
|
||||
const int n_grids = static_cast<int>(common::DivRoundUp(node.Size(), kBlockThreads));
|
||||
InteractionConstraintSplitKernel<<<n_grids, kBlockThreads>>>
|
||||
(feature_buffer_,
|
||||
feature_id,
|
||||
node, left, right);
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
|
||||
|
||||
dh::LaunchKernel {n_grids, kBlockThreads} (
|
||||
InteractionConstraintSplitKernel,
|
||||
feature_buffer_,
|
||||
feature_id,
|
||||
node, left, right);
|
||||
}
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -603,12 +603,12 @@ struct GPUHistMakerDevice {
|
||||
}
|
||||
|
||||
// One block for each feature
|
||||
int constexpr kBlockThreads = 256;
|
||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
||||
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
||||
hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix,
|
||||
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
||||
monotone_constraints);
|
||||
uint32_t constexpr kBlockThreads = 256;
|
||||
dh::LaunchKernel {uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]} (
|
||||
EvaluateSplitKernel<kBlockThreads, GradientSumT>,
|
||||
hist.GetNodeHistogram(nidx), d_feature_set, node, page->matrix,
|
||||
gpu_param, d_split_candidates, node_value_constraints[nidx],
|
||||
monotone_constraints);
|
||||
|
||||
// Reduce over features to find best feature
|
||||
auto d_cub_memory =
|
||||
@@ -638,14 +638,12 @@ struct GPUHistMakerDevice {
|
||||
use_shared_memory_histograms
|
||||
? sizeof(GradientSumT) * page->matrix.BinCount()
|
||||
: 0;
|
||||
const int items_per_thread = 8;
|
||||
const int block_threads = 256;
|
||||
const int grid_size = static_cast<int>(
|
||||
uint32_t items_per_thread = 8;
|
||||
uint32_t block_threads = 256;
|
||||
auto grid_size = static_cast<uint32_t>(
|
||||
common::DivRoundUp(n_elements, items_per_thread * block_threads));
|
||||
if (grid_size <= 0) {
|
||||
return;
|
||||
}
|
||||
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>(
|
||||
dh::LaunchKernel {grid_size, block_threads, smem_size} (
|
||||
SharedMemHistKernel<GradientSumT>,
|
||||
page->matrix, d_ridx, d_node_hist.data(), d_gpair, n_elements,
|
||||
use_shared_memory_histograms);
|
||||
}
|
||||
@@ -886,6 +884,7 @@ struct GPUHistMakerDevice {
|
||||
monitor.StartCuda("InitRoot");
|
||||
this->InitRoot(p_tree, gpair_all, reducer, p_fmat->Info().num_col_);
|
||||
monitor.StopCuda("InitRoot");
|
||||
|
||||
auto timestamp = qexpand->size();
|
||||
auto num_leaves = 1;
|
||||
|
||||
@@ -895,7 +894,6 @@ struct GPUHistMakerDevice {
|
||||
if (!candidate.IsValid(param, num_leaves)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
this->ApplySplit(candidate, p_tree);
|
||||
|
||||
num_leaves++;
|
||||
@@ -996,18 +994,22 @@ class GPUHistMakerSpecialised {
|
||||
try {
|
||||
for (xgboost::RegTree* tree : trees) {
|
||||
this->UpdateTree(gpair, dmat, tree);
|
||||
|
||||
if (hist_maker_param_.debug_synchronize) {
|
||||
this->CheckTreesSynchronized(tree);
|
||||
}
|
||||
}
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
} catch (const std::exception& e) {
|
||||
LOG(FATAL) << "Exception in gpu_hist: " << e.what() << std::endl;
|
||||
}
|
||||
|
||||
param_.learning_rate = lr;
|
||||
monitor_.StopCuda("Update");
|
||||
}
|
||||
|
||||
void InitDataOnce(DMatrix* dmat) {
|
||||
info_ = &dmat->Info();
|
||||
|
||||
reducer_.Init({device_});
|
||||
|
||||
// Synchronise the column sampling seed
|
||||
@@ -1048,20 +1050,18 @@ class GPUHistMakerSpecialised {
|
||||
}
|
||||
|
||||
// Only call this method for testing
|
||||
void CheckTreesSynchronized(const std::vector<RegTree>& local_trees) const {
|
||||
void CheckTreesSynchronized(RegTree* local_tree) const {
|
||||
std::string s_model;
|
||||
common::MemoryBufferStream fs(&s_model);
|
||||
int rank = rabit::GetRank();
|
||||
if (rank == 0) {
|
||||
local_trees.front().SaveModel(&fs);
|
||||
local_tree->SaveModel(&fs);
|
||||
}
|
||||
fs.Seek(0);
|
||||
rabit::Broadcast(&s_model, 0);
|
||||
RegTree reference_tree{};
|
||||
RegTree reference_tree {}; // rank 0 tree
|
||||
reference_tree.LoadModel(&fs);
|
||||
for (const auto& tree : local_trees) {
|
||||
CHECK(tree == reference_tree);
|
||||
}
|
||||
CHECK(*local_tree == reference_tree);
|
||||
}
|
||||
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
|
||||
|
||||
Reference in New Issue
Block a user