Implement transform to reduce CPU/GPU code duplication. (#3643)

* Implement Transform class.
* Add tests for softmax.
* Use Transform in regression, softmax and hinge objectives, except for Cox.
* Mark old gpu objective functions deprecated.
* static_assert for softmax.
* Split up multi-gpu tests.
This commit is contained in:
trivialfis
2018-10-02 15:06:21 +13:00
committed by Rory Mitchell
parent 87aca8c244
commit d594b11f35
31 changed files with 1514 additions and 997 deletions

View File

@@ -1,9 +1,11 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2018 by Contributors
* \file common.cc
* \brief Enable all kinds of global variables in common.
*/
#include <dmlc/thread_local.h>
#include "common.h"
#include "./random.h"
namespace xgboost {

View File

@@ -11,7 +11,7 @@ int AllVisibleImpl::AllVisible() {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
} catch(const std::exception& e) {
} catch(const thrust::system::system_error& err) {
return 0;
}
return n_visgpus;

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2015 by Contributors
* Copyright 2015-2018 by Contributors
* \file common.h
* \brief Common utilities
*/
@@ -19,6 +19,13 @@
#if defined(__CUDACC__)
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#define WITH_CUDA() true
#else
#define WITH_CUDA() false
#endif
namespace dh {
@@ -29,11 +36,11 @@ namespace dh {
#define safe_cuda(ans) ThrowOnCudaError((ans), __FILE__, __LINE__)
inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
int line) {
int line) {
if (code != cudaSuccess) {
throw thrust::system_error(code, thrust::cuda_category(),
std::string{file} + "(" + // NOLINT
std::to_string(line) + ")");
LOG(FATAL) << thrust::system_error(code, thrust::cuda_category(),
std::string{file} + ": " + // NOLINT
std::to_string(line)).what();
}
return code;
}
@@ -70,13 +77,13 @@ inline std::string ToString(const T& data) {
*/
class Range {
public:
using DifferenceType = int64_t;
class Iterator {
friend class Range;
public:
using DifferenceType = int64_t;
XGBOOST_DEVICE int64_t operator*() const { return i_; }
XGBOOST_DEVICE DifferenceType operator*() const { return i_; }
XGBOOST_DEVICE const Iterator &operator++() {
i_ += step_;
return *this;
@@ -97,8 +104,8 @@ class Range {
XGBOOST_DEVICE void Step(DifferenceType s) { step_ = s; }
protected:
XGBOOST_DEVICE explicit Iterator(int64_t start) : i_(start) {}
XGBOOST_DEVICE explicit Iterator(int64_t start, int step) :
XGBOOST_DEVICE explicit Iterator(DifferenceType start) : i_(start) {}
XGBOOST_DEVICE explicit Iterator(DifferenceType start, DifferenceType step) :
i_{start}, step_{step} {}
public:
@@ -109,9 +116,10 @@ class Range {
XGBOOST_DEVICE Iterator begin() const { return begin_; } // NOLINT
XGBOOST_DEVICE Iterator end() const { return end_; } // NOLINT
XGBOOST_DEVICE Range(int64_t begin, int64_t end)
XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end)
: begin_(begin), end_(end) {}
XGBOOST_DEVICE Range(int64_t begin, int64_t end, Iterator::DifferenceType step)
XGBOOST_DEVICE Range(DifferenceType begin, DifferenceType end,
DifferenceType step)
: begin_(begin, step), end_(end) {}
XGBOOST_DEVICE bool operator==(const Range& other) const {
@@ -121,9 +129,7 @@ class Range {
return !(*this == other);
}
XGBOOST_DEVICE void Step(Iterator::DifferenceType s) { begin_.Step(s); }
XGBOOST_DEVICE Iterator::DifferenceType GetStep() const { return begin_.step_; }
XGBOOST_DEVICE void Step(DifferenceType s) { begin_.Step(s); }
private:
Iterator begin_;

View File

@@ -9,6 +9,7 @@
#include <xgboost/logging.h>
#include "common.h"
#include "span.h"
#include <algorithm>
#include <chrono>
@@ -955,7 +956,7 @@ class SaveCudaContext {
// cudaGetDevice will fail.
try {
safe_cuda(cudaGetDevice(&saved_device_));
} catch (thrust::system::system_error & err) {
} catch (const thrust::system::system_error & err) {
saved_device_ = -1;
}
func();
@@ -1035,4 +1036,22 @@ ReduceT ReduceShards(std::vector<ShardT> *shards, FunctionT f) {
};
return std::accumulate(sums.begin(), sums.end(), ReduceT());
}
template <typename T,
typename IndexT = typename xgboost::common::Span<T>::index_type>
xgboost::common::Span<T> ToSpan(
thrust::device_vector<T>& vec,
IndexT offset = 0,
IndexT size = -1) {
size = size == -1 ? vec.size() : size;
CHECK_LE(offset + size, vec.size());
return {vec.data().get() + offset, static_cast<IndexT>(size)};
}
template <typename T>
xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
size_t offset, size_t size) {
using IndexT = typename xgboost::common::Span<T>::index_type;
return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
}
} // namespace dh

View File

@@ -116,6 +116,7 @@ struct HostDeviceVectorImpl {
int ndevices = vec_->distribution_.devices_.Size();
start_ = vec_->distribution_.ShardStart(new_size, index_);
proper_size_ = vec_->distribution_.ShardProperSize(new_size, index_);
// The size on this device.
size_t size_d = vec_->distribution_.ShardSize(new_size, index_);
SetDevice();
data_.resize(size_d);
@@ -230,7 +231,7 @@ struct HostDeviceVectorImpl {
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kWrite);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
}
common::Span<const T> ConstDeviceSpan(int device) {
@@ -238,7 +239,7 @@ struct HostDeviceVectorImpl {
CHECK(devices.Contains(device));
LazySyncDevice(device, GPUAccess::kRead);
return {shards_[devices.Index(device)].data_.data().get(),
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
}
size_t DeviceSize(int device) {
@@ -289,7 +290,6 @@ struct HostDeviceVectorImpl {
data_h_.size() * sizeof(T),
cudaMemcpyHostToDevice));
} else {
//
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
}
}
@@ -304,14 +304,20 @@ struct HostDeviceVectorImpl {
void Copy(HostDeviceVectorImpl<T>* other) {
CHECK_EQ(Size(), other->Size());
// Data is on host.
if (perm_h_.CanWrite() && other->perm_h_.CanWrite()) {
std::copy(other->data_h_.begin(), other->data_h_.end(), data_h_.begin());
} else {
CHECK(distribution_ == other->distribution_);
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_[i]);
});
return;
}
// Data is on device;
if (distribution_ != other->distribution_) {
distribution_ = GPUDistribution();
Reshard(other->Distribution());
size_d_ = other->size_d_;
}
dh::ExecuteIndexShards(&shards_, [&](int i, DeviceShard& shard) {
shard.Copy(&other->shards_[i]);
});
}
void Copy(const std::vector<T>& other) {

View File

@@ -111,8 +111,11 @@ class GPUDistribution {
}
friend bool operator==(const GPUDistribution& a, const GPUDistribution& b) {
return a.devices_ == b.devices_ && a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ && a.offsets_ == b.offsets_;
bool const res = a.devices_ == b.devices_ &&
a.granularity_ == b.granularity_ &&
a.overlap_ == b.overlap_ &&
a.offsets_ == b.offsets_;
return res;
}
friend bool operator!=(const GPUDistribution& a, const GPUDistribution& b) {

View File

@@ -11,6 +11,7 @@
#include <vector>
#include <cmath>
#include <algorithm>
#include <utility>
#include "avx_helpers.h"
namespace xgboost {
@@ -29,22 +30,31 @@ inline avx::Float8 Sigmoid(avx::Float8 x) {
}
/*!
* \brief do inplace softmax transformaton on p_rec
* \param p_rec the input/output vector of the values.
* \brief Do inplace softmax transformaton on start to end
*
* \tparam Iterator Input iterator type
*
* \param start Start iterator of input
* \param end end iterator of input
*/
inline void Softmax(std::vector<float>* p_rec) {
std::vector<float> &rec = *p_rec;
float wmax = rec[0];
for (size_t i = 1; i < rec.size(); ++i) {
wmax = std::max(rec[i], wmax);
template <typename Iterator>
XGBOOST_DEVICE inline void Softmax(Iterator start, Iterator end) {
static_assert(std::is_same<bst_float,
typename std::remove_reference<
decltype(std::declval<Iterator>().operator*())>::type
>::value,
"Values should be of type bst_float");
bst_float wmax = *start;
for (Iterator i = start+1; i != end; ++i) {
wmax = fmaxf(*i, wmax);
}
double wsum = 0.0f;
for (float & elem : rec) {
elem = std::exp(elem - wmax);
wsum += elem;
for (Iterator i = start; i != end; ++i) {
*i = expf(*i - wmax);
wsum += *i;
}
for (float & elem : rec) {
elem /= static_cast<float>(wsum);
for (Iterator i = start; i != end; ++i) {
*i /= static_cast<float>(wsum);
}
}
@@ -56,7 +66,7 @@ inline void Softmax(std::vector<float>* p_rec) {
* \tparam Iterator The type of the iterator.
*/
template<typename Iterator>
inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
XGBOOST_DEVICE inline Iterator FindMaxIndex(Iterator begin, Iterator end) {
Iterator maxit = begin;
for (Iterator it = begin; it != end; ++it) {
if (*it > *maxit) maxit = it;

View File

@@ -49,7 +49,7 @@
*
* https://github.com/Microsoft/GSL/pull/664
*
* FIXME: Group these MSVC workarounds into a manageable place.
* TODO(trivialfis): Group these MSVC workarounds into a manageable place.
*/
#if defined(_MSC_VER) && _MSC_VER < 1910
@@ -68,7 +68,7 @@ namespace xgboost {
namespace common {
// Usual logging facility is not available inside device code.
// FIXME: Make dmlc check more generic.
// TODO(trivialfis): Make dmlc check more generic.
#define KERNEL_CHECK(cond) \
do { \
if (!(cond)) { \
@@ -104,11 +104,11 @@ constexpr detail::ptrdiff_t dynamic_extent = -1; // NOLINT
enum class byte : unsigned char {}; // NOLINT
namespace detail {
template <class ElementType, detail::ptrdiff_t Extent = dynamic_extent>
template <class ElementType, detail::ptrdiff_t Extent>
class Span;
namespace detail {
template <typename SpanType, bool IsConst>
class SpanIterator {
using ElementType = typename SpanType::element_type;

204
src/common/transform.h Normal file
View File

@@ -0,0 +1,204 @@
/*!
* Copyright 2018 XGBoost contributors
*/
#ifndef XGBOOST_COMMON_TRANSFORM_H_
#define XGBOOST_COMMON_TRANSFORM_H_
#include <dmlc/omp.h>
#include <xgboost/data.h>
#include <vector>
#include <type_traits> // enable_if
#include "host_device_vector.h"
#include "common.h"
#include "span.h"
#if defined (__CUDACC__)
#include "device_helpers.cuh"
#endif
namespace xgboost {
namespace common {
constexpr size_t kBlockThreads = 256;
namespace detail {
#if defined(__CUDACC__)
template <typename Functor, typename... SpanType>
__global__ void LaunchCUDAKernel(Functor _func, Range _range,
SpanType... _spans) {
for (auto i : dh::GridStrideRange(*_range.begin(), *_range.end())) {
_func(i, _spans...);
}
}
#endif
} // namespace detail
/*! \brief Do Transformation on HostDeviceVectors.
*
* \tparam CompiledWithCuda A bool parameter used to distinguish compilation
* trajectories, users do not need to use it.
*
* Note: Using Transform is a VERY tricky thing to do. Transform uses template
* argument to duplicate itself into two different types, one for CPU,
* another for CUDA. The trick is not without its flaw:
*
* If you use it in a function that can be compiled by both nvcc and host
* compiler, the behaviour is un-defined! Because your function is NOT
* duplicated by `CompiledWithCuda`. At link time, cuda compiler resolution
* will merge functions with same signature.
*/
template <bool CompiledWithCuda = WITH_CUDA()>
class Transform {
private:
template <typename Functor>
struct Evaluator {
public:
Evaluator(Functor func, Range range, GPUSet devices, bool reshard) :
func_(func), range_{std::move(range)},
distribution_{std::move(GPUDistribution::Block(devices))},
reshard_{reshard} {}
Evaluator(Functor func, Range range, GPUDistribution dist,
bool reshard) :
func_(func), range_{std::move(range)}, distribution_{std::move(dist)},
reshard_{reshard} {}
/*!
* \brief Evaluate the functor with input pointers to HostDeviceVector.
*
* \tparam HDV... HostDeviceVectors type.
* \param vectors Pointers to HostDeviceVector.
*/
template <typename... HDV>
void Eval(HDV... vectors) const {
bool on_device = !distribution_.IsEmpty();
if (on_device) {
LaunchCUDA(func_, vectors...);
} else {
LaunchCPU(func_, vectors...);
}
}
private:
// CUDA UnpackHDV
template <typename T>
Span<T> UnpackHDV(HostDeviceVector<T>* _vec, int _device) const {
return _vec->DeviceSpan(_device);
}
template <typename T>
Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec, int _device) const {
return _vec->ConstDeviceSpan(_device);
}
// CPU UnpackHDV
template <typename T>
Span<T> UnpackHDV(HostDeviceVector<T>* _vec) const {
return Span<T> {_vec->HostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
template <typename T>
Span<T const> UnpackHDV(const HostDeviceVector<T>* _vec) const {
return Span<T const> {_vec->ConstHostPointer(),
static_cast<typename Span<T>::index_type>(_vec->Size())};
}
// Recursive unpack for Reshard.
template <typename T>
void UnpackReshard(GPUDistribution dist, const HostDeviceVector<T>* vector) const {
vector->Reshard(dist);
}
template <typename Head, typename... Rest>
void UnpackReshard(GPUDistribution dist,
const HostDeviceVector<Head>* _vector,
const HostDeviceVector<Rest>*... _vectors) const {
_vector->Reshard(dist);
UnpackReshard(dist, _vectors...);
}
#if defined(__CUDACC__)
template <typename std::enable_if<CompiledWithCuda>::type* = nullptr,
typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
if (reshard_)
UnpackReshard(distribution_, _vectors...);
GPUSet devices = distribution_.Devices();
size_t range_size = *range_.end() - *range_.begin();
#pragma omp parallel for schedule(static, 1) if (devices.Size() > 1)
for (omp_ulong i = 0; i < devices.Size(); ++i) {
int d = devices.Index(i);
// Ignore other attributes of GPUDistribution for spliting index.
size_t shard_size =
GPUDistribution::Block(devices).ShardSize(range_size, d);
Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
dh::safe_cuda(cudaSetDevice(d));
const int GRID_SIZE =
static_cast<int>(dh::DivRoundUp(*(range_.end()), kBlockThreads));
detail::LaunchCUDAKernel<<<GRID_SIZE, kBlockThreads>>>(
_func, shard_range, UnpackHDV(_vectors, d)...);
dh::safe_cuda(cudaGetLastError());
dh::safe_cuda(cudaDeviceSynchronize());
}
}
#else
/*! \brief Dummy funtion defined when compiling for CPU. */
template <typename std::enable_if<!CompiledWithCuda>::type* = nullptr,
typename... HDV>
void LaunchCUDA(Functor _func, HDV*... _vectors) const {
LOG(FATAL) << "Not part of device code. WITH_CUDA: " << WITH_CUDA();
}
#endif
template <typename... HDV>
void LaunchCPU(Functor func, HDV*... vectors) const {
auto end = *(range_.end());
#pragma omp parallel for schedule(static)
for (omp_ulong idx = 0; idx < end; ++idx) {
func(idx, UnpackHDV(vectors)...);
}
}
private:
/*! \brief Callable object. */
Functor func_;
/*! \brief Range object specifying parallel threads index range. */
Range range_;
/*! \brief Whether resharding for vectors is required. */
bool reshard_;
GPUDistribution distribution_;
};
public:
/*!
* \brief Initialize a Transform object.
*
* \tparam Functor A callable object type.
* \return A Evaluator having one method Eval.
*
* \param func A callable object, accepting a size_t thread index,
* followed by a set of Span classes.
* \param range Range object specifying parallel threads index range.
* \param devices GPUSet specifying GPUs to use, when compiling for CPU,
* this should be GPUSet::Empty().
* \param reshard Whether Reshard for HostDeviceVector is needed.
*/
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUSet const devices,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(devices), reshard};
}
template <typename Functor>
static Evaluator<Functor> Init(Functor func, Range const range,
GPUDistribution const dist,
bool const reshard = true) {
return Evaluator<Functor> {func, std::move(range), std::move(dist), reshard};
}
};
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_TRANSFORM_H_