Fix clang-tidy warnings. (#4149)
* Upgrade gtest for clang-tidy. * Use CMake to install GTest instead of mv. * Don't enforce clang-tidy to return 0 due to errors in thrust. * Add a small test for tidy itself. * Reformat.
This commit is contained in:
parent
259fb809e9
commit
7b9043cf71
@ -87,7 +87,10 @@ class Booster {
|
|||||||
initialized_ = true;
|
initialized_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public:
|
bool IsInitialized() const { return initialized_; }
|
||||||
|
void Intialize() { initialized_ = true; }
|
||||||
|
|
||||||
|
private:
|
||||||
bool configured_;
|
bool configured_;
|
||||||
bool initialized_;
|
bool initialized_;
|
||||||
std::unique_ptr<Learner> learner_;
|
std::unique_ptr<Learner> learner_;
|
||||||
@ -1153,7 +1156,7 @@ XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
|
|||||||
auto* bst = static_cast<Booster*>(handle);
|
auto* bst = static_cast<Booster*>(handle);
|
||||||
*version = rabit::LoadCheckPoint(bst->learner());
|
*version = rabit::LoadCheckPoint(bst->learner());
|
||||||
if (*version != 0) {
|
if (*version != 0) {
|
||||||
bst->initialized_ = true;
|
bst->Intialize();
|
||||||
}
|
}
|
||||||
API_END();
|
API_END();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,7 +42,9 @@ class Column {
|
|||||||
uint32_t GetBaseIdx() const { return index_base_; }
|
uint32_t GetBaseIdx() const { return index_base_; }
|
||||||
ColumnType GetType() const { return type_; }
|
ColumnType GetType() const { return type_; }
|
||||||
size_t GetRowIdx(size_t idx) const {
|
size_t GetRowIdx(size_t idx) const {
|
||||||
return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx];
|
// clang-tidy worries that row_ind_ might be a nullptr, which is possible,
|
||||||
|
// but low level structure is not safe anyway.
|
||||||
|
return type_ == ColumnType::kDenseColumn ? idx : row_ind_[idx]; // NOLINT
|
||||||
}
|
}
|
||||||
bool IsMissing(size_t idx) const {
|
bool IsMissing(size_t idx) const {
|
||||||
return index_[idx] == std::numeric_limits<uint32_t>::max();
|
return index_[idx] == std::numeric_limits<uint32_t>::max();
|
||||||
|
|||||||
@ -772,7 +772,7 @@ template <typename T>
|
|||||||
typename std::iterator_traits<T>::value_type SumReduction(
|
typename std::iterator_traits<T>::value_type SumReduction(
|
||||||
dh::CubMemory &tmp_mem, T in, int nVals) {
|
dh::CubMemory &tmp_mem, T in, int nVals) {
|
||||||
using ValueT = typename std::iterator_traits<T>::value_type;
|
using ValueT = typename std::iterator_traits<T>::value_type;
|
||||||
size_t tmpSize;
|
size_t tmpSize {0};
|
||||||
ValueT *dummy_out = nullptr;
|
ValueT *dummy_out = nullptr;
|
||||||
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
|
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, dummy_out, nVals));
|
||||||
// Allocate small extra memory for the return value
|
// Allocate small extra memory for the return value
|
||||||
|
|||||||
@ -548,7 +548,7 @@ void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
|||||||
const size_t rest = nrows % kUnroll;
|
const size_t rest = nrows % kUnroll;
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
const auto nthread = static_cast<bst_omp_uint>(this->nthread_); // NOLINT
|
||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
tree::GradStats* p_hist = hist.data();
|
tree::GradStats* p_hist = hist.data();
|
||||||
|
|
||||||
@ -594,7 +594,7 @@ void GHistBuilder::SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow pa
|
|||||||
const uint32_t rest = nbins % kUnroll;
|
const uint32_t rest = nbins % kUnroll;
|
||||||
|
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
const auto nthread = static_cast<bst_omp_uint>(this->nthread_);
|
const auto nthread = static_cast<bst_omp_uint>(this->nthread_); // NOLINT
|
||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
tree::GradStats* p_self = self.data();
|
tree::GradStats* p_self = self.data();
|
||||||
tree::GradStats* p_sibling = sibling.data();
|
tree::GradStats* p_sibling = sibling.data();
|
||||||
|
|||||||
@ -24,13 +24,14 @@ namespace common {
|
|||||||
|
|
||||||
using WXQSketch = HistCutMatrix::WXQSketch;
|
using WXQSketch = HistCutMatrix::WXQSketch;
|
||||||
|
|
||||||
__global__ void find_cuts_k
|
__global__ void FindCutsK
|
||||||
(WXQSketch::Entry* __restrict__ cuts, const bst_float* __restrict__ data,
|
(WXQSketch::Entry* __restrict__ cuts, const bst_float* __restrict__ data,
|
||||||
const float* __restrict__ cum_weights, int nsamples, int ncuts) {
|
const float* __restrict__ cum_weights, int nsamples, int ncuts) {
|
||||||
// ncuts < nsamples
|
// ncuts < nsamples
|
||||||
int icut = threadIdx.x + blockIdx.x * blockDim.x;
|
int icut = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
if (icut >= ncuts)
|
if (icut >= ncuts) {
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
WXQSketch::Entry v;
|
WXQSketch::Entry v;
|
||||||
int isample = 0;
|
int isample = 0;
|
||||||
if (icut == 0) {
|
if (icut == 0) {
|
||||||
@ -55,7 +56,7 @@ struct IsNotNaN {
|
|||||||
__device__ bool operator()(float a) const { return !isnan(a); }
|
__device__ bool operator()(float a) const { return !isnan(a); }
|
||||||
};
|
};
|
||||||
|
|
||||||
__global__ void unpack_features_k
|
__global__ void UnpackFeaturesK
|
||||||
(float* __restrict__ fvalues, float* __restrict__ feature_weights,
|
(float* __restrict__ fvalues, float* __restrict__ feature_weights,
|
||||||
const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
|
const size_t* __restrict__ row_ptrs, const float* __restrict__ weights,
|
||||||
Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
|
Entry* entries, size_t nrows_array, int ncols, size_t row_begin_ptr,
|
||||||
@ -75,7 +76,7 @@ __global__ void unpack_features_k
|
|||||||
// if and only if it is also written to features
|
// if and only if it is also written to features
|
||||||
if (!isnan(entry.fvalue) && (weights == nullptr || !isnan(weights[irow]))) {
|
if (!isnan(entry.fvalue) && (weights == nullptr || !isnan(weights[irow]))) {
|
||||||
fvalues[ind] = entry.fvalue;
|
fvalues[ind] = entry.fvalue;
|
||||||
if (feature_weights != nullptr) {
|
if (feature_weights != nullptr && weights != nullptr) {
|
||||||
feature_weights[ind] = weights[irow];
|
feature_weights[ind] = weights[irow];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -84,7 +85,7 @@ __global__ void unpack_features_k
|
|||||||
// finds quantiles on the GPU
|
// finds quantiles on the GPU
|
||||||
struct GPUSketcher {
|
struct GPUSketcher {
|
||||||
// manage memory for a single GPU
|
// manage memory for a single GPU
|
||||||
struct DeviceShard {
|
class DeviceShard {
|
||||||
int device_;
|
int device_;
|
||||||
bst_uint row_begin_; // The row offset for this shard
|
bst_uint row_begin_; // The row offset for this shard
|
||||||
bst_uint row_end_;
|
bst_uint row_end_;
|
||||||
@ -110,6 +111,7 @@ struct GPUSketcher {
|
|||||||
thrust::device_vector<size_t> num_elements_;
|
thrust::device_vector<size_t> num_elements_;
|
||||||
thrust::device_vector<char> tmp_storage_;
|
thrust::device_vector<char> tmp_storage_;
|
||||||
|
|
||||||
|
public:
|
||||||
DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
|
DeviceShard(int device, bst_uint row_begin, bst_uint row_end,
|
||||||
tree::TrainParam param) :
|
tree::TrainParam param) :
|
||||||
device_(device), row_begin_(row_begin), row_end_(row_end),
|
device_(device), row_begin_(row_begin), row_end_(row_end),
|
||||||
@ -268,7 +270,7 @@ struct GPUSketcher {
|
|||||||
} else if (n_cuts_cur_[icol] > 0) {
|
} else if (n_cuts_cur_[icol] > 0) {
|
||||||
// if more elements than cuts: use binary search on cumulative weights
|
// if more elements than cuts: use binary search on cumulative weights
|
||||||
int block = 256;
|
int block = 256;
|
||||||
find_cuts_k<<<dh::DivRoundUp(n_cuts_cur_[icol], block), block>>>
|
FindCutsK<<<dh::DivRoundUp(n_cuts_cur_[icol], block), block>>>
|
||||||
(cuts_d_.data().get() + icol * n_cuts_, fvalues_cur_.data().get(),
|
(cuts_d_.data().get() + icol * n_cuts_, fvalues_cur_.data().get(),
|
||||||
weights2_.data().get(), n_unique, n_cuts_cur_[icol]);
|
weights2_.data().get(), n_unique, n_cuts_cur_[icol]);
|
||||||
dh::safe_cuda(cudaGetLastError()); // NOLINT
|
dh::safe_cuda(cudaGetLastError()); // NOLINT
|
||||||
@ -309,7 +311,7 @@ struct GPUSketcher {
|
|||||||
dim3 block3(64, 4, 1);
|
dim3 block3(64, 4, 1);
|
||||||
dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
|
dim3 grid3(dh::DivRoundUp(batch_nrows, block3.x),
|
||||||
dh::DivRoundUp(num_cols_, block3.y), 1);
|
dh::DivRoundUp(num_cols_, block3.y), 1);
|
||||||
unpack_features_k<<<grid3, block3>>>
|
UnpackFeaturesK<<<grid3, block3>>>
|
||||||
(fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
|
(fvalues_.data().get(), has_weights_ ? feature_weights_.data().get() : nullptr,
|
||||||
row_ptrs_.data().get() + batch_row_begin,
|
row_ptrs_.data().get() + batch_row_begin,
|
||||||
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
|
has_weights_ ? weights_.data().get() : nullptr, entries_.data().get(),
|
||||||
@ -340,6 +342,10 @@ struct GPUSketcher {
|
|||||||
SketchBatch(row_batch, info, gpu_batch);
|
SketchBatch(row_batch, info, gpu_batch);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GetSummary(WXQSketch::SummaryContainer *summary, size_t const icol) {
|
||||||
|
sketches_[icol].GetSummary(summary);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void Sketch(const SparsePage& batch, const MetaInfo& info,
|
void Sketch(const SparsePage& batch, const MetaInfo& info,
|
||||||
@ -368,8 +374,8 @@ struct GPUSketcher {
|
|||||||
WXQSketch::SummaryContainer summary;
|
WXQSketch::SummaryContainer summary;
|
||||||
for (int icol = 0; icol < num_cols; ++icol) {
|
for (int icol = 0; icol < num_cols; ++icol) {
|
||||||
sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin));
|
sketches[icol].Init(batch.Size(), 1.0 / (8 * param_.max_bin));
|
||||||
for (int shard = 0; shard < shards_.size(); ++shard) {
|
for (auto &shard : shards_) {
|
||||||
shards_[shard]->sketches_[icol].GetSummary(&summary);
|
shard->GetSummary(&summary, icol);
|
||||||
sketches[icol].PushSummary(summary);
|
sketches[icol].PushSummary(summary);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -381,6 +387,7 @@ struct GPUSketcher {
|
|||||||
dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus, n_rows));
|
dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus, n_rows));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
std::vector<std::unique_ptr<DeviceShard>> shards_;
|
std::vector<std::unique_ptr<DeviceShard>> shards_;
|
||||||
tree::TrainParam param_;
|
tree::TrainParam param_;
|
||||||
GPUDistribution dist_;
|
GPUDistribution dist_;
|
||||||
|
|||||||
@ -38,6 +38,7 @@ struct HistCutMatrix {
|
|||||||
void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
|
void Init(std::vector<WXQSketch>* sketchs, uint32_t max_num_bins);
|
||||||
|
|
||||||
HistCutMatrix();
|
HistCutMatrix();
|
||||||
|
size_t NumBins() const { return row_ptr.back(); }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual size_t SearchGroupIndFromBaseRow(
|
virtual size_t SearchGroupIndFromBaseRow(
|
||||||
|
|||||||
@ -18,6 +18,11 @@ struct HostDeviceVectorImpl {
|
|||||||
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
|
explicit HostDeviceVectorImpl(size_t size, T v) : data_h_(size, v), distribution_() {}
|
||||||
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
|
HostDeviceVectorImpl(std::initializer_list<T> init) : data_h_(init), distribution_() {}
|
||||||
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)), distribution_() {}
|
explicit HostDeviceVectorImpl(std::vector<T> init) : data_h_(std::move(init)), distribution_() {}
|
||||||
|
|
||||||
|
std::vector<T>& Vec() { return data_h_; }
|
||||||
|
GPUDistribution& Dist() { return distribution_; }
|
||||||
|
|
||||||
|
private:
|
||||||
std::vector<T> data_h_;
|
std::vector<T> data_h_;
|
||||||
GPUDistribution distribution_;
|
GPUDistribution distribution_;
|
||||||
};
|
};
|
||||||
@ -64,14 +69,14 @@ HostDeviceVector<T>& HostDeviceVector<T>::operator=(const HostDeviceVector<T>& o
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
size_t HostDeviceVector<T>::Size() const { return impl_->data_h_.size(); }
|
size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
|
GPUSet HostDeviceVector<T>::Devices() const { return GPUSet::Empty(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
|
const GPUDistribution& HostDeviceVector<T>::Distribution() const {
|
||||||
return impl_->distribution_;
|
return impl_->Dist();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
@ -93,16 +98,16 @@ common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan(int device) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->data_h_; }
|
std::vector<T>& HostDeviceVector<T>::HostVector() { return impl_->Vec(); }
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
const std::vector<T>& HostDeviceVector<T>::ConstHostVector() const {
|
||||||
return impl_->data_h_;
|
return impl_->Vec();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||||
impl_->data_h_.resize(new_size, v);
|
impl_->Vec().resize(new_size, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|||||||
@ -23,10 +23,10 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
|
|||||||
// wrapper over access with useful methods
|
// wrapper over access with useful methods
|
||||||
class Permissions {
|
class Permissions {
|
||||||
GPUAccess access_;
|
GPUAccess access_;
|
||||||
explicit Permissions(GPUAccess access) : access_(access) {}
|
explicit Permissions(GPUAccess access) : access_{access} {}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Permissions() : access_(GPUAccess::kNone) {}
|
Permissions() : access_{GPUAccess::kNone} {}
|
||||||
explicit Permissions(bool perm)
|
explicit Permissions(bool perm)
|
||||||
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
|
: access_(perm ? GPUAccess::kWrite : GPUAccess::kNone) {}
|
||||||
|
|
||||||
@ -46,8 +46,8 @@ template <typename T>
|
|||||||
struct HostDeviceVectorImpl {
|
struct HostDeviceVectorImpl {
|
||||||
struct DeviceShard {
|
struct DeviceShard {
|
||||||
DeviceShard()
|
DeviceShard()
|
||||||
: proper_size_(0), device_(-1), start_(0), perm_d_(false),
|
: proper_size_{0}, device_{-1}, start_{0}, perm_d_{false},
|
||||||
cached_size_(~0), vec_(nullptr) {}
|
cached_size_{static_cast<size_t>(~0)}, vec_{nullptr} {}
|
||||||
|
|
||||||
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
void Init(HostDeviceVectorImpl<T>* vec, int device) {
|
||||||
if (vec_ == nullptr) { vec_ = vec; }
|
if (vec_ == nullptr) { vec_ = vec; }
|
||||||
@ -154,6 +154,13 @@ struct HostDeviceVectorImpl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
T* Raw() { return data_.data().get(); }
|
||||||
|
size_t Start() const { return start_; }
|
||||||
|
size_t DataSize() const { return data_.size(); }
|
||||||
|
Permissions& Perm() { return perm_d_; }
|
||||||
|
Permissions const& Perm() const { return perm_d_; }
|
||||||
|
|
||||||
|
private:
|
||||||
int device_;
|
int device_;
|
||||||
thrust::device_vector<T> data_;
|
thrust::device_vector<T> data_;
|
||||||
// cached vector size
|
// cached vector size
|
||||||
@ -216,20 +223,20 @@ struct HostDeviceVectorImpl {
|
|||||||
T* DevicePointer(int device) {
|
T* DevicePointer(int device) {
|
||||||
CHECK(distribution_.devices_.Contains(device));
|
CHECK(distribution_.devices_.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kWrite);
|
LazySyncDevice(device, GPUAccess::kWrite);
|
||||||
return shards_.at(distribution_.devices_.Index(device)).data_.data().get();
|
return shards_.at(distribution_.devices_.Index(device)).Raw();
|
||||||
}
|
}
|
||||||
|
|
||||||
const T* ConstDevicePointer(int device) {
|
const T* ConstDevicePointer(int device) {
|
||||||
CHECK(distribution_.devices_.Contains(device));
|
CHECK(distribution_.devices_.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kRead);
|
LazySyncDevice(device, GPUAccess::kRead);
|
||||||
return shards_.at(distribution_.devices_.Index(device)).data_.data().get();
|
return shards_.at(distribution_.devices_.Index(device)).Raw();
|
||||||
}
|
}
|
||||||
|
|
||||||
common::Span<T> DeviceSpan(int device) {
|
common::Span<T> DeviceSpan(int device) {
|
||||||
GPUSet devices = distribution_.devices_;
|
GPUSet devices = distribution_.devices_;
|
||||||
CHECK(devices.Contains(device));
|
CHECK(devices.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kWrite);
|
LazySyncDevice(device, GPUAccess::kWrite);
|
||||||
return {shards_.at(devices.Index(device)).data_.data().get(),
|
return {shards_.at(devices.Index(device)).Raw(),
|
||||||
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
|
static_cast<typename common::Span<T>::index_type>(DeviceSize(device))};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,20 +244,21 @@ struct HostDeviceVectorImpl {
|
|||||||
GPUSet devices = distribution_.devices_;
|
GPUSet devices = distribution_.devices_;
|
||||||
CHECK(devices.Contains(device));
|
CHECK(devices.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kRead);
|
LazySyncDevice(device, GPUAccess::kRead);
|
||||||
return {shards_.at(devices.Index(device)).data_.data().get(),
|
using SpanInd = typename common::Span<const T>::index_type;
|
||||||
static_cast<typename common::Span<const T>::index_type>(DeviceSize(device))};
|
return {shards_.at(devices.Index(device)).Raw(),
|
||||||
|
static_cast<SpanInd>(DeviceSize(device))};
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t DeviceSize(int device) {
|
size_t DeviceSize(int device) {
|
||||||
CHECK(distribution_.devices_.Contains(device));
|
CHECK(distribution_.devices_.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kRead);
|
LazySyncDevice(device, GPUAccess::kRead);
|
||||||
return shards_.at(distribution_.devices_.Index(device)).data_.size();
|
return shards_.at(distribution_.devices_.Index(device)).DataSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t DeviceStart(int device) {
|
size_t DeviceStart(int device) {
|
||||||
CHECK(distribution_.devices_.Contains(device));
|
CHECK(distribution_.devices_.Contains(device));
|
||||||
LazySyncDevice(device, GPUAccess::kRead);
|
LazySyncDevice(device, GPUAccess::kRead);
|
||||||
return shards_.at(distribution_.devices_.Index(device)).start_;
|
return shards_.at(distribution_.devices_.Index(device)).Start();
|
||||||
}
|
}
|
||||||
|
|
||||||
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
thrust::device_ptr<T> tbegin(int device) { // NOLINT
|
||||||
@ -293,7 +301,7 @@ struct HostDeviceVectorImpl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Fill(T v) {
|
void Fill(T v) { // NOLINT
|
||||||
if (perm_h_.CanWrite()) {
|
if (perm_h_.CanWrite()) {
|
||||||
std::fill(data_h_.begin(), data_h_.end(), v);
|
std::fill(data_h_.begin(), data_h_.end(), v);
|
||||||
} else {
|
} else {
|
||||||
@ -389,7 +397,7 @@ struct HostDeviceVectorImpl {
|
|||||||
if (perm_h_.CanRead()) {
|
if (perm_h_.CanRead()) {
|
||||||
// data is present, just need to deny access to the device
|
// data is present, just need to deny access to the device
|
||||||
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
||||||
shard.perm_d_.DenyComplementary(access);
|
shard.Perm().DenyComplementary(access);
|
||||||
});
|
});
|
||||||
perm_h_.Grant(access);
|
perm_h_.Grant(access);
|
||||||
return;
|
return;
|
||||||
@ -412,9 +420,10 @@ struct HostDeviceVectorImpl {
|
|||||||
bool DeviceCanAccess(int device, GPUAccess access) {
|
bool DeviceCanAccess(int device, GPUAccess access) {
|
||||||
GPUSet devices = distribution_.Devices();
|
GPUSet devices = distribution_.Devices();
|
||||||
if (!devices.Contains(device)) { return false; }
|
if (!devices.Contains(device)) { return false; }
|
||||||
return shards_.at(devices.Index(device)).perm_d_.CanAccess(access);
|
return shards_.at(devices.Index(device)).Perm().CanAccess(access);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
std::vector<T> data_h_;
|
std::vector<T> data_h_;
|
||||||
Permissions perm_h_;
|
Permissions perm_h_;
|
||||||
// the total size of the data stored on the devices
|
// the total size of the data stored on the devices
|
||||||
|
|||||||
@ -7,6 +7,8 @@
|
|||||||
#ifndef XGBOOST_COMMON_MATH_H_
|
#ifndef XGBOOST_COMMON_MATH_H_
|
||||||
#define XGBOOST_COMMON_MATH_H_
|
#define XGBOOST_COMMON_MATH_H_
|
||||||
|
|
||||||
|
#include <xgboost/base.h>
|
||||||
|
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|||||||
@ -622,8 +622,8 @@ XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept -> // NOLIN
|
|||||||
return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
|
return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace common NOLINT
|
} // namespace common
|
||||||
} // namespace xgboost NOLINT
|
} // namespace xgboost
|
||||||
|
|
||||||
#if defined(_MSC_VER) &&_MSC_VER < 1910
|
#if defined(_MSC_VER) &&_MSC_VER < 1910
|
||||||
#undef constexpr
|
#undef constexpr
|
||||||
|
|||||||
@ -30,8 +30,8 @@ class CoordinateUpdater : public LinearUpdater {
|
|||||||
tparam_.InitAllowUnknown(args)
|
tparam_.InitAllowUnknown(args)
|
||||||
};
|
};
|
||||||
cparam_.InitAllowUnknown(rest);
|
cparam_.InitAllowUnknown(rest);
|
||||||
selector.reset(FeatureSelector::Create(tparam_.feature_selector));
|
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
|
||||||
monitor.Init("CoordinateUpdater");
|
monitor_.Init("CoordinateUpdater");
|
||||||
}
|
}
|
||||||
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
|
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
|
||||||
gbm::GBLinearModel *model, double sum_instance_weight) override {
|
gbm::GBLinearModel *model, double sum_instance_weight) override {
|
||||||
@ -48,20 +48,20 @@ class CoordinateUpdater : public LinearUpdater {
|
|||||||
dbias, &in_gpair->HostVector(), p_fmat);
|
dbias, &in_gpair->HostVector(), p_fmat);
|
||||||
}
|
}
|
||||||
// prepare for updating the weights
|
// prepare for updating the weights
|
||||||
selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
|
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
|
||||||
tparam_.reg_alpha_denorm,
|
tparam_.reg_alpha_denorm,
|
||||||
tparam_.reg_lambda_denorm, cparam_.top_k);
|
tparam_.reg_lambda_denorm, cparam_.top_k);
|
||||||
// update weights
|
// update weights
|
||||||
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
|
for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
|
||||||
for (unsigned i = 0U; i < model->param.num_feature; i++) {
|
for (unsigned i = 0U; i < model->param.num_feature; i++) {
|
||||||
int fidx = selector->NextFeature
|
int fidx = selector_->NextFeature
|
||||||
(i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
|
(i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
|
||||||
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
|
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
|
||||||
if (fidx < 0) break;
|
if (fidx < 0) break;
|
||||||
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
|
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
monitor.Stop("UpdateFeature");
|
monitor_.Stop("UpdateFeature");
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair,
|
inline void UpdateFeature(int fidx, int group_idx, std::vector<GradientPair> *in_gpair,
|
||||||
@ -78,11 +78,12 @@ class CoordinateUpdater : public LinearUpdater {
|
|||||||
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
|
UpdateResidualParallel(fidx, group_idx, ngroup, dw, in_gpair, p_fmat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
CoordinateParam cparam_;
|
CoordinateParam cparam_;
|
||||||
// training parameter
|
// training parameter
|
||||||
LinearTrainParam tparam_;
|
LinearTrainParam tparam_;
|
||||||
std::unique_ptr<FeatureSelector> selector;
|
std::unique_ptr<FeatureSelector> selector_;
|
||||||
common::Monitor monitor;
|
common::Monitor monitor_;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
|
XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
|
||||||
|
|||||||
@ -62,7 +62,7 @@ class DeviceShard {
|
|||||||
auto column_end =
|
auto column_end =
|
||||||
std::lower_bound(col.cbegin(), col.cend(),
|
std::lower_bound(col.cbegin(), col.cend(),
|
||||||
xgboost::Entry(row_end, 0.0f), cmp);
|
xgboost::Entry(row_end, 0.0f), cmp);
|
||||||
column_segments.push_back(
|
column_segments.emplace_back(
|
||||||
std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin()));
|
std::make_pair(column_begin - col.cbegin(), column_end - col.cbegin()));
|
||||||
row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin));
|
row_ptr_.push_back(row_ptr_.back() + (column_end - column_begin));
|
||||||
}
|
}
|
||||||
@ -154,13 +154,13 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>> &args) override {
|
const std::vector<std::pair<std::string, std::string>> &args) override {
|
||||||
tparam_.InitAllowUnknown(args);
|
tparam_.InitAllowUnknown(args);
|
||||||
selector.reset(FeatureSelector::Create(tparam_.feature_selector));
|
selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
|
||||||
monitor.Init("GPUCoordinateUpdater");
|
monitor_.Init("GPUCoordinateUpdater");
|
||||||
}
|
}
|
||||||
|
|
||||||
void LazyInitShards(DMatrix *p_fmat,
|
void LazyInitShards(DMatrix *p_fmat,
|
||||||
const gbm::GBLinearModelParam &model_param) {
|
const gbm::GBLinearModelParam &model_param) {
|
||||||
if (!shards.empty()) return;
|
if (!shards_.empty()) return;
|
||||||
|
|
||||||
dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus,
|
dist_ = GPUDistribution::Block(GPUSet::All(tparam_.gpu_id, tparam_.n_gpus,
|
||||||
p_fmat->Info().num_row_));
|
p_fmat->Info().num_row_));
|
||||||
@ -183,9 +183,9 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
CHECK(p_fmat->SingleColBlock());
|
CHECK(p_fmat->SingleColBlock());
|
||||||
SparsePage const& batch = *(p_fmat->GetColumnBatches().begin());
|
SparsePage const& batch = *(p_fmat->GetColumnBatches().begin());
|
||||||
|
|
||||||
shards.resize(n_devices);
|
shards_.resize(n_devices);
|
||||||
// Create device shards
|
// Create device shards
|
||||||
dh::ExecuteIndexShards(&shards,
|
dh::ExecuteIndexShards(&shards_,
|
||||||
[&](int i, std::unique_ptr<DeviceShard>& shard) {
|
[&](int i, std::unique_ptr<DeviceShard>& shard) {
|
||||||
shard = std::unique_ptr<DeviceShard>(
|
shard = std::unique_ptr<DeviceShard>(
|
||||||
new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
|
new DeviceShard(devices.DeviceId(i), batch, row_segments[i],
|
||||||
@ -196,38 +196,38 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
|
void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
|
||||||
gbm::GBLinearModel *model, double sum_instance_weight) override {
|
gbm::GBLinearModel *model, double sum_instance_weight) override {
|
||||||
tparam_.DenormalizePenalties(sum_instance_weight);
|
tparam_.DenormalizePenalties(sum_instance_weight);
|
||||||
monitor.Start("LazyInitShards");
|
monitor_.Start("LazyInitShards");
|
||||||
this->LazyInitShards(p_fmat, model->param);
|
this->LazyInitShards(p_fmat, model->param);
|
||||||
monitor.Stop("LazyInitShards");
|
monitor_.Stop("LazyInitShards");
|
||||||
|
|
||||||
monitor.Start("UpdateGpair");
|
monitor_.Start("UpdateGpair");
|
||||||
// Update gpair
|
// Update gpair
|
||||||
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
|
dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
|
||||||
if (!shard->IsEmpty()) {
|
if (!shard->IsEmpty()) {
|
||||||
shard->UpdateGpair(in_gpair->ConstHostVector(), model->param);
|
shard->UpdateGpair(in_gpair->ConstHostVector(), model->param);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
monitor.Stop("UpdateGpair");
|
monitor_.Stop("UpdateGpair");
|
||||||
|
|
||||||
monitor.Start("UpdateBias");
|
monitor_.Start("UpdateBias");
|
||||||
this->UpdateBias(p_fmat, model);
|
this->UpdateBias(p_fmat, model);
|
||||||
monitor.Stop("UpdateBias");
|
monitor_.Stop("UpdateBias");
|
||||||
// prepare for updating the weights
|
// prepare for updating the weights
|
||||||
selector->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
|
selector_->Setup(*model, in_gpair->ConstHostVector(), p_fmat,
|
||||||
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
|
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm,
|
||||||
coord_param_.top_k);
|
coord_param_.top_k);
|
||||||
monitor.Start("UpdateFeature");
|
monitor_.Start("UpdateFeature");
|
||||||
for (auto group_idx = 0; group_idx < model->param.num_output_group;
|
for (auto group_idx = 0; group_idx < model->param.num_output_group;
|
||||||
++group_idx) {
|
++group_idx) {
|
||||||
for (auto i = 0U; i < model->param.num_feature; i++) {
|
for (auto i = 0U; i < model->param.num_feature; i++) {
|
||||||
auto fidx = selector->NextFeature(
|
auto fidx = selector_->NextFeature(
|
||||||
i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
|
i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
|
||||||
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
|
tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
|
||||||
if (fidx < 0) break;
|
if (fidx < 0) break;
|
||||||
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model);
|
this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), model);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
monitor.Stop("UpdateFeature");
|
monitor_.Stop("UpdateFeature");
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) {
|
void UpdateBias(DMatrix *p_fmat, gbm::GBLinearModel *model) {
|
||||||
@ -235,7 +235,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
++group_idx) {
|
++group_idx) {
|
||||||
// Get gradient
|
// Get gradient
|
||||||
auto grad = dh::ReduceShards<GradientPair>(
|
auto grad = dh::ReduceShards<GradientPair>(
|
||||||
&shards, [&](std::unique_ptr<DeviceShard> &shard) {
|
&shards_, [&](std::unique_ptr<DeviceShard> &shard) {
|
||||||
if (!shard->IsEmpty()) {
|
if (!shard->IsEmpty()) {
|
||||||
GradientPair result =
|
GradientPair result =
|
||||||
shard->GetBiasGradient(group_idx,
|
shard->GetBiasGradient(group_idx,
|
||||||
@ -251,7 +251,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
model->bias()[group_idx] += dbias;
|
model->bias()[group_idx] += dbias;
|
||||||
|
|
||||||
// Update residual
|
// Update residual
|
||||||
dh::ExecuteIndexShards(&shards, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
|
dh::ExecuteIndexShards(&shards_, [&](int idx, std::unique_ptr<DeviceShard>& shard) {
|
||||||
if (!shard->IsEmpty()) {
|
if (!shard->IsEmpty()) {
|
||||||
shard->UpdateBiasResidual(dbias, group_idx,
|
shard->UpdateBiasResidual(dbias, group_idx,
|
||||||
model->param.num_output_group);
|
model->param.num_output_group);
|
||||||
@ -266,7 +266,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
bst_float &w = (*model)[fidx][group_idx];
|
bst_float &w = (*model)[fidx][group_idx];
|
||||||
// Get gradient
|
// Get gradient
|
||||||
auto grad = dh::ReduceShards<GradientPair>(
|
auto grad = dh::ReduceShards<GradientPair>(
|
||||||
&shards, [&](std::unique_ptr<DeviceShard> &shard) {
|
&shards_, [&](std::unique_ptr<DeviceShard> &shard) {
|
||||||
if (!shard->IsEmpty()) {
|
if (!shard->IsEmpty()) {
|
||||||
return shard->GetGradient(group_idx, model->param.num_output_group,
|
return shard->GetGradient(group_idx, model->param.num_output_group,
|
||||||
fidx);
|
fidx);
|
||||||
@ -280,7 +280,7 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
tparam_.reg_lambda_denorm));
|
tparam_.reg_lambda_denorm));
|
||||||
w += dw;
|
w += dw;
|
||||||
|
|
||||||
dh::ExecuteIndexShards(&shards, [&](int idx,
|
dh::ExecuteIndexShards(&shards_, [&](int idx,
|
||||||
std::unique_ptr<DeviceShard> &shard) {
|
std::unique_ptr<DeviceShard> &shard) {
|
||||||
if (!shard->IsEmpty()) {
|
if (!shard->IsEmpty()) {
|
||||||
shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
|
shard->UpdateResidual(dw, group_idx, model->param.num_output_group, fidx);
|
||||||
@ -288,14 +288,15 @@ class GPUCoordinateUpdater : public LinearUpdater {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
// training parameter
|
// training parameter
|
||||||
LinearTrainParam tparam_;
|
LinearTrainParam tparam_;
|
||||||
CoordinateParam coord_param_;
|
CoordinateParam coord_param_;
|
||||||
GPUDistribution dist_;
|
GPUDistribution dist_;
|
||||||
std::unique_ptr<FeatureSelector> selector;
|
std::unique_ptr<FeatureSelector> selector_;
|
||||||
common::Monitor monitor;
|
common::Monitor monitor_;
|
||||||
|
|
||||||
std::vector<std::unique_ptr<DeviceShard>> shards;
|
std::vector<std::unique_ptr<DeviceShard>> shards_;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
|
XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
|
||||||
|
|||||||
@ -27,10 +27,15 @@ namespace metric {
|
|||||||
// tag the this file, used by force static link later.
|
// tag the this file, used by force static link later.
|
||||||
DMLC_REGISTRY_FILE_TAG(elementwise_metric);
|
DMLC_REGISTRY_FILE_TAG(elementwise_metric);
|
||||||
|
|
||||||
struct PackedReduceResult {
|
template <typename EvalRow>
|
||||||
|
class MetricsReduction {
|
||||||
|
public:
|
||||||
|
class PackedReduceResult {
|
||||||
double residue_sum_;
|
double residue_sum_;
|
||||||
double weights_sum_;
|
double weights_sum_;
|
||||||
|
friend MetricsReduction;
|
||||||
|
|
||||||
|
public:
|
||||||
XGBOOST_DEVICE PackedReduceResult() : residue_sum_{0}, weights_sum_{0} {}
|
XGBOOST_DEVICE PackedReduceResult() : residue_sum_{0}, weights_sum_{0} {}
|
||||||
XGBOOST_DEVICE PackedReduceResult(double residue, double weight) :
|
XGBOOST_DEVICE PackedReduceResult(double residue, double weight) :
|
||||||
residue_sum_{residue}, weights_sum_{weight} {}
|
residue_sum_{residue}, weights_sum_{weight} {}
|
||||||
@ -40,10 +45,10 @@ struct PackedReduceResult {
|
|||||||
return PackedReduceResult { residue_sum_ + other.residue_sum_,
|
return PackedReduceResult { residue_sum_ + other.residue_sum_,
|
||||||
weights_sum_ + other.weights_sum_ };
|
weights_sum_ + other.weights_sum_ };
|
||||||
}
|
}
|
||||||
|
double Residue() const { return residue_sum_; }
|
||||||
|
double Weights() const { return weights_sum_; }
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename EvalRow>
|
|
||||||
class MetricsReduction {
|
|
||||||
public:
|
public:
|
||||||
explicit MetricsReduction(EvalRow policy) :
|
explicit MetricsReduction(EvalRow policy) :
|
||||||
policy_(std::move(policy)) {}
|
policy_(std::move(policy)) {}
|
||||||
@ -346,10 +351,10 @@ struct EvalEWiseBase : public Metric {
|
|||||||
// Dealing with ndata < n_gpus.
|
// Dealing with ndata < n_gpus.
|
||||||
GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
|
GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus, ndata);
|
||||||
|
|
||||||
PackedReduceResult result =
|
auto result =
|
||||||
reducer_.Reduce(devices, info.weights_, info.labels_, preds);
|
reducer_.Reduce(devices, info.weights_, info.labels_, preds);
|
||||||
|
|
||||||
double dat[2] { result.residue_sum_, result.weights_sum_ };
|
double dat[2] { result.Residue(), result.Weights() };
|
||||||
if (distributed) {
|
if (distributed) {
|
||||||
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
rabit::Allreduce<rabit::op::Sum>(dat, 2);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -79,6 +79,8 @@ struct EvalMClassBase : public Metric {
|
|||||||
inline static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
inline static bst_float GetFinal(bst_float esum, bst_float wsum) {
|
||||||
return esum / wsum;
|
return esum / wsum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
// used to store error message
|
// used to store error message
|
||||||
const char *error_msg_;
|
const char *error_msg_;
|
||||||
};
|
};
|
||||||
|
|||||||
@ -48,7 +48,7 @@ void IncrementOffset(IterT begin_itr, IterT end_itr, size_t amount) {
|
|||||||
*/
|
*/
|
||||||
struct DevicePredictionNode {
|
struct DevicePredictionNode {
|
||||||
XGBOOST_DEVICE DevicePredictionNode()
|
XGBOOST_DEVICE DevicePredictionNode()
|
||||||
: fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
|
: fidx{-1}, left_child_idx{-1}, right_child_idx{-1} {}
|
||||||
|
|
||||||
union NodeValue {
|
union NodeValue {
|
||||||
float leaf_weight;
|
float leaf_weight;
|
||||||
@ -238,10 +238,10 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct DeviceShard {
|
struct DeviceShard {
|
||||||
DeviceShard() : device_(-1) {}
|
DeviceShard() : device_{-1} {}
|
||||||
void Init(int device) {
|
void Init(int device) {
|
||||||
this->device_ = device;
|
this->device_ = device;
|
||||||
max_shared_memory_bytes = dh::MaxSharedMemory(this->device_);
|
max_shared_memory_bytes_ = dh::MaxSharedMemory(this->device_);
|
||||||
}
|
}
|
||||||
void PredictInternal
|
void PredictInternal
|
||||||
(const SparsePage& batch, const MetaInfo& info,
|
(const SparsePage& batch, const MetaInfo& info,
|
||||||
@ -251,18 +251,18 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
const thrust::host_vector<DevicePredictionNode>& h_nodes,
|
const thrust::host_vector<DevicePredictionNode>& h_nodes,
|
||||||
size_t tree_begin, size_t tree_end) {
|
size_t tree_begin, size_t tree_end) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_));
|
dh::safe_cuda(cudaSetDevice(device_));
|
||||||
nodes.resize(h_nodes.size());
|
nodes_.resize(h_nodes.size());
|
||||||
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(nodes), h_nodes.data(),
|
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(nodes_), h_nodes.data(),
|
||||||
sizeof(DevicePredictionNode) * h_nodes.size(),
|
sizeof(DevicePredictionNode) * h_nodes.size(),
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
tree_segments.resize(h_tree_segments.size());
|
tree_segments_.resize(h_tree_segments.size());
|
||||||
|
|
||||||
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_segments), h_tree_segments.data(),
|
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_segments_), h_tree_segments.data(),
|
||||||
sizeof(size_t) * h_tree_segments.size(),
|
sizeof(size_t) * h_tree_segments.size(),
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
tree_group.resize(model.tree_info.size());
|
tree_group_.resize(model.tree_info.size());
|
||||||
|
|
||||||
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_group), model.tree_info.data(),
|
dh::safe_cuda(cudaMemcpyAsync(dh::Raw(tree_group_), model.tree_info.data(),
|
||||||
sizeof(int) * model.tree_info.size(),
|
sizeof(int) * model.tree_info.size(),
|
||||||
cudaMemcpyHostToDevice));
|
cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
@ -275,7 +275,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
int shared_memory_bytes = static_cast<int>
|
int shared_memory_bytes = static_cast<int>
|
||||||
(sizeof(float) * info.num_col_ * BLOCK_THREADS);
|
(sizeof(float) * info.num_col_ * BLOCK_THREADS);
|
||||||
bool use_shared = true;
|
bool use_shared = true;
|
||||||
if (shared_memory_bytes > max_shared_memory_bytes) {
|
if (shared_memory_bytes > max_shared_memory_bytes_) {
|
||||||
shared_memory_bytes = 0;
|
shared_memory_bytes = 0;
|
||||||
use_shared = false;
|
use_shared = false;
|
||||||
}
|
}
|
||||||
@ -284,17 +284,18 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
data_distr.Devices().Index(device_));
|
data_distr.Devices().Index(device_));
|
||||||
|
|
||||||
PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
|
PredictKernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS, shared_memory_bytes>>>
|
||||||
(dh::ToSpan(nodes), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments),
|
(dh::ToSpan(nodes_), predictions->DeviceSpan(device_), dh::ToSpan(tree_segments_),
|
||||||
dh::ToSpan(tree_group), batch.offset.DeviceSpan(device_),
|
dh::ToSpan(tree_group_), batch.offset.DeviceSpan(device_),
|
||||||
batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
|
batch.data.DeviceSpan(device_), tree_begin, tree_end, info.num_col_,
|
||||||
num_rows, entry_start, use_shared, model.param.num_output_group);
|
num_rows, entry_start, use_shared, model.param.num_output_group);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
int device_;
|
int device_;
|
||||||
thrust::device_vector<DevicePredictionNode> nodes;
|
thrust::device_vector<DevicePredictionNode> nodes_;
|
||||||
thrust::device_vector<size_t> tree_segments;
|
thrust::device_vector<size_t> tree_segments_;
|
||||||
thrust::device_vector<int> tree_group;
|
thrust::device_vector<int> tree_group_;
|
||||||
size_t max_shared_memory_bytes;
|
size_t max_shared_memory_bytes_;
|
||||||
};
|
};
|
||||||
|
|
||||||
void DevicePredictInternal(DMatrix* dmat,
|
void DevicePredictInternal(DMatrix* dmat,
|
||||||
@ -325,13 +326,12 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
|
|
||||||
for (const auto &batch : dmat->GetRowBatches()) {
|
for (const auto &batch : dmat->GetRowBatches()) {
|
||||||
CHECK_EQ(i_batch, 0) << "External memory not supported";
|
CHECK_EQ(i_batch, 0) << "External memory not supported";
|
||||||
size_t n_rows = batch.offset.Size() - 1;
|
|
||||||
// out_preds have been resharded and resized in InitOutPredictions()
|
// out_preds have been resharded and resized in InitOutPredictions()
|
||||||
batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
|
batch.offset.Reshard(GPUDistribution::Overlap(devices_, 1));
|
||||||
std::vector<size_t> device_offsets;
|
std::vector<size_t> device_offsets;
|
||||||
DeviceOffsets(batch.offset, &device_offsets);
|
DeviceOffsets(batch.offset, &device_offsets);
|
||||||
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
|
batch.data.Reshard(GPUDistribution::Explicit(devices_, device_offsets));
|
||||||
dh::ExecuteIndexShards(&shards, [&](int idx, DeviceShard& shard) {
|
dh::ExecuteIndexShards(&shards_, [&](int idx, DeviceShard& shard) {
|
||||||
shard.PredictInternal(batch, dmat->Info(), out_preds, model,
|
shard.PredictInternal(batch, dmat->Info(), out_preds, model,
|
||||||
h_tree_segments, h_nodes, tree_begin, tree_end);
|
h_tree_segments, h_nodes, tree_begin, tree_end);
|
||||||
});
|
});
|
||||||
@ -340,13 +340,13 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GPUPredictor() : cpu_predictor(Predictor::Create("cpu_predictor")) {}
|
GPUPredictor() : cpu_predictor_(Predictor::Create("cpu_predictor")) {}
|
||||||
|
|
||||||
void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
|
void PredictBatch(DMatrix* dmat, HostDeviceVector<bst_float>* out_preds,
|
||||||
const gbm::GBTreeModel& model, int tree_begin,
|
const gbm::GBTreeModel& model, int tree_begin,
|
||||||
unsigned ntree_limit = 0) override {
|
unsigned ntree_limit = 0) override {
|
||||||
GPUSet devices = GPUSet::All(
|
GPUSet devices = GPUSet::All(
|
||||||
param.gpu_id, param.n_gpus, dmat->Info().num_row_);
|
param_.gpu_id, param_.n_gpus, dmat->Info().num_row_);
|
||||||
ConfigureShards(devices);
|
ConfigureShards(devices);
|
||||||
|
|
||||||
if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
|
if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) {
|
||||||
@ -427,12 +427,12 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
std::vector<bst_float>* out_preds,
|
std::vector<bst_float>* out_preds,
|
||||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||||
unsigned root_index) override {
|
unsigned root_index) override {
|
||||||
cpu_predictor->PredictInstance(inst, out_preds, model, root_index);
|
cpu_predictor_->PredictInstance(inst, out_preds, model, root_index);
|
||||||
}
|
}
|
||||||
void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
|
void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
|
||||||
const gbm::GBTreeModel& model,
|
const gbm::GBTreeModel& model,
|
||||||
unsigned ntree_limit) override {
|
unsigned ntree_limit) override {
|
||||||
cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
|
cpu_predictor_->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PredictContribution(DMatrix* p_fmat,
|
void PredictContribution(DMatrix* p_fmat,
|
||||||
@ -440,7 +440,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
const gbm::GBTreeModel& model, unsigned ntree_limit,
|
||||||
bool approximate, int condition,
|
bool approximate, int condition,
|
||||||
unsigned condition_feature) override {
|
unsigned condition_feature) override {
|
||||||
cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit,
|
cpu_predictor_->PredictContribution(p_fmat, out_contribs, model, ntree_limit,
|
||||||
approximate, condition,
|
approximate, condition,
|
||||||
condition_feature);
|
condition_feature);
|
||||||
}
|
}
|
||||||
@ -450,17 +450,17 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
const gbm::GBTreeModel& model,
|
const gbm::GBTreeModel& model,
|
||||||
unsigned ntree_limit,
|
unsigned ntree_limit,
|
||||||
bool approximate) override {
|
bool approximate) override {
|
||||||
cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model,
|
cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model,
|
||||||
ntree_limit, approximate);
|
ntree_limit, approximate);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Init(const std::vector<std::pair<std::string, std::string>>& cfg,
|
void Init(const std::vector<std::pair<std::string, std::string>>& cfg,
|
||||||
const std::vector<std::shared_ptr<DMatrix>>& cache) override {
|
const std::vector<std::shared_ptr<DMatrix>>& cache) override {
|
||||||
Predictor::Init(cfg, cache);
|
Predictor::Init(cfg, cache);
|
||||||
cpu_predictor->Init(cfg, cache);
|
cpu_predictor_->Init(cfg, cache);
|
||||||
param.InitAllowUnknown(cfg);
|
param_.InitAllowUnknown(cfg);
|
||||||
|
|
||||||
GPUSet devices = GPUSet::All(param.gpu_id, param.n_gpus);
|
GPUSet devices = GPUSet::All(param_.gpu_id, param_.n_gpus);
|
||||||
ConfigureShards(devices);
|
ConfigureShards(devices);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -470,16 +470,16 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (devices_ == devices) return;
|
if (devices_ == devices) return;
|
||||||
|
|
||||||
devices_ = devices;
|
devices_ = devices;
|
||||||
shards.clear();
|
shards_.clear();
|
||||||
shards.resize(devices_.Size());
|
shards_.resize(devices_.Size());
|
||||||
dh::ExecuteIndexShards(&shards, [=](size_t i, DeviceShard& shard){
|
dh::ExecuteIndexShards(&shards_, [=](size_t i, DeviceShard& shard){
|
||||||
shard.Init(devices_.DeviceId(i));
|
shard.Init(devices_.DeviceId(i));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
GPUPredictionParam param;
|
GPUPredictionParam param_;
|
||||||
std::unique_ptr<Predictor> cpu_predictor;
|
std::unique_ptr<Predictor> cpu_predictor_;
|
||||||
std::vector<DeviceShard> shards;
|
std::vector<DeviceShard> shards_;
|
||||||
GPUSet devices_;
|
GPUSet devices_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class ColMaker: public TreeUpdater {
|
|||||||
/*! \brief current best solution */
|
/*! \brief current best solution */
|
||||||
SplitEntry best;
|
SplitEntry best;
|
||||||
// constructor
|
// constructor
|
||||||
NodeEntry() : root_gain(0.0f), weight(0.0f) {}
|
NodeEntry() : root_gain{0.0f}, weight{0.0f} {}
|
||||||
};
|
};
|
||||||
// actual builder that runs the algorithm
|
// actual builder that runs the algorithm
|
||||||
class Builder {
|
class Builder {
|
||||||
@ -596,7 +596,8 @@ class ColMaker: public TreeUpdater {
|
|||||||
// start enumeration
|
// start enumeration
|
||||||
const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
|
const auto num_features = static_cast<bst_omp_uint>(feat_set.size());
|
||||||
#if defined(_OPENMP)
|
#if defined(_OPENMP)
|
||||||
const int batch_size = std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
|
const int batch_size = // NOLINT
|
||||||
|
std::max(static_cast<int>(num_features / this->nthread_ / 32), 1);
|
||||||
#endif // defined(_OPENMP)
|
#endif // defined(_OPENMP)
|
||||||
int poption = param_.parallel_option;
|
int poption = param_.parallel_option;
|
||||||
if (poption == 2) {
|
if (poption == 2) {
|
||||||
|
|||||||
@ -102,7 +102,7 @@ struct AddByKey {
|
|||||||
* @param instIds instance index buffer
|
* @param instIds instance index buffer
|
||||||
* @return the expected gradient value
|
* @return the expected gradient value
|
||||||
*/
|
*/
|
||||||
HOST_DEV_INLINE GradientPair get(int id,
|
HOST_DEV_INLINE GradientPair Get(int id,
|
||||||
common::Span<const GradientPair> vals,
|
common::Span<const GradientPair> vals,
|
||||||
common::Span<const int> instIds) {
|
common::Span<const int> instIds) {
|
||||||
id = instIds[id];
|
id = instIds[id];
|
||||||
@ -123,13 +123,13 @@ __global__ void CubScanByKeyL1(
|
|||||||
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
||||||
int myKey;
|
int myKey;
|
||||||
GradientPair myValue;
|
GradientPair myValue;
|
||||||
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
using BlockScan = cub::BlockScan<Pair, BLKDIM_L1L3>;
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
Pair threadData;
|
Pair threadData;
|
||||||
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
||||||
if (tid < size) {
|
if (tid < size) {
|
||||||
myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||||
myValue = get(tid, vals, instIds);
|
myValue = Get(tid, vals, instIds);
|
||||||
} else {
|
} else {
|
||||||
myKey = kNoneKey;
|
myKey = kNoneKey;
|
||||||
myValue = {};
|
myValue = {};
|
||||||
@ -164,7 +164,7 @@ __global__ void CubScanByKeyL1(
|
|||||||
template <int BLKSIZE>
|
template <int BLKSIZE>
|
||||||
__global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
|
__global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
|
||||||
common::Span<int> mKeys, int mLength) {
|
common::Span<int> mKeys, int mLength) {
|
||||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
using BlockScan = cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||||
Pair threadData;
|
Pair threadData;
|
||||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
|
for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
|
||||||
@ -205,19 +205,19 @@ __global__ void CubScanByKeyL3(common::Span<GradientPair> sums,
|
|||||||
int previousKey =
|
int previousKey =
|
||||||
tid == 0 ? kNoneKey
|
tid == 0 ? kNoneKey
|
||||||
: Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
: Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
||||||
GradientPair myValue = scans[tid];
|
GradientPair my_value = scans[tid];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
||||||
myValue += s_mScans[0];
|
my_value += s_mScans[0];
|
||||||
}
|
}
|
||||||
if (tid == size - 1) {
|
if (tid == size - 1) {
|
||||||
sums[previousKey] = myValue + get(tid, vals, instIds);
|
sums[previousKey] = my_value + Get(tid, vals, instIds);
|
||||||
}
|
}
|
||||||
if ((previousKey != myKey) && (previousKey >= 0)) {
|
if ((previousKey != myKey) && (previousKey >= 0)) {
|
||||||
sums[previousKey] = myValue;
|
sums[previousKey] = my_value;
|
||||||
myValue = GradientPair(0.0f, 0.0f);
|
my_value = GradientPair(0.0f, 0.0f);
|
||||||
}
|
}
|
||||||
scans[tid] = myValue;
|
scans[tid] = my_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -271,14 +271,14 @@ struct ExactSplitCandidate {
|
|||||||
/** index where to split in the DMatrix */
|
/** index where to split in the DMatrix */
|
||||||
int index;
|
int index;
|
||||||
|
|
||||||
HOST_DEV_INLINE ExactSplitCandidate() : score(-FLT_MAX), index(INT_MAX) {}
|
HOST_DEV_INLINE ExactSplitCandidate() : score{-FLT_MAX}, index{INT_MAX} {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Whether the split info is valid to be used to create a new child
|
* @brief Whether the split info is valid to be used to create a new child
|
||||||
* @param minSplitLoss minimum score above which decision to split is made
|
* @param minSplitLoss minimum score above which decision to split is made
|
||||||
* @return true if splittable, else false
|
* @return true if splittable, else false
|
||||||
*/
|
*/
|
||||||
HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const {
|
HOST_DEV_INLINE bool IsSplittable(float minSplitLoss) const {
|
||||||
return ((score >= minSplitLoss) && (index != INT_MAX));
|
return ((score >= minSplitLoss) && (index != INT_MAX));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -297,7 +297,7 @@ enum ArgMaxByKeyAlgo {
|
|||||||
/** max depth until which to use shared mem based atomics for argmax */
|
/** max depth until which to use shared mem based atomics for argmax */
|
||||||
static const int kMaxAbkLevels = 3;
|
static const int kMaxAbkLevels = 3;
|
||||||
|
|
||||||
HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
|
HOST_DEV_INLINE ExactSplitCandidate MaxSplit(ExactSplitCandidate a,
|
||||||
ExactSplitCandidate b) {
|
ExactSplitCandidate b) {
|
||||||
ExactSplitCandidate out;
|
ExactSplitCandidate out;
|
||||||
if (a.score < b.score) {
|
if (a.score < b.score) {
|
||||||
@ -315,13 +315,13 @@ HOST_DEV_INLINE ExactSplitCandidate maxSplit(ExactSplitCandidate a,
|
|||||||
|
|
||||||
DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
|
DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
|
||||||
ExactSplitCandidate val) {
|
ExactSplitCandidate val) {
|
||||||
unsigned long long* intAddress = (unsigned long long*)address; // NOLINT
|
unsigned long long* intAddress = reinterpret_cast<unsigned long long*>(address); // NOLINT
|
||||||
unsigned long long old = *intAddress; // NOLINT
|
unsigned long long old = *intAddress; // NOLINT
|
||||||
unsigned long long assumed; // NOLINT
|
unsigned long long assumed = old; // NOLINT
|
||||||
do {
|
do {
|
||||||
assumed = old;
|
assumed = old;
|
||||||
ExactSplitCandidate res =
|
ExactSplitCandidate res =
|
||||||
maxSplit(val, *reinterpret_cast<ExactSplitCandidate*>(&assumed));
|
MaxSplit(val, *reinterpret_cast<ExactSplitCandidate*>(&assumed));
|
||||||
old = atomicCAS(intAddress, assumed, *reinterpret_cast<uint64_t*>(&res));
|
old = atomicCAS(intAddress, assumed, *reinterpret_cast<uint64_t*>(&res));
|
||||||
} while (assumed != old);
|
} while (assumed != old);
|
||||||
}
|
}
|
||||||
@ -399,7 +399,7 @@ __global__ void AtomicArgMaxByKeySmem(
|
|||||||
nUniqKeys * sizeof(ExactSplitCandidate)));
|
nUniqKeys * sizeof(ExactSplitCandidate)));
|
||||||
int tid = threadIdx.x;
|
int tid = threadIdx.x;
|
||||||
ExactSplitCandidate defVal;
|
ExactSplitCandidate defVal;
|
||||||
#pragma unroll 1
|
|
||||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||||
sNodeSplits[i] = defVal;
|
sNodeSplits[i] = defVal;
|
||||||
}
|
}
|
||||||
@ -465,7 +465,7 @@ void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void assignColIds(int* colIds, const int* colOffsets) {
|
__global__ void AssignColIds(int* colIds, const int* colOffsets) {
|
||||||
int myId = blockIdx.x;
|
int myId = blockIdx.x;
|
||||||
int start = colOffsets[myId];
|
int start = colOffsets[myId];
|
||||||
int end = colOffsets[myId + 1];
|
int end = colOffsets[myId + 1];
|
||||||
@ -474,10 +474,10 @@ __global__ void assignColIds(int* colIds, const int* colOffsets) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void fillDefaultNodeIds(NodeIdT* nodeIdsPerInst,
|
__global__ void FillDefaultNodeIds(NodeIdT* nodeIdsPerInst,
|
||||||
const DeviceNodeStats* nodes, int nRows) {
|
const DeviceNodeStats* nodes, int n_rows) {
|
||||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||||
if (id >= nRows) {
|
if (id >= n_rows) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// if this element belongs to none of the currently active node-id's
|
// if this element belongs to none of the currently active node-id's
|
||||||
@ -497,7 +497,7 @@ __global__ void fillDefaultNodeIds(NodeIdT* nodeIdsPerInst,
|
|||||||
nodeIdsPerInst[id] = result;
|
nodeIdsPerInst[id] = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void assignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations,
|
__global__ void AssignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations,
|
||||||
const NodeIdT* nodeIds, const int* instId,
|
const NodeIdT* nodeIds, const int* instId,
|
||||||
const DeviceNodeStats* nodes,
|
const DeviceNodeStats* nodes,
|
||||||
const int* colOffsets, const float* vals,
|
const int* colOffsets, const float* vals,
|
||||||
@ -526,7 +526,7 @@ __global__ void assignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void markLeavesKernel(DeviceNodeStats* nodes, int len) {
|
__global__ void MarkLeavesKernel(DeviceNodeStats* nodes, int len) {
|
||||||
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
|
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||||
if ((id < len) && !nodes[id].IsUnused()) {
|
if ((id < len) && !nodes[id].IsUnused()) {
|
||||||
int lid = (id << 1) + 1;
|
int lid = (id << 1) + 1;
|
||||||
@ -541,118 +541,117 @@ __global__ void markLeavesKernel(DeviceNodeStats* nodes, int len) {
|
|||||||
|
|
||||||
class GPUMaker : public TreeUpdater {
|
class GPUMaker : public TreeUpdater {
|
||||||
protected:
|
protected:
|
||||||
TrainParam param;
|
TrainParam param_;
|
||||||
/** whether we have initialized memory already (so as not to repeat!) */
|
/** whether we have initialized memory already (so as not to repeat!) */
|
||||||
bool allocated;
|
bool allocated_;
|
||||||
/** feature values stored in column-major compressed format */
|
/** feature values stored in column-major compressed format */
|
||||||
dh::DVec2<float> vals;
|
dh::DVec2<float> vals_;
|
||||||
dh::DVec<float> vals_cached;
|
dh::DVec<float> vals_cached_;
|
||||||
/** corresponding instance id's of these featutre values */
|
/** corresponding instance id's of these featutre values */
|
||||||
dh::DVec2<int> instIds;
|
dh::DVec2<int> instIds_;
|
||||||
dh::DVec<int> instIds_cached;
|
dh::DVec<int> inst_ids_cached_;
|
||||||
/** column offsets for these feature values */
|
/** column offsets for these feature values */
|
||||||
dh::DVec<int> colOffsets;
|
dh::DVec<int> colOffsets_;
|
||||||
dh::DVec<GradientPair> gradsInst;
|
dh::DVec<GradientPair> gradsInst_;
|
||||||
dh::DVec2<NodeIdT> nodeAssigns;
|
dh::DVec2<NodeIdT> nodeAssigns_;
|
||||||
dh::DVec2<int> nodeLocations;
|
dh::DVec2<int> nodeLocations_;
|
||||||
dh::DVec<DeviceNodeStats> nodes;
|
dh::DVec<DeviceNodeStats> nodes_;
|
||||||
dh::DVec<NodeIdT> nodeAssignsPerInst;
|
dh::DVec<NodeIdT> node_assigns_per_inst_;
|
||||||
dh::DVec<GradientPair> gradSums;
|
dh::DVec<GradientPair> gradsums_;
|
||||||
dh::DVec<GradientPair> gradScans;
|
dh::DVec<GradientPair> gradscans_;
|
||||||
dh::DVec<ExactSplitCandidate> nodeSplits;
|
dh::DVec<ExactSplitCandidate> nodeSplits_;
|
||||||
int nVals;
|
int n_vals_;
|
||||||
int nRows;
|
int n_rows_;
|
||||||
int nCols;
|
int n_cols_;
|
||||||
int maxNodes;
|
int maxNodes_;
|
||||||
int maxLeaves;
|
int maxLeaves_;
|
||||||
|
|
||||||
// devices are only used for resharding the HostDeviceVector passed as a parameter;
|
// devices are only used for resharding the HostDeviceVector passed as a parameter;
|
||||||
// the algorithm works with a single GPU only
|
// the algorithm works with a single GPU only
|
||||||
GPUSet devices_;
|
GPUSet devices_;
|
||||||
|
|
||||||
dh::CubMemory tmp_mem;
|
dh::CubMemory tmp_mem_;
|
||||||
dh::DVec<GradientPair> tmpScanGradBuff;
|
dh::DVec<GradientPair> tmpScanGradBuff_;
|
||||||
dh::DVec<int> tmpScanKeyBuff;
|
dh::DVec<int> tmp_scan_key_buff_;
|
||||||
dh::DVec<int> colIds;
|
dh::DVec<int> colIds_;
|
||||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
dh::BulkAllocator<dh::MemoryType::kDevice> ba_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
GPUMaker() : allocated(false) {}
|
GPUMaker() : allocated_{false} {}
|
||||||
~GPUMaker() {}
|
~GPUMaker() override = default;
|
||||||
|
|
||||||
void Init(
|
void Init(const std::vector<std::pair<std::string, std::string>> &args) override {
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) {
|
param_.InitAllowUnknown(args);
|
||||||
param.InitAllowUnknown(args);
|
maxNodes_ = (1 << (param_.max_depth + 1)) - 1;
|
||||||
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
maxLeaves_ = 1 << param_.max_depth;
|
||||||
maxLeaves = 1 << param.max_depth;
|
|
||||||
|
|
||||||
devices_ = GPUSet::All(param.gpu_id, param.n_gpus);
|
devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||||
const std::vector<RegTree*>& trees) {
|
const std::vector<RegTree*>& trees) override {
|
||||||
// rescale learning rate according to size of trees
|
// rescale learning rate according to size of trees
|
||||||
float lr = param.learning_rate;
|
float lr = param_.learning_rate;
|
||||||
param.learning_rate = lr / trees.size();
|
param_.learning_rate = lr / trees.size();
|
||||||
|
|
||||||
gpair->Reshard(devices_);
|
gpair->Reshard(devices_);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// build tree
|
// build tree
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
for (auto tree : trees) {
|
||||||
UpdateTree(gpair, dmat, trees[i]);
|
UpdateTree(gpair, dmat, tree);
|
||||||
}
|
}
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
|
LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
|
||||||
}
|
}
|
||||||
param.learning_rate = lr;
|
param_.learning_rate = lr;
|
||||||
}
|
}
|
||||||
/// @note: Update should be only after Init!!
|
/// @note: Update should be only after Init!!
|
||||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||||
RegTree* hTree) {
|
RegTree* hTree) {
|
||||||
if (!allocated) {
|
if (!allocated_) {
|
||||||
SetupOneTimeData(dmat);
|
SetupOneTimeData(dmat);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < param.max_depth; ++i) {
|
for (int i = 0; i < param_.max_depth; ++i) {
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
// make sure to start on a fresh tree with sorted values!
|
// make sure to start on a fresh tree with sorted values!
|
||||||
vals.CurrentDVec() = vals_cached;
|
vals_.CurrentDVec() = vals_cached_;
|
||||||
instIds.CurrentDVec() = instIds_cached;
|
instIds_.CurrentDVec() = inst_ids_cached_;
|
||||||
transferGrads(gpair);
|
TransferGrads(gpair);
|
||||||
}
|
}
|
||||||
int nNodes = 1 << i;
|
int nNodes = 1 << i;
|
||||||
NodeIdT nodeStart = nNodes - 1;
|
NodeIdT nodeStart = nNodes - 1;
|
||||||
initNodeData(i, nodeStart, nNodes);
|
InitNodeData(i, nodeStart, nNodes);
|
||||||
findSplit(i, nodeStart, nNodes);
|
FindSplit(i, nodeStart, nNodes);
|
||||||
}
|
}
|
||||||
// mark all the used nodes with unused children as leaf nodes
|
// mark all the used nodes with unused children as leaf nodes
|
||||||
markLeaves();
|
MarkLeaves();
|
||||||
Dense2SparseTree(hTree, nodes, param);
|
Dense2SparseTree(hTree, nodes_, param_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void split2node(int nNodes, NodeIdT nodeStart) {
|
void Split2Node(int nNodes, NodeIdT nodeStart) {
|
||||||
auto d_nodes = nodes.GetSpan();
|
auto d_nodes = nodes_.GetSpan();
|
||||||
auto d_gradScans = gradScans.GetSpan();
|
auto d_gradScans = gradscans_.GetSpan();
|
||||||
auto d_gradSums = gradSums.GetSpan();
|
auto d_gradsums = gradsums_.GetSpan();
|
||||||
auto d_nodeAssigns = nodeAssigns.CurrentSpan();
|
auto d_nodeAssigns = nodeAssigns_.CurrentSpan();
|
||||||
auto d_colIds = colIds.GetSpan();
|
auto d_colIds = colIds_.GetSpan();
|
||||||
auto d_vals = vals.Current();
|
auto d_vals = vals_.Current();
|
||||||
auto d_nodeSplits = nodeSplits.Data();
|
auto d_nodeSplits = nodeSplits_.Data();
|
||||||
int nUniqKeys = nNodes;
|
int nUniqKeys = nNodes;
|
||||||
float min_split_loss = param.min_split_loss;
|
float min_split_loss = param_.min_split_loss;
|
||||||
auto gpu_param = GPUTrainingParam(param);
|
auto gpu_param = GPUTrainingParam(param_);
|
||||||
|
|
||||||
dh::LaunchN(param.gpu_id, nNodes, [=] __device__(int uid) {
|
dh::LaunchN(param_.gpu_id, nNodes, [=] __device__(int uid) {
|
||||||
int absNodeId = uid + nodeStart;
|
int absNodeId = uid + nodeStart;
|
||||||
ExactSplitCandidate s = d_nodeSplits[uid];
|
ExactSplitCandidate s = d_nodeSplits[uid];
|
||||||
if (s.isSplittable(min_split_loss)) {
|
if (s.IsSplittable(min_split_loss)) {
|
||||||
int idx = s.index;
|
int idx = s.index;
|
||||||
int nodeInstId =
|
int nodeInstId =
|
||||||
Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
||||||
bool missingLeft = true;
|
bool missingLeft = true;
|
||||||
const DeviceNodeStats& n = d_nodes[absNodeId];
|
const DeviceNodeStats& n = d_nodes[absNodeId];
|
||||||
GradientPair gradScan = d_gradScans[idx];
|
GradientPair gradScan = d_gradScans[idx];
|
||||||
GradientPair gradSum = d_gradSums[nodeInstId];
|
GradientPair gradSum = d_gradsums[nodeInstId];
|
||||||
float thresh = d_vals[idx];
|
float thresh = d_vals[idx];
|
||||||
int colId = d_colIds[idx];
|
int colId = d_colIds[idx];
|
||||||
// get the default direction for the current node
|
// get the default direction for the current node
|
||||||
@ -679,54 +678,53 @@ class GPUMaker : public TreeUpdater {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void findSplit(int level, NodeIdT nodeStart, int nNodes) {
|
void FindSplit(int level, NodeIdT nodeStart, int nNodes) {
|
||||||
ReduceScanByKey(gradSums.GetSpan(), gradScans.GetSpan(), gradsInst.GetSpan(),
|
ReduceScanByKey(gradsums_.GetSpan(), gradscans_.GetSpan(), gradsInst_.GetSpan(),
|
||||||
instIds.CurrentSpan(), nodeAssigns.CurrentSpan(), nVals, nNodes,
|
instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
|
||||||
nCols, tmpScanGradBuff.GetSpan(), tmpScanKeyBuff.GetSpan(),
|
n_cols_, tmpScanGradBuff_.GetSpan(), tmp_scan_key_buff_.GetSpan(),
|
||||||
colIds.GetSpan(), nodeStart);
|
colIds_.GetSpan(), nodeStart);
|
||||||
ArgMaxByKey(nodeSplits.GetSpan(), gradScans.GetSpan(), gradSums.GetSpan(),
|
ArgMaxByKey(nodeSplits_.GetSpan(), gradscans_.GetSpan(), gradsums_.GetSpan(),
|
||||||
vals.CurrentSpan(), colIds.GetSpan(), nodeAssigns.CurrentSpan(),
|
vals_.CurrentSpan(), colIds_.GetSpan(), nodeAssigns_.CurrentSpan(),
|
||||||
nodes.GetSpan(), nNodes, nodeStart, nVals, param,
|
nodes_.GetSpan(), nNodes, nodeStart, n_vals_, param_,
|
||||||
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
||||||
split2node(nNodes, nodeStart);
|
Split2Node(nNodes, nodeStart);
|
||||||
}
|
}
|
||||||
|
|
||||||
void allocateAllData(int offsetSize) {
|
void AllocateAllData(int offsetSize) {
|
||||||
int tmpBuffSize = ScanTempBufferSize(nVals);
|
int tmpBuffSize = ScanTempBufferSize(n_vals_);
|
||||||
ba.Allocate(param.gpu_id, &vals, nVals,
|
ba_.Allocate(param_.gpu_id, &vals_, n_vals_,
|
||||||
&vals_cached, nVals, &instIds, nVals, &instIds_cached, nVals,
|
&vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_,
|
||||||
&colOffsets, offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
|
&colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_,
|
||||||
&nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
|
&nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_,
|
||||||
nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
|
n_rows_, &gradsums_, maxLeaves_ * n_cols_, &gradscans_, n_vals_,
|
||||||
&nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
|
&nodeSplits_, maxLeaves_, &tmpScanGradBuff_, tmpBuffSize,
|
||||||
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
|
&tmp_scan_key_buff_, tmpBuffSize, &colIds_, n_vals_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetupOneTimeData(DMatrix* dmat) {
|
void SetupOneTimeData(DMatrix* dmat) {
|
||||||
size_t free_memory = dh::AvailableMemory(param.gpu_id);
|
|
||||||
if (!dmat->SingleColBlock()) {
|
if (!dmat->SingleColBlock()) {
|
||||||
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
|
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
|
||||||
}
|
}
|
||||||
std::vector<float> fval;
|
std::vector<float> fval;
|
||||||
std::vector<int> fId;
|
std::vector<int> fId;
|
||||||
std::vector<size_t> offset;
|
std::vector<size_t> offset;
|
||||||
convertToCsc(dmat, &fval, &fId, &offset);
|
ConvertToCsc(dmat, &fval, &fId, &offset);
|
||||||
allocateAllData(static_cast<int>(offset.size()));
|
AllocateAllData(static_cast<int>(offset.size()));
|
||||||
transferAndSortData(fval, fId, offset);
|
TransferAndSortData(fval, fId, offset);
|
||||||
allocated = true;
|
allocated_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
void ConvertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
||||||
std::vector<int>* fId, std::vector<size_t>* offset) {
|
std::vector<int>* fId, std::vector<size_t>* offset) {
|
||||||
const MetaInfo& info = dmat->Info();
|
const MetaInfo& info = dmat->Info();
|
||||||
CHECK(info.num_col_ < std::numeric_limits<int>::max());
|
CHECK(info.num_col_ < std::numeric_limits<int>::max());
|
||||||
CHECK(info.num_row_ < std::numeric_limits<int>::max());
|
CHECK(info.num_row_ < std::numeric_limits<int>::max());
|
||||||
nRows = static_cast<int>(info.num_row_);
|
n_rows_ = static_cast<int>(info.num_row_);
|
||||||
nCols = static_cast<int>(info.num_col_);
|
n_cols_ = static_cast<int>(info.num_col_);
|
||||||
offset->reserve(nCols + 1);
|
offset->reserve(n_cols_ + 1);
|
||||||
offset->push_back(0);
|
offset->push_back(0);
|
||||||
fval->reserve(nCols * nRows);
|
fval->reserve(n_cols_ * n_rows_);
|
||||||
fId->reserve(nCols * nRows);
|
fId->reserve(n_cols_ * n_rows_);
|
||||||
// in case you end up with a DMatrix having no column access
|
// in case you end up with a DMatrix having no column access
|
||||||
// then make sure to enable that before copying the data!
|
// then make sure to enable that before copying the data!
|
||||||
for (const auto& batch : dmat->GetSortedColumnBatches()) {
|
for (const auto& batch : dmat->GetSortedColumnBatches()) {
|
||||||
@ -741,59 +739,59 @@ class GPUMaker : public TreeUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
CHECK(fval->size() < std::numeric_limits<int>::max());
|
CHECK(fval->size() < std::numeric_limits<int>::max());
|
||||||
nVals = static_cast<int>(fval->size());
|
n_vals_ = static_cast<int>(fval->size());
|
||||||
}
|
}
|
||||||
|
|
||||||
void transferAndSortData(const std::vector<float>& fval,
|
void TransferAndSortData(const std::vector<float>& fval,
|
||||||
const std::vector<int>& fId,
|
const std::vector<int>& fId,
|
||||||
const std::vector<size_t>& offset) {
|
const std::vector<size_t>& offset) {
|
||||||
vals.CurrentDVec() = fval;
|
vals_.CurrentDVec() = fval;
|
||||||
instIds.CurrentDVec() = fId;
|
instIds_.CurrentDVec() = fId;
|
||||||
colOffsets = offset;
|
colOffsets_ = offset;
|
||||||
dh::SegmentedSort<float, int>(&tmp_mem, &vals, &instIds, nVals, nCols,
|
dh::SegmentedSort<float, int>(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_,
|
||||||
colOffsets);
|
colOffsets_);
|
||||||
vals_cached = vals.CurrentDVec();
|
vals_cached_ = vals_.CurrentDVec();
|
||||||
instIds_cached = instIds.CurrentDVec();
|
inst_ids_cached_ = instIds_.CurrentDVec();
|
||||||
assignColIds<<<nCols, 512>>>(colIds.Data(), colOffsets.Data());
|
AssignColIds<<<n_cols_, 512>>>(colIds_.Data(), colOffsets_.Data());
|
||||||
}
|
}
|
||||||
|
|
||||||
void transferGrads(HostDeviceVector<GradientPair>* gpair) {
|
void TransferGrads(HostDeviceVector<GradientPair>* gpair) {
|
||||||
gpair->GatherTo(gradsInst.tbegin(), gradsInst.tend());
|
gpair->GatherTo(gradsInst_.tbegin(), gradsInst_.tend());
|
||||||
// evaluate the full-grad reduction for the root node
|
// evaluate the full-grad reduction for the root node
|
||||||
dh::SumReduction<GradientPair>(tmp_mem, gradsInst, gradSums, nRows);
|
dh::SumReduction<GradientPair>(tmp_mem_, gradsInst_, gradsums_, n_rows_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void initNodeData(int level, NodeIdT nodeStart, int nNodes) {
|
void InitNodeData(int level, NodeIdT nodeStart, int nNodes) {
|
||||||
// all instances belong to root node at the beginning!
|
// all instances belong to root node at the beginning!
|
||||||
if (level == 0) {
|
if (level == 0) {
|
||||||
nodes.Fill(DeviceNodeStats());
|
nodes_.Fill(DeviceNodeStats());
|
||||||
nodeAssigns.CurrentDVec().Fill(0);
|
nodeAssigns_.CurrentDVec().Fill(0);
|
||||||
nodeAssignsPerInst.Fill(0);
|
node_assigns_per_inst_.Fill(0);
|
||||||
// for root node, just update the gradient/score/weight/id info
|
// for root node, just update the gradient/score/weight/id info
|
||||||
// before splitting it! Currently all data is on GPU, hence this
|
// before splitting it! Currently all data is on GPU, hence this
|
||||||
// stupid little kernel
|
// stupid little kernel
|
||||||
auto d_nodes = nodes.Data();
|
auto d_nodes = nodes_.Data();
|
||||||
auto d_sums = gradSums.Data();
|
auto d_sums = gradsums_.Data();
|
||||||
auto gpu_params = GPUTrainingParam(param);
|
auto gpu_params = GPUTrainingParam(param_);
|
||||||
dh::LaunchN(param.gpu_id, 1, [=] __device__(int idx) {
|
dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
|
||||||
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
|
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
const int BlkDim = 256;
|
const int BlkDim = 256;
|
||||||
const int ItemsPerThread = 4;
|
const int ItemsPerThread = 4;
|
||||||
// assign default node ids first
|
// assign default node ids first
|
||||||
int nBlks = dh::DivRoundUp(nRows, BlkDim);
|
int nBlks = dh::DivRoundUp(n_rows_, BlkDim);
|
||||||
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.Data(),
|
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.Data(),
|
||||||
nodes.Data(), nRows);
|
nodes_.Data(), n_rows_);
|
||||||
// evaluate the correct child indices of non-missing values next
|
// evaluate the correct child indices of non-missing values next
|
||||||
nBlks = dh::DivRoundUp(nVals, BlkDim * ItemsPerThread);
|
nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
|
||||||
assignNodeIds<<<nBlks, BlkDim>>>(
|
AssignNodeIds<<<nBlks, BlkDim>>>(
|
||||||
nodeAssignsPerInst.Data(), nodeLocations.Current(),
|
node_assigns_per_inst_.Data(), nodeLocations_.Current(),
|
||||||
nodeAssigns.Current(), instIds.Current(), nodes.Data(),
|
nodeAssigns_.Current(), instIds_.Current(), nodes_.Data(),
|
||||||
colOffsets.Data(), vals.Current(), nVals, nCols);
|
colOffsets_.Data(), vals_.Current(), n_vals_, n_cols_);
|
||||||
// gather the node assignments across all other columns too
|
// gather the node assignments across all other columns too
|
||||||
dh::Gather(param.gpu_id, nodeAssigns.Current(),
|
dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
|
||||||
nodeAssignsPerInst.Data(), instIds.Current(), nVals);
|
node_assigns_per_inst_.Data(), instIds_.Current(), n_vals_);
|
||||||
SortKeys(level);
|
SortKeys(level);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -801,19 +799,19 @@ class GPUMaker : public TreeUpdater {
|
|||||||
void SortKeys(int level) {
|
void SortKeys(int level) {
|
||||||
// segmented-sort the arrays based on node-id's
|
// segmented-sort the arrays based on node-id's
|
||||||
// but we don't need more than level+1 bits for sorting!
|
// but we don't need more than level+1 bits for sorting!
|
||||||
SegmentedSort(&tmp_mem, &nodeAssigns, &nodeLocations, nVals, nCols,
|
SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_,
|
||||||
colOffsets, 0, level + 1);
|
colOffsets_, 0, level + 1);
|
||||||
dh::Gather<float, int>(param.gpu_id, vals.other(),
|
dh::Gather<float, int>(param_.gpu_id, vals_.other(),
|
||||||
vals.Current(), instIds.other(), instIds.Current(),
|
vals_.Current(), instIds_.other(), instIds_.Current(),
|
||||||
nodeLocations.Current(), nVals);
|
nodeLocations_.Current(), n_vals_);
|
||||||
vals.buff().selector ^= 1;
|
vals_.buff().selector ^= 1;
|
||||||
instIds.buff().selector ^= 1;
|
instIds_.buff().selector ^= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void markLeaves() {
|
void MarkLeaves() {
|
||||||
const int BlkDim = 128;
|
const int BlkDim = 128;
|
||||||
int nBlks = dh::DivRoundUp(maxNodes, BlkDim);
|
int nBlks = dh::DivRoundUp(maxNodes_, BlkDim);
|
||||||
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.Data(), maxNodes);
|
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.Data(), maxNodes_);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,7 @@
|
|||||||
|
|
||||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
|
||||||
|
|
||||||
#else
|
#else // In device code and CUDA < 600
|
||||||
XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
|
XGBOOST_DEVICE __forceinline__ double atomicAdd(double* address, double val) {
|
||||||
unsigned long long int* address_as_ull =
|
unsigned long long int* address_as_ull =
|
||||||
(unsigned long long int*)address; // NOLINT
|
(unsigned long long int*)address; // NOLINT
|
||||||
|
|||||||
@ -108,7 +108,7 @@ __device__ GradientSumT ReduceFeature(common::Span<const GradientSumT> feature_h
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*! \brief Find the thread with best gain. */
|
/*! \brief Find the thread with best gain. */
|
||||||
template <int BLOCK_THREADS, typename ReduceT, typename scan_t,
|
template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
|
||||||
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
|
||||||
__device__ void EvaluateFeature(
|
__device__ void EvaluateFeature(
|
||||||
int fidx,
|
int fidx,
|
||||||
@ -142,7 +142,7 @@ __device__ void EvaluateFeature(
|
|||||||
// Gradient value for current bin.
|
// Gradient value for current bin.
|
||||||
GradientSumT bin =
|
GradientSumT bin =
|
||||||
thread_active ? node_histogram[scan_begin + threadIdx.x] : GradientSumT();
|
thread_active ? node_histogram[scan_begin + threadIdx.x] : GradientSumT();
|
||||||
scan_t(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
|
ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op);
|
||||||
|
|
||||||
// Whether the gradient of missing values is put to the left side.
|
// Whether the gradient of missing values is put to the left side.
|
||||||
bool missing_left = true;
|
bool missing_left = true;
|
||||||
@ -198,12 +198,12 @@ __global__ void EvaluateSplitKernel(
|
|||||||
ValueConstraint value_constraint,
|
ValueConstraint value_constraint,
|
||||||
common::Span<int> d_monotonic_constraints) {
|
common::Span<int> d_monotonic_constraints) {
|
||||||
// KeyValuePair here used as threadIdx.x -> gain_value
|
// KeyValuePair here used as threadIdx.x -> gain_value
|
||||||
typedef cub::KeyValuePair<int, float> ArgMaxT;
|
using ArgMaxT = cub::KeyValuePair<int, float>;
|
||||||
typedef cub::BlockScan<
|
using BlockScanT =
|
||||||
GradientSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS> BlockScanT;
|
cub::BlockScan<GradientSumT, BLOCK_THREADS, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||||
typedef cub::BlockReduce<ArgMaxT, BLOCK_THREADS> MaxReduceT;
|
using MaxReduceT = cub::BlockReduce<ArgMaxT, BLOCK_THREADS>;
|
||||||
|
|
||||||
typedef cub::BlockReduce<GradientSumT, BLOCK_THREADS> SumReduceT;
|
using SumReduceT = cub::BlockReduce<GradientSumT, BLOCK_THREADS>;
|
||||||
|
|
||||||
union TempStorage {
|
union TempStorage {
|
||||||
typename BlockScanT::TempStorage scan;
|
typename BlockScanT::TempStorage scan;
|
||||||
@ -274,51 +274,56 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, GidxIterT data,
|
|||||||
* \date 28/07/2018
|
* \date 28/07/2018
|
||||||
*/
|
*/
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
struct DeviceHistogram {
|
class DeviceHistogram {
|
||||||
|
private:
|
||||||
/*! \brief Map nidx to starting index of its histogram. */
|
/*! \brief Map nidx to starting index of its histogram. */
|
||||||
std::map<int, size_t> nidx_map;
|
std::map<int, size_t> nidx_map_;
|
||||||
thrust::device_vector<typename GradientSumT::ValueT> data;
|
thrust::device_vector<typename GradientSumT::ValueT> data_;
|
||||||
const size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size
|
static constexpr size_t kStopGrowingSize = 1 << 26; // Do not grow beyond this size
|
||||||
int n_bins;
|
int n_bins_;
|
||||||
int device_id_;
|
int device_id_;
|
||||||
|
|
||||||
|
public:
|
||||||
void Init(int device_id, int n_bins) {
|
void Init(int device_id, int n_bins) {
|
||||||
this->n_bins = n_bins;
|
this->n_bins_ = n_bins;
|
||||||
this->device_id_ = device_id;
|
this->device_id_ = device_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Reset() {
|
void Reset() {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id_));
|
||||||
dh::safe_cuda(cudaMemsetAsync(
|
dh::safe_cuda(cudaMemsetAsync(
|
||||||
data.data().get(), 0,
|
data_.data().get(), 0,
|
||||||
data.size() * sizeof(typename decltype(data)::value_type)));
|
data_.size() * sizeof(typename decltype(data_)::value_type)));
|
||||||
nidx_map.clear();
|
nidx_map_.clear();
|
||||||
|
}
|
||||||
|
bool HistogramExists(int nidx) {
|
||||||
|
return nidx_map_.find(nidx) != nidx_map_.end();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool HistogramExists(int nidx) {
|
thrust::device_vector<typename GradientSumT::ValueT> &Data() {
|
||||||
return nidx_map.find(nidx) != nidx_map.end();
|
return data_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void AllocateHistogram(int nidx) {
|
void AllocateHistogram(int nidx) {
|
||||||
if (HistogramExists(nidx)) return;
|
if (HistogramExists(nidx)) return;
|
||||||
size_t current_size =
|
size_t current_size =
|
||||||
nidx_map.size() * n_bins * 2; // Number of items currently used in data
|
nidx_map_.size() * n_bins_ * 2; // Number of items currently used in data
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id_));
|
||||||
if (data.size() >= kStopGrowingSize) {
|
if (data_.size() >= kStopGrowingSize) {
|
||||||
// Recycle histogram memory
|
// Recycle histogram memory
|
||||||
std::pair<int, size_t> old_entry = *nidx_map.begin();
|
std::pair<int, size_t> old_entry = *nidx_map_.begin();
|
||||||
nidx_map.erase(old_entry.first);
|
nidx_map_.erase(old_entry.first);
|
||||||
dh::safe_cuda(cudaMemsetAsync(data.data().get() + old_entry.second, 0,
|
dh::safe_cuda(cudaMemsetAsync(data_.data().get() + old_entry.second, 0,
|
||||||
n_bins * sizeof(GradientSumT)));
|
n_bins_ * sizeof(GradientSumT)));
|
||||||
nidx_map[nidx] = old_entry.second;
|
nidx_map_[nidx] = old_entry.second;
|
||||||
} else {
|
} else {
|
||||||
// Append new node histogram
|
// Append new node histogram
|
||||||
nidx_map[nidx] = current_size;
|
nidx_map_[nidx] = current_size;
|
||||||
if (data.size() < current_size + n_bins * 2) {
|
if (data_.size() < current_size + n_bins_ * 2) {
|
||||||
size_t new_size = current_size * 2; // Double in size
|
size_t new_size = current_size * 2; // Double in size
|
||||||
new_size = std::max(static_cast<size_t>(n_bins * 2),
|
new_size = std::max(static_cast<size_t>(n_bins_ * 2),
|
||||||
new_size); // Have at least one histogram
|
new_size); // Have at least one histogram
|
||||||
data.resize(new_size);
|
data_.resize(new_size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -330,9 +335,9 @@ struct DeviceHistogram {
|
|||||||
*/
|
*/
|
||||||
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
|
common::Span<GradientSumT> GetNodeHistogram(int nidx) {
|
||||||
CHECK(this->HistogramExists(nidx));
|
CHECK(this->HistogramExists(nidx));
|
||||||
auto ptr = data.data().get() + nidx_map[nidx];
|
auto ptr = data_.data().get() + nidx_map_[nidx];
|
||||||
return common::Span<GradientSumT>(
|
return common::Span<GradientSumT>(
|
||||||
reinterpret_cast<GradientSumT*>(ptr), n_bins);
|
reinterpret_cast<GradientSumT*>(ptr), n_bins_);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -351,7 +356,7 @@ struct CalcWeightTrainParam {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Bin each input data entry, store the bin indices in compressed form.
|
// Bin each input data entry, store the bin indices in compressed form.
|
||||||
__global__ void compress_bin_ellpack_k(
|
__global__ void CompressBinEllpackKernel(
|
||||||
common::CompressedBufferWriter wr,
|
common::CompressedBufferWriter wr,
|
||||||
common::CompressedByteT* __restrict__ buffer, // gidx_buffer
|
common::CompressedByteT* __restrict__ buffer, // gidx_buffer
|
||||||
const size_t* __restrict__ row_ptrs, // row offset of input data
|
const size_t* __restrict__ row_ptrs, // row offset of input data
|
||||||
@ -366,8 +371,9 @@ __global__ void compress_bin_ellpack_k(
|
|||||||
unsigned int null_gidx_value) {
|
unsigned int null_gidx_value) {
|
||||||
size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
|
size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
|
int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
|
||||||
if (irow >= n_rows || ifeature >= row_stride)
|
if (irow >= n_rows || ifeature >= row_stride) {
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
|
int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
|
||||||
unsigned int bin = null_gidx_value;
|
unsigned int bin = null_gidx_value;
|
||||||
if (ifeature < row_length) {
|
if (ifeature < row_length) {
|
||||||
@ -380,8 +386,9 @@ __global__ void compress_bin_ellpack_k(
|
|||||||
// Assigning the bin in current entry.
|
// Assigning the bin in current entry.
|
||||||
// S.t.: fvalue < feature_cuts[bin]
|
// S.t.: fvalue < feature_cuts[bin]
|
||||||
bin = dh::UpperBound(feature_cuts, ncuts, fvalue);
|
bin = dh::UpperBound(feature_cuts, ncuts, fvalue);
|
||||||
if (bin >= ncuts)
|
if (bin >= ncuts) {
|
||||||
bin = ncuts - 1;
|
bin = ncuts - 1;
|
||||||
|
}
|
||||||
// Add the number of bins in previous features.
|
// Add the number of bins in previous features.
|
||||||
bin += cut_rows[feature];
|
bin += cut_rows[feature];
|
||||||
}
|
}
|
||||||
@ -419,7 +426,7 @@ struct Segment {
|
|||||||
size_t begin;
|
size_t begin;
|
||||||
size_t end;
|
size_t end;
|
||||||
|
|
||||||
Segment() : begin(0), end(0) {}
|
Segment() : begin{0}, end{0} {}
|
||||||
|
|
||||||
Segment(size_t begin, size_t end) : begin(begin), end(end) {
|
Segment(size_t begin, size_t end) : begin(begin), end(end) {
|
||||||
CHECK_GE(end, begin);
|
CHECK_GE(end, begin);
|
||||||
@ -487,7 +494,9 @@ struct GPUHistBuilderBase {
|
|||||||
// Manage memory for a single GPU
|
// Manage memory for a single GPU
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
struct DeviceShard {
|
struct DeviceShard {
|
||||||
int device_id_;
|
int n_bins;
|
||||||
|
int device_id;
|
||||||
|
|
||||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||||
|
|
||||||
/*! \brief HistCutMatrix stored in device. */
|
/*! \brief HistCutMatrix stored in device. */
|
||||||
@ -498,14 +507,12 @@ struct DeviceShard {
|
|||||||
dh::DVec<bst_float> min_fvalue;
|
dh::DVec<bst_float> min_fvalue;
|
||||||
/*! \brief Cut. */
|
/*! \brief Cut. */
|
||||||
dh::DVec<bst_float> gidx_fvalue_map;
|
dh::DVec<bst_float> gidx_fvalue_map;
|
||||||
} cut_;
|
} d_cut;
|
||||||
|
|
||||||
/*! \brief Range of rows for each node. */
|
/*! \brief Range of rows for each node. */
|
||||||
std::vector<Segment> ridx_segments;
|
std::vector<Segment> ridx_segments;
|
||||||
DeviceHistogram<GradientSumT> hist;
|
DeviceHistogram<GradientSumT> hist;
|
||||||
|
|
||||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
|
||||||
dh::DVec<common::CompressedByteT> gidx_buffer;
|
|
||||||
/*! \brief row length for ELLPack. */
|
/*! \brief row length for ELLPack. */
|
||||||
size_t row_stride;
|
size_t row_stride;
|
||||||
common::CompressedIterator<uint32_t> gidx;
|
common::CompressedIterator<uint32_t> gidx;
|
||||||
@ -526,6 +533,8 @@ struct DeviceShard {
|
|||||||
/*! \brief Sum gradient for each node. */
|
/*! \brief Sum gradient for each node. */
|
||||||
std::vector<GradientPair> node_sum_gradients;
|
std::vector<GradientPair> node_sum_gradients;
|
||||||
dh::DVec<GradientPair> node_sum_gradients_d;
|
dh::DVec<GradientPair> node_sum_gradients_d;
|
||||||
|
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||||
|
dh::DVec<common::CompressedByteT> gidx_buffer;
|
||||||
/*! \brief row offset in SparsePage (the input data). */
|
/*! \brief row offset in SparsePage (the input data). */
|
||||||
thrust::device_vector<size_t> row_ptrs;
|
thrust::device_vector<size_t> row_ptrs;
|
||||||
/*! \brief On-device feature set, only actually used on one of the devices */
|
/*! \brief On-device feature set, only actually used on one of the devices */
|
||||||
@ -534,7 +543,6 @@ struct DeviceShard {
|
|||||||
bst_uint row_begin_idx;
|
bst_uint row_begin_idx;
|
||||||
bst_uint row_end_idx;
|
bst_uint row_end_idx;
|
||||||
bst_uint n_rows;
|
bst_uint n_rows;
|
||||||
int n_bins;
|
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
bool prediction_cache_initialised;
|
bool prediction_cache_initialised;
|
||||||
@ -544,21 +552,21 @@ struct DeviceShard {
|
|||||||
std::unique_ptr<GPUHistBuilderBase<GradientSumT>> hist_builder;
|
std::unique_ptr<GPUHistBuilderBase<GradientSumT>> hist_builder;
|
||||||
|
|
||||||
// TODO(canonizer): do add support multi-batch DMatrix here
|
// TODO(canonizer): do add support multi-batch DMatrix here
|
||||||
DeviceShard(int device_id, bst_uint row_begin, bst_uint row_end,
|
DeviceShard(int _device_id, bst_uint row_begin, bst_uint row_end,
|
||||||
TrainParam _param)
|
TrainParam _param)
|
||||||
: device_id_(device_id),
|
: device_id(_device_id),
|
||||||
row_begin_idx(row_begin),
|
row_begin_idx(row_begin),
|
||||||
row_end_idx(row_end),
|
row_end_idx(row_end),
|
||||||
row_stride(0),
|
row_stride(0),
|
||||||
n_rows(row_end - row_begin),
|
n_rows(row_end - row_begin),
|
||||||
n_bins(0),
|
n_bins{0},
|
||||||
null_gidx_value(0),
|
null_gidx_value(0),
|
||||||
param(_param),
|
param(std::move(_param)),
|
||||||
prediction_cache_initialised(false) {}
|
prediction_cache_initialised(false) {}
|
||||||
|
|
||||||
/* Init row_ptrs and row_stride */
|
/* Init row_ptrs and row_stride */
|
||||||
void InitRowPtrs(const SparsePage& row_batch) {
|
void InitRowPtrs(const SparsePage& row_batch) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
const auto& offset_vec = row_batch.offset.HostVector();
|
const auto& offset_vec = row_batch.offset.HostVector();
|
||||||
row_ptrs.resize(n_rows + 1);
|
row_ptrs.resize(n_rows + 1);
|
||||||
thrust::copy(offset_vec.data() + row_begin_idx,
|
thrust::copy(offset_vec.data() + row_begin_idx,
|
||||||
@ -589,12 +597,11 @@ struct DeviceShard {
|
|||||||
|
|
||||||
void CreateHistIndices(const SparsePage& row_batch);
|
void CreateHistIndices(const SparsePage& row_batch);
|
||||||
|
|
||||||
~DeviceShard() {
|
~DeviceShard() = default;
|
||||||
}
|
|
||||||
|
|
||||||
// Reset values for each update iteration
|
// Reset values for each update iteration
|
||||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
position.CurrentDVec().Fill(0);
|
position.CurrentDVec().Fill(0);
|
||||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||||
GradientPair());
|
GradientPair());
|
||||||
@ -603,8 +610,8 @@ struct DeviceShard {
|
|||||||
|
|
||||||
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
||||||
ridx_segments.front() = Segment(0, ridx.Size());
|
ridx_segments.front() = Segment(0, ridx.Size());
|
||||||
this->gpair.copy(dh_gpair->tcbegin(device_id_),
|
this->gpair.copy(dh_gpair->tcbegin(device_id),
|
||||||
dh_gpair->tcend(device_id_));
|
dh_gpair->tcend(device_id));
|
||||||
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
||||||
hist.Reset();
|
hist.Reset();
|
||||||
}
|
}
|
||||||
@ -612,7 +619,7 @@ struct DeviceShard {
|
|||||||
DeviceSplitCandidate EvaluateSplit(int nidx,
|
DeviceSplitCandidate EvaluateSplit(int nidx,
|
||||||
const std::vector<int>& feature_set,
|
const std::vector<int>& feature_set,
|
||||||
ValueConstraint value_constraint) {
|
ValueConstraint value_constraint) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
auto d_split_candidates = temp_memory.GetSpan<DeviceSplitCandidate>(feature_set.size());
|
auto d_split_candidates = temp_memory.GetSpan<DeviceSplitCandidate>(feature_set.size());
|
||||||
feature_set_d.resize(feature_set.size());
|
feature_set_d.resize(feature_set.size());
|
||||||
auto d_features = common::Span<int>(feature_set_d.data().get(),
|
auto d_features = common::Span<int>(feature_set_d.data().get(),
|
||||||
@ -622,14 +629,13 @@ struct DeviceShard {
|
|||||||
DeviceNodeStats node(node_sum_gradients[nidx], nidx, param);
|
DeviceNodeStats node(node_sum_gradients[nidx], nidx, param);
|
||||||
|
|
||||||
// One block for each feature
|
// One block for each feature
|
||||||
int constexpr BLOCK_THREADS = 256;
|
int constexpr kBlockThreads = 256;
|
||||||
EvaluateSplitKernel<BLOCK_THREADS, GradientSumT>
|
EvaluateSplitKernel<kBlockThreads, GradientSumT>
|
||||||
<<<uint32_t(feature_set.size()), BLOCK_THREADS, 0>>>(
|
<<<uint32_t(feature_set.size()), kBlockThreads, 0>>>
|
||||||
hist.GetNodeHistogram(nidx), d_features, node,
|
(hist.GetNodeHistogram(nidx), d_features, node,
|
||||||
cut_.feature_segments.GetSpan(), cut_.min_fvalue.GetSpan(),
|
d_cut.feature_segments.GetSpan(), d_cut.min_fvalue.GetSpan(),
|
||||||
cut_.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param),
|
d_cut.gidx_fvalue_map.GetSpan(), GPUTrainingParam(param),
|
||||||
d_split_candidates, value_constraint,
|
d_split_candidates, value_constraint, monotone_constraints.GetSpan());
|
||||||
monotone_constraints.GetSpan());
|
|
||||||
|
|
||||||
std::vector<DeviceSplitCandidate> split_candidates(feature_set.size());
|
std::vector<DeviceSplitCandidate> split_candidates(feature_set.size());
|
||||||
dh::safe_cuda(cudaMemcpy(split_candidates.data(), d_split_candidates.data(),
|
dh::safe_cuda(cudaMemcpy(split_candidates.data(), d_split_candidates.data(),
|
||||||
@ -655,7 +661,7 @@ struct DeviceShard {
|
|||||||
auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
|
auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
|
||||||
auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
|
auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
|
||||||
|
|
||||||
dh::LaunchN(device_id_, hist.n_bins, [=] __device__(size_t idx) {
|
dh::LaunchN(device_id, n_bins, [=] __device__(size_t idx) {
|
||||||
d_node_hist_subtraction[idx] =
|
d_node_hist_subtraction[idx] =
|
||||||
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
d_node_hist_parent[idx] - d_node_hist_histogram[idx];
|
||||||
});
|
});
|
||||||
@ -673,7 +679,7 @@ struct DeviceShard {
|
|||||||
int64_t split_gidx, bool default_dir_left, bool is_dense,
|
int64_t split_gidx, bool default_dir_left, bool is_dense,
|
||||||
int fidx_begin, // cut.row_ptr[fidx]
|
int fidx_begin, // cut.row_ptr[fidx]
|
||||||
int fidx_end) { // cut.row_ptr[fidx + 1]
|
int fidx_end) { // cut.row_ptr[fidx + 1]
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
Segment segment = ridx_segments[nidx];
|
Segment segment = ridx_segments[nidx];
|
||||||
bst_uint* d_ridx = ridx.Current();
|
bst_uint* d_ridx = ridx.Current();
|
||||||
int* d_position = position.Current();
|
int* d_position = position.Current();
|
||||||
@ -681,7 +687,7 @@ struct DeviceShard {
|
|||||||
size_t row_stride = this->row_stride;
|
size_t row_stride = this->row_stride;
|
||||||
// Launch 1 thread for each row
|
// Launch 1 thread for each row
|
||||||
dh::LaunchN<1, 128>(
|
dh::LaunchN<1, 128>(
|
||||||
device_id_, segment.Size(), [=] __device__(bst_uint idx) {
|
device_id, segment.Size(), [=] __device__(bst_uint idx) {
|
||||||
idx += segment.begin;
|
idx += segment.begin;
|
||||||
bst_uint ridx = d_ridx[idx];
|
bst_uint ridx = d_ridx[idx];
|
||||||
auto row_begin = row_stride * ridx;
|
auto row_begin = row_stride * ridx;
|
||||||
@ -737,14 +743,14 @@ struct DeviceShard {
|
|||||||
const auto d_position_other = position.other() + segment.begin;
|
const auto d_position_other = position.other() + segment.begin;
|
||||||
const auto d_ridx_current = ridx.Current() + segment.begin;
|
const auto d_ridx_current = ridx.Current() + segment.begin;
|
||||||
const auto d_ridx_other = ridx.other() + segment.begin;
|
const auto d_ridx_other = ridx.other() + segment.begin;
|
||||||
dh::LaunchN(device_id_, segment.Size(), [=] __device__(size_t idx) {
|
dh::LaunchN(device_id, segment.Size(), [=] __device__(size_t idx) {
|
||||||
d_position_current[idx] = d_position_other[idx];
|
d_position_current[idx] = d_position_other[idx];
|
||||||
d_ridx_current[idx] = d_ridx_other[idx];
|
d_ridx_current[idx] = d_ridx_other[idx];
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdatePredictionCache(bst_float* out_preds_d) {
|
void UpdatePredictionCache(bst_float* out_preds_d) {
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
if (!prediction_cache_initialised) {
|
if (!prediction_cache_initialised) {
|
||||||
dh::safe_cuda(cudaMemcpyAsync(
|
dh::safe_cuda(cudaMemcpyAsync(
|
||||||
prediction_cache.Data(), out_preds_d,
|
prediction_cache.Data(), out_preds_d,
|
||||||
@ -764,7 +770,7 @@ struct DeviceShard {
|
|||||||
auto d_prediction_cache = prediction_cache.Data();
|
auto d_prediction_cache = prediction_cache.Data();
|
||||||
|
|
||||||
dh::LaunchN(
|
dh::LaunchN(
|
||||||
device_id_, prediction_cache.Size(), [=] __device__(int local_idx) {
|
device_id, prediction_cache.Size(), [=] __device__(int local_idx) {
|
||||||
int pos = d_position[local_idx];
|
int pos = d_position[local_idx];
|
||||||
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
||||||
d_prediction_cache[d_ridx[local_idx]] +=
|
d_prediction_cache[d_ridx[local_idx]] +=
|
||||||
@ -799,7 +805,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
if (grid_size <= 0) {
|
if (grid_size <= 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id_));
|
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||||
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>
|
SharedMemHistKernel<<<grid_size, block_threads, smem_size>>>
|
||||||
(shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair,
|
(shard->row_stride, d_ridx, d_gidx, null_gidx_value, d_node_hist.data(), d_gpair,
|
||||||
segment_begin, n_elements);
|
segment_begin, n_elements);
|
||||||
@ -819,7 +825,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
size_t const row_stride = shard->row_stride;
|
size_t const row_stride = shard->row_stride;
|
||||||
int const null_gidx_value = shard->null_gidx_value;
|
int const null_gidx_value = shard->null_gidx_value;
|
||||||
|
|
||||||
dh::LaunchN(shard->device_id_, n_elements, [=] __device__(size_t idx) {
|
dh::LaunchN(shard->device_id, n_elements, [=] __device__(size_t idx) {
|
||||||
int ridx = d_ridx[(idx / row_stride) + segment.begin];
|
int ridx = d_ridx[(idx / row_stride) + segment.begin];
|
||||||
// lookup the index (bin) of histogram.
|
// lookup the index (bin) of histogram.
|
||||||
int gidx = d_gidx[ridx * row_stride + idx % row_stride];
|
int gidx = d_gidx[ridx * row_stride + idx % row_stride];
|
||||||
@ -834,31 +840,31 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
|||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||||
const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
|
const common::HistCutMatrix& hmat, const SparsePage& row_batch) {
|
||||||
n_bins = hmat.row_ptr.back();
|
n_bins = hmat.NumBins();
|
||||||
null_gidx_value = hmat.row_ptr.back();
|
null_gidx_value = hmat.NumBins();
|
||||||
|
|
||||||
int max_nodes =
|
int max_nodes =
|
||||||
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
|
param.max_leaves > 0 ? param.max_leaves * 2 : MaxNodesDepth(param.max_depth);
|
||||||
|
|
||||||
ba.Allocate(device_id_,
|
ba.Allocate(device_id,
|
||||||
&gpair, n_rows,
|
&gpair, n_rows,
|
||||||
&ridx, n_rows,
|
&ridx, n_rows,
|
||||||
&position, n_rows,
|
&position, n_rows,
|
||||||
&prediction_cache, n_rows,
|
&prediction_cache, n_rows,
|
||||||
&node_sum_gradients_d, max_nodes,
|
&node_sum_gradients_d, max_nodes,
|
||||||
&cut_.feature_segments, hmat.row_ptr.size(),
|
&d_cut.feature_segments, hmat.row_ptr.size(),
|
||||||
&cut_.gidx_fvalue_map, hmat.cut.size(),
|
&d_cut.gidx_fvalue_map, hmat.cut.size(),
|
||||||
&cut_.min_fvalue, hmat.min_val.size(),
|
&d_cut.min_fvalue, hmat.min_val.size(),
|
||||||
&monotone_constraints, param.monotone_constraints.size());
|
&monotone_constraints, param.monotone_constraints.size());
|
||||||
cut_.gidx_fvalue_map = hmat.cut;
|
d_cut.gidx_fvalue_map = hmat.cut;
|
||||||
cut_.min_fvalue = hmat.min_val;
|
d_cut.min_fvalue = hmat.min_val;
|
||||||
cut_.feature_segments = hmat.row_ptr;
|
d_cut.feature_segments = hmat.row_ptr;
|
||||||
monotone_constraints = param.monotone_constraints;
|
monotone_constraints = param.monotone_constraints;
|
||||||
|
|
||||||
node_sum_gradients.resize(max_nodes);
|
node_sum_gradients.resize(max_nodes);
|
||||||
ridx_segments.resize(max_nodes);
|
ridx_segments.resize(max_nodes);
|
||||||
|
|
||||||
dh::safe_cuda(cudaSetDevice(device_id_));
|
dh::safe_cuda(cudaSetDevice(device_id));
|
||||||
|
|
||||||
// allocate compressed bin data
|
// allocate compressed bin data
|
||||||
int num_symbols = n_bins + 1;
|
int num_symbols = n_bins + 1;
|
||||||
@ -870,7 +876,7 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
|
CHECK(!(param.max_leaves == 0 && param.max_depth == 0))
|
||||||
<< "Max leaves and max depth cannot both be unconstrained for "
|
<< "Max leaves and max depth cannot both be unconstrained for "
|
||||||
"gpu_hist.";
|
"gpu_hist.";
|
||||||
ba.Allocate(device_id_, &gidx_buffer, compressed_size_bytes);
|
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
|
||||||
gidx_buffer.Fill(0);
|
gidx_buffer.Fill(0);
|
||||||
|
|
||||||
int nbits = common::detail::SymbolBits(num_symbols);
|
int nbits = common::detail::SymbolBits(num_symbols);
|
||||||
@ -882,7 +888,7 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
// check if we can use shared memory for building histograms
|
// check if we can use shared memory for building histograms
|
||||||
// (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding)
|
// (assuming atleast we need 2 CTAs per SM to maintain decent latency hiding)
|
||||||
auto histogram_size = sizeof(GradientSumT) * null_gidx_value;
|
auto histogram_size = sizeof(GradientSumT) * null_gidx_value;
|
||||||
auto max_smem = dh::MaxSharedMemory(device_id_);
|
auto max_smem = dh::MaxSharedMemory(device_id);
|
||||||
if (histogram_size <= max_smem) {
|
if (histogram_size <= max_smem) {
|
||||||
hist_builder.reset(new SharedMemHistBuilder<GradientSumT>);
|
hist_builder.reset(new SharedMemHistBuilder<GradientSumT>);
|
||||||
} else {
|
} else {
|
||||||
@ -890,7 +896,7 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Init histogram
|
// Init histogram
|
||||||
hist.Init(device_id_, hmat.row_ptr.back());
|
hist.Init(device_id, hmat.NumBins());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -900,7 +906,7 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
|
|||||||
// bin and compress entries in batches of rows
|
// bin and compress entries in batches of rows
|
||||||
size_t gpu_batch_nrows =
|
size_t gpu_batch_nrows =
|
||||||
std::min
|
std::min
|
||||||
(dh::TotalMemory(device_id_) / (16 * row_stride * sizeof(Entry)),
|
(dh::TotalMemory(device_id) / (16 * row_stride * sizeof(Entry)),
|
||||||
static_cast<size_t>(n_rows));
|
static_cast<size_t>(n_rows));
|
||||||
const std::vector<Entry>& data_vec = row_batch.data.HostVector();
|
const std::vector<Entry>& data_vec = row_batch.data.HostVector();
|
||||||
|
|
||||||
@ -924,12 +930,12 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(const SparsePage& row_b
|
|||||||
const dim3 block3(32, 8, 1); // 256 threads
|
const dim3 block3(32, 8, 1); // 256 threads
|
||||||
const dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
|
const dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
|
||||||
dh::DivRoundUp(row_stride, block3.y), 1);
|
dh::DivRoundUp(row_stride, block3.y), 1);
|
||||||
compress_bin_ellpack_k<<<grid3, block3>>>
|
CompressBinEllpackKernel<<<grid3, block3>>>
|
||||||
(common::CompressedBufferWriter(num_symbols),
|
(common::CompressedBufferWriter(num_symbols),
|
||||||
gidx_buffer.Data(),
|
gidx_buffer.Data(),
|
||||||
row_ptrs.data().get() + batch_row_begin,
|
row_ptrs.data().get() + batch_row_begin,
|
||||||
entries_d.data().get(),
|
entries_d.data().get(),
|
||||||
cut_.gidx_fvalue_map.Data(), cut_.feature_segments.Data(),
|
d_cut.gidx_fvalue_map.Data(), d_cut.feature_segments.Data(),
|
||||||
batch_row_begin, batch_nrows,
|
batch_row_begin, batch_nrows,
|
||||||
row_ptrs[batch_row_begin],
|
row_ptrs[batch_row_begin],
|
||||||
row_stride, null_gidx_value);
|
row_stride, null_gidx_value);
|
||||||
@ -948,7 +954,7 @@ class GPUHistMakerSpecialised{
|
|||||||
public:
|
public:
|
||||||
struct ExpandEntry;
|
struct ExpandEntry;
|
||||||
|
|
||||||
GPUHistMakerSpecialised() : initialised_(false), p_last_fmat_(nullptr) {}
|
GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
|
||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) {
|
const std::vector<std::pair<std::string, std::string>>& args) {
|
||||||
param_.InitAllowUnknown(args);
|
param_.InitAllowUnknown(args);
|
||||||
@ -977,8 +983,8 @@ class GPUHistMakerSpecialised{
|
|||||||
ValueConstraint::Init(¶m_, dmat->Info().num_col_);
|
ValueConstraint::Init(¶m_, dmat->Info().num_col_);
|
||||||
// build tree
|
// build tree
|
||||||
try {
|
try {
|
||||||
for (size_t i = 0; i < trees.size(); ++i) {
|
for (xgboost::RegTree* tree : trees) {
|
||||||
this->UpdateTree(gpair, dmat, trees[i]);
|
this->UpdateTree(gpair, dmat, tree);
|
||||||
}
|
}
|
||||||
dh::safe_cuda(cudaGetLastError());
|
dh::safe_cuda(cudaGetLastError());
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
@ -1056,14 +1062,16 @@ class GPUHistMakerSpecialised{
|
|||||||
}
|
}
|
||||||
|
|
||||||
void AllReduceHist(int nidx) {
|
void AllReduceHist(int nidx) {
|
||||||
if (shards_.size() == 1 && !rabit::IsDistributed()) return;
|
if (shards_.size() == 1 && !rabit::IsDistributed()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
monitor_.StartCuda("AllReduce");
|
monitor_.StartCuda("AllReduce");
|
||||||
|
|
||||||
reducer_.GroupStart();
|
reducer_.GroupStart();
|
||||||
for (auto& shard : shards_) {
|
for (auto& shard : shards_) {
|
||||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||||
reducer_.AllReduceSum(
|
reducer_.AllReduceSum(
|
||||||
dist_.Devices().Index(shard->device_id_),
|
dist_.Devices().Index(shard->device_id),
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
|
||||||
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
n_bins_ * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
|
||||||
@ -1141,14 +1149,14 @@ class GPUHistMakerSpecialised{
|
|||||||
}
|
}
|
||||||
|
|
||||||
void InitRoot(RegTree* p_tree) {
|
void InitRoot(RegTree* p_tree) {
|
||||||
constexpr int root_nidx = 0;
|
constexpr int kRootNIdx = 0;
|
||||||
// Sum gradients
|
// Sum gradients
|
||||||
std::vector<GradientPair> tmp_sums(shards_.size());
|
std::vector<GradientPair> tmp_sums(shards_.size());
|
||||||
|
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
dh::safe_cuda(cudaSetDevice(shard->device_id_));
|
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||||
tmp_sums[i] = dh::SumReduction(
|
tmp_sums[i] = dh::SumReduction(
|
||||||
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
||||||
});
|
});
|
||||||
@ -1156,35 +1164,36 @@ class GPUHistMakerSpecialised{
|
|||||||
GradientPair sum_gradient =
|
GradientPair sum_gradient =
|
||||||
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
std::accumulate(tmp_sums.begin(), tmp_sums.end(), GradientPair());
|
||||||
|
|
||||||
rabit::Allreduce<rabit::op::Sum>((GradientPair::ValueT*)&sum_gradient, 2);
|
rabit::Allreduce<rabit::op::Sum>(
|
||||||
|
reinterpret_cast<GradientPair::ValueT*>(&sum_gradient), 2);
|
||||||
|
|
||||||
// Generate root histogram
|
// Generate root histogram
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
shard->BuildHist(root_nidx);
|
shard->BuildHist(kRootNIdx);
|
||||||
});
|
});
|
||||||
|
|
||||||
this->AllReduceHist(root_nidx);
|
this->AllReduceHist(kRootNIdx);
|
||||||
|
|
||||||
// Remember root stats
|
// Remember root stats
|
||||||
p_tree->Stat(root_nidx).sum_hess = sum_gradient.GetHess();
|
p_tree->Stat(kRootNIdx).sum_hess = sum_gradient.GetHess();
|
||||||
auto weight = CalcWeight(param_, sum_gradient);
|
auto weight = CalcWeight(param_, sum_gradient);
|
||||||
p_tree->Stat(root_nidx).base_weight = weight;
|
p_tree->Stat(kRootNIdx).base_weight = weight;
|
||||||
(*p_tree)[root_nidx].SetLeaf(param_.learning_rate * weight);
|
(*p_tree)[kRootNIdx].SetLeaf(param_.learning_rate * weight);
|
||||||
|
|
||||||
// Store sum gradients
|
// Store sum gradients
|
||||||
for (auto& shard : shards_) {
|
for (auto& shard : shards_) {
|
||||||
shard->node_sum_gradients[root_nidx] = sum_gradient;
|
shard->node_sum_gradients[kRootNIdx] = sum_gradient;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialise root constraint
|
// Initialise root constraint
|
||||||
node_value_constraints_.resize(p_tree->GetNodes().size());
|
node_value_constraints_.resize(p_tree->GetNodes().size());
|
||||||
|
|
||||||
// Generate first split
|
// Generate first split
|
||||||
auto split = this->EvaluateSplit(root_nidx, p_tree);
|
auto split = this->EvaluateSplit(kRootNIdx, p_tree);
|
||||||
qexpand_->push(
|
qexpand_->push(
|
||||||
ExpandEntry(root_nidx, p_tree->GetDepth(root_nidx), split, 0));
|
ExpandEntry(kRootNIdx, p_tree->GetDepth(kRootNIdx), split, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
void UpdatePosition(const ExpandEntry& candidate, RegTree* p_tree) {
|
||||||
@ -1302,15 +1311,16 @@ class GPUHistMakerSpecialised{
|
|||||||
|
|
||||||
bool UpdatePredictionCache(
|
bool UpdatePredictionCache(
|
||||||
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
|
const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) {
|
||||||
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
|
||||||
return false;
|
|
||||||
monitor_.StartCuda("UpdatePredictionCache");
|
monitor_.StartCuda("UpdatePredictionCache");
|
||||||
|
if (shards_.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
p_out_preds->Reshard(dist_.Devices());
|
p_out_preds->Reshard(dist_.Devices());
|
||||||
dh::ExecuteIndexShards(
|
dh::ExecuteIndexShards(
|
||||||
&shards_,
|
&shards_,
|
||||||
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
[&](int idx, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||||
shard->UpdatePredictionCache(
|
shard->UpdatePredictionCache(
|
||||||
p_out_preds->DevicePointer(shard->device_id_));
|
p_out_preds->DevicePointer(shard->device_id));
|
||||||
});
|
});
|
||||||
monitor_.StopCuda("UpdatePredictionCache");
|
monitor_.StopCuda("UpdatePredictionCache");
|
||||||
return true;
|
return true;
|
||||||
@ -1321,15 +1331,23 @@ class GPUHistMakerSpecialised{
|
|||||||
int depth;
|
int depth;
|
||||||
DeviceSplitCandidate split;
|
DeviceSplitCandidate split;
|
||||||
uint64_t timestamp;
|
uint64_t timestamp;
|
||||||
ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split,
|
ExpandEntry(int _nid, int _depth, const DeviceSplitCandidate _split,
|
||||||
uint64_t timestamp)
|
uint64_t _timestamp) :
|
||||||
: nid(nid), depth(depth), split(split), timestamp(timestamp) {}
|
nid{_nid}, depth{_depth}, split(std::move(_split)),
|
||||||
|
timestamp{_timestamp} {}
|
||||||
bool IsValid(const TrainParam& param, int num_leaves) const {
|
bool IsValid(const TrainParam& param, int num_leaves) const {
|
||||||
if (split.loss_chg <= kRtEps) return false;
|
if (split.loss_chg <= kRtEps) {
|
||||||
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0)
|
|
||||||
return false;
|
return false;
|
||||||
if (param.max_depth > 0 && depth == param.max_depth) return false;
|
}
|
||||||
if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
|
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (param.max_depth > 0 && depth == param.max_depth) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (param.max_leaves > 0 && num_leaves == param.max_leaves) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1365,28 +1383,36 @@ class GPUHistMakerSpecialised{
|
|||||||
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
|
return lhs.split.loss_chg < rhs.split.loss_chg; // favor large loss_chg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TrainParam param_;
|
|
||||||
GPUHistMakerTrainParam hist_maker_param_;
|
TrainParam param_; // NOLINT
|
||||||
common::HistCutMatrix hmat_;
|
common::HistCutMatrix hmat_; // NOLINT
|
||||||
common::GHistIndexMatrix gmat_;
|
MetaInfo* info_; // NOLINT
|
||||||
MetaInfo* info_;
|
|
||||||
|
std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_; // NOLINT
|
||||||
|
common::ColumnSampler column_sampler_; // NOLINT
|
||||||
|
|
||||||
|
std::vector<ValueConstraint> node_value_constraints_; // NOLINT
|
||||||
|
|
||||||
|
private:
|
||||||
bool initialised_;
|
bool initialised_;
|
||||||
|
|
||||||
int n_devices_;
|
int n_devices_;
|
||||||
int n_bins_;
|
int n_bins_;
|
||||||
|
|
||||||
std::vector<std::unique_ptr<DeviceShard<GradientSumT>>> shards_;
|
GPUHistMakerTrainParam hist_maker_param_;
|
||||||
common::ColumnSampler column_sampler_;
|
common::GHistIndexMatrix gmat_;
|
||||||
|
|
||||||
using ExpandQueue = std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
using ExpandQueue = std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
|
||||||
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
std::function<bool(ExpandEntry, ExpandEntry)>>;
|
||||||
std::unique_ptr<ExpandQueue> qexpand_;
|
std::unique_ptr<ExpandQueue> qexpand_;
|
||||||
common::Monitor monitor_;
|
|
||||||
dh::AllReducer reducer_;
|
dh::AllReducer reducer_;
|
||||||
std::vector<ValueConstraint> node_value_constraints_;
|
|
||||||
/*! List storing device id. */
|
|
||||||
std::vector<int> device_list_;
|
|
||||||
|
|
||||||
DMatrix* p_last_fmat_;
|
DMatrix* p_last_fmat_;
|
||||||
GPUDistribution dist_;
|
GPUDistribution dist_;
|
||||||
|
|
||||||
|
common::Monitor monitor_;
|
||||||
|
/*! List storing device id. */
|
||||||
|
std::vector<int> device_list_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class GPUHistMaker : public TreeUpdater {
|
class GPUHistMaker : public TreeUpdater {
|
||||||
|
|||||||
@ -69,9 +69,9 @@ class HistMaker: public BaseMaker {
|
|||||||
std::vector<GradStats> data;
|
std::vector<GradStats> data;
|
||||||
/*! \brief */
|
/*! \brief */
|
||||||
inline HistUnit operator[](size_t fid) {
|
inline HistUnit operator[](size_t fid) {
|
||||||
return HistUnit(cut + rptr[fid],
|
return {cut + rptr[fid],
|
||||||
&data[0] + rptr[fid],
|
&data[0] + rptr[fid],
|
||||||
rptr[fid+1] - rptr[fid]);
|
rptr[fid+1] - rptr[fid]};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
// thread workspace
|
// thread workspace
|
||||||
|
|||||||
@ -95,11 +95,10 @@ void QuantileHistMaker::Builder::SyncHistograms(
|
|||||||
perf_monitor.TickStart();
|
perf_monitor.TickStart();
|
||||||
this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count);
|
this->histred_.Allreduce(hist_[starting_index].data(), hist_builder_.GetNumBins() * sync_count);
|
||||||
// use Subtraction Trick
|
// use Subtraction Trick
|
||||||
for (auto local_it = nodes_for_subtraction_trick_.begin();
|
for (auto const& node_pair : nodes_for_subtraction_trick_) {
|
||||||
local_it != nodes_for_subtraction_trick_.end(); local_it++) {
|
hist_.AddHistRow(node_pair.first);
|
||||||
hist_.AddHistRow(local_it->first);
|
SubtractionTrick(hist_[node_pair.first], hist_[node_pair.second],
|
||||||
SubtractionTrick(hist_[local_it->first], hist_[local_it->second],
|
hist_[(*p_tree)[node_pair.first].Parent()]);
|
||||||
hist_[(*p_tree)[local_it->first].Parent()]);
|
|
||||||
}
|
}
|
||||||
perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
|
perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::BUILD_HIST);
|
||||||
}
|
}
|
||||||
@ -112,8 +111,8 @@ void QuantileHistMaker::Builder::BuildLocalHistograms(
|
|||||||
RegTree *p_tree,
|
RegTree *p_tree,
|
||||||
const std::vector<GradientPair> &gpair_h) {
|
const std::vector<GradientPair> &gpair_h) {
|
||||||
perf_monitor.TickStart();
|
perf_monitor.TickStart();
|
||||||
for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
|
for (auto const& entry : qexpand_depth_wise_) {
|
||||||
int nid = qexpand_depth_wise_[k].nid;
|
int nid = entry.nid;
|
||||||
RegTree::Node &node = (*p_tree)[nid];
|
RegTree::Node &node = (*p_tree)[nid];
|
||||||
if (rabit::IsDistributed()) {
|
if (rabit::IsDistributed()) {
|
||||||
if (node.IsRoot() || node.IsLeftChild()) {
|
if (node.IsRoot() || node.IsLeftChild()) {
|
||||||
@ -160,8 +159,8 @@ void QuantileHistMaker::Builder::BuildNodeStats(
|
|||||||
RegTree *p_tree,
|
RegTree *p_tree,
|
||||||
const std::vector<GradientPair> &gpair_h) {
|
const std::vector<GradientPair> &gpair_h) {
|
||||||
perf_monitor.TickStart();
|
perf_monitor.TickStart();
|
||||||
for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
|
for (auto const& entry : qexpand_depth_wise_) {
|
||||||
int nid = qexpand_depth_wise_[k].nid;
|
int nid = entry.nid;
|
||||||
this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
|
this->InitNewNode(nid, gmat, gpair_h, *p_fmat, *p_tree);
|
||||||
// add constraints
|
// add constraints
|
||||||
if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) {
|
if (!(*p_tree)[nid].IsLeftChild() && !(*p_tree)[nid].IsRoot()) {
|
||||||
@ -185,8 +184,8 @@ void QuantileHistMaker::Builder::EvaluateSplits(
|
|||||||
int depth,
|
int depth,
|
||||||
unsigned *timestamp,
|
unsigned *timestamp,
|
||||||
std::vector<ExpandEntry> *temp_qexpand_depth) {
|
std::vector<ExpandEntry> *temp_qexpand_depth) {
|
||||||
for (size_t k = 0; k < qexpand_depth_wise_.size(); k++) {
|
for (auto const& entry : qexpand_depth_wise_) {
|
||||||
int nid = qexpand_depth_wise_[k].nid;
|
int nid = entry.nid;
|
||||||
perf_monitor.TickStart();
|
perf_monitor.TickStart();
|
||||||
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
|
this->EvaluateSplit(nid, gmat, hist_, *p_fmat, *p_tree);
|
||||||
perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT);
|
perf_monitor.UpdatePerfTimer(TreeGrowingPerfMonitor::timer_name::EVALUATE_SPLIT);
|
||||||
@ -221,7 +220,7 @@ void QuantileHistMaker::Builder::ExpandWithDepthWidth(
|
|||||||
int num_leaves = 0;
|
int num_leaves = 0;
|
||||||
|
|
||||||
// in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway
|
// in depth_wise growing, we feed loss_chg with 0.0 since it is not used anyway
|
||||||
qexpand_depth_wise_.push_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++));
|
qexpand_depth_wise_.emplace_back(ExpandEntry(0, p_tree->GetDepth(0), 0.0, timestamp++));
|
||||||
++num_leaves;
|
++num_leaves;
|
||||||
for (int depth = 0; depth < param_.max_depth + 1; depth++) {
|
for (int depth = 0; depth < param_.max_depth + 1; depth++) {
|
||||||
int starting_index = std::numeric_limits<int>::max();
|
int starting_index = std::numeric_limits<int>::max();
|
||||||
|
|||||||
@ -1,12 +1,18 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
rm -rf gtest googletest-release-1.7.0
|
export GTEST_PKG_NAME=release-1.8.1
|
||||||
wget -nc https://github.com/google/googletest/archive/release-1.7.0.zip
|
export GTEST_DIR_NAME=googletest-${GTEST_PKG_NAME} # uncompressed directory
|
||||||
unzip -n release-1.7.0.zip
|
export GTEST_ZIP_FILE=${GTEST_PKG_NAME}.zip # downloaded zip ball name
|
||||||
mv googletest-release-1.7.0 gtest && cd gtest
|
|
||||||
cmake . && make
|
|
||||||
mkdir lib && mv libgtest.a lib
|
|
||||||
cd ..
|
|
||||||
rm -rf release-1.7.0.zip*
|
|
||||||
|
|
||||||
python3 tests/ci_build/tidy.py --gtest-path=${PWD}/gtest
|
rm -rf gtest googletest-release*
|
||||||
|
|
||||||
|
wget -nc https://github.com/google/googletest/archive/${GTEST_ZIP_FILE}
|
||||||
|
unzip -n ${GTEST_ZIP_FILE}
|
||||||
|
mv ${GTEST_DIR_NAME} gtest && cd gtest
|
||||||
|
cmake . -DCMAKE_INSTALL_PREFIX=./ins && make
|
||||||
|
make install
|
||||||
|
|
||||||
|
cd ..
|
||||||
|
rm ${GTEST_ZIP_FILE}
|
||||||
|
|
||||||
|
python3 tests/ci_build/tidy.py --gtest-path=${PWD}/gtest/ins
|
||||||
|
|||||||
11
tests/ci_build/test_tidy.cc
Normal file
11
tests/ci_build/test_tidy.cc
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct Foo {
|
||||||
|
int bar_;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
std::vector<Foo> values;
|
||||||
|
values.push_back(Foo());
|
||||||
|
}
|
||||||
@ -5,23 +5,24 @@ import json
|
|||||||
from multiprocessing import Pool, cpu_count
|
from multiprocessing import Pool, cpu_count
|
||||||
import shutil
|
import shutil
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def call(args):
|
def call(args):
|
||||||
'''Subprocess run wrapper.'''
|
'''Subprocess run wrapper.'''
|
||||||
completed = subprocess.run(args, stdout=subprocess.PIPE,
|
completed = subprocess.run(args,
|
||||||
stderr=subprocess.DEVNULL)
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE)
|
||||||
error_msg = completed.stdout.decode('utf-8')
|
error_msg = completed.stdout.decode('utf-8')
|
||||||
matched = re.match('.*xgboost.*warning.*', error_msg,
|
matched = re.search('(src|tests)/.*warning:', error_msg,
|
||||||
re.MULTILINE | re.DOTALL)
|
re.MULTILINE)
|
||||||
if matched is None:
|
if matched is None:
|
||||||
return_code = 0
|
return_code = 0
|
||||||
else:
|
else:
|
||||||
print(error_msg, '\n')
|
|
||||||
return_code = 1
|
return_code = 1
|
||||||
return completed.returncode | return_code
|
return (completed.returncode, return_code, error_msg)
|
||||||
|
|
||||||
|
|
||||||
class ClangTidy(object):
|
class ClangTidy(object):
|
||||||
@ -69,8 +70,8 @@ class ClangTidy(object):
|
|||||||
|
|
||||||
def _configure_flags(self, path, command):
|
def _configure_flags(self, path, command):
|
||||||
common_args = ['clang-tidy',
|
common_args = ['clang-tidy',
|
||||||
# "-header-filter='(xgboost\\/src|xgboost\\/include)'",
|
"-header-filter='(xgboost\\/src|xgboost\\/include)'",
|
||||||
'-config='+str(self.clang_tidy)]
|
'-config='+self.clang_tidy]
|
||||||
common_args.append(path)
|
common_args.append(path)
|
||||||
common_args.append('--')
|
common_args.append('--')
|
||||||
|
|
||||||
@ -112,6 +113,9 @@ class ClangTidy(object):
|
|||||||
def should_lint(path):
|
def should_lint(path):
|
||||||
if not self.cpp_lint and path.endswith('.cc'):
|
if not self.cpp_lint and path.endswith('.cc'):
|
||||||
return False
|
return False
|
||||||
|
isxgb = path.find('rabit') == -1
|
||||||
|
isxgb = isxgb and path.find('dmlc-core') == -1
|
||||||
|
if isxgb:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
|
cdb_file = os.path.join(self.cdb_path, 'compile_commands.json')
|
||||||
@ -120,6 +124,7 @@ class ClangTidy(object):
|
|||||||
tidy_file = os.path.join(self.root_path, '.clang-tidy')
|
tidy_file = os.path.join(self.root_path, '.clang-tidy')
|
||||||
with open(tidy_file) as fd:
|
with open(tidy_file) as fd:
|
||||||
self.clang_tidy = yaml.load(fd)
|
self.clang_tidy = yaml.load(fd)
|
||||||
|
self.clang_tidy = str(self.clang_tidy)
|
||||||
all_files = []
|
all_files = []
|
||||||
for entry in self.compile_commands:
|
for entry in self.compile_commands:
|
||||||
path = entry['file']
|
path = entry['file']
|
||||||
@ -132,15 +137,59 @@ class ClangTidy(object):
|
|||||||
def run(self):
|
def run(self):
|
||||||
'''Run clang-tidy.'''
|
'''Run clang-tidy.'''
|
||||||
all_files = self._configure()
|
all_files = self._configure()
|
||||||
|
passed = True
|
||||||
|
BAR = '-'*32
|
||||||
with Pool(cpu_count()) as pool:
|
with Pool(cpu_count()) as pool:
|
||||||
results = pool.map(call, all_files)
|
results = pool.map(call, all_files)
|
||||||
passed = True
|
for (process_status, tidy_status, msg) in results:
|
||||||
if 1 in results:
|
# Don't enforce clang-tidy to pass for now due to namespace
|
||||||
print('Please correct clang-tidy warnings.')
|
# for cub in thrust is not correct.
|
||||||
|
if tidy_status == 1:
|
||||||
passed = False
|
passed = False
|
||||||
|
print(BAR, '\n'
|
||||||
|
'Process return code:', process_status, ', ',
|
||||||
|
'Tidy result code:', tidy_status, ', ',
|
||||||
|
'Message:\n', msg,
|
||||||
|
BAR, '\n')
|
||||||
|
if not passed:
|
||||||
|
print('Please correct clang-tidy warnings.')
|
||||||
return passed
|
return passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_tidy():
|
||||||
|
'''See if clang-tidy and our regex is working correctly. There are
|
||||||
|
many subtleties we need to be careful. For instances:
|
||||||
|
|
||||||
|
* Is the string re-directed to pipe encoded as UTF-8? or is it
|
||||||
|
bytes?
|
||||||
|
|
||||||
|
* On Jenkins there's no 'xgboost' directory, are we catching the
|
||||||
|
right keywords?
|
||||||
|
|
||||||
|
* Should we use re.DOTALL?
|
||||||
|
|
||||||
|
* Should we use re.MULTILINE?
|
||||||
|
|
||||||
|
Tests here are not thorough, at least we want to guarantee tidy is
|
||||||
|
not missing anything on Jenkins.
|
||||||
|
|
||||||
|
'''
|
||||||
|
root_path = os.path.abspath(os.path.curdir)
|
||||||
|
tidy_file = os.path.join(root_path, '.clang-tidy')
|
||||||
|
test_file_path = os.path.join(root_path,
|
||||||
|
'tests', 'ci_build', 'test_tidy.cc')
|
||||||
|
|
||||||
|
with open(tidy_file) as fd:
|
||||||
|
tidy_config = fd.read()
|
||||||
|
tidy_config = str(tidy_config)
|
||||||
|
tidy_config = '-config='+tidy_config
|
||||||
|
args = ['clang-tidy', tidy_config, test_file_path]
|
||||||
|
(proc_code, tidy_status, error_msg) = call(args)
|
||||||
|
assert proc_code == 0
|
||||||
|
assert tidy_status == 1
|
||||||
|
print('clang-tidy is working.')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='Run clang-tidy.')
|
parser = argparse.ArgumentParser(description='Run clang-tidy.')
|
||||||
parser.add_argument('--cpp', type=int, default=1)
|
parser.add_argument('--cpp', type=int, default=1)
|
||||||
@ -148,8 +197,10 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('--gtest-path', required=True,
|
parser.add_argument('--gtest-path', required=True,
|
||||||
help='Full path of Google Test library directory')
|
help='Full path of Google Test library directory')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
test_tidy()
|
||||||
|
|
||||||
with ClangTidy(args.gtest_path, args.cpp, args.cuda) as linter:
|
with ClangTidy(args.gtest_path, args.cpp, args.cuda) as linter:
|
||||||
passed = linter.run()
|
passed = linter.run()
|
||||||
# Uncomment it once the code base is clang-tidy conformant.
|
if not passed:
|
||||||
# if not passed:
|
sys.exit(1)
|
||||||
# sys.exit(1)
|
|
||||||
|
|||||||
@ -172,7 +172,7 @@ struct BaseClass {
|
|||||||
virtual void operator()() {}
|
virtual void operator()() {}
|
||||||
};
|
};
|
||||||
struct DerivedClass : public BaseClass {
|
struct DerivedClass : public BaseClass {
|
||||||
virtual void operator()() {}
|
void operator()() override {}
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST(Span, FromOther) {
|
TEST(Span, FromOther) {
|
||||||
|
|||||||
@ -15,6 +15,7 @@ namespace xgboost {
|
|||||||
namespace common {
|
namespace common {
|
||||||
|
|
||||||
struct TestStatus {
|
struct TestStatus {
|
||||||
|
private:
|
||||||
int *status_;
|
int *status_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -28,33 +29,35 @@ struct TestStatus {
|
|||||||
dh::safe_cuda(cudaFree(status_));
|
dh::safe_cuda(cudaFree(status_));
|
||||||
}
|
}
|
||||||
|
|
||||||
int get() {
|
int Get() {
|
||||||
int h_status;
|
int h_status;
|
||||||
dh::safe_cuda(cudaMemcpy(&h_status, status_,
|
dh::safe_cuda(cudaMemcpy(&h_status, status_,
|
||||||
sizeof(int), cudaMemcpyDeviceToHost));
|
sizeof(int), cudaMemcpyDeviceToHost));
|
||||||
return h_status;
|
return h_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
int* data() {
|
int* Data() {
|
||||||
return status_;
|
return status_;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
__global__ void test_from_other_kernel(Span<float> span) {
|
__global__ void TestFromOtherKernel(Span<float> span) {
|
||||||
// don't get optimized out
|
// don't get optimized out
|
||||||
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
if (idx >= span.size())
|
if (idx >= span.size()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
// Test converting different T
|
// Test converting different T
|
||||||
__global__ void test_from_other_kernel_const(Span<float const, 16> span) {
|
__global__ void TestFromOtherKernelConst(Span<float const, 16> span) {
|
||||||
// don't get optimized out
|
// don't get optimized out
|
||||||
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
if (idx >= span.size())
|
if (idx >= span.size()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Here we just test whether the code compiles.
|
* \brief Here we just test whether the code compiles.
|
||||||
@ -68,42 +71,44 @@ TEST(GPUSpan, FromOther) {
|
|||||||
// dynamic extent
|
// dynamic extent
|
||||||
{
|
{
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_from_other_kernel<<<1, 16>>>(span);
|
TestFromOtherKernel<<<1, 16>>>(span);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_from_other_kernel_const<<<1, 16>>>(span);
|
TestFromOtherKernelConst<<<1, 16>>>(span);
|
||||||
}
|
}
|
||||||
// static extent
|
// static extent
|
||||||
{
|
{
|
||||||
Span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
|
Span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
|
||||||
test_from_other_kernel<<<1, 16>>>(span);
|
TestFromOtherKernel<<<1, 16>>>(span);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
|
Span<float, 16> span(d_vec.data().get(), d_vec.data().get() + 16);
|
||||||
test_from_other_kernel_const<<<1, 16>>>(span);
|
TestFromOtherKernelConst<<<1, 16>>>(span);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, Assignment) {
|
TEST(GPUSpan, Assignment) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestAssignment{status.data()});
|
dh::LaunchN(0, 16, TestAssignment{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, TestStatus) {
|
TEST(GPUSpan, TestStatus) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestTestStatus{status.data()});
|
dh::LaunchN(0, 16, TestTestStatus{status.Data()});
|
||||||
ASSERT_EQ(status.get(), -1);
|
ASSERT_EQ(status.Get(), -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct TestEqual {
|
struct TestEqual {
|
||||||
|
private:
|
||||||
T *lhs_, *rhs_;
|
T *lhs_, *rhs_;
|
||||||
int *status_;
|
int *status_;
|
||||||
|
|
||||||
|
public:
|
||||||
TestEqual(T* _lhs, T* _rhs, int * _status) :
|
TestEqual(T* _lhs, T* _rhs, int * _status) :
|
||||||
lhs_(_lhs), rhs_(_rhs), status_(_status) {}
|
lhs_(_lhs), rhs_(_rhs), status_(_status) {}
|
||||||
|
|
||||||
@ -140,10 +145,10 @@ TEST(GPUSpan, WithTrust) {
|
|||||||
|
|
||||||
dh::LaunchN(0, 16, TestEqual<float>{
|
dh::LaunchN(0, 16, TestEqual<float>{
|
||||||
thrust::raw_pointer_cast(d_vec1.data()),
|
thrust::raw_pointer_cast(d_vec1.data()),
|
||||||
s.data(), status.data()});
|
s.data(), status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
|
|
||||||
// FIXME: memory error!
|
// FIXME(trivialfis): memory error!
|
||||||
// bool res = thrust::equal(thrust::device,
|
// bool res = thrust::equal(thrust::device,
|
||||||
// d_vec.begin(), d_vec.end(),
|
// d_vec.begin(), d_vec.end(),
|
||||||
// s.begin());
|
// s.begin());
|
||||||
@ -153,23 +158,23 @@ TEST(GPUSpan, WithTrust) {
|
|||||||
TEST(GPUSpan, BeginEnd) {
|
TEST(GPUSpan, BeginEnd) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestBeginEnd{status.data()});
|
dh::LaunchN(0, 16, TestBeginEnd{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, RBeginREnd) {
|
TEST(GPUSpan, RBeginREnd) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestRBeginREnd{status.data()});
|
dh::LaunchN(0, 16, TestRBeginREnd{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void test_modify_kernel(Span<float> span) {
|
__global__ void TestModifyKernel(Span<float> span) {
|
||||||
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
if (idx >= span.size())
|
if (idx >= span.size()) {
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
span[idx] = span.size() - idx;
|
span[idx] = span.size() - idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -182,7 +187,7 @@ TEST(GPUSpan, Modify) {
|
|||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
|
|
||||||
test_modify_kernel<<<1, 16>>>(span);
|
TestModifyKernel<<<1, 16>>>(span);
|
||||||
|
|
||||||
for (size_t i = 0; i < d_vec.size(); ++i) {
|
for (size_t i = 0; i < d_vec.size(); ++i) {
|
||||||
ASSERT_EQ(d_vec[i], d_vec.size() - i);
|
ASSERT_EQ(d_vec[i], d_vec.size() - i);
|
||||||
@ -192,21 +197,23 @@ TEST(GPUSpan, Modify) {
|
|||||||
TEST(GPUSpan, Observers) {
|
TEST(GPUSpan, Observers) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestObservers{status.data()});
|
dh::LaunchN(0, 16, TestObservers{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, Compare) {
|
TEST(GPUSpan, Compare) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestIterCompare{status.data()});
|
dh::LaunchN(0, 16, TestIterCompare{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct TestElementAccess {
|
struct TestElementAccess {
|
||||||
|
private:
|
||||||
Span<float> span_;
|
Span<float> span_;
|
||||||
|
|
||||||
XGBOOST_DEVICE TestElementAccess (Span<float> _span) : span_(_span) {}
|
public:
|
||||||
|
XGBOOST_DEVICE explicit TestElementAccess (Span<float> _span) : span_(_span) {}
|
||||||
|
|
||||||
XGBOOST_DEVICE float operator()(size_t _idx) {
|
XGBOOST_DEVICE float operator()(size_t _idx) {
|
||||||
float tmp = span_[_idx];
|
float tmp = span_[_idx];
|
||||||
@ -232,16 +239,16 @@ TEST(GPUSpan, ElementAccess) {
|
|||||||
std::string output = testing::internal::GetCapturedStdout();
|
std::string output = testing::internal::GetCapturedStdout();
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void test_first_dynamic_kernel(Span<float> _span) {
|
__global__ void TestFirstDynamicKernel(Span<float> _span) {
|
||||||
_span.first<-1>();
|
_span.first<-1>();
|
||||||
}
|
}
|
||||||
__global__ void test_first_static_kernel(Span<float> _span) {
|
__global__ void TestFirstStaticKernel(Span<float> _span) {
|
||||||
_span.first(-1);
|
_span.first(-1);
|
||||||
}
|
}
|
||||||
__global__ void test_last_dynamic_kernel(Span<float> _span) {
|
__global__ void TestLastDynamicKernel(Span<float> _span) {
|
||||||
_span.last<-1>();
|
_span.last<-1>();
|
||||||
}
|
}
|
||||||
__global__ void test_last_static_kernel(Span<float> _span) {
|
__global__ void TestLastStaticKernel(Span<float> _span) {
|
||||||
_span.last(-1);
|
_span.last(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -256,7 +263,7 @@ TEST(GPUSpan, FirstLast) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_first_dynamic_kernel<<<1, 1>>>(span);
|
TestFirstDynamicKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_first_dy(), "");
|
EXPECT_DEATH(lambda_first_dy(), "");
|
||||||
@ -270,7 +277,7 @@ TEST(GPUSpan, FirstLast) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_first_static_kernel<<<1, 1>>>(span);
|
TestFirstStaticKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_first_static(), "");
|
EXPECT_DEATH(lambda_first_static(), "");
|
||||||
@ -284,7 +291,7 @@ TEST(GPUSpan, FirstLast) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_last_dynamic_kernel<<<1, 1>>>(span);
|
TestLastDynamicKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_last_dy(), "");
|
EXPECT_DEATH(lambda_last_dy(), "");
|
||||||
@ -298,7 +305,7 @@ TEST(GPUSpan, FirstLast) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_last_static_kernel<<<1, 1>>>(span);
|
TestLastStaticKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_last_static(), "");
|
EXPECT_DEATH(lambda_last_static(), "");
|
||||||
@ -306,10 +313,10 @@ TEST(GPUSpan, FirstLast) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
__global__ void test_subspan_dynamic_kernel(Span<float> _span) {
|
__global__ void TestSubspanDynamicKernel(Span<float> _span) {
|
||||||
_span.subspan(16, 0);
|
_span.subspan(16, 0);
|
||||||
}
|
}
|
||||||
__global__ void test_subspan_static_kernel(Span<float> _span) {
|
__global__ void TestSubspanStaticKernel(Span<float> _span) {
|
||||||
_span.subspan<16>();
|
_span.subspan<16>();
|
||||||
}
|
}
|
||||||
TEST(GPUSpan, Subspan) {
|
TEST(GPUSpan, Subspan) {
|
||||||
@ -321,7 +328,7 @@ TEST(GPUSpan, Subspan) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_subspan_dynamic_kernel<<<1, 1>>>(span);
|
TestSubspanDynamicKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_subspan_dynamic(), "");
|
EXPECT_DEATH(lambda_subspan_dynamic(), "");
|
||||||
@ -335,7 +342,7 @@ TEST(GPUSpan, Subspan) {
|
|||||||
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());
|
||||||
|
|
||||||
Span<float> span (d_vec.data().get(), d_vec.size());
|
Span<float> span (d_vec.data().get(), d_vec.size());
|
||||||
test_subspan_static_kernel<<<1, 1>>>(span);
|
TestSubspanStaticKernel<<<1, 1>>>(span);
|
||||||
};
|
};
|
||||||
testing::internal::CaptureStdout();
|
testing::internal::CaptureStdout();
|
||||||
EXPECT_DEATH(lambda_subspan_static(), "");
|
EXPECT_DEATH(lambda_subspan_static(), "");
|
||||||
@ -345,43 +352,43 @@ TEST(GPUSpan, Subspan) {
|
|||||||
TEST(GPUSpanIter, Construct) {
|
TEST(GPUSpanIter, Construct) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestIterConstruct{status.data()});
|
dh::LaunchN(0, 16, TestIterConstruct{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpanIter, Ref) {
|
TEST(GPUSpanIter, Ref) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestIterRef{status.data()});
|
dh::LaunchN(0, 16, TestIterRef{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpanIter, Calculate) {
|
TEST(GPUSpanIter, Calculate) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestIterCalculate{status.data()});
|
dh::LaunchN(0, 16, TestIterCalculate{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpanIter, Compare) {
|
TEST(GPUSpanIter, Compare) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestIterCompare{status.data()});
|
dh::LaunchN(0, 16, TestIterCompare{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, AsBytes) {
|
TEST(GPUSpan, AsBytes) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestAsBytes{status.data()});
|
dh::LaunchN(0, 16, TestAsBytes{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GPUSpan, AsWritableBytes) {
|
TEST(GPUSpan, AsWritableBytes) {
|
||||||
dh::safe_cuda(cudaSetDevice(0));
|
dh::safe_cuda(cudaSetDevice(0));
|
||||||
TestStatus status;
|
TestStatus status;
|
||||||
dh::LaunchN(0, 16, TestAsWritableBytes{status.data()});
|
dh::LaunchN(0, 16, TestAsWritableBytes{status.Data()});
|
||||||
ASSERT_EQ(status.get(), 1);
|
ASSERT_EQ(status.Get(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace common
|
} // namespace common
|
||||||
|
|||||||
@ -13,7 +13,7 @@ TEST(SparsePage, PushCSC) {
|
|||||||
|
|
||||||
offset = {0, 1, 4};
|
offset = {0, 1, 4};
|
||||||
for (size_t i = 0; i < offset.back(); ++i) {
|
for (size_t i = 0; i < offset.back(); ++i) {
|
||||||
data.push_back(Entry(i, 0.1f));
|
data.emplace_back(Entry(i, 0.1f));
|
||||||
}
|
}
|
||||||
|
|
||||||
SparsePage other;
|
SparsePage other;
|
||||||
@ -52,4 +52,4 @@ TEST(SparsePage, PushCSC) {
|
|||||||
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
|
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} // namespace xgboost
|
||||||
|
|||||||
@ -27,7 +27,7 @@ TEST(SimpleDMatrix, RowAccess) {
|
|||||||
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
|
xgboost::DMatrix * dmat = xgboost::DMatrix::Load(tmp_file, false, false);
|
||||||
|
|
||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
long row_count = 0;
|
int64_t row_count = 0;
|
||||||
for (auto &batch : dmat->GetRowBatches()) {
|
for (auto &batch : dmat->GetRowBatches()) {
|
||||||
row_count += batch.Size();
|
row_count += batch.Size();
|
||||||
}
|
}
|
||||||
@ -54,7 +54,7 @@ TEST(SimpleDMatrix, ColAccessWithoutBatches) {
|
|||||||
ASSERT_TRUE(dmat->SingleColBlock());
|
ASSERT_TRUE(dmat->SingleColBlock());
|
||||||
|
|
||||||
// Loop over the batches and assert the data is as expected
|
// Loop over the batches and assert the data is as expected
|
||||||
long num_col_batch = 0;
|
int64_t num_col_batch = 0;
|
||||||
for (const auto &batch : dmat->GetSortedColumnBatches()) {
|
for (const auto &batch : dmat->GetSortedColumnBatches()) {
|
||||||
num_col_batch += 1;
|
num_col_batch += 1;
|
||||||
EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
|
EXPECT_EQ(batch.Size(), dmat->Info().num_col_)
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
// Copyright by Contributors
|
// Copyright by Contributors
|
||||||
#include <xgboost/data.h>
|
#include <xgboost/data.h>
|
||||||
#include <dmlc/filesystem.h>
|
#include <dmlc/filesystem.h>
|
||||||
|
#include <cinttypes>
|
||||||
|
|
||||||
#include "../../../src/data/sparse_page_dmatrix.h"
|
#include "../../../src/data/sparse_page_dmatrix.h"
|
||||||
|
|
||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
@ -33,7 +35,7 @@ TEST(SparsePageDMatrix, RowAccess) {
|
|||||||
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
||||||
|
|
||||||
// Loop over the batches and count the records
|
// Loop over the batches and count the records
|
||||||
long row_count = 0;
|
int64_t row_count = 0;
|
||||||
for (auto &batch : dmat->GetRowBatches()) {
|
for (auto &batch : dmat->GetRowBatches()) {
|
||||||
row_count += batch.Size();
|
row_count += batch.Size();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,13 +4,14 @@
|
|||||||
#include "./helpers.h"
|
#include "./helpers.h"
|
||||||
#include "xgboost/c_api.h"
|
#include "xgboost/c_api.h"
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <cinttypes>
|
||||||
|
|
||||||
bool FileExists(const std::string& filename) {
|
bool FileExists(const std::string& filename) {
|
||||||
struct stat st;
|
struct stat st;
|
||||||
return stat(filename.c_str(), &st) == 0;
|
return stat(filename.c_str(), &st) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
long GetFileSize(const std::string& filename) {
|
int64_t GetFileSize(const std::string& filename) {
|
||||||
struct stat st;
|
struct stat st;
|
||||||
stat(filename.c_str(), &st);
|
stat(filename.c_str(), &st);
|
||||||
return st.st_size;
|
return st.st_size;
|
||||||
@ -30,7 +31,7 @@ void CreateBigTestData(const std::string& filename, size_t n_entries) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _CheckObjFunction(xgboost::ObjFunction * obj,
|
void CheckObjFunctionImpl(xgboost::ObjFunction * obj,
|
||||||
std::vector<xgboost::bst_float> preds,
|
std::vector<xgboost::bst_float> preds,
|
||||||
std::vector<xgboost::bst_float> labels,
|
std::vector<xgboost::bst_float> labels,
|
||||||
std::vector<xgboost::bst_float> weights,
|
std::vector<xgboost::bst_float> weights,
|
||||||
@ -64,7 +65,7 @@ void CheckObjFunction(xgboost::ObjFunction * obj,
|
|||||||
info.labels_.HostVector() = labels;
|
info.labels_.HostVector() = labels;
|
||||||
info.weights_.HostVector() = weights;
|
info.weights_.HostVector() = weights;
|
||||||
|
|
||||||
_CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess);
|
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CheckRankingObjFunction(xgboost::ObjFunction * obj,
|
void CheckRankingObjFunction(xgboost::ObjFunction * obj,
|
||||||
@ -80,7 +81,7 @@ void CheckRankingObjFunction(xgboost::ObjFunction * obj,
|
|||||||
info.weights_.HostVector() = weights;
|
info.weights_.HostVector() = weights;
|
||||||
info.group_ptr_ = groups;
|
info.group_ptr_ = groups;
|
||||||
|
|
||||||
_CheckObjFunction(obj, preds, labels, weights, info, out_grad, out_hess);
|
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,11 +4,16 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|
||||||
TEST(Metric, UnknownMetric) {
|
TEST(Metric, UnknownMetric) {
|
||||||
xgboost::Metric * metric;
|
xgboost::Metric * metric = nullptr;
|
||||||
EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name"));
|
EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name"));
|
||||||
EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse"));
|
EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse"));
|
||||||
delete metric;
|
if (metric) {
|
||||||
EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name@1"));
|
|
||||||
EXPECT_NO_THROW(metric = xgboost::Metric::Create("error@0.5f"));
|
|
||||||
delete metric;
|
delete metric;
|
||||||
}
|
}
|
||||||
|
metric = nullptr;
|
||||||
|
EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name@1"));
|
||||||
|
EXPECT_NO_THROW(metric = xgboost::Metric::Create("error@0.5f"));
|
||||||
|
if (metric) {
|
||||||
|
delete metric;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -4,8 +4,10 @@
|
|||||||
#include "../helpers.h"
|
#include "../helpers.h"
|
||||||
|
|
||||||
TEST(Objective, UnknownFunction) {
|
TEST(Objective, UnknownFunction) {
|
||||||
xgboost::ObjFunction* obj;
|
xgboost::ObjFunction* obj = nullptr;
|
||||||
EXPECT_ANY_THROW(obj = xgboost::ObjFunction::Create("unknown_name"));
|
EXPECT_ANY_THROW(obj = xgboost::ObjFunction::Create("unknown_name"));
|
||||||
EXPECT_NO_THROW(obj = xgboost::ObjFunction::Create("reg:linear"));
|
EXPECT_NO_THROW(obj = xgboost::ObjFunction::Create("reg:linear"));
|
||||||
|
if (obj) {
|
||||||
delete obj;
|
delete obj;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -85,7 +85,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
|
|||||||
TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
|
TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
|
||||||
xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson");
|
xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("count:poisson");
|
||||||
std::vector<std::pair<std::string, std::string> > args;
|
std::vector<std::pair<std::string, std::string> > args;
|
||||||
args.push_back(std::make_pair("max_delta_step", "0.1f"));
|
args.emplace_back(std::make_pair("max_delta_step", "0.1f"));
|
||||||
obj->Configure(args);
|
obj->Configure(args);
|
||||||
CheckObjFunction(obj,
|
CheckObjFunction(obj,
|
||||||
{ 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1},
|
{ 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1},
|
||||||
@ -176,7 +176,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
|
|||||||
TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
|
TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
|
||||||
xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie");
|
xgboost::ObjFunction * obj = xgboost::ObjFunction::Create("reg:tweedie");
|
||||||
std::vector<std::pair<std::string, std::string> > args;
|
std::vector<std::pair<std::string, std::string> > args;
|
||||||
args.push_back(std::make_pair("tweedie_variance_power", "1.1f"));
|
args.emplace_back(std::make_pair("tweedie_variance_power", "1.1f"));
|
||||||
obj->Configure(args);
|
obj->Configure(args);
|
||||||
CheckObjFunction(obj,
|
CheckObjFunction(obj,
|
||||||
{ 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1},
|
{ 0, 0.1f, 0.9f, 1, 0, 0.1f, 0.9f, 1},
|
||||||
|
|||||||
@ -41,21 +41,20 @@ TEST(cpu_predictor, Test) {
|
|||||||
// Test predict leaf
|
// Test predict leaf
|
||||||
std::vector<float> leaf_out_predictions;
|
std::vector<float> leaf_out_predictions;
|
||||||
cpu_predictor->PredictLeaf((*dmat).get(), &leaf_out_predictions, model);
|
cpu_predictor->PredictLeaf((*dmat).get(), &leaf_out_predictions, model);
|
||||||
for (int i = 0; i < leaf_out_predictions.size(); i++) {
|
for (auto v : leaf_out_predictions) {
|
||||||
ASSERT_EQ(leaf_out_predictions[i], 0);
|
ASSERT_EQ(v, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test predict contribution
|
// Test predict contribution
|
||||||
std::vector<float> out_contribution;
|
std::vector<float> out_contribution;
|
||||||
cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model);
|
cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model);
|
||||||
for (int i = 0; i < out_contribution.size(); i++) {
|
for (auto const& contri : out_contribution) {
|
||||||
ASSERT_EQ(out_contribution[i], 1.5);
|
ASSERT_EQ(contri, 1.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Test predict contribution (approximate method)
|
// Test predict contribution (approximate method)
|
||||||
cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model, true);
|
cpu_predictor->PredictContribution((*dmat).get(), &out_contribution, model, true);
|
||||||
for (int i = 0; i < out_contribution.size(); i++) {
|
for (auto const& contri : out_contribution) {
|
||||||
ASSERT_EQ(out_contribution[i], 1.5);
|
ASSERT_EQ(contri, 1.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
delete dmat;
|
delete dmat;
|
||||||
|
|||||||
@ -8,7 +8,7 @@
|
|||||||
namespace xgboost {
|
namespace xgboost {
|
||||||
|
|
||||||
TEST(Learner, Basic) {
|
TEST(Learner, Basic) {
|
||||||
typedef std::pair<std::string, std::string> Arg;
|
using Arg = std::pair<std::string, std::string>;
|
||||||
auto args = {Arg("tree_method", "exact")};
|
auto args = {Arg("tree_method", "exact")};
|
||||||
auto mat_ptr = CreateDMatrix(10, 10, 0);
|
auto mat_ptr = CreateDMatrix(10, 10, 0);
|
||||||
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {*mat_ptr};
|
std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {*mat_ptr};
|
||||||
|
|||||||
@ -20,13 +20,13 @@ TEST(GPUExact, Update) {
|
|||||||
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu");
|
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu");
|
||||||
p_gpuexact_maker->Init(args);
|
p_gpuexact_maker->Init(args);
|
||||||
|
|
||||||
size_t constexpr n_rows = 4;
|
size_t constexpr kNRows = 4;
|
||||||
size_t constexpr n_cols = 8;
|
size_t constexpr kNCols = 8;
|
||||||
bst_float constexpr sparsity = 0.0f;
|
bst_float constexpr kSparsity = 0.0f;
|
||||||
|
|
||||||
auto dmat = CreateDMatrix(n_rows, n_cols, sparsity, 3);
|
auto dmat = CreateDMatrix(kNRows, kNCols, kSparsity, 3);
|
||||||
std::vector<GradientPair> h_gpair(n_rows);
|
std::vector<GradientPair> h_gpair(kNRows);
|
||||||
for (size_t i = 0; i < n_rows; ++i) {
|
for (size_t i = 0; i < kNRows; ++i) {
|
||||||
h_gpair[i] = GradientPair(i % 2, 1);
|
h_gpair[i] = GradientPair(i % 2, 1);
|
||||||
}
|
}
|
||||||
HostDeviceVector<GradientPair> gpair (h_gpair);
|
HostDeviceVector<GradientPair> gpair (h_gpair);
|
||||||
|
|||||||
@ -46,20 +46,20 @@ void BuildGidx(DeviceShard<GradientSumT>* shard, int n_rows, int n_cols,
|
|||||||
}
|
}
|
||||||
|
|
||||||
TEST(GpuHist, BuildGidxDense) {
|
TEST(GpuHist, BuildGidxDense) {
|
||||||
int const n_rows = 16, n_cols = 8;
|
int constexpr kNRows = 16, kNCols = 8;
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
param.max_depth = 1;
|
param.max_depth = 1;
|
||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientPairPrecise> shard(0, 0, n_rows, param);
|
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
|
||||||
BuildGidx(&shard, n_rows, n_cols);
|
BuildGidx(&shard, kNRows, kNCols);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||||
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
common::CompressedIterator<uint32_t> gidx(h_gidx_buffer.data(), 25);
|
||||||
|
|
||||||
ASSERT_EQ(shard.row_stride, n_cols);
|
ASSERT_EQ(shard.row_stride, kNCols);
|
||||||
|
|
||||||
std::vector<uint32_t> solution = {
|
std::vector<uint32_t> solution = {
|
||||||
0, 3, 8, 9, 14, 17, 20, 21,
|
0, 3, 8, 9, 14, 17, 20, 21,
|
||||||
@ -79,20 +79,20 @@ TEST(GpuHist, BuildGidxDense) {
|
|||||||
2, 4, 8, 10, 14, 15, 19, 22,
|
2, 4, 8, 10, 14, 15, 19, 22,
|
||||||
1, 4, 7, 10, 14, 16, 19, 21,
|
1, 4, 7, 10, 14, 16, 19, 21,
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < n_rows * n_cols; ++i) {
|
for (size_t i = 0; i < kNRows * kNCols; ++i) {
|
||||||
ASSERT_EQ(solution[i], gidx[i]);
|
ASSERT_EQ(solution[i], gidx[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(GpuHist, BuildGidxSparse) {
|
TEST(GpuHist, BuildGidxSparse) {
|
||||||
int const n_rows = 16, n_cols = 8;
|
int constexpr kNRows = 16, kNCols = 8;
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
param.max_depth = 1;
|
param.max_depth = 1;
|
||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientPairPrecise> shard(0, 0, n_rows, param);
|
DeviceShard<GradientPairPrecise> shard(0, 0, kNRows, param);
|
||||||
BuildGidx(&shard, n_rows, n_cols, 0.9f);
|
BuildGidx(&shard, kNRows, kNCols, 0.9f);
|
||||||
|
|
||||||
std::vector<common::CompressedByteT> h_gidx_buffer;
|
std::vector<common::CompressedByteT> h_gidx_buffer;
|
||||||
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
h_gidx_buffer = shard.gidx_buffer.AsVector();
|
||||||
@ -106,7 +106,7 @@ TEST(GpuHist, BuildGidxSparse) {
|
|||||||
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24,
|
||||||
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24
|
||||||
};
|
};
|
||||||
for (size_t i = 0; i < n_rows * shard.row_stride; ++i) {
|
for (size_t i = 0; i < kNRows * shard.row_stride; ++i) {
|
||||||
ASSERT_EQ(solution[i], gidx[i]);
|
ASSERT_EQ(solution[i], gidx[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,27 +128,27 @@ std::vector<GradientPairPrecise> GetHostHistGpair() {
|
|||||||
|
|
||||||
template <typename GradientSumT>
|
template <typename GradientSumT>
|
||||||
void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
|
void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
|
||||||
int const n_rows = 16, n_cols = 8;
|
int const kNRows = 16, kNCols = 8;
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
param.max_depth = 6;
|
param.max_depth = 6;
|
||||||
param.n_gpus = 1;
|
param.n_gpus = 1;
|
||||||
param.max_leaves = 0;
|
param.max_leaves = 0;
|
||||||
|
|
||||||
DeviceShard<GradientSumT> shard(0, 0, n_rows, param);
|
DeviceShard<GradientSumT> shard(0, 0, kNRows, param);
|
||||||
|
|
||||||
BuildGidx(&shard, n_rows, n_cols);
|
BuildGidx(&shard, kNRows, kNCols);
|
||||||
|
|
||||||
xgboost::SimpleLCG gen;
|
xgboost::SimpleLCG gen;
|
||||||
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
|
xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
|
||||||
std::vector<GradientPair> h_gpair(n_rows);
|
std::vector<GradientPair> h_gpair(kNRows);
|
||||||
for (size_t i = 0; i < h_gpair.size(); ++i) {
|
for (auto &gpair : h_gpair) {
|
||||||
bst_float grad = dist(&gen);
|
bst_float grad = dist(&gen);
|
||||||
bst_float hess = dist(&gen);
|
bst_float hess = dist(&gen);
|
||||||
h_gpair[i] = GradientPair(grad, hess);
|
gpair = GradientPair(grad, hess);
|
||||||
}
|
}
|
||||||
|
|
||||||
thrust::device_vector<GradientPair> gpair (n_rows);
|
thrust::device_vector<GradientPair> gpair (kNRows);
|
||||||
gpair = h_gpair;
|
gpair = h_gpair;
|
||||||
|
|
||||||
int num_symbols = shard.n_bins + 1;
|
int num_symbols = shard.n_bins + 1;
|
||||||
@ -164,7 +164,7 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
|
|||||||
num_symbols);
|
num_symbols);
|
||||||
|
|
||||||
shard.ridx_segments.resize(1);
|
shard.ridx_segments.resize(1);
|
||||||
shard.ridx_segments[0] = Segment(0, n_rows);
|
shard.ridx_segments[0] = Segment(0, kNRows);
|
||||||
shard.hist.AllocateHistogram(0);
|
shard.hist.AllocateHistogram(0);
|
||||||
shard.gpair.copy(gpair.begin(), gpair.end());
|
shard.gpair.copy(gpair.begin(), gpair.end());
|
||||||
thrust::sequence(shard.ridx.CurrentDVec().tbegin(),
|
thrust::sequence(shard.ridx.CurrentDVec().tbegin(),
|
||||||
@ -175,11 +175,11 @@ void TestBuildHist(GPUHistBuilderBase<GradientSumT>& builder) {
|
|||||||
|
|
||||||
auto node_histogram = d_hist.GetNodeHistogram(0);
|
auto node_histogram = d_hist.GetNodeHistogram(0);
|
||||||
// d_hist.data stored in float, not gradient pair
|
// d_hist.data stored in float, not gradient pair
|
||||||
thrust::host_vector<GradientSumT> h_result (d_hist.data.size()/2);
|
thrust::host_vector<GradientSumT> h_result (d_hist.Data().size() / 2);
|
||||||
size_t data_size =
|
size_t data_size =
|
||||||
sizeof(GradientSumT) /
|
sizeof(GradientSumT) /
|
||||||
(sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT));
|
(sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT));
|
||||||
data_size *= d_hist.data.size();
|
data_size *= d_hist.Data().size();
|
||||||
dh::safe_cuda(cudaMemcpy(h_result.data(), node_histogram.data(), data_size,
|
dh::safe_cuda(cudaMemcpy(h_result.data(), node_histogram.data(), data_size,
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
@ -224,8 +224,8 @@ common::HistCutMatrix GetHostCutMatrix () {
|
|||||||
|
|
||||||
// TODO(trivialfis): This test is over simplified.
|
// TODO(trivialfis): This test is over simplified.
|
||||||
TEST(GpuHist, EvaluateSplits) {
|
TEST(GpuHist, EvaluateSplits) {
|
||||||
constexpr int n_rows = 16;
|
constexpr int kNRows = 16;
|
||||||
constexpr int n_cols = 8;
|
constexpr int kNCols = 8;
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
param.max_depth = 1;
|
param.max_depth = 1;
|
||||||
@ -240,14 +240,15 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
param.reg_lambda = 0;
|
param.reg_lambda = 0;
|
||||||
param.max_delta_step = 0.0;
|
param.max_delta_step = 0.0;
|
||||||
|
|
||||||
for (size_t i = 0; i < n_cols; ++i) {
|
for (size_t i = 0; i < kNCols; ++i) {
|
||||||
param.monotone_constraints.emplace_back(0);
|
param.monotone_constraints.emplace_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int max_bins = 4;
|
int max_bins = 4;
|
||||||
|
|
||||||
// Initialize DeviceShard
|
// Initialize DeviceShard
|
||||||
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {new DeviceShard<GradientPairPrecise>(0, 0, n_rows, param)};
|
std::unique_ptr<DeviceShard<GradientPairPrecise>> shard {
|
||||||
|
new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param)};
|
||||||
// Initialize DeviceShard::node_sum_gradients
|
// Initialize DeviceShard::node_sum_gradients
|
||||||
shard->node_sum_gradients = {{6.4f, 12.8f}};
|
shard->node_sum_gradients = {{6.4f, 12.8f}};
|
||||||
|
|
||||||
@ -257,17 +258,17 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
// Copy cut matrix to device.
|
// Copy cut matrix to device.
|
||||||
DeviceShard<GradientPairPrecise>::DeviceHistCutMatrix cut;
|
DeviceShard<GradientPairPrecise>::DeviceHistCutMatrix cut;
|
||||||
shard->ba.Allocate(0,
|
shard->ba.Allocate(0,
|
||||||
&(shard->cut_.feature_segments), cmat.row_ptr.size(),
|
&(shard->d_cut.feature_segments), cmat.row_ptr.size(),
|
||||||
&(shard->cut_.min_fvalue), cmat.min_val.size(),
|
&(shard->d_cut.min_fvalue), cmat.min_val.size(),
|
||||||
&(shard->cut_.gidx_fvalue_map), 24,
|
&(shard->d_cut.gidx_fvalue_map), 24,
|
||||||
&(shard->monotone_constraints), n_cols);
|
&(shard->monotone_constraints), kNCols);
|
||||||
shard->cut_.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
|
shard->d_cut.feature_segments.copy(cmat.row_ptr.begin(), cmat.row_ptr.end());
|
||||||
shard->cut_.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
|
shard->d_cut.gidx_fvalue_map.copy(cmat.cut.begin(), cmat.cut.end());
|
||||||
shard->monotone_constraints.copy(param.monotone_constraints.begin(),
|
shard->monotone_constraints.copy(param.monotone_constraints.begin(),
|
||||||
param.monotone_constraints.end());
|
param.monotone_constraints.end());
|
||||||
|
|
||||||
// Initialize DeviceShard::hist
|
// Initialize DeviceShard::hist
|
||||||
shard->hist.Init(0, (max_bins - 1) * n_cols);
|
shard->hist.Init(0, (max_bins - 1) * kNCols);
|
||||||
shard->hist.AllocateHistogram(0);
|
shard->hist.AllocateHistogram(0);
|
||||||
// Each row of hist_gpair represents gpairs for one feature.
|
// Each row of hist_gpair represents gpairs for one feature.
|
||||||
// Each entry represents a bin.
|
// Each entry represents a bin.
|
||||||
@ -278,16 +279,16 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
hist.push_back(pair.GetHess());
|
hist.push_back(pair.GetHess());
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT_EQ(shard->hist.data.size(), hist.size());
|
ASSERT_EQ(shard->hist.Data().size(), hist.size());
|
||||||
thrust::copy(hist.begin(), hist.end(),
|
thrust::copy(hist.begin(), hist.end(),
|
||||||
shard->hist.data.begin());
|
shard->hist.Data().begin());
|
||||||
|
|
||||||
// Initialize GPUHistMaker
|
// Initialize GPUHistMaker
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
|
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise>();
|
GPUHistMakerSpecialised<GradientPairPrecise>();
|
||||||
hist_maker.param_ = param;
|
hist_maker.param_ = param;
|
||||||
hist_maker.shards_.push_back(std::move(shard));
|
hist_maker.shards_.push_back(std::move(shard));
|
||||||
hist_maker.column_sampler_.Init(n_cols,
|
hist_maker.column_sampler_.Init(kNCols,
|
||||||
param.colsample_bynode,
|
param.colsample_bynode,
|
||||||
param.colsample_bylevel,
|
param.colsample_bylevel,
|
||||||
param.colsample_bytree,
|
param.colsample_bytree,
|
||||||
@ -295,8 +296,8 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
|
|
||||||
RegTree tree;
|
RegTree tree;
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
info.num_row_ = n_rows;
|
info.num_row_ = kNRows;
|
||||||
info.num_col_ = n_cols;
|
info.num_col_ = kNCols;
|
||||||
|
|
||||||
hist_maker.info_ = &info;
|
hist_maker.info_ = &info;
|
||||||
hist_maker.node_value_constraints_.resize(1);
|
hist_maker.node_value_constraints_.resize(1);
|
||||||
@ -313,30 +314,30 @@ TEST(GpuHist, EvaluateSplits) {
|
|||||||
TEST(GpuHist, ApplySplit) {
|
TEST(GpuHist, ApplySplit) {
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
|
GPUHistMakerSpecialised<GradientPairPrecise> hist_maker =
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise>();
|
GPUHistMakerSpecialised<GradientPairPrecise>();
|
||||||
int constexpr nid = 0;
|
int constexpr kNId = 0;
|
||||||
int constexpr n_rows = 16;
|
int constexpr kNRows = 16;
|
||||||
int constexpr n_cols = 8;
|
int constexpr kNCols = 8;
|
||||||
|
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
std::vector<std::pair<std::string, std::string>> args = {};
|
std::vector<std::pair<std::string, std::string>> args = {};
|
||||||
param.InitAllowUnknown(args);
|
param.InitAllowUnknown(args);
|
||||||
|
|
||||||
// Initialize shard
|
// Initialize shard
|
||||||
for (size_t i = 0; i < n_cols; ++i) {
|
for (size_t i = 0; i < kNCols; ++i) {
|
||||||
param.monotone_constraints.emplace_back(0);
|
param.monotone_constraints.emplace_back(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
hist_maker.shards_.resize(1);
|
hist_maker.shards_.resize(1);
|
||||||
hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, n_rows, param));
|
hist_maker.shards_[0].reset(new DeviceShard<GradientPairPrecise>(0, 0, kNRows, param));
|
||||||
|
|
||||||
auto& shard = hist_maker.shards_.at(0);
|
auto& shard = hist_maker.shards_.at(0);
|
||||||
shard->ridx_segments.resize(3); // 3 nodes.
|
shard->ridx_segments.resize(3); // 3 nodes.
|
||||||
shard->node_sum_gradients.resize(3);
|
shard->node_sum_gradients.resize(3);
|
||||||
|
|
||||||
shard->ridx_segments[0] = Segment(0, n_rows);
|
shard->ridx_segments[0] = Segment(0, kNRows);
|
||||||
shard->ba.Allocate(0, &(shard->ridx), n_rows,
|
shard->ba.Allocate(0, &(shard->ridx), kNRows,
|
||||||
&(shard->position), n_rows);
|
&(shard->position), kNRows);
|
||||||
shard->row_stride = n_cols;
|
shard->row_stride = kNCols;
|
||||||
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
thrust::sequence(shard->ridx.CurrentDVec().tbegin(),
|
||||||
shard->ridx.CurrentDVec().tend());
|
shard->ridx.CurrentDVec().tend());
|
||||||
// Initialize GPUHistMaker
|
// Initialize GPUHistMaker
|
||||||
@ -349,31 +350,30 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
GradientPair(8.2, 2.8), GradientPair(6.3, 3.6),
|
GradientPair(8.2, 2.8), GradientPair(6.3, 3.6),
|
||||||
GPUTrainingParam(param));
|
GPUTrainingParam(param));
|
||||||
GPUHistMakerSpecialised<GradientPairPrecise>::ExpandEntry candidate_entry {0, 0, candidate, 0};
|
GPUHistMakerSpecialised<GradientPairPrecise>::ExpandEntry candidate_entry {0, 0, candidate, 0};
|
||||||
candidate_entry.nid = nid;
|
candidate_entry.nid = kNId;
|
||||||
|
|
||||||
auto const& nodes = tree.GetNodes();
|
auto const& nodes = tree.GetNodes();
|
||||||
size_t n_nodes = nodes.size();
|
|
||||||
|
|
||||||
// Used to get bin_id in update position.
|
// Used to get bin_id in update position.
|
||||||
common::HistCutMatrix cmat = GetHostCutMatrix();
|
common::HistCutMatrix cmat = GetHostCutMatrix();
|
||||||
hist_maker.hmat_ = cmat;
|
hist_maker.hmat_ = cmat;
|
||||||
|
|
||||||
MetaInfo info;
|
MetaInfo info;
|
||||||
info.num_row_ = n_rows;
|
info.num_row_ = kNRows;
|
||||||
info.num_col_ = n_cols;
|
info.num_col_ = kNCols;
|
||||||
info.num_nonzero_ = n_rows * n_cols; // Dense
|
info.num_nonzero_ = kNRows * kNCols; // Dense
|
||||||
|
|
||||||
// Initialize gidx
|
// Initialize gidx
|
||||||
int n_bins = 24;
|
int n_bins = 24;
|
||||||
int row_stride = n_cols;
|
int row_stride = kNCols;
|
||||||
int num_symbols = n_bins + 1;
|
int num_symbols = n_bins + 1;
|
||||||
size_t compressed_size_bytes =
|
size_t compressed_size_bytes =
|
||||||
common::CompressedBufferWriter::CalculateBufferSize(
|
common::CompressedBufferWriter::CalculateBufferSize(
|
||||||
row_stride * n_rows, num_symbols);
|
row_stride * kNRows, num_symbols);
|
||||||
shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes);
|
shard->ba.Allocate(0, &(shard->gidx_buffer), compressed_size_bytes);
|
||||||
|
|
||||||
common::CompressedBufferWriter wr(num_symbols);
|
common::CompressedBufferWriter wr(num_symbols);
|
||||||
std::vector<int> h_gidx (n_rows * row_stride);
|
std::vector<int> h_gidx (kNRows * row_stride);
|
||||||
std::iota(h_gidx.begin(), h_gidx.end(), 0);
|
std::iota(h_gidx.begin(), h_gidx.end(), 0);
|
||||||
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
|
std::vector<common::CompressedByteT> h_gidx_compressed (compressed_size_bytes);
|
||||||
|
|
||||||
@ -387,10 +387,10 @@ TEST(GpuHist, ApplySplit) {
|
|||||||
hist_maker.ApplySplit(candidate_entry, &tree);
|
hist_maker.ApplySplit(candidate_entry, &tree);
|
||||||
hist_maker.UpdatePosition(candidate_entry, &tree);
|
hist_maker.UpdatePosition(candidate_entry, &tree);
|
||||||
|
|
||||||
ASSERT_FALSE(tree[nid].IsLeaf());
|
ASSERT_FALSE(tree[kNId].IsLeaf());
|
||||||
|
|
||||||
int left_nidx = tree[nid].LeftChild();
|
int left_nidx = tree[kNId].LeftChild();
|
||||||
int right_nidx = tree[nid].RightChild();
|
int right_nidx = tree[kNId].RightChild();
|
||||||
|
|
||||||
ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0);
|
ASSERT_EQ(shard->ridx_segments[left_nidx].begin, 0);
|
||||||
ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6);
|
ASSERT_EQ(shard->ridx_segments[left_nidx].end, 6);
|
||||||
|
|||||||
@ -13,14 +13,14 @@ namespace xgboost {
|
|||||||
namespace tree {
|
namespace tree {
|
||||||
|
|
||||||
TEST(Updater, Prune) {
|
TEST(Updater, Prune) {
|
||||||
int constexpr n_rows = 32, n_cols = 16;
|
int constexpr kNRows = 32, kNCols = 16;
|
||||||
|
|
||||||
std::vector<std::pair<std::string, std::string>> cfg;
|
std::vector<std::pair<std::string, std::string>> cfg;
|
||||||
cfg.push_back(std::pair<std::string, std::string>(
|
cfg.emplace_back(std::pair<std::string, std::string>(
|
||||||
"num_feature", std::to_string(n_cols)));
|
"num_feature", std::to_string(kNCols)));
|
||||||
cfg.push_back(std::pair<std::string, std::string>(
|
cfg.emplace_back(std::pair<std::string, std::string>(
|
||||||
"min_split_loss", "10"));
|
"min_split_loss", "10"));
|
||||||
cfg.push_back(std::pair<std::string, std::string>(
|
cfg.emplace_back(std::pair<std::string, std::string>(
|
||||||
"silent", "1"));
|
"silent", "1"));
|
||||||
|
|
||||||
// These data are just place holders.
|
// These data are just place holders.
|
||||||
|
|||||||
@ -133,12 +133,12 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
std::vector<GradientPair> row_gpairs =
|
std::vector<GradientPair> row_gpairs =
|
||||||
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
|
{ {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
|
||||||
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
|
{0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
|
||||||
size_t constexpr max_bins = 4;
|
size_t constexpr kMaxBins = 4;
|
||||||
auto dmat = CreateDMatrix(n_rows, n_cols, 0, 3);
|
auto dmat = CreateDMatrix(kNRows, kNCols, 0, 3);
|
||||||
// dense, no missing values
|
// dense, no missing values
|
||||||
|
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
gmat.Init((*dmat).get(), max_bins);
|
gmat.Init((*dmat).get(), kMaxBins);
|
||||||
|
|
||||||
RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree);
|
RealImpl::InitData(gmat, row_gpairs, *(*dmat), tree);
|
||||||
hist_.AddHistRow(0);
|
hist_.AddHistRow(0);
|
||||||
@ -167,7 +167,8 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
// 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha,
|
// 2) no regularization, i.e. set min_child_weight, reg_lambda, reg_alpha,
|
||||||
// and max_delta_step to 0.
|
// and max_delta_step to 0.
|
||||||
bst_float best_split_gain = 0.0f;
|
bst_float best_split_gain = 0.0f;
|
||||||
size_t best_split_threshold, best_split_feature;
|
size_t best_split_threshold = std::numeric_limits<size_t>::max();
|
||||||
|
size_t best_split_feature = std::numeric_limits<size_t>::max();
|
||||||
// Enumerate all features
|
// Enumerate all features
|
||||||
for (size_t fid = 0; fid < num_feature; ++fid) {
|
for (size_t fid = 0; fid < num_feature; ++fid) {
|
||||||
const size_t bin_id_min = gmat.cut.row_ptr[fid];
|
const size_t bin_id_min = gmat.cut.row_ptr[fid];
|
||||||
@ -213,56 +214,56 @@ class QuantileHistMock : public QuantileHistMaker {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
int static constexpr n_rows = 8, n_cols = 16;
|
int static constexpr kNRows = 8, kNCols = 16;
|
||||||
std::shared_ptr<xgboost::DMatrix> *dmat;
|
std::shared_ptr<xgboost::DMatrix> *dmat_;
|
||||||
const std::vector<std::pair<std::string, std::string> > cfg;
|
const std::vector<std::pair<std::string, std::string> > cfg_;
|
||||||
std::shared_ptr<BuilderMock> builder_;
|
std::shared_ptr<BuilderMock> builder_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit QuantileHistMock(
|
explicit QuantileHistMock(
|
||||||
const std::vector<std::pair<std::string, std::string> >& args) :
|
const std::vector<std::pair<std::string, std::string> >& args) :
|
||||||
cfg{args} {
|
cfg_{args} {
|
||||||
QuantileHistMaker::Init(args);
|
QuantileHistMaker::Init(args);
|
||||||
builder_.reset(
|
builder_.reset(
|
||||||
new BuilderMock(
|
new BuilderMock(
|
||||||
param_,
|
param_,
|
||||||
std::move(pruner_),
|
std::move(pruner_),
|
||||||
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
|
std::unique_ptr<SplitEvaluator>(spliteval_->GetHostClone())));
|
||||||
dmat = CreateDMatrix(n_rows, n_cols, 0.8, 3);
|
dmat_ = CreateDMatrix(kNRows, kNCols, 0.8, 3);
|
||||||
}
|
}
|
||||||
~QuantileHistMock() { delete dmat; }
|
~QuantileHistMock() override { delete dmat_; }
|
||||||
|
|
||||||
static size_t GetNumColumns() { return n_cols; }
|
static size_t GetNumColumns() { return kNCols; }
|
||||||
|
|
||||||
void TestInitData() {
|
void TestInitData() {
|
||||||
size_t constexpr max_bins = 4;
|
size_t constexpr kMaxBins = 4;
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
gmat.Init((*dmat).get(), max_bins);
|
gmat.Init((*dmat_).get(), kMaxBins);
|
||||||
|
|
||||||
RegTree tree = RegTree();
|
RegTree tree = RegTree();
|
||||||
tree.param.InitAllowUnknown(cfg);
|
tree.param.InitAllowUnknown(cfg_);
|
||||||
|
|
||||||
std::vector<GradientPair> gpair =
|
std::vector<GradientPair> gpair =
|
||||||
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
||||||
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
||||||
|
|
||||||
builder_->TestInitData(gmat, gpair, dmat->get(), tree);
|
builder_->TestInitData(gmat, gpair, dmat_->get(), tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestBuildHist() {
|
void TestBuildHist() {
|
||||||
RegTree tree = RegTree();
|
RegTree tree = RegTree();
|
||||||
tree.param.InitAllowUnknown(cfg);
|
tree.param.InitAllowUnknown(cfg_);
|
||||||
|
|
||||||
size_t constexpr max_bins = 4;
|
size_t constexpr kMaxBins = 4;
|
||||||
common::GHistIndexMatrix gmat;
|
common::GHistIndexMatrix gmat;
|
||||||
gmat.Init((*dmat).get(), max_bins);
|
gmat.Init((*dmat_).get(), kMaxBins);
|
||||||
|
|
||||||
builder_->TestBuildHist(0, gmat, *(*dmat).get(), tree);
|
builder_->TestBuildHist(0, gmat, *(*dmat_).get(), tree);
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestEvaluateSplit() {
|
void TestEvaluateSplit() {
|
||||||
RegTree tree = RegTree();
|
RegTree tree = RegTree();
|
||||||
tree.param.InitAllowUnknown(cfg);
|
tree.param.InitAllowUnknown(cfg_);
|
||||||
|
|
||||||
builder_->TestEvaluateSplit(gmatb_, tree);
|
builder_->TestEvaluateSplit(gmatb_, tree);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -13,15 +13,15 @@ namespace xgboost {
|
|||||||
namespace tree {
|
namespace tree {
|
||||||
|
|
||||||
TEST(Updater, Refresh) {
|
TEST(Updater, Refresh) {
|
||||||
int constexpr n_rows = 8, n_cols = 16;
|
int constexpr kNRows = 8, kNCols = 16;
|
||||||
|
|
||||||
HostDeviceVector<GradientPair> gpair =
|
HostDeviceVector<GradientPair> gpair =
|
||||||
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
|
||||||
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
{0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
|
||||||
auto dmat = CreateDMatrix(n_rows, n_cols, 0.4, 3);
|
auto dmat = CreateDMatrix(kNRows, kNCols, 0.4, 3);
|
||||||
std::vector<std::pair<std::string, std::string>> cfg {
|
std::vector<std::pair<std::string, std::string>> cfg {
|
||||||
{"reg_alpha", "0.0"},
|
{"reg_alpha", "0.0"},
|
||||||
{"num_feature", std::to_string(n_cols)},
|
{"num_feature", std::to_string(kNCols)},
|
||||||
{"reg_lambda", "1"}};
|
{"reg_lambda", "1"}};
|
||||||
|
|
||||||
RegTree tree = RegTree();
|
RegTree tree = RegTree();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user