GPU memory usage fixes + column sampling refactor (#3635)
* Remove thrust copy calls * Fix histogram memory usage * Cap extreme histogram memory usage * More efficient column sampling * Use column sampler across updaters * More efficient split evaluation on GPU with column sampling
This commit is contained in:
@@ -402,7 +402,6 @@ void GHistIndexBlockMatrix::Init(const GHistIndexMatrix& gmat,
|
||||
void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
data_.resize(nbins_ * nthread_, GHistEntry());
|
||||
std::fill(data_.begin(), data_.end(), GHistEntry());
|
||||
@@ -461,7 +460,6 @@ void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
|
||||
void GHistBuilder::BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist) {
|
||||
constexpr int kUnroll = 8; // loop unrolling factor
|
||||
const size_t nblock = gmatb.GetNumBlock();
|
||||
|
||||
@@ -266,13 +266,11 @@ class GHistBuilder {
|
||||
void BuildHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexMatrix& gmat,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist);
|
||||
// same, with feature grouping
|
||||
void BuildBlockHist(const std::vector<GradientPair>& gpair,
|
||||
const RowSetCollection::Elem row_indices,
|
||||
const GHistIndexBlockMatrix& gmatb,
|
||||
const std::vector<bst_uint>& feat_set,
|
||||
GHistRow hist);
|
||||
// construct a histogram via subtraction trick
|
||||
void SubtractionTrick(GHistRow self, GHistRow sibling, GHistRow parent);
|
||||
|
||||
@@ -102,6 +102,7 @@ void HostDeviceVector<T>::Reshard(GPUSet devices) { }
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
template class HostDeviceVector<int>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
|
||||
@@ -77,7 +77,9 @@ struct HostDeviceVectorImpl {
|
||||
|
||||
void LazySyncHost() {
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
thrust::copy(data_.begin(), data_.end(), vec_->data_h_.begin() + start_);
|
||||
dh::safe_cuda(
|
||||
cudaMemcpy(vec_->data_h_.data(), data_.data().get() + start_,
|
||||
data_.size() * sizeof(T), cudaMemcpyDeviceToHost));
|
||||
on_d_ = false;
|
||||
}
|
||||
|
||||
@@ -90,8 +92,9 @@ struct HostDeviceVectorImpl {
|
||||
size_t size_d = ShardSize(size_h, ndevices, index_);
|
||||
dh::safe_cuda(cudaSetDevice(device_));
|
||||
data_.resize(size_d);
|
||||
thrust::copy(vec_->data_h_.begin() + start_,
|
||||
vec_->data_h_.begin() + start_ + size_d, data_.begin());
|
||||
dh::safe_cuda(cudaMemcpy(data_.data().get(),
|
||||
vec_->data_h_.data() + start_,
|
||||
size_d * sizeof(T), cudaMemcpyHostToDevice));
|
||||
on_d_ = true;
|
||||
// this may cause a race condition if LazySyncDevice() is called
|
||||
// from multiple threads in parallel;
|
||||
@@ -186,18 +189,22 @@ struct HostDeviceVectorImpl {
|
||||
void ScatterFrom(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
thrust::copy(begin, end, data_h_.begin());
|
||||
dh::safe_cuda(cudaMemcpy(data_h_.data(), begin.get(),
|
||||
(end - begin) * sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) {
|
||||
shard.ScatterFrom(begin.get());
|
||||
});
|
||||
shard.ScatterFrom(begin.get());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void GatherTo(thrust::device_ptr<T> begin, thrust::device_ptr<T> end) {
|
||||
CHECK_EQ(end - begin, Size());
|
||||
if (on_h_) {
|
||||
thrust::copy(data_h_.begin(), data_h_.end(), begin);
|
||||
dh::safe_cuda(cudaMemcpy(begin.get(), data_h_.data(),
|
||||
data_h_.size() * sizeof(T),
|
||||
cudaMemcpyHostToDevice));
|
||||
} else {
|
||||
dh::ExecuteShards(&shards_, [&](DeviceShard& shard) { shard.GatherTo(begin); });
|
||||
}
|
||||
@@ -400,5 +407,6 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
|
||||
template class HostDeviceVector<bst_float>;
|
||||
template class HostDeviceVector<GradientPair>;
|
||||
template class HostDeviceVector<unsigned int>;
|
||||
template class HostDeviceVector<int>;
|
||||
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -7,8 +7,14 @@
|
||||
#ifndef XGBOOST_COMMON_RANDOM_H_
|
||||
#define XGBOOST_COMMON_RANDOM_H_
|
||||
|
||||
#include <random>
|
||||
#include <xgboost/logging.h>
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include "host_device_vector.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
@@ -66,6 +72,78 @@ using GlobalRandomEngine = RandomEngine;
|
||||
*/
|
||||
GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
|
||||
|
||||
/**
|
||||
* \class ColumnSampler
|
||||
*
|
||||
* \brief Handles selection of columns due to colsample_bytree and
|
||||
* colsample_bylevel parameters. Should be initialised before tree
|
||||
* construction and to reset when tree construction is completed.
|
||||
*/
|
||||
|
||||
class ColumnSampler {
|
||||
HostDeviceVector<int> feature_set_tree_;
|
||||
std::map<int, HostDeviceVector<int>> feature_set_level_;
|
||||
float colsample_bylevel_{1.0f};
|
||||
float colsample_bytree_{1.0f};
|
||||
|
||||
std::vector<int> ColSample(std::vector<int> features, float colsample) const {
|
||||
if (colsample == 1.0f) return features;
|
||||
CHECK_GT(features.size(), 0);
|
||||
int n = std::max(1, static_cast<int>(colsample * features.size()));
|
||||
|
||||
std::shuffle(features.begin(), features.end(), common::GlobalRandom());
|
||||
features.resize(n);
|
||||
std::sort(features.begin(), features.end());
|
||||
|
||||
return features;
|
||||
}
|
||||
|
||||
public:
|
||||
/**
|
||||
* \brief Initialise this object before use.
|
||||
*
|
||||
* \param num_col
|
||||
* \param colsample_bylevel
|
||||
* \param colsample_bytree
|
||||
* \param skip_index_0 (Optional) True to skip index 0.
|
||||
*/
|
||||
void Init(int64_t num_col, float colsample_bylevel, float colsample_bytree,
|
||||
bool skip_index_0 = false) {
|
||||
this->colsample_bylevel_ = colsample_bylevel;
|
||||
this->colsample_bytree_ = colsample_bytree;
|
||||
this->Reset();
|
||||
|
||||
int begin_idx = skip_index_0 ? 1 : 0;
|
||||
auto& feature_set_h = feature_set_tree_.HostVector();
|
||||
feature_set_h.resize(num_col - begin_idx);
|
||||
|
||||
std::iota(feature_set_h.begin(), feature_set_h.end(), begin_idx);
|
||||
feature_set_h = ColSample(feature_set_h, this->colsample_bytree_);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Resets this object.
|
||||
*/
|
||||
void Reset() {
|
||||
feature_set_tree_.HostVector().clear();
|
||||
feature_set_level_.clear();
|
||||
}
|
||||
|
||||
HostDeviceVector<int>& GetFeatureSet(int depth) {
|
||||
if (this->colsample_bylevel_ == 1.0f) {
|
||||
return feature_set_tree_;
|
||||
}
|
||||
|
||||
if (feature_set_level_.count(depth) == 0) {
|
||||
// Level sampling, level does not yet exist so generate it
|
||||
auto& level = feature_set_level_[depth].HostVector();
|
||||
level = ColSample(feature_set_tree_.HostVector(), this->colsample_bylevel_);
|
||||
}
|
||||
// Level sampling
|
||||
return feature_set_level_[depth];
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
#endif // XGBOOST_COMMON_RANDOM_H_
|
||||
|
||||
Reference in New Issue
Block a user