De-duplicate GPU parameters. (#4454)

* Only define `gpu_id` and `n_gpus` in `LearnerTrainParam`
* Pass LearnerTrainParam through XGBoost vid factory method.
* Disable all GPU usage when GPU related parameters are not specified (fixes XGBoost choosing GPU over aggressively).
* Test learner train param io.
* Fix gpu pickling.
This commit is contained in:
Jiaming Yuan
2019-05-29 11:55:57 +08:00
committed by GitHub
parent a3fedbeaa8
commit c589eff941
69 changed files with 927 additions and 562 deletions

View File

@@ -72,10 +72,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
bool refresh_leaf;
// auxiliary data structure
std::vector<int> monotone_constraints;
// gpu to use for single gpu algorithms
int gpu_id;
// number of GPUs to use
int n_gpus;
// the criteria to use for ranking splits
std::string split_evaluator;
@@ -191,14 +187,6 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
DMLC_DECLARE_FIELD(monotone_constraints)
.set_default(std::vector<int>())
.describe("Constraint of variable monotonicity");
DMLC_DECLARE_FIELD(gpu_id)
.set_lower_bound(0)
.set_default(0)
.describe("gpu to use for single gpu algorithms");
DMLC_DECLARE_FIELD(n_gpus)
.set_lower_bound(-1)
.set_default(1)
.describe("Number of GPUs to use for multi-gpu algorithms: -1=use all GPUs");
DMLC_DECLARE_FIELD(split_evaluator)
.set_default("elastic_net,monotonic,interaction")
.describe("The criteria to use for ranking splits");

View File

@@ -14,12 +14,14 @@ DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);
namespace xgboost {
TreeUpdater* TreeUpdater::Create(const std::string& name) {
TreeUpdater* TreeUpdater::Create(const std::string& name, LearnerTrainParam const* tparam) {
auto *e = ::dmlc::Registry< ::xgboost::TreeUpdaterReg>::Get()->Find(name);
if (e == nullptr) {
LOG(FATAL) << "Unknown tree updater " << name;
}
return (e->body)();
auto p_updater = (e->body)();
p_updater->tparam_ = tparam;
return p_updater;
}
} // namespace xgboost

View File

@@ -769,7 +769,7 @@ class DistColMaker : public ColMaker {
public:
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {
param_.InitAllowUnknown(args);
pruner_.reset(TreeUpdater::Create("prune"));
pruner_.reset(TreeUpdater::Create("prune", tparam_));
pruner_->Init(args);
spliteval_.reset(SplitEvaluator::Create(param_.split_evaluator));
spliteval_->Init(args);

View File

@@ -443,9 +443,10 @@ void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
common::Span<const DeviceNodeStats> nodes,
int nUniqKeys,
NodeIdT nodeStart, int len, const TrainParam param,
ArgMaxByKeyAlgo algo) {
ArgMaxByKeyAlgo algo,
GPUSet const& devices) {
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
param.gpu_id, nodeSplits.data(), nUniqKeys,
*(devices.begin()), nodeSplits.data(), nUniqKeys,
ExactSplitCandidate());
int nBlks = dh::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
switch (algo) {
@@ -585,7 +586,7 @@ class GPUMaker : public TreeUpdater {
maxNodes_ = (1 << (param_.max_depth + 1)) - 1;
maxLeaves_ = 1 << param_.max_depth;
devices_ = GPUSet::All(param_.gpu_id, param_.n_gpus);
devices_ = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
}
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
@@ -641,7 +642,7 @@ class GPUMaker : public TreeUpdater {
float min_split_loss = param_.min_split_loss;
auto gpu_param = GPUTrainingParam(param_);
dh::LaunchN(param_.gpu_id, nNodes, [=] __device__(int uid) {
dh::LaunchN(*(devices_.begin()), nNodes, [=] __device__(int uid) {
int absNodeId = uid + nodeStart;
ExactSplitCandidate s = d_nodeSplits[uid];
if (s.IsSplittable(min_split_loss)) {
@@ -683,16 +684,18 @@ class GPUMaker : public TreeUpdater {
instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
colIds_, nodeStart);
auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
nodes_, nNodes, nodeStart, n_vals_, param_,
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem,
devices);
Split2Node(nNodes, nodeStart);
}
void AllocateAllData(int offsetSize) {
int tmpBuffSize = ScanTempBufferSize(n_vals_);
ba_.Allocate(param_.gpu_id, &vals_, n_vals_,
ba_.Allocate(*(devices_.begin()), &vals_, n_vals_,
&vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_,
&colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_,
&nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_,
@@ -783,7 +786,7 @@ class GPUMaker : public TreeUpdater {
auto d_nodes = nodes_;
auto d_sums = gradsums_;
auto gpu_params = GPUTrainingParam(param_);
dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
dh::LaunchN(*(devices_.begin()), 1, [=] __device__(int idx) {
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
});
} else {
@@ -800,7 +803,7 @@ class GPUMaker : public TreeUpdater {
nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
// gather the node assignments across all other columns too
dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
dh::Gather(*(devices_.begin()), nodeAssigns_.Current(),
node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
SortKeys(level);
}
@@ -811,7 +814,7 @@ class GPUMaker : public TreeUpdater {
// but we don't need more than level+1 bits for sorting!
SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_,
colOffsets_, 0, level + 1);
dh::Gather<float, int>(param_.gpu_id, vals_.other(),
dh::Gather<float, int>(*(devices_.begin()), vals_.other(),
vals_.Current(), instIds_.other(), instIds_.Current(),
nodeLocations_.Current(), n_vals_);
vals_.buff.selector ^= 1;

View File

@@ -1,5 +1,5 @@
/*!
* Copyright 2017-2018 XGBoost contributors
* Copyright 2017-2019 XGBoost contributors
*/
#include <thrust/copy.h>
#include <thrust/functional.h>
@@ -322,7 +322,7 @@ __global__ void EvaluateSplitKernel(
node_histogram, // histogram for gradients
common::Span<const int> feature_set, // Selected features
DeviceNodeStats node,
ELLPackMatrix matrix,
ELLPackMatrix matrix,
GPUTrainingParam gpu_param,
common::Span<DeviceSplitCandidate> split_candidates, // resulting split
ValueConstraint value_constraint,
@@ -1377,13 +1377,16 @@ template <typename GradientSumT>
class GPUHistMakerSpecialised{
public:
GPUHistMakerSpecialised() : initialised_{false}, p_last_fmat_{nullptr} {}
void Init(
const std::vector<std::pair<std::string, std::string>>& args) {
void Init(const std::vector<std::pair<std::string, std::string>>& args,
LearnerTrainParam const* lparam) {
param_.InitAllowUnknown(args);
learner_param_ = lparam;
hist_maker_param_.InitAllowUnknown(args);
CHECK(param_.n_gpus != 0) << "Must have at least one device";
n_devices_ = param_.n_gpus;
dist_ = GPUDistribution::Block(GPUSet::All(param_.gpu_id, param_.n_gpus));
auto devices = GPUSet::All(learner_param_->gpu_id,
learner_param_->n_gpus);
n_devices_ = devices.Size();
CHECK(n_devices_ != 0) << "Must have at least one device";
dist_ = GPUDistribution::Block(devices);
dh::CheckComputeCapability();
@@ -1446,7 +1449,8 @@ class GPUHistMakerSpecialised{
// Find the cuts.
monitor_.StartCuda("Quantiles");
common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows);
common::DeviceSketch(batch, *info_, param_, &hmat_, hist_maker_param_.gpu_batch_nrows,
GPUSet::All(learner_param_->gpu_id, learner_param_->n_gpus));
n_bins_ = hmat_.row_ptr.back();
monitor_.StopCuda("Quantiles");
auto is_dense = info_->num_nonzero_ == info_->num_row_ * info_->num_col_;
@@ -1552,6 +1556,7 @@ class GPUHistMakerSpecialised{
int n_bins_;
GPUHistMakerTrainParam hist_maker_param_;
LearnerTrainParam const* learner_param_;
common::GHistIndexMatrix gmat_;
dh::AllReducer reducer_;
@@ -1573,10 +1578,10 @@ class GPUHistMaker : public TreeUpdater {
double_maker_.reset();
if (hist_maker_param_.single_precision_histogram) {
float_maker_.reset(new GPUHistMakerSpecialised<GradientPair>());
float_maker_->Init(args);
float_maker_->Init(args, tparam_);
} else {
double_maker_.reset(new GPUHistMakerSpecialised<GradientPairPrecise>());
double_maker_->Init(args);
double_maker_->Init(args, tparam_);
}
}

View File

@@ -22,7 +22,7 @@ DMLC_REGISTRY_FILE_TAG(updater_prune);
class TreePruner: public TreeUpdater {
public:
TreePruner() {
syncher_.reset(TreeUpdater::Create("sync"));
syncher_.reset(TreeUpdater::Create("sync", tparam_));
}
// set training parameter
void Init(const std::vector<std::pair<std::string, std::string> >& args) override {

View File

@@ -35,7 +35,7 @@ DMLC_REGISTRY_FILE_TAG(updater_quantile_hist);
void QuantileHistMaker::Init(const std::vector<std::pair<std::string, std::string> >& args) {
// initialize pruner
if (!pruner_) {
pruner_.reset(TreeUpdater::Create("prune"));
pruner_.reset(TreeUpdater::Create("prune", tparam_));
}
pruner_->Init(args);
param_.InitAllowUnknown(args);