Remove internal use of gpu_id. (#9568)

This commit is contained in:
Jiaming Yuan
2023-09-20 23:29:51 +08:00
committed by GitHub
parent 38ac52dd87
commit 8c676c889d
121 changed files with 1012 additions and 1044 deletions

View File

@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
bst_feature_t num_features, bst_row_t num_rows,
size_t entry_start, float missing) :
batch{batch},
columns{num_features},
use_shared{use_shared},
is_valid{missing} {
extern __shared__ float _smem[];
smem = _smem;
if (use_shared) {
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
size_t shared_elements = blockDim.x * num_features;
dh::BlockFill(smem, shared_elements, nanf(""));
__syncthreads();
if (global_idx < num_rows) {
auto beg = global_idx * columns;
auto end = (global_idx + 1) * columns;
for (size_t i = beg; i < end; ++i) {
auto value = batch.GetElement(i).value;
if (is_valid(value)) {
smem[threadIdx.x * num_features + (i - beg)] = value;
}
size_t entry_start, float missing)
: batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
extern __shared__ float _smem[];
smem = _smem;
if (use_shared) {
uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
size_t shared_elements = blockDim.x * num_features;
dh::BlockFill(smem, shared_elements, nanf(""));
__syncthreads();
if (global_idx < num_rows) {
auto beg = global_idx * columns;
auto end = (global_idx + 1) * columns;
for (size_t i = beg; i < end; ++i) {
auto value = batch.GetElement(i).value;
if (is_valid(value)) {
smem[threadIdx.x * num_features + (i - beg)] = value;
}
}
}
__syncthreads();
}
__syncthreads();
}
XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
if (use_shared) {
return smem[threadIdx.x * columns + fidx];
}
@@ -340,11 +337,11 @@ class DeviceModel {
size_t tree_end_; // NOLINT
int num_group;
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
dh::safe_cuda(cudaSetDevice(gpu_id));
void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
// Copy decision trees to device
tree_segments = HostDeviceVector<size_t>({}, gpu_id);
tree_segments = HostDeviceVector<size_t>({}, device);
auto& h_tree_segments = tree_segments.HostVector();
h_tree_segments.reserve((tree_end - tree_begin) + 1);
size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
h_tree_segments.push_back(sum);
}
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
auto d_nodes = nodes.DevicePointer();
auto d_stats = stats.DevicePointer();
for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -369,12 +366,12 @@ class DeviceModel {
sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
}
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
auto& h_tree_group = tree_group.HostVector();
std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
// Initialize categorical splits.
split_types.SetDevice(gpu_id);
split_types.SetDevice(device);
std::vector<FeatureType>& h_split_types = split_types.HostVector();
h_split_types.resize(h_tree_segments.back());
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -383,8 +380,8 @@ class DeviceModel {
h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
}
categories = HostDeviceVector<uint32_t>({}, gpu_id);
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
categories = HostDeviceVector<uint32_t>({}, device);
categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
std::vector<uint32_t> &h_categories = categories.HostVector();
std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -397,7 +394,7 @@ class DeviceModel {
}
categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
h_tree_segments.back(), {}, gpu_id);
h_tree_segments.back(), {}, device);
std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
categories_node_segments.HostVector();
for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -485,8 +482,8 @@ struct PathInfo {
void ExtractPaths(
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
int gpu_id) {
dh::safe_cuda(cudaSetDevice(gpu_id));
DeviceOrd device) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
auto& device_model = *model;
dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -773,12 +770,12 @@ class ColumnSplitHelper {
template <bool predict_leaf>
void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
bst_feature_t num_features, std::uint32_t num_group) const {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
dh::caching_device_vector<BitType> decision_storage{};
dh::caching_device_vector<BitType> missing_storage{};
auto constexpr kBlockThreads = 128;
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
auto const shared_memory_bytes =
SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
auto const use_shared = shared_memory_bytes != 0;
@@ -791,8 +788,8 @@ class ColumnSplitHelper {
BitVector decision_bits{dh::ToSpan(decision_storage)};
BitVector missing_bits{dh::ToSpan(missing_storage)};
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->Device());
std::size_t entry_start = 0;
SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
@@ -823,9 +820,9 @@ class ColumnSplitHelper {
void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
dh::caching_device_vector<BitType>* missing_storage) const {
collective::AllReduce<collective::Operation::kBitwiseOR>(
ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
collective::AllReduce<collective::Operation::kBitwiseAND>(
ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
}
void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
@@ -853,12 +850,12 @@ class GPUPredictor : public xgboost::Predictor {
size_t num_features,
HostDeviceVector<bst_float>* predictions,
size_t batch_offset, bool is_dense) const {
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->Device());
batch.data.SetDevice(ctx_->Device());
const uint32_t BLOCK_THREADS = 128;
size_t num_rows = batch.Size();
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
bool use_shared = shared_memory_bytes != 0;
@@ -914,10 +911,10 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_end - tree_begin == 0) {
return;
}
out_preds->SetDevice(ctx_->gpu_id);
out_preds->SetDevice(ctx_->Device());
auto const& info = dmat->Info();
DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
d_model.Init(model, tree_begin, tree_end, ctx_->Device());
if (info.IsColumnSplit()) {
column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
@@ -934,10 +931,10 @@ class GPUPredictor : public xgboost::Predictor {
} else {
size_t batch_offset = 0;
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal(
page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
d_model,
out_preds,
batch_offset);
@@ -951,16 +948,15 @@ class GPUPredictor : public xgboost::Predictor {
: Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
~GPUPredictor() override {
if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
}
}
void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
const gbm::GBTreeModel& model, uint32_t tree_begin,
uint32_t tree_end = 0) const override {
int device = ctx_->gpu_id;
CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
auto* out_preds = &predts->predictions;
if (tree_end == 0) {
tree_end = model.trees.size();
@@ -978,9 +974,9 @@ class GPUPredictor : public xgboost::Predictor {
auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
<< "Number of columns in data must equal to trained model.";
CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
<< "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
<< "but data is on: " << m->DeviceIdx();
CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
<< "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
<< "but data is on: " << m->Device().Name();
if (p_m) {
p_m->Info().num_row_ = m->NumRows();
this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -989,16 +985,16 @@ class GPUPredictor : public xgboost::Predictor {
info.num_row_ = m->NumRows();
this->InitOutPredictions(info, &(out_preds->predictions), model);
}
out_preds->predictions.SetDevice(m->DeviceIdx());
out_preds->predictions.SetDevice(m->Device());
const uint32_t BLOCK_THREADS = 128;
auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
size_t shared_memory_bytes =
SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
DeviceModel d_model;
d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
d_model.Init(model, tree_begin, tree_end, m->Device());
bool use_shared = shared_memory_bytes != 0;
size_t entry_start = 0;
@@ -1050,9 +1046,8 @@ class GPUPredictor : public xgboost::Predictor {
}
CHECK(!p_fmat->Info().IsColumnSplit())
<< "Predict contribution support for column-wise data split is not yet implemented.";
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
out_contribs->SetDevice(ctx_->gpu_id);
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
@@ -1070,12 +1065,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths;
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1084,7 +1079,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis));
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1109,8 +1104,8 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
out_contribs->SetDevice(ctx_->gpu_id);
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
@@ -1129,12 +1124,12 @@ class GPUPredictor : public xgboost::Predictor {
dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
device_paths;
DeviceModel d_model;
d_model.Init(model, 0, tree_end, ctx_->gpu_id);
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1143,7 +1138,7 @@ class GPUPredictor : public xgboost::Predictor {
dh::tend(phis));
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1168,24 +1163,24 @@ class GPUPredictor : public xgboost::Predictor {
void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
const gbm::GBTreeModel &model,
unsigned tree_end) const override {
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
const MetaInfo& info = p_fmat->Info();
bst_row_t num_rows = info.num_row_;
if (tree_end == 0 || tree_end > model.trees.size()) {
tree_end = static_cast<uint32_t>(model.trees.size());
}
predictions->SetDevice(ctx_->gpu_id);
predictions->SetDevice(ctx_->Device());
predictions->Resize(num_rows * tree_end);
DeviceModel d_model;
d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
d_model.Init(model, 0, tree_end, this->ctx_->Device());
if (info.IsColumnSplit()) {
column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
return;
}
auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
constexpr uint32_t kBlockThreads = 128;
size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
info.num_col_, max_shared_memory_bytes);
@@ -1195,8 +1190,8 @@ class GPUPredictor : public xgboost::Predictor {
if (p_fmat->PageExists<SparsePage>()) {
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->gpu_id);
batch.offset.SetDevice(ctx_->gpu_id);
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
bst_row_t batch_offset = 0;
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature};
@@ -1221,7 +1216,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
bst_row_t batch_offset = 0;
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
size_t num_rows = batch.Size();
auto grid =
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1249,9 +1244,9 @@ class GPUPredictor : public xgboost::Predictor {
private:
/*! \brief Reconfigure the device when GPU is changed. */
static size_t ConfigureDevice(int device) {
if (device >= 0) {
return dh::MaxSharedMemory(device);
static size_t ConfigureDevice(DeviceOrd device) {
if (device.IsCUDA()) {
return dh::MaxSharedMemory(device.ordinal);
}
return 0;
}

View File

@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
if (ctx_->gpu_id >= 0) {
out_preds->SetDevice(ctx_->gpu_id);
if (ctx_->Device().IsCUDA()) {
out_preds->SetDevice(ctx_->Device());
}
if (!base_margin->Empty()) {
out_preds->Resize(n);