[EM] Support SHAP contribution with QDM. (#10724)

- Add GPU support.
- Add external memory support.
- Update the GPU tree shap.
This commit is contained in:
Jiaming Yuan
2024-08-22 05:25:10 +08:00
committed by GitHub
parent cb54374550
commit 142bdc73ec
13 changed files with 274 additions and 159 deletions

View File

@@ -143,10 +143,9 @@ struct SparsePageLoader {
};
struct EllpackLoader {
EllpackDeviceAccessor const& matrix;
XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_idx_t,
float)
: matrix{m} {}
EllpackDeviceAccessor matrix;
XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor m, bool, bst_feature_t, bst_idx_t, float)
: matrix{std::move(m)} {}
[[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
auto gidx = matrix.GetBinIndex<false>(ridx, fidx);
if (gidx == -1) {
@@ -162,6 +161,8 @@ struct EllpackLoader {
}
return matrix.gidx_fvalue_map[gidx - 1];
}
[[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return this->matrix.NumFeatures(); }
[[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return this->matrix.n_rows; }
};
template <typename Batch>
@@ -1031,9 +1032,6 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
if (!p_fmat->PageExists<SparsePage>()) {
LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
}
CHECK(!p_fmat->Info().IsColumnSplit())
<< "Predict contribution support for column-wise data split is not yet implemented.";
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1047,8 +1045,8 @@ class GPUPredictor : public xgboost::Predictor {
// allocate space for (number of features + bias) times the number of rows
size_t contributions_columns =
model.learner_model_param->num_feature + 1; // +1 for bias
out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
model.learner_model_param->num_output_group);
auto dim_size = contributions_columns * model.learner_model_param->num_output_group;
out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
out_contribs->Fill(0.0f);
auto phis = out_contribs->DeviceSpan();
@@ -1058,16 +1056,27 @@ class GPUPredictor : public xgboost::Predictor {
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin,
dh::tend(phis));
if (p_fmat->PageExists<SparsePage>()) {
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
} else {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
gpu_treeshap::GPUTreeShap<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1094,9 +1103,6 @@ class GPUPredictor : public xgboost::Predictor {
if (tree_weights != nullptr) {
LOG(FATAL) << "Dart booster feature " << not_implemented;
}
if (!p_fmat->PageExists<SparsePage>()) {
LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
}
dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
out_contribs->SetDevice(ctx_->Device());
if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1108,9 +1114,9 @@ class GPUPredictor : public xgboost::Predictor {
// allocate space for (number of features + bias) times the number of rows
size_t contributions_columns =
model.learner_model_param->num_feature + 1; // +1 for bias
out_contribs->Resize(p_fmat->Info().num_row_ * contributions_columns *
contributions_columns *
model.learner_model_param->num_output_group);
auto dim_size =
contributions_columns * contributions_columns * model.learner_model_param->num_output_group;
out_contribs->Resize(p_fmat->Info().num_row_ * dim_size);
out_contribs->Fill(0.0f);
auto phis = out_contribs->DeviceSpan();
@@ -1120,16 +1126,29 @@ class GPUPredictor : public xgboost::Predictor {
d_model.Init(model, 0, tree_end, ctx_->Device());
dh::device_vector<uint32_t> categories;
ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin,
dh::tend(phis));
if (p_fmat->PageExists<SparsePage>()) {
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature);
auto begin = dh::tbegin(phis) + batch.base_rowid * dim_size;
gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
auto impl = batch.Impl();
auto acc =
impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
gpu_treeshap::GPUTreeShapInteractions<dh::XGBDeviceAllocator<int>>(
X, device_paths.begin(), device_paths.end(), ngroup, begin, dh::tend(phis));
}
}
// Add the base margin term to last column
p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
@@ -1180,51 +1199,35 @@ class GPUPredictor : public xgboost::Predictor {
bool use_shared = shared_memory_bytes != 0;
bst_feature_t num_features = info.num_col_;
auto launch = [&](auto fn, std::uint32_t grid, auto data, bst_idx_t batch_offset) {
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes}(
fn, data, d_model.nodes.ConstDeviceSpan(),
predictions->DeviceSpan().subspan(batch_offset), d_model.tree_segments.ConstDeviceSpan(),
d_model.split_types.ConstDeviceSpan(), d_model.categories_tree_segments.ConstDeviceSpan(),
d_model.categories_node_segments.ConstDeviceSpan(), d_model.categories.ConstDeviceSpan(),
d_model.tree_beg_, d_model.tree_end_, num_features, num_rows, use_shared,
std::numeric_limits<float>::quiet_NaN());
};
if (p_fmat->PageExists<SparsePage>()) {
bst_idx_t batch_offset = 0;
for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
batch.data.SetDevice(ctx_->Device());
batch.offset.SetDevice(ctx_->Device());
bst_idx_t batch_offset = 0;
SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
model.learner_model_param->num_feature};
size_t num_rows = batch.Size();
auto grid =
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
PredictLeafKernel<SparsePageLoader, SparsePageView>, data,
d_model.nodes.ConstDeviceSpan(),
predictions->DeviceSpan().subspan(batch_offset),
d_model.tree_segments.ConstDeviceSpan(),
d_model.split_types.ConstDeviceSpan(),
d_model.categories_tree_segments.ConstDeviceSpan(),
d_model.categories_node_segments.ConstDeviceSpan(),
d_model.categories.ConstDeviceSpan(),
d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
use_shared, std::numeric_limits<float>::quiet_NaN());
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
launch(PredictLeafKernel<SparsePageLoader, SparsePageView>, grid, data, batch_offset);
batch_offset += batch.Size();
}
} else {
bst_idx_t batch_offset = 0;
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
bst_idx_t batch_offset = 0;
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
size_t num_rows = batch.Size();
auto grid =
static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes} (
PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, data,
d_model.nodes.ConstDeviceSpan(),
predictions->DeviceSpan().subspan(batch_offset),
d_model.tree_segments.ConstDeviceSpan(),
d_model.split_types.ConstDeviceSpan(),
d_model.categories_tree_segments.ConstDeviceSpan(),
d_model.categories_node_segments.ConstDeviceSpan(),
d_model.categories.ConstDeviceSpan(),
d_model.tree_beg_, d_model.tree_end_, num_features, num_rows,
use_shared, std::numeric_limits<float>::quiet_NaN());
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
batch_offset += batch.Size();
}
}