- Update SparseDMatrix comment. - Use a pointer in the bitfield. We will replace the `std::vector<bool>` in `ColumnMatrix` with bitfield. - Clean up the page source. The timer is removed as it's inaccurate once we swap the mmap pointer into the page.
336 lines
12 KiB
Plaintext
336 lines
12 KiB
Plaintext
/**
|
|
* Copyright 2019-2023, XGBoost contributors
|
|
*/
|
|
#include <thrust/copy.h>
|
|
#include <thrust/device_vector.h>
|
|
#include <thrust/execution_policy.h>
|
|
#include <thrust/iterator/counting_iterator.h>
|
|
|
|
#include <algorithm>
|
|
#include <string>
|
|
#include <set>
|
|
|
|
#include "xgboost/logging.h"
|
|
#include "xgboost/span.h"
|
|
#include "constraints.cuh"
|
|
#include "param.h"
|
|
#include "../common/device_helpers.cuh"
|
|
|
|
namespace xgboost {
|
|
|
|
size_t FeatureInteractionConstraintDevice::Features() const {
|
|
return d_sets_ptr_.size() - 1;
|
|
}
|
|
|
|
void FeatureInteractionConstraintDevice::Configure(
|
|
tree::TrainParam const& param, int32_t const n_features) {
|
|
has_constraint_ = true;
|
|
if (param.interaction_constraints.length() == 0) {
|
|
has_constraint_ = false;
|
|
return;
|
|
}
|
|
// --- Parse interaction constraints
|
|
// Interaction constraints parsed from string parameter. After
|
|
// parsing, this looks like {{0, 1, 2}, {2, 3 ,4}}.
|
|
std::vector<std::vector<bst_feature_t>> h_feature_constraints;
|
|
try {
|
|
ParseInteractionConstraint(param.interaction_constraints, &h_feature_constraints);
|
|
} catch (dmlc::Error const& e) {
|
|
LOG(FATAL) << "Failed to parse feature interaction constraint:\n"
|
|
<< param.interaction_constraints << "\n"
|
|
<< "With error:\n" << e.what();
|
|
}
|
|
n_sets_ = h_feature_constraints.size();
|
|
|
|
size_t const n_feat_storage = LBitField64::ComputeStorageSize(n_features);
|
|
if (n_feat_storage == 0 && n_features != 0) {
|
|
LOG(FATAL) << "Wrong storage size, n_features: " << n_features;
|
|
}
|
|
|
|
// --- Initialize allowed features attached to nodes.
|
|
int32_t n_nodes { param.MaxNodes() };
|
|
node_constraints_.resize(n_nodes);
|
|
node_constraints_storage_.resize(n_nodes);
|
|
for (auto& n : node_constraints_storage_) {
|
|
n.resize(LBitField64::ComputeStorageSize(n_features));
|
|
}
|
|
for (size_t i = 0; i < node_constraints_storage_.size(); ++i) {
|
|
auto span = dh::ToSpan(node_constraints_storage_[i]);
|
|
node_constraints_[i] = LBitField64(span);
|
|
}
|
|
s_node_constraints_ = common::Span<LBitField64>(node_constraints_.data(),
|
|
node_constraints_.size());
|
|
|
|
// Represent constraints as CSR format, flatten is the value vector,
|
|
// ptr is row_ptr vector in CSR.
|
|
std::vector<uint32_t> h_feature_constraints_flatten;
|
|
for (auto const& constraints : h_feature_constraints) {
|
|
for (uint32_t c : constraints) {
|
|
h_feature_constraints_flatten.emplace_back(c);
|
|
}
|
|
}
|
|
std::vector<size_t> h_feature_constraints_ptr;
|
|
size_t n_features_in_constraints = 0;
|
|
h_feature_constraints_ptr.emplace_back(n_features_in_constraints);
|
|
for (auto const& v : h_feature_constraints) {
|
|
n_features_in_constraints += v.size();
|
|
h_feature_constraints_ptr.emplace_back(n_features_in_constraints);
|
|
}
|
|
// Copy the CSR to device.
|
|
d_fconstraints_.resize(h_feature_constraints_flatten.size());
|
|
thrust::copy(h_feature_constraints_flatten.cbegin(), h_feature_constraints_flatten.cend(),
|
|
d_fconstraints_.begin());
|
|
s_fconstraints_ = dh::ToSpan(d_fconstraints_);
|
|
d_fconstraints_ptr_.resize(h_feature_constraints_ptr.size());
|
|
thrust::copy(h_feature_constraints_ptr.cbegin(), h_feature_constraints_ptr.cend(),
|
|
d_fconstraints_ptr_.begin());
|
|
s_fconstraints_ptr_ = dh::ToSpan(d_fconstraints_ptr_);
|
|
|
|
// --- Compute interaction sets attached to each feature.
|
|
// Use a set to eliminate duplicated entries.
|
|
std::vector<std::set<int32_t> > h_features_set(n_features);
|
|
int32_t cid = 0;
|
|
for (auto const& constraints : h_feature_constraints) {
|
|
for (auto const& feat : constraints) {
|
|
h_features_set.at(feat).insert(cid);
|
|
}
|
|
cid++;
|
|
}
|
|
// Compute device sets.
|
|
std::vector<int32_t> h_sets;
|
|
int32_t ptr = 0;
|
|
std::vector<int32_t> h_sets_ptr {ptr};
|
|
for (auto const& feature : h_features_set) {
|
|
for (auto constraint_id : feature) {
|
|
h_sets.emplace_back(constraint_id);
|
|
}
|
|
// empty set is well defined here.
|
|
ptr += feature.size();
|
|
h_sets_ptr.emplace_back(ptr);
|
|
}
|
|
d_sets_ = h_sets;
|
|
d_sets_ptr_ = h_sets_ptr;
|
|
s_sets_ = dh::ToSpan(d_sets_);
|
|
s_sets_ptr_ = dh::ToSpan(d_sets_ptr_);
|
|
|
|
d_feature_buffer_storage_.resize(LBitField64::ComputeStorageSize(n_features));
|
|
feature_buffer_ = LBitField64{dh::ToSpan(d_feature_buffer_storage_)};
|
|
|
|
// --- Initialize result buffers.
|
|
output_buffer_bits_storage_.resize(LBitField64::ComputeStorageSize(n_features));
|
|
output_buffer_bits_ = LBitField64(dh::ToSpan(output_buffer_bits_storage_));
|
|
input_buffer_bits_storage_.resize(LBitField64::ComputeStorageSize(n_features));
|
|
input_buffer_bits_ = LBitField64(dh::ToSpan(input_buffer_bits_storage_));
|
|
result_buffer_.resize(n_features);
|
|
s_result_buffer_ = dh::ToSpan(result_buffer_);
|
|
}
|
|
|
|
FeatureInteractionConstraintDevice::FeatureInteractionConstraintDevice(
|
|
tree::TrainParam const& param, int32_t const n_features) :
|
|
has_constraint_{true}, n_sets_{0} {
|
|
this->Configure(param, n_features);
|
|
}
|
|
|
|
void FeatureInteractionConstraintDevice::Reset() {
|
|
for (auto& node : node_constraints_storage_) {
|
|
thrust::fill(node.begin(), node.end(), 0);
|
|
}
|
|
}
|
|
|
|
__global__ void ClearBuffersKernel(
|
|
LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
|
|
auto tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid < result_buffer_output.Capacity()) {
|
|
result_buffer_output.Clear(tid);
|
|
}
|
|
if (tid < result_buffer_input.Capacity()) {
|
|
result_buffer_input.Clear(tid);
|
|
}
|
|
}
|
|
|
|
void FeatureInteractionConstraintDevice::ClearBuffers() {
|
|
CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
|
|
CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
|
|
uint32_t constexpr kBlockThreads = 256;
|
|
auto const n_grids = static_cast<uint32_t>(
|
|
common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
|
|
dh::LaunchKernel {n_grids, kBlockThreads} (
|
|
ClearBuffersKernel,
|
|
output_buffer_bits_, input_buffer_bits_);
|
|
}
|
|
|
|
common::Span<bst_feature_t> FeatureInteractionConstraintDevice::QueryNode(int32_t node_id) {
|
|
if (!has_constraint_) { return {}; }
|
|
CHECK_LT(node_id, s_node_constraints_.size());
|
|
|
|
ClearBuffers();
|
|
|
|
thrust::counting_iterator<int32_t> begin(0);
|
|
thrust::counting_iterator<int32_t> end(result_buffer_.size());
|
|
auto p_result_buffer = result_buffer_.data();
|
|
LBitField64 node_constraints = s_node_constraints_[node_id];
|
|
|
|
thrust::device_ptr<bst_feature_t> const out_end = thrust::copy_if(
|
|
thrust::device,
|
|
begin, end,
|
|
p_result_buffer,
|
|
[=]__device__(int32_t pos) {
|
|
bool res = node_constraints.Check(pos);
|
|
return res;
|
|
});
|
|
size_t const n_available = std::distance(result_buffer_.data(), out_end);
|
|
|
|
return {s_result_buffer_.data(), s_result_buffer_.data() + n_available};
|
|
}
|
|
|
|
__global__ void SetInputBufferKernel(common::Span<bst_feature_t> feature_list_input,
|
|
LBitField64 result_buffer_input) {
|
|
uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (tid < feature_list_input.size()) {
|
|
result_buffer_input.Set(feature_list_input[tid]);
|
|
}
|
|
}
|
|
|
|
__global__ void QueryFeatureListKernel(LBitField64 node_constraints,
|
|
LBitField64 result_buffer_input,
|
|
LBitField64 result_buffer_output) {
|
|
result_buffer_output |= node_constraints;
|
|
result_buffer_output &= result_buffer_input;
|
|
}
|
|
|
|
common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
|
|
common::Span<bst_feature_t> feature_list, int32_t nid) {
|
|
if (!has_constraint_ || nid == 0) {
|
|
return feature_list;
|
|
}
|
|
|
|
ClearBuffers();
|
|
|
|
LBitField64 node_constraints = s_node_constraints_[nid];
|
|
CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
|
|
|
|
uint32_t constexpr kBlockThreads = 256;
|
|
auto n_grids = static_cast<uint32_t>(
|
|
common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
|
|
dh::LaunchKernel {n_grids, kBlockThreads} (
|
|
SetInputBufferKernel,
|
|
feature_list, input_buffer_bits_);
|
|
dh::LaunchKernel {n_grids, kBlockThreads} (
|
|
QueryFeatureListKernel,
|
|
node_constraints, input_buffer_bits_, output_buffer_bits_);
|
|
|
|
thrust::counting_iterator<int32_t> begin(0);
|
|
thrust::counting_iterator<int32_t> end(result_buffer_.size());
|
|
|
|
LBitField64 local_result_buffer = output_buffer_bits_;
|
|
|
|
thrust::device_ptr<bst_feature_t> const out_end = thrust::copy_if(
|
|
thrust::device,
|
|
begin, end,
|
|
result_buffer_.data(),
|
|
[=]__device__(int32_t pos) {
|
|
bool res = local_result_buffer.Check(pos);
|
|
return res;
|
|
});
|
|
size_t const n_available = std::distance(result_buffer_.data(), out_end);
|
|
|
|
common::Span<bst_feature_t> result =
|
|
{s_result_buffer_.data(), s_result_buffer_.data() + n_available};
|
|
return result;
|
|
}
|
|
|
|
// Find interaction sets for each feature, then store all features in
|
|
// those sets in a buffer.
|
|
__global__ void RestoreFeatureListFromSetsKernel(
|
|
LBitField64 feature_buffer,
|
|
|
|
bst_feature_t fid,
|
|
common::Span<bst_feature_t> feature_interactions,
|
|
common::Span<size_t> feature_interactions_ptr, // of size n interaction set + 1
|
|
|
|
common::Span<bst_feature_t> interactions_list,
|
|
common::Span<size_t> interactions_list_ptr) {
|
|
auto const tid_x = threadIdx.x + blockIdx.x * blockDim.x;
|
|
auto const tid_y = threadIdx.y + blockIdx.y * blockDim.y;
|
|
// painful mapping: fid -> sets related to it -> features related to sets.
|
|
auto const beg = interactions_list_ptr[fid];
|
|
auto const end = interactions_list_ptr[fid+1];
|
|
auto const n_sets = end - beg;
|
|
if (tid_x < n_sets) {
|
|
auto const set_id_pos = beg + tid_x;
|
|
auto const set_id = interactions_list[set_id_pos];
|
|
auto const set_beg = feature_interactions_ptr[set_id];
|
|
auto const set_end = feature_interactions_ptr[set_id + 1];
|
|
auto const feature_pos = set_beg + tid_y;
|
|
if (feature_pos < set_end) {
|
|
feature_buffer.Set(feature_interactions[feature_pos]);
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void InteractionConstraintSplitKernel(LBitField64 feature,
|
|
int32_t feature_id,
|
|
LBitField64 node,
|
|
LBitField64 left,
|
|
LBitField64 right) {
|
|
auto tid = threadIdx.x + blockDim.x * blockIdx.x;
|
|
if (tid > node.Capacity()) {
|
|
return;
|
|
}
|
|
// enable constraints from feature
|
|
node |= feature;
|
|
// clear the buffer after use
|
|
if (tid < feature.Capacity()) {
|
|
feature.Clear(tid);
|
|
}
|
|
|
|
// enable constraints from parent
|
|
left |= node;
|
|
right |= node;
|
|
|
|
if (tid == feature_id) {
|
|
// enable the split feature, set all of them at last instead of
|
|
// setting it for parent to avoid race.
|
|
node.Set(feature_id);
|
|
left.Set(feature_id);
|
|
right.Set(feature_id);
|
|
}
|
|
}
|
|
|
|
void FeatureInteractionConstraintDevice::Split(
|
|
bst_node_t node_id, bst_feature_t feature_id, bst_node_t left_id, bst_node_t right_id) {
|
|
if (!has_constraint_) { return; }
|
|
CHECK_NE(node_id, left_id)
|
|
<< " Split node: " << node_id << " and its left child: "
|
|
<< left_id << " cannot be the same.";
|
|
CHECK_NE(node_id, right_id)
|
|
<< " Split node: " << node_id << " and its left child: "
|
|
<< right_id << " cannot be the same.";
|
|
CHECK_LT(right_id, s_node_constraints_.size());
|
|
CHECK_NE(s_node_constraints_.size(), 0);
|
|
|
|
LBitField64 node = s_node_constraints_[node_id];
|
|
LBitField64 left = s_node_constraints_[left_id];
|
|
LBitField64 right = s_node_constraints_[right_id];
|
|
|
|
dim3 const block3(16, 64, 1);
|
|
dim3 const grid3(common::DivRoundUp(n_sets_, 16),
|
|
common::DivRoundUp(s_fconstraints_.size(), 64));
|
|
dh::LaunchKernel {grid3, block3} (
|
|
RestoreFeatureListFromSetsKernel,
|
|
feature_buffer_, feature_id,
|
|
s_fconstraints_, s_fconstraints_ptr_,
|
|
s_sets_, s_sets_ptr_);
|
|
|
|
uint32_t constexpr kBlockThreads = 256;
|
|
auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
|
|
|
|
dh::LaunchKernel {n_grids, kBlockThreads} (
|
|
InteractionConstraintSplitKernel,
|
|
feature_buffer_,
|
|
feature_id,
|
|
node, left, right);
|
|
}
|
|
|
|
} // namespace xgboost
|