Feature interaction for GPU Hist. (#4534)

* GPU hist Interaction Constraints. * Duplicate related parameters. * Add tests for CPU interaction constraint. * Add better error reporting. * Thorough tests.
2019-06-19 18:11:02 +08:00
parent 570374effe
commit ae05948e32
14 changed files with 1201 additions and 76 deletions
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -0,0 +1,347 @@
+/*!
+ * Copyright 2019 XGBoost contributors
+ */
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <xgboost/logging.h>
+
+#include <algorithm>
+#include <bitset>
+#include <string>
+#include <sstream>
+
+#include "constraints.cuh"
+#include "param.h"
+#include "../common/span.h"
+#include "../common/device_helpers.cuh"
+
+
+namespace xgboost {
+
+BitField::value_type constexpr BitField::kValueSize;
+BitField::value_type constexpr BitField::kOne;
+
+size_t FeatureInteractionConstraint::Features() const {
+  return d_sets_ptr_.size() - 1;
+}
+
+void FeatureInteractionConstraint::Configure(
+    tree::TrainParam const& param, int32_t const n_features) {
+  has_constraint_ = true;
+  if (param.interaction_constraints.length() == 0) {
+    has_constraint_ = false;
+    return;
+  }
+  // --- Parse interaction constraints
+  std::istringstream iss(param.interaction_constraints);
+  dmlc::JSONReader reader(&iss);
+  // Interaction constraints parsed from string parameter.  After
+  // parsing, this looks like {{0, 1, 2}, {2, 3 ,4}}.
+  std::vector<std::vector<int32_t>> h_feature_constraints;
+  try {
+    reader.Read(&h_feature_constraints);
+  } catch (dmlc::Error const& e) {
+    LOG(FATAL) << "Failed to parse feature interaction constraint:\n"
+               << param.interaction_constraints << "\n"
+               << "With error:\n" << e.what();
+  }
+  n_sets_ = h_feature_constraints.size();
+
+  size_t const n_feat_storage = BitField::ComputeStorageSize(n_features);
+  if (n_feat_storage == 0 && n_features != 0) {
+    LOG(FATAL) << "Wrong storage size, n_features: " << n_features;
+  }
+
+  // --- Initialize allowed features attached to nodes.
+  if (param.max_depth == 0 && param.max_leaves == 0) {
+    LOG(FATAL) << "Max leaves and max depth cannot both be unconstrained for gpu_hist.";
+  }
+  int32_t n_nodes {0};
+  if (param.max_depth != 0) {
+    n_nodes = std::pow(2, param.max_depth + 1);
+  } else {
+    n_nodes = param.max_leaves * 2 - 1;
+  }
+  CHECK_NE(n_nodes, 0);
+  node_constraints_.resize(n_nodes);
+  node_constraints_storage_.resize(n_nodes);
+  for (auto& n : node_constraints_storage_) {
+    n.resize(BitField::ComputeStorageSize(n_features));
+  }
+  for (size_t i = 0; i < node_constraints_storage_.size(); ++i) {
+    auto span = dh::ToSpan(node_constraints_storage_[i]);
+    node_constraints_[i] = BitField(span);
+  }
+  s_node_constraints_ = common::Span<BitField>(node_constraints_.data(),
+                                               node_constraints_.size());
+
+  // Represent constraints as CSR format, flatten is the value vector,
+  // ptr is row_ptr vector in CSR.
+  std::vector<int32_t> h_feature_constraints_flatten;
+  for (auto const& constraints : h_feature_constraints) {
+    for (int32_t c : constraints) {
+      h_feature_constraints_flatten.emplace_back(c);
+    }
+  }
+  std::vector<int32_t> h_feature_constraints_ptr;
+  size_t n_features_in_constraints = 0;
+  h_feature_constraints_ptr.emplace_back(n_features_in_constraints);
+  for (auto const& v : h_feature_constraints) {
+    n_features_in_constraints += v.size();
+    h_feature_constraints_ptr.emplace_back(n_features_in_constraints);
+  }
+  // Copy the CSR to device.
+  d_fconstraints_.resize(h_feature_constraints_flatten.size());
+  thrust::copy(h_feature_constraints_flatten.cbegin(), h_feature_constraints_flatten.cend(),
+               d_fconstraints_.begin());
+  s_fconstraints_ = dh::ToSpan(d_fconstraints_);
+  d_fconstraints_ptr_.resize(h_feature_constraints_ptr.size());
+  thrust::copy(h_feature_constraints_ptr.cbegin(), h_feature_constraints_ptr.cend(),
+               d_fconstraints_ptr_.begin());
+  s_fconstraints_ptr_ = dh::ToSpan(d_fconstraints_ptr_);
+
+  // --- Compute interaction sets attached to each feature.
+  // Use a set to eliminate duplicated entries.
+  std::vector<std::set<int32_t> > h_features_set(n_features);
+  int32_t cid = 0;
+  for (auto const& constraints : h_feature_constraints) {
+    for (auto const& feat : constraints) {
+      h_features_set.at(feat).insert(cid);
+    }
+    cid++;
+  }
+  // Compute device sets.
+  std::vector<int32_t> h_sets;
+  int32_t ptr = 0;
+  std::vector<int32_t> h_sets_ptr {ptr};
+  for (auto const& feature : h_features_set) {
+    for (auto constraint_id : feature) {
+      h_sets.emplace_back(constraint_id);
+    }
+    // empty set is well defined here.
+    ptr += feature.size();
+    h_sets_ptr.emplace_back(ptr);
+  }
+  d_sets_ = h_sets;
+  d_sets_ptr_ = h_sets_ptr;
+  s_sets_ = dh::ToSpan(d_sets_);
+  s_sets_ptr_ = dh::ToSpan(d_sets_ptr_);
+
+  d_feature_buffer_storage_.resize(BitField::ComputeStorageSize(n_features));
+  feature_buffer_ = dh::ToSpan(d_feature_buffer_storage_);
+
+  // --- Initialize result buffers.
+  output_buffer_bits_storage_.resize(n_features);
+  output_buffer_bits_ = BitField(dh::ToSpan(output_buffer_bits_storage_));
+  input_buffer_bits_storage_.resize(n_features);
+  input_buffer_bits_ = BitField(dh::ToSpan(input_buffer_bits_storage_));
+  result_buffer_.resize(n_features);
+  s_result_buffer_ = dh::ToSpan(result_buffer_);
+}
+
+FeatureInteractionConstraint::FeatureInteractionConstraint(
+    tree::TrainParam const& param, int32_t const n_features) :
+    has_constraint_{true}, n_sets_{0} {
+  this->Configure(param, n_features);
+}
+
+void FeatureInteractionConstraint::Reset() {
+  for (auto& node : node_constraints_storage_) {
+    thrust::fill(node.begin(), node.end(), 0);
+  }
+}
+
+__global__ void ClearBuffersKernel(
+    BitField result_buffer_self, BitField result_buffer_input, BitField feature_buffer) {
+  auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < result_buffer_self.Size()) {
+    result_buffer_self.Clear(tid);
+  }
+  if (tid < result_buffer_input.Size()) {
+    result_buffer_input.Clear(tid);
+  }
+}
+
+void FeatureInteractionConstraint::ClearBuffers() {
+  CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
+  CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
+  int constexpr kBlockThreads = 256;
+  const int n_grids = static_cast<int>(
+      dh::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
+  ClearBuffersKernel<<<n_grids, kBlockThreads>>>(
+      output_buffer_bits_, input_buffer_bits_, feature_buffer_);
+}
+
+common::Span<int32_t> FeatureInteractionConstraint::QueryNode(int32_t node_id) {
+  if (!has_constraint_) { return {}; }
+  CHECK_LT(node_id, s_node_constraints_.size());
+
+  ClearBuffers();
+
+  thrust::counting_iterator<int32_t> begin(0);
+  thrust::counting_iterator<int32_t> end(result_buffer_.size());
+  auto p_result_buffer = result_buffer_.data();
+  BitField node_constraints = s_node_constraints_[node_id];
+
+  thrust::device_ptr<int32_t> const out_end = thrust::copy_if(
+      thrust::device,
+      begin, end,
+      p_result_buffer,
+      [=]__device__(int32_t pos) {
+        bool res = node_constraints.Check(pos);
+        return res;
+      });
+  size_t const n_available = std::distance(result_buffer_.data(), out_end);
+
+  return {s_result_buffer_.data(), s_result_buffer_.data() + n_available};
+}
+
+__global__ void QueryFeatureListKernel(common::Span<int32_t> feature_list_input,
+                                       common::Span<int32_t> node_feature_list,
+                                       BitField result_buffer_input,
+                                       BitField result_buffer_output) {
+  uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < feature_list_input.size()) {
+    result_buffer_input.Set(feature_list_input[tid]);
+  }
+
+  if (tid < node_feature_list.size()) {
+    result_buffer_output.Set(node_feature_list[tid]);
+  }
+  result_buffer_output &= result_buffer_input;
+}
+
+common::Span<int32_t> FeatureInteractionConstraint::Query(
+    common::Span<int32_t> feature_list, int32_t nid) {
+  if (!has_constraint_ || nid == 0) {
+    return feature_list;
+  }
+  auto selected = this->QueryNode(nid);
+  CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
+  int constexpr kBlockThreads = 256;
+  const int n_grids = static_cast<int>(
+      dh::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
+
+  QueryFeatureListKernel<<<n_grids, kBlockThreads>>>
+      (feature_list,
+       selected,
+       input_buffer_bits_,
+       output_buffer_bits_);
+
+  thrust::counting_iterator<int32_t> begin(0);
+  thrust::counting_iterator<int32_t> end(result_buffer_.size());
+
+  BitField local_result_buffer = output_buffer_bits_;
+
+  thrust::device_ptr<int32_t> const out_end = thrust::copy_if(
+      thrust::device,
+      begin, end,
+      result_buffer_.data(),
+      [=]__device__(int32_t pos) {
+        bool res = local_result_buffer.Check(pos);
+        return res;
+      });
+  size_t const n_available = std::distance(result_buffer_.data(), out_end);
+
+  common::Span<int32_t> result =
+      {s_result_buffer_.data(), s_result_buffer_.data() + n_available};
+  return result;
+}
+
+// Find interaction sets for each feature, then store all features in
+// those sets in a buffer.
+__global__ void RestoreFeatureListFromSetsKernel(
+    BitField feature_buffer,
+
+    int32_t fid,
+    common::Span<int32_t> feature_interactions,
+    common::Span<int32_t> feature_interactions_ptr,  // of size n interaction set + 1
+
+    common::Span<int32_t> interactions_list,
+    common::Span<int32_t> interactions_list_ptr) {
+  auto const tid_x = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid_y = threadIdx.y + blockIdx.y * blockDim.y;
+  // painful mapping: fid -> sets related to it -> features related to sets.
+  auto const beg = interactions_list_ptr[fid];
+  auto const end = interactions_list_ptr[fid+1];
+  auto const n_sets = end - beg;
+  if (tid_x < n_sets) {
+    auto const set_id_pos = beg + tid_x;
+    auto const set_id = interactions_list[set_id_pos];
+    auto const set_beg = feature_interactions_ptr[set_id];
+    auto const set_end = feature_interactions_ptr[set_id + 1];
+    auto const feature_pos = set_beg + tid_y;
+    if (feature_pos < set_end) {
+      feature_buffer.Set(feature_interactions[feature_pos]);
+    }
+  }
+}
+
+__global__ void InteractionConstraintSplitKernel(BitField feature,
+                                                 int32_t feature_id,
+                                                 BitField node,
+                                                 BitField left,
+                                                 BitField right) {
+  auto tid = threadIdx.x + blockDim.x * blockIdx.x;
+  if (tid > node.Size()) {
+    return;
+  }
+  // enable constraints from feature
+  node |= feature;
+  // clear the buffer after use
+  if (tid < feature.Size()) {
+    feature.Clear(tid);
+  }
+
+  // enable constraints from parent
+  left  |= node;
+  right |= node;
+
+  if (tid == feature_id) {
+    // enable the split feature, set all of them at last instead of
+    // setting it for parent to avoid race.
+    node.Set(feature_id);
+    left.Set(feature_id);
+    right.Set(feature_id);
+  }
+}
+
+void FeatureInteractionConstraint::Split(
+    int32_t node_id, int32_t feature_id, int32_t left_id, int32_t right_id) {
+  if (!has_constraint_) { return; }
+  CHECK_NE(node_id, left_id)
+      << " Split node: " << node_id << " and its left child: "
+      << left_id << " cannot be the same.";
+  CHECK_NE(node_id, right_id)
+      << " Split node: " << node_id << " and its left child: "
+      << right_id << " cannot be the same.";
+  CHECK_LT(right_id, s_node_constraints_.size());
+  CHECK_NE(s_node_constraints_.size(), 0);
+
+  BitField node = s_node_constraints_[node_id];
+  BitField left = s_node_constraints_[left_id];
+  BitField right = s_node_constraints_[right_id];
+
+  dim3 const block3(16, 64, 1);
+  dim3 const grid3(dh::DivRoundUp(n_sets_, 16),
+                   dh::DivRoundUp(s_fconstraints_.size(), 64));
+  RestoreFeatureListFromSetsKernel<<<grid3, block3>>>
+      (feature_buffer_,
+       feature_id,
+       s_fconstraints_,
+       s_fconstraints_ptr_,
+       s_sets_,
+       s_sets_ptr_);
+
+  int constexpr kBlockThreads = 256;
+  const int n_grids = static_cast<int>(dh::DivRoundUp(node.Size(), kBlockThreads));
+  InteractionConstraintSplitKernel<<<n_grids, kBlockThreads>>>
+      (feature_buffer_,
+       feature_id,
+       node, left, right);
+}
+
+}  // namespace xgboost
--- a/src/tree/constraints.cuh
+++ b/src/tree/constraints.cuh
@@ -0,0 +1,248 @@
+/*!
+ * Copyright 2019 XGBoost contributors
+ */
+#ifndef XGBOOST_TREE_CONSTRAINTS_H_
+#define XGBOOST_TREE_CONSTRAINTS_H_
+
+#include <dmlc/json.h>
+#include <xgboost/logging.h>
+
+#include <cinttypes>
+#include <iterator>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <set>
+
+#include "param.h"
+#include "../common/span.h"
+#include "../common/device_helpers.cuh"
+
+#include <bitset>
+
+namespace xgboost {
+
+__forceinline__ __device__ unsigned long long AtomicOr(unsigned long long* address,
+                                                       unsigned long long val) {
+  unsigned long long int old = *address, assumed;  // NOLINT
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val | assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+__forceinline__ __device__ unsigned long long AtomicAnd(unsigned long long* address,
+                                                        unsigned long long val) {
+  unsigned long long int old = *address, assumed;  // NOLINT
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val & assumed);
+  } while (assumed != old);
+
+  return old;
+}
+
+/*!
+ * \brief A non-owning type with auxiliary methods defined for manipulating bits.
+ */
+struct BitField {
+  using value_type = uint64_t;
+
+  static value_type constexpr kValueSize = sizeof(value_type) * 8;
+  static value_type constexpr kOne = 1UL;  // force uint64_t
+  static_assert(kValueSize == 64, "uint64_t should be of 64 bits.");
+
+  struct Pos {
+    value_type int_pos {0};
+    value_type bit_pos {0};
+  };
+
+  common::Span<value_type> bits_;
+
+ public:
+  BitField() = default;
+  XGBOOST_DEVICE BitField(common::Span<value_type> bits) : bits_{bits} {}
+  XGBOOST_DEVICE BitField(BitField const& other) : bits_{other.bits_} {}
+
+  static size_t ComputeStorageSize(size_t size) {
+    auto pos = ToBitPos(size);
+    if (size < kValueSize) {
+      return 1;
+    }
+
+    if (pos.bit_pos != 0) {
+      return pos.int_pos + 2;
+    } else {
+      return pos.int_pos + 1;
+    }
+  }
+  XGBOOST_DEVICE static Pos ToBitPos(value_type pos) {
+    Pos pos_v;
+    if (pos == 0) {
+      return pos_v;
+    }
+    pos_v.int_pos =  pos / kValueSize;
+    pos_v.bit_pos =  pos % kValueSize;
+    return pos_v;
+  }
+
+  __device__ BitField& operator|=(BitField const& rhs) {
+    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    if (tid < min_size) {
+      bits_[tid] |= rhs.bits_[tid];
+    }
+    return *this;
+  }
+  __device__ BitField& operator&=(BitField const& rhs) {
+    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < min_size) {
+      bits_[tid] &= rhs.bits_[tid];
+    }
+    return *this;
+  }
+
+  XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }
+
+  __device__ void Set(value_type pos) {
+    Pos pos_v = ToBitPos(pos);
+    value_type& value = bits_[pos_v.int_pos];
+    value_type set_bit = kOne << (kValueSize - pos_v.bit_pos - kOne);
+    static_assert(sizeof(unsigned long long int) == sizeof(value_type), "");
+    AtomicOr(reinterpret_cast<unsigned long long*>(&value), set_bit);
+  }
+  __device__ void Clear(value_type pos) {
+    Pos pos_v = ToBitPos(pos);
+    value_type& value = bits_[pos_v.int_pos];
+    value_type clear_bit = ~(kOne << (kValueSize - pos_v.bit_pos - kOne));
+    static_assert(sizeof(unsigned long long int) == sizeof(value_type), "");
+    AtomicAnd(reinterpret_cast<unsigned long long*>(&value), clear_bit);
+  }
+
+  XGBOOST_DEVICE bool Check(Pos pos_v) const {
+    value_type value = bits_[pos_v.int_pos];
+    value_type const test_bit = kOne << (kValueSize - pos_v.bit_pos - kOne);
+    value_type result = test_bit & value;
+    return static_cast<bool>(result);
+  }
+  XGBOOST_DEVICE bool Check(value_type pos) const {
+    Pos pos_v = ToBitPos(pos);
+    return Check(pos_v);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, BitField field) {
+    os << "Bits " << "storage size: " << field.bits_.size() << "\n";
+    for (size_t i = 0; i < field.bits_.size(); ++i) {
+      std::bitset<BitField::kValueSize> set(field.bits_[i]);
+      os << set << "\n";
+    }
+    return os;
+  }
+};
+
+inline void PrintDeviceBits(std::string name, BitField field) {
+  std::cout << "Bits: " << name << std::endl;
+  std::vector<BitField::value_type> h_field_bits(field.bits_.size());
+  thrust::copy(thrust::device_ptr<BitField::value_type>(field.bits_.data()),
+               thrust::device_ptr<BitField::value_type>(field.bits_.data() + field.bits_.size()),
+               h_field_bits.data());
+  BitField h_field;
+  h_field.bits_ = {h_field_bits.data(), h_field_bits.data() + h_field_bits.size()};
+  std::cout << h_field;
+}
+
+inline void PrintDeviceStorage(std::string name, common::Span<int32_t> list) {
+  std::cout << name << std::endl;
+  std::vector<int32_t> h_list(list.size());
+  thrust::copy(thrust::device_ptr<int32_t>(list.data()),
+               thrust::device_ptr<int32_t>(list.data() + list.size()),
+               h_list.data());
+  for (auto v : h_list) {
+    std::cout << v << ", ";
+  }
+  std::cout << std::endl;
+}
+
+// Feature interaction constraints built for GPU Hist updater.
+struct FeatureInteractionConstraint {
+ protected:
+  // Whether interaction constraint is used.
+  bool has_constraint_;
+  // n interaction sets.
+  int32_t n_sets_;
+
+  // The parsed feature interaction constraints as CSR.
+  dh::device_vector<int32_t> d_fconstraints_;
+  common::Span<int32_t> s_fconstraints_;
+  dh::device_vector<int32_t> d_fconstraints_ptr_;
+  common::Span<int32_t> s_fconstraints_ptr_;
+  /* Interaction sets for each feature as CSR.  For an input like:
+   * [[0, 1], [1, 2]], this will have values:
+   *
+   * fid:                                |0 | 1  | 2|
+   * sets a feature belongs to(d_sets_): |0 |0, 1| 1|
+   *
+   * d_sets_ptr_:                        |0, 1, 3, 4|
+   */
+  dh::device_vector<int32_t> d_sets_;
+  common::Span<int32_t> s_sets_;
+  dh::device_vector<int32_t> d_sets_ptr_;
+  common::Span<int32_t> s_sets_ptr_;
+
+  // Allowed features attached to each node, have n_nodes bitfields,
+  // each of size n_features.
+  std::vector<dh::device_vector<BitField::value_type>> node_constraints_storage_;
+  std::vector<BitField> node_constraints_;
+  common::Span<BitField> s_node_constraints_;
+
+  // buffer storing return feature list from Query, of size n_features.
+  dh::device_vector<int32_t> result_buffer_;
+  common::Span<int32_t> s_result_buffer_;
+
+  // Temp buffers, one bit for each possible feature.
+  dh::device_vector<BitField::value_type> output_buffer_bits_storage_;
+  BitField output_buffer_bits_;
+  dh::device_vector<BitField::value_type> input_buffer_bits_storage_;
+  BitField input_buffer_bits_;
+  /*
+   * Combined features from all interaction sets that one feature belongs to.
+   * For an input with [[0, 1], [1, 2]], the feature 1 belongs to sets {0, 1}
+   */
+  dh::device_vector<BitField::value_type> d_feature_buffer_storage_;
+  BitField feature_buffer_;  // of Size n features.
+
+  // Clear out all temp buffers except for `feature_buffer_', which is
+  // handled in `Split'.
+  void ClearBuffers();
+
+ public:
+  size_t Features() const;
+  FeatureInteractionConstraint() = default;
+  void Configure(tree::TrainParam const& param, int32_t const n_features);
+  FeatureInteractionConstraint(tree::TrainParam const& param, int32_t const n_features);
+  FeatureInteractionConstraint(FeatureInteractionConstraint const& that) = default;
+  FeatureInteractionConstraint(FeatureInteractionConstraint&& that) = default;
+  /*! \brief Reset before constructing a new tree. */
+  void Reset();
+  /*! \brief Return a list of features given node id */
+  common::Span<int32_t> QueryNode(int32_t nid);
+  /*!
+   * \brief Return a list of selected features from given feature_list and node id.
+   *
+   * \param feature_list A list of features
+   * \param nid node id
+   *
+   * \return A list of features picked from `feature_list' that conform to constraints in
+   * node.
+   */
+  common::Span<int32_t> Query(common::Span<int32_t> feature_list, int32_t nid);
+  /*! \brief Apply split for node_id. */
+  void Split(int32_t node_id, int32_t feature_id, int32_t left_id, int32_t right_id);
+};
+
+}      // namespace xgboost
+#endif  // XGBOOST_TREE_CONSTRAINTS_H_
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -70,8 +70,13 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
  bool cache_opt;
  // whether refresh updater needs to update the leaf values
  bool refresh_leaf;
-  // auxiliary data structure
+
+  // FIXME(trivialfis): Following constraints are used by gpu
+  // algorithm, duplicated with those defined split evaluator due to
+  // their different code paths.
  std::vector<int> monotone_constraints;
+  std::string interaction_constraints;
+
  // the criteria to use for ranking splits
  std::string split_evaluator;

@@ -187,6 +192,13 @@ struct TrainParam : public dmlc::Parameter<TrainParam> {
    DMLC_DECLARE_FIELD(monotone_constraints)
        .set_default(std::vector<int>())
        .describe("Constraint of variable monotonicity");
+    DMLC_DECLARE_FIELD(interaction_constraints)
+        .set_default("")
+        .describe("Constraints for interaction representing permitted interactions."
+                  "The constraints must be specified in the form of a nest list,"
+                  "e.g. [[0, 1], [2, 3, 4]], where each inner list is a group of"
+                  "indices of features that are allowed to interact with each other."
+                  "See tutorial for more information");
    DMLC_DECLARE_FIELD(split_evaluator)
        .set_default("elastic_net,monotonic,interaction")
        .describe("The criteria to use for ranking splits");
--- a/src/tree/split_evaluator.cc
+++ b/src/tree/split_evaluator.cc
@@ -6,6 +6,7 @@
 #include "split_evaluator.h"
 #include <dmlc/json.h>
 #include <dmlc/registry.h>
+#include <xgboost/logging.h>
 #include <algorithm>
 #include <unordered_set>
 #include <vector>
@@ -384,17 +385,23 @@ class InteractionConstraint final : public SplitEvaluator {
    // Read std::vector<std::vector<bst_uint>> first and then
    //   convert to std::vector<std::unordered_set<bst_uint>>
    std::vector<std::vector<bst_uint>> tmp;
-    reader.Read(&tmp);
+    try {
+      reader.Read(&tmp);
+    } catch (dmlc::Error const& e) {
+      LOG(FATAL) << "Failed to parse feature interaction constraint:\n"
+                 << params_.interaction_constraints << "\n"
+                 << "With error:\n" << e.what();
+    }
    for (const auto& e : tmp) {
      interaction_constraints_.emplace_back(e.begin(), e.end());
    }

    // Initialise interaction constraints record with all variables permitted for the first node
-    int_cont_.clear();
-    int_cont_.resize(1, std::unordered_set<bst_uint>());
-    int_cont_[0].reserve(params_.num_feature);
+    node_constraints_.clear();
+    node_constraints_.resize(1, std::unordered_set<bst_uint>());
+    node_constraints_[0].reserve(params_.num_feature);
    for (bst_uint i = 0; i < params_.num_feature; ++i) {
-      int_cont_[0].insert(i);
+      node_constraints_[0].insert(i);
    }

    // Initialise splits record
@@ -463,12 +470,12 @@ class InteractionConstraint final : public SplitEvaluator {
    splits_[rightid] = feature_splits;

    // Resize constraints record, initialise all features to be not permitted for new nodes
-    int_cont_.resize(newsize, std::unordered_set<bst_uint>());
+    node_constraints_.resize(newsize, std::unordered_set<bst_uint>());

    // Permit features used in previous splits
    for (bst_uint fid : feature_splits) {
-      int_cont_[leftid].insert(fid);
-      int_cont_[rightid].insert(fid);
+      node_constraints_[leftid].insert(fid);
+      node_constraints_[rightid].insert(fid);
    }

    // Loop across specified interactions in constraints
@@ -486,8 +493,8 @@ class InteractionConstraint final : public SplitEvaluator {
      // If interaction is still relevant, permit all other features in the interaction
      if (flag == 1) {
        for (bst_uint k : constraint) {
-          int_cont_[leftid].insert(k);
-          int_cont_[rightid].insert(k);
+          node_constraints_[leftid].insert(k);
+          node_constraints_[rightid].insert(k);
        }
      }
    }
@@ -506,7 +513,7 @@ class InteractionConstraint final : public SplitEvaluator {
  std::vector< std::unordered_set<bst_uint> > interaction_constraints_;
  // int_cont_[nid] contains the set of all feature IDs that are allowed to
  //   be used for a split at node nid
-  std::vector< std::unordered_set<bst_uint> > int_cont_;
+  std::vector< std::unordered_set<bst_uint> > node_constraints_;
  // splits_[nid] contains the set of all feature IDs that have been used for
  //   splits in node nid and its parents
  std::vector< std::unordered_set<bst_uint> > splits_;
@@ -516,7 +523,7 @@ class InteractionConstraint final : public SplitEvaluator {
  inline bool CheckInteractionConstraint(bst_uint featureid, bst_uint nodeid) const {
    // short-circuit if no constraint is specified
    return (params_.interaction_constraints.empty()
-            || int_cont_.at(nodeid).count(featureid) > 0);
+            || node_constraints_.at(nodeid).count(featureid) > 0);
  }
 };

--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -24,6 +24,7 @@
 #include "../common/span.h"
 #include "param.h"
 #include "updater_gpu_common.cuh"
+#include "constraints.cuh"

 namespace xgboost {
 namespace tree {
@@ -318,9 +319,8 @@ __device__ void EvaluateFeature(

 template <int BLOCK_THREADS, typename GradientSumT>
 __global__ void EvaluateSplitKernel(
-    common::Span<const GradientSumT>
-        node_histogram,               // histogram for gradients
-    common::Span<const int> feature_set,  // Selected features
+    common::Span<const GradientSumT> node_histogram,  // histogram for gradients
+    common::Span<const int> feature_set,              // Selected features
    DeviceNodeStats node,
    ELLPackMatrix matrix,
    GPUTrainingParam gpu_param,
@@ -354,6 +354,7 @@ __global__ void EvaluateSplitKernel(

  // One block for each feature. Features are sampled, so fidx != blockIdx.x
  int fidx = feature_set[blockIdx.x];
+
  int constraint = d_monotonic_constraints[fidx];
  EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
      fidx, node_histogram, matrix, &best_split, node, gpu_param, &temp_storage,
@@ -714,6 +715,7 @@ struct DeviceShard {
  common::Monitor monitor;
  std::vector<ValueConstraint> node_value_constraints;
  common::ColumnSampler column_sampler;
+  FeatureInteractionConstraint interaction_constraints;

  using ExpandQueue =
      std::priority_queue<ExpandEntry, std::vector<ExpandEntry>,
@@ -721,7 +723,8 @@ struct DeviceShard {
  std::unique_ptr<ExpandQueue> qexpand;

  DeviceShard(int _device_id, int shard_idx, bst_uint row_begin,
-              bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed)
+              bst_uint row_end, TrainParam _param, uint32_t column_sampler_seed,
+              uint32_t n_features)
      : device_id(_device_id),
        shard_idx(shard_idx),
        row_begin_idx(row_begin),
@@ -730,7 +733,8 @@ struct DeviceShard {
        n_bins(0),
        param(std::move(_param)),
        prediction_cache_initialised(false),
-        column_sampler(column_sampler_seed) {
+        column_sampler(column_sampler_seed),
+        interaction_constraints(param, n_features) {
    monitor.Init(std::string("DeviceShard") + std::to_string(device_id));
  }

@@ -778,6 +782,8 @@ struct DeviceShard {
    this->column_sampler.Init(num_columns, param.colsample_bynode,
      param.colsample_bylevel, param.colsample_bytree);
    dh::safe_cuda(cudaSetDevice(device_id));
+    this->interaction_constraints.Reset();
+
    thrust::fill(
        thrust::device_pointer_cast(position.Current()),
        thrust::device_pointer_cast(position.Current() + position.Size()), 0);
@@ -806,7 +812,7 @@ struct DeviceShard {
      std::vector<int> nidxs, const RegTree& tree,
      size_t num_columns) {
    dh::safe_cuda(cudaSetDevice(device_id));
-    auto result = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());
+    auto result_all = pinned_memory.GetSpan<DeviceSplitCandidate>(nidxs.size());

    // Work out cub temporary memory requirement
    GPUTrainingParam gpu_param(param);
@@ -840,11 +846,26 @@ struct DeviceShard {
      auto nidx = nidxs[i];
      auto p_feature_set = column_sampler.GetFeatureSet(tree.GetDepth(nidx));
      p_feature_set->Shard(GPUSet(device_id, 1));
-      auto d_feature_set = p_feature_set->DeviceSpan(device_id);
+      auto d_sampled_features = p_feature_set->DeviceSpan(device_id);
+      common::Span<int32_t> d_feature_set =
+          interaction_constraints.Query(d_sampled_features, nidx);
      auto d_split_candidates =
          d_split_candidates_all.subspan(i * num_columns, d_feature_set.size());
+
      DeviceNodeStats node(node_sum_gradients[nidx], nidx, param);

+      auto d_result = d_result_all.subspan(i, 1);
+      if (d_feature_set.size() == 0) {
+        // Acting as a device side constructor for DeviceSplitCandidate.
+        // DeviceSplitCandidate::IsValid is false so that ApplySplit can reject this
+        // candidate.
+        auto worst_candidate = DeviceSplitCandidate();
+        dh::safe_cuda(cudaMemcpyAsync(d_result.data(), &worst_candidate,
+                                      sizeof(DeviceSplitCandidate),
+                                      cudaMemcpyHostToDevice));
+        continue;
+      }
+
      // One block for each feature
      int constexpr kBlockThreads = 256;
      EvaluateSplitKernel<kBlockThreads, GradientSumT>
@@ -854,7 +875,6 @@ struct DeviceShard {
              monotone_constraints);

      // Reduce over features to find best feature
-      auto d_result = d_result_all.subspan(i, 1);
      auto d_cub_memory =
          d_cub_memory_all.subspan(i * cub_memory_size, cub_memory_size);
      size_t cub_bytes = d_cub_memory.size() * sizeof(DeviceSplitCandidate);
@@ -864,11 +884,10 @@ struct DeviceShard {
                                DeviceSplitCandidate(), streams[i]);
    }

-    dh::safe_cuda(cudaMemcpy(result.data(), d_result_all.data(),
+    dh::safe_cuda(cudaMemcpy(result_all.data(), d_result_all.data(),
                             sizeof(DeviceSplitCandidate) * d_result_all.size(),
                             cudaMemcpyDeviceToHost));
-
-    return std::vector<DeviceSplitCandidate>(result.begin(), result.end());
+    return std::vector<DeviceSplitCandidate>(result_all.begin(), result_all.end());
  }

  void BuildHist(int nidx) {
@@ -1137,6 +1156,10 @@ struct DeviceShard {
        candidate.split.left_sum;
    node_sum_gradients[tree[candidate.nid].RightChild()] =
        candidate.split.right_sum;
+
+    interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(),
+                                  tree[candidate.nid].LeftChild(),
+                                  tree[candidate.nid].RightChild());
  }

  void InitRoot(RegTree* p_tree, HostDeviceVector<GradientPair>* gpair_all,
@@ -1202,7 +1225,7 @@ struct DeviceShard {
      int right_child_nidx = tree[candidate.nid].RightChild();
      // Only create child entries if needed
      if (ExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
-        num_leaves)) {
+                                    num_leaves)) {
        monitor.StartCuda("UpdatePosition");
        this->UpdatePosition(candidate.nid, (*p_tree)[candidate.nid]);
        monitor.StopCuda("UpdatePosition");
@@ -1487,7 +1510,8 @@ class GPUHistMakerSpecialised {
          shard = std::unique_ptr<DeviceShard<GradientSumT>>(
            new DeviceShard<GradientSumT>(dist_.Devices().DeviceId(idx), idx,
                                          start, start + size, param_,
-                                          column_sampling_seed));
+                                          column_sampling_seed,
+                                          info_->num_col_));
        });

    monitor_.StartCuda("Quantiles");