Export c++ headers in CMake installation. (#4897)

* Move get transpose into cc. * Clean up headers in host device vector, remove thrust dependency. * Move span and host device vector into public. * Install c++ headers. * Short notes for c and c++. Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
2019-10-06 23:53:09 -04:00
parent 4ab1df5fe6
commit 095de3bf5f
55 changed files with 240 additions and 209 deletions
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -13,7 +13,12 @@
 #include <string>
 #include <vector>

-#include "span.h"
+#if defined(__CUDACC__)
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#endif  // defined(__CUDACC__)
+
+#include "xgboost/span.h"

 namespace xgboost {

--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -11,8 +11,10 @@
 #include <rabit/rabit.h>
 #include <cub/util_allocator.cuh>

+#include "xgboost/host_device_vector.h"
+#include "xgboost/span.h"
+
 #include "common.h"
-#include "span.h"

 #include <algorithm>
 #include <omp.h>
@@ -1132,6 +1134,27 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
  return ToSpan(vec, static_cast<IndexT>(offset), static_cast<IndexT>(size));
 }

+// thrust begin, similiar to std::begin
+template <typename T>
+thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
+  return thrust::device_ptr<T>(vector.DevicePointer());
+}
+
+template <typename T>
+thrust::device_ptr<T> tend(xgboost::HostDeviceVector<T>& vector) {  // // NOLINT
+  return tbegin(vector) + vector.Size();
+}
+
+template <typename T>
+thrust::device_ptr<T const> tcbegin(xgboost::HostDeviceVector<T> const& vector) {
+  return thrust::device_ptr<T const>(vector.ConstDevicePointer());
+}
+
+template <typename T>
+thrust::device_ptr<T const> tcend(xgboost::HostDeviceVector<T> const& vector) {
+  return tcbegin(vector) + vector.Size();
+}
+
 template <typename FunctionT>
 class LauncherItr {
 public:
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -8,6 +8,8 @@
 #include <dmlc/omp.h>
 #include <numeric>
 #include <vector>
+
+#include "../common/common.h"
 #include "./random.h"
 #include "./column_matrix.h"
 #include "./quantile.h"
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -2,7 +2,6 @@
 * Copyright 2018 XGBoost contributors
 */

-#include "./hist_util.h"
 #include <xgboost/logging.h>

 #include <thrust/copy.h>
@@ -17,10 +16,11 @@
 #include <memory>
 #include <mutex>

+#include "hist_util.h"
+#include "xgboost/host_device_vector.h"
+#include "device_helpers.cuh"
+#include "quantile.h"
 #include "../tree/param.h"
-#include "./host_device_vector.h"
-#include "./device_helpers.cuh"
-#include "./quantile.h"

 namespace xgboost {
 namespace common {
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -9,7 +9,7 @@
 #include <xgboost/data.h>
 #include <cstdint>
 #include <utility>
-#include "./host_device_vector.h"
+#include "xgboost/host_device_vector.h"

 namespace xgboost {

--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -2,13 +2,16 @@
 * Copyright 2017 XGBoost contributors
 */

-#include "./host_device_vector.h"
 #include <thrust/fill.h>
-#include <xgboost/data.h>
+#include <thrust/device_ptr.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <mutex>
-#include "./device_helpers.cuh"
+
+#include "xgboost/data.h"
+#include "xgboost/host_device_vector.h"
+#include "device_helpers.cuh"

 namespace xgboost {

@@ -75,22 +78,6 @@ class HostDeviceVectorImpl {
    return {data_d_.data().get(), static_cast<SpanInd>(Size())};
  }

-  thrust::device_ptr<T> tbegin() {  // NOLINT
-    return thrust::device_ptr<T>(DevicePointer());
-  }
-
-  thrust::device_ptr<const T> tcbegin() {  // NOLINT
-    return thrust::device_ptr<const T>(ConstDevicePointer());
-  }
-
-  thrust::device_ptr<T> tend() {  // NOLINT
-    return tbegin() + Size();
-  }
-
-  thrust::device_ptr<const T> tcend() {  // NOLINT
-    return tcbegin() + Size();
-  }
-
  void Fill(T v) {  // NOLINT
    if (HostCanWrite()) {
      std::fill(data_h_.begin(), data_h_.end(), v);
@@ -304,26 +291,6 @@ common::Span<const T> HostDeviceVector<T>::ConstDeviceSpan() const {
  return impl_->ConstDeviceSpan();
 }

-template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tbegin() {  // NOLINT
-  return impl_->tbegin();
-}
-
-template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcbegin() const {  // NOLINT
-  return impl_->tcbegin();
-}
-
-template <typename T>
-thrust::device_ptr<T> HostDeviceVector<T>::tend() {  // NOLINT
-  return impl_->tend();
-}
-
-template <typename T>
-thrust::device_ptr<const T> HostDeviceVector<T>::tcend() const {  // NOLINT
-  return impl_->tcend();
-}
-
 template <typename T>
 void HostDeviceVector<T>::Fill(T v) {
  impl_->Fill(v);
--- a/src/common/host_device_vector.h
+++ b/src/common/host_device_vector.h
@@ -1,158 +0,0 @@
-/*!
- * Copyright 2017-2019 XGBoost contributors
- */
-
-/**
- * @file host_device_vector.h
- * @brief A device-and-host vector abstraction layer.
- *
- * Why HostDeviceVector?<br/>
- * With CUDA, one has to explicitly manage memory through 'cudaMemcpy' calls.
- * This wrapper class hides this management from the users, thereby making it
- * easy to integrate GPU/CPU usage under a single interface.
- *
- * Initialization/Allocation:<br/>
- * One can choose to initialize the vector on CPU or GPU during constructor.
- * (use the 'devices' argument) Or, can choose to use the 'Resize' method to
- * allocate/resize memory explicitly, and use the 'SetDevice' method
- * to specify the device.
- *
- * Accessing underlying data:<br/>
- * Use 'HostVector' method to explicitly query for the underlying std::vector.
- * If you need the raw device pointer, use the 'DevicePointer' method. For perf
- * implications of these calls, see below.
- *
- * Accessing underling data and their perf implications:<br/>
- * There are 4 scenarios to be considered here:
- * HostVector and data on CPU --> no problems, std::vector returned immediately
- * HostVector but data on GPU --> this causes a cudaMemcpy to be issued internally.
- *                        subsequent calls to HostVector, will NOT incur this penalty.
- *                        (assuming 'DevicePointer' is not called in between)
- * DevicePointer but data on CPU  --> this causes a cudaMemcpy to be issued internally.
- *                        subsequent calls to DevicePointer, will NOT incur this penalty.
- *                        (assuming 'HostVector' is not called in between)
- * DevicePointer and data on GPU  --> no problems, the device ptr
- *                        will be returned immediately.
- *
- * What if xgboost is compiled without CUDA?<br/>
- * In that case, there's a special implementation which always falls-back to
- * working with std::vector. This logic can be found in host_device_vector.cc
- *
- * Why not consider CUDA unified memory?<br/>
- * We did consider. However, it poses complications if we need to support both
- * compiling with and without CUDA toolkit. It was easier to have
- * 'HostDeviceVector' with a special-case implementation in host_device_vector.cc
- *
- * @note: Size and Devices methods are thread-safe.
- * DevicePointer, DeviceStart, DeviceSize, tbegin and tend methods are thread-safe
- * if different threads call these methods with different values of the device argument.
- * All other methods are not thread safe.
- */
-
-#ifndef XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-#define XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
-
-#include <dmlc/logging.h>
-
-#include <algorithm>
-#include <cstdlib>
-#include <initializer_list>
-#include <utility>
-#include <vector>
-
-#include "common.h"
-#include "span.h"
-
-// only include thrust-related files if host_device_vector.h
-// is included from a .cu file
-#ifdef __CUDACC__
-#include <thrust/device_ptr.h>
-#endif  // __CUDACC__
-
-namespace xgboost {
-
-#ifdef __CUDACC__
-// Sets a function to call instead of cudaSetDevice();
-// only added for testing
-void SetCudaSetDeviceHandler(void (*handler)(int));
-#endif  // __CUDACC__
-
-template <typename T> struct HostDeviceVectorImpl;
-
-/*!
- * \brief Controls data access from the GPU.
- *
- * Since a `HostDeviceVector` can have data on both the host and device, access control needs to be
- * maintained to keep the data consistent.
- *
- * There are 3 scenarios supported:
- *   - Data is being manipulated on device. GPU has write access, host doesn't have access.
- *   - Data is read-only on both the host and device.
- *   - Data is being manipulated on the host. Host has write access, device doesn't have access.
- */
-enum GPUAccess {
-  kNone, kRead,
-  // write implies read
-  kWrite
-};
-
-template <typename T>
-class HostDeviceVector {
- public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
-  ~HostDeviceVector();
-  HostDeviceVector(const HostDeviceVector<T>&);
-  HostDeviceVector<T>& operator=(const HostDeviceVector<T>&);
-  size_t Size() const;
-  int DeviceIdx() const;
-  common::Span<T> DeviceSpan();
-  common::Span<const T> ConstDeviceSpan() const;
-  common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
-  T* DevicePointer();
-  const T* ConstDevicePointer() const;
-  const T* DevicePointer() const { return ConstDevicePointer(); }
-
-  T* HostPointer() { return HostVector().data(); }
-  const T* ConstHostPointer() const { return ConstHostVector().data(); }
-  const T* HostPointer() const { return ConstHostPointer(); }
-
-  // only define functions returning device_ptr
-  // if HostDeviceVector.h is included from a .cu file
-#ifdef __CUDACC__
-  thrust::device_ptr<T> tbegin();  // NOLINT
-  thrust::device_ptr<T> tend();  // NOLINT
-  thrust::device_ptr<const T> tcbegin() const;  // NOLINT
-  thrust::device_ptr<const T> tcend() const;  // NOLINT
-  thrust::device_ptr<const T> tbegin() const {  // NOLINT
-    return tcbegin();
-  }
-  thrust::device_ptr<const T> tend() const { return tcend(); }  // NOLINT
-#endif  // __CUDACC__
-
-  void Fill(T v);
-  void Copy(const HostDeviceVector<T>& other);
-  void Copy(const std::vector<T>& other);
-  void Copy(std::initializer_list<T> other);
-
-  std::vector<T>& HostVector();
-  const std::vector<T>& ConstHostVector() const;
-  const std::vector<T>& HostVector() const {return ConstHostVector(); }
-
-  bool HostCanRead() const;
-  bool HostCanWrite() const;
-  bool DeviceCanRead() const;
-  bool DeviceCanWrite() const;
-
-  void SetDevice(int device) const;
-
-  void Resize(size_t new_size, T v = T());
-
- private:
-  HostDeviceVectorImpl<T>* impl_;
-};
-
-}  // namespace xgboost
-
-#endif  // XGBOOST_COMMON_HOST_DEVICE_VECTOR_H_
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -17,8 +17,8 @@
 #include <numeric>
 #include <random>

+#include "xgboost/host_device_vector.h"
 #include "io.h"
-#include "host_device_vector.h"

 namespace xgboost {
 namespace common {
@@ -113,7 +113,7 @@ class ColumnSampler {
  }

 public:
-  /** 
+  /**
   * \brief Column sampler constructor.
   * \note This constructor manually sets the rng seed
   */
@@ -169,7 +169,7 @@ class ColumnSampler {

  /**
   * \brief Samples a feature set.
-   * 
+   *
   * \param depth The tree depth of the node at which to sample.
   * \return The sampled feature set.
   * \note If colsample_bynode_ < 1.0, this method creates a new feature set each time it
--- a/src/common/span.h
+++ b/src/common/span.h
@@ -1,640 +0,0 @@
-/*!
- * Copyright 2018 XGBoost contributors
- * \brief span class based on ISO++20 span
- *
- * About NOLINTs in this file:
- *
- *   If we want Span to work with std interface, like range for loop, the
- *   naming must be consistant with std, not XGBoost. Also, the interface also
- *   conflicts with XGBoost coding style, specifically, the use of `explicit'
- *   keyword.
- *
- *
- * Some of the code is copied from Guidelines Support Library, here is the
- * license:
- *
- * Copyright (c) 2015 Microsoft Corporation. All rights reserved.
- *
- * This code is licensed under the MIT License (MIT).
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-
-#ifndef XGBOOST_COMMON_SPAN_H_
-#define XGBOOST_COMMON_SPAN_H_
-
-#include <xgboost/logging.h>  // CHECK
-
-#include <cinttypes>          // int64_t
-#include <type_traits>
-
-/*!
- * The version number 1910 is picked up from GSL.
- *
- * We might want to use MOODYCAMEL_NOEXCEPT from dmlc/concurrentqueue.h. But
- * there are a lot more definitions in that file would cause warnings/troubles
- * in MSVC 2013. Currently we try to keep the closure of Span as minimal as
- * possible.
- *
- * There are other workarounds for MSVC, like _Unwrapped, _Verify_range ...
- * Some of these are hiden magics of MSVC and I tried to avoid them. Should any
- * of them become needed, please consult the source code of GSL, and possibily
- * some explanations from this thread:
- *
- *   https://github.com/Microsoft/GSL/pull/664
- *
- * TODO(trivialfis): Group these MSVC workarounds into a manageable place.
- */
-#if defined(_MSC_VER) && _MSC_VER < 1910
-
-#define __span_noexcept
-
-#pragma push_macro("constexpr")
-#define constexpr /*constexpr*/
-
-#else
-
-#define __span_noexcept noexcept
-
-#endif  // defined(_MSC_VER) && _MSC_VER < 1910
-
-namespace xgboost {
-namespace common {
-
-// Usual logging facility is not available inside device code.
-// TODO(trivialfis): Make dmlc check more generic.
-// assert is not supported in mac as of CUDA 10.0
-#define KERNEL_CHECK(cond)                                      \
-  do {                                                          \
-    if (!(cond)) {                                              \
-      printf("\nKernel error:\n"                                \
-             "In: %s: %d\n"                                     \
-             "\t%s\n\tExpecting: %s\n"                          \
-             "\tBlock: [%d, %d, %d], Thread: [%d, %d, %d]\n\n", \
-             __FILE__, __LINE__, __PRETTY_FUNCTION__, #cond,    \
-             blockIdx.x, blockIdx.y, blockIdx.z,                \
-             threadIdx.x, threadIdx.y, threadIdx.z);            \
-      asm("trap;");                                             \
-    }                                                           \
-  } while (0);
-
-#ifdef __CUDA_ARCH__
-#define SPAN_CHECK KERNEL_CHECK
-#else
-#define SPAN_CHECK CHECK  // check from dmlc
-#endif  // __CUDA_ARCH__
-
-namespace detail {
-/*!
- * By default, XGBoost uses uint32_t for indexing data. int64_t covers all
- *   values uint32_t can represent. Also, On x86-64 Linux, GCC uses long int to
- *   represent ptrdiff_t, which is just int64_t. So we make it determinstic
- *   here.
- */
-using ptrdiff_t = int64_t;  // NOLINT
-}  // namespace detail
-
-#if defined(_MSC_VER) && _MSC_VER < 1910
-constexpr const detail::ptrdiff_t dynamic_extent = -1;  // NOLINT
-#else
-constexpr detail::ptrdiff_t dynamic_extent = -1;  // NOLINT
-#endif  // defined(_MSC_VER) && _MSC_VER < 1910
-
-enum class byte : unsigned char {};  // NOLINT
-
-template <class ElementType, detail::ptrdiff_t Extent>
-class Span;
-
-namespace detail {
-
-template <typename SpanType, bool IsConst>
-class SpanIterator {
-  using ElementType = typename SpanType::element_type;
-
- public:
-  using iterator_category = std::random_access_iterator_tag;      // NOLINT
-  using value_type = typename std::remove_cv<ElementType>::type;  // NOLINT
-  using difference_type = typename SpanType::index_type;          // NOLINT
-
-  using reference = typename std::conditional<                    // NOLINT
-    IsConst, const ElementType, ElementType>::type&;
-  using pointer = typename std::add_pointer<reference>::type;     // NOLINT
-
-  XGBOOST_DEVICE constexpr SpanIterator() : span_{nullptr}, index_{0} {}
-
-  XGBOOST_DEVICE constexpr SpanIterator(
-      const SpanType* _span,
-      typename SpanType::index_type _idx) __span_noexcept :
-                                           span_(_span), index_(_idx) {}
-
-  friend SpanIterator<SpanType, true>;
-  template <bool B, typename std::enable_if<!B && IsConst>::type* = nullptr>
-  XGBOOST_DEVICE constexpr SpanIterator(                         // NOLINT
-      const SpanIterator<SpanType, B>& other_) __span_noexcept
-      : SpanIterator(other_.span_, other_.index_) {}
-
-  XGBOOST_DEVICE reference operator*() const {
-    SPAN_CHECK(index_ < span_->size());
-    return *(span_->data() + index_);
-  }
-  XGBOOST_DEVICE reference operator[](difference_type n) const {
-    return *(*this + n);
-  }
-
-  XGBOOST_DEVICE pointer operator->() const {
-    SPAN_CHECK(index_ != span_->size());
-    return span_->data() + index_;
-  }
-
-  XGBOOST_DEVICE SpanIterator& operator++() {
-    SPAN_CHECK(0 <= index_ && index_ != span_->size());
-    index_++;
-    return *this;
-  }
-
-  XGBOOST_DEVICE SpanIterator operator++(int) {
-    auto ret = *this;
-    ++(*this);
-    return ret;
-  }
-
-  XGBOOST_DEVICE SpanIterator& operator--() {
-    SPAN_CHECK(index_ != 0 && index_ <= span_->size());
-    index_--;
-    return *this;
-  }
-
-  XGBOOST_DEVICE SpanIterator operator--(int) {
-    auto ret = *this;
-    --(*this);
-    return ret;
-  }
-
-  XGBOOST_DEVICE SpanIterator operator+(difference_type n) const {
-    auto ret = *this;
-    return ret += n;
-  }
-
-  XGBOOST_DEVICE SpanIterator& operator+=(difference_type n) {
-    SPAN_CHECK((index_ + n) >= 0 && (index_ + n) <= span_->size());
-    index_ += n;
-    return *this;
-  }
-
-  XGBOOST_DEVICE difference_type operator-(SpanIterator rhs) const {
-    SPAN_CHECK(span_ == rhs.span_);
-    return index_ - rhs.index_;
-  }
-
-  XGBOOST_DEVICE SpanIterator operator-(difference_type n) const {
-    auto ret = *this;
-    return ret -= n;
-  }
-
-  XGBOOST_DEVICE SpanIterator& operator-=(difference_type n) {
-    return *this += -n;
-  }
-
-  // friends
-  XGBOOST_DEVICE constexpr friend bool operator==(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return _lhs.span_ == _rhs.span_ && _lhs.index_ == _rhs.index_;
-  }
-
-  XGBOOST_DEVICE constexpr friend bool operator!=(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return !(_lhs == _rhs);
-  }
-
-  XGBOOST_DEVICE constexpr friend bool operator<(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return _lhs.index_ < _rhs.index_;
-  }
-
-  XGBOOST_DEVICE constexpr friend bool operator<=(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return !(_rhs < _lhs);
-  }
-
-  XGBOOST_DEVICE constexpr friend bool operator>(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return _rhs < _lhs;
-  }
-
-  XGBOOST_DEVICE constexpr friend bool operator>=(
-      SpanIterator _lhs, SpanIterator _rhs) __span_noexcept {
-    return !(_rhs > _lhs);
-  }
-
- protected:
-  const SpanType *span_;
-  detail::ptrdiff_t index_;
-};
-
-
-// It's tempting to use constexpr instead of structs to do the following meta
-// programming. But remember that we are supporting MSVC 2013 here.
-
-/*!
- * The extent E of the span returned by subspan is determined as follows:
- *
- *   - If Count is not dynamic_extent, Count;
- *   - Otherwise, if Extent is not dynamic_extent, Extent - Offset;
- *   - Otherwise, dynamic_extent.
- */
-template <detail::ptrdiff_t Extent,
-          detail::ptrdiff_t Offset,
-          detail::ptrdiff_t Count>
-struct ExtentValue : public std::integral_constant<
-  detail::ptrdiff_t, Count != dynamic_extent ?
-  Count : (Extent != dynamic_extent ? Extent - Offset : Extent)> {};
-
-/*!
- * If N is dynamic_extent, the extent of the returned span E is also
- * dynamic_extent; otherwise it is detail::ptrdiff_t(sizeof(T)) * N.
- */
-template <typename T, detail::ptrdiff_t Extent>
-struct ExtentAsBytesValue : public std::integral_constant<
-  detail::ptrdiff_t,
-  Extent == dynamic_extent ?
-  Extent : static_cast<detail::ptrdiff_t>(sizeof(T) * Extent)> {};
-
-template <detail::ptrdiff_t From, detail::ptrdiff_t To>
-struct IsAllowedExtentConversion : public std::integral_constant<
-  bool, From == To || From == dynamic_extent || To == dynamic_extent> {};
-
-template <class From, class To>
-struct IsAllowedElementTypeConversion : public std::integral_constant<
-  bool, std::is_convertible<From(*)[], To(*)[]>::value> {};
-
-template <class T>
-struct IsSpanOracle : std::false_type {};
-
-template <class T, detail::ptrdiff_t Extent>
-struct IsSpanOracle<Span<T, Extent>> : std::true_type {};
-
-template <class T>
-struct IsSpan : public IsSpanOracle<typename std::remove_cv<T>::type> {};
-
-// Re-implement std algorithms here to adopt CUDA.
-template <typename T>
-struct Less {
-  XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {
-    return _x < _y;
-  }
-};
-
-template <typename T>
-struct Greater {
-  XGBOOST_DEVICE constexpr bool operator()(const T& _x, const T& _y) const {
-    return _x > _y;
-  }
-};
-
-template <class InputIt1, class InputIt2,
-          class Compare =
-          detail::Less<decltype(std::declval<InputIt1>().operator*())>>
-XGBOOST_DEVICE bool LexicographicalCompare(InputIt1 first1, InputIt1 last1,
-                                            InputIt2 first2, InputIt2 last2) {
-  Compare comp;
-  for (; first1 != last1 && first2 != last2; ++first1, ++first2) {
-    if (comp(*first1, *first2)) {
-      return true;
-    }
-    if (comp(*first2, *first1)) {
-      return false;
-    }
-  }
-  return first1 == last1 && first2 != last2;
-}
-
-}  // namespace detail
-
-
-/*!
- * \brief span class implementation, based on ISO++20 span<T>. The interface
- *      should be the same.
- *
- * What's different from span<T> in Guidelines Support Library (GSL)
- *
- *    Interface might be slightly different, we stick with ISO.
- *
- *    GSL uses C++14/17 features, which are not available here.
- *    GSL uses constexpr extensively, which is not possibile with limitation
- *      of C++11.
- *    GSL doesn't concern about CUDA.
- *
- *    GSL is more thoroughly implemented and tested.
- *    GSL is more optimized, especially for static extent.
- *
- *    GSL uses __buildin_unreachable() when error, Span<T> uses dmlc LOG and
- *      customized CUDA logging.
- *
- *
- * What's different from span<T> in ISO++20 (ISO)
- *
- *    ISO uses functions/structs from std library, which might be not available
- *      in CUDA.
- *    Initializing from std::array is not supported.
- *
- *    ISO uses constexpr extensively, which is not possibile with limitation
- *      of C++11.
- *    ISO uses C++14/17 features, which is not available here.
- *    ISO doesn't concern about CUDA.
- *
- *    ISO uses std::terminate(), Span<T> uses dmlc LOG and customized CUDA
- *      logging.
- *
- *
- * Limitations:
- *    With thrust:
- *       It's not adviced to initialize Span with host_vector directly, since
- *         host_vector::data() is a host function.
- *       It's not possible to initialize Span with device_vector directly, since
- *         device_vector::data() returns a wrapped pointer.
- *       It's unclear that what kind of thrust algorithm can be used without
- *         memory error. See the test case "GPUSpan.WithTrust"
- *
- *    Pass iterator to kernel:
- *       Not possible. Use subspan instead.
- *
- *       The underlying Span in SpanIterator is a pointer, but CUDA pass kernel
- *       parameter by value.  If we were to hold a Span value instead of a
- *       pointer, the following snippet will crash, violating the safety
- *       purpose of Span:
- *
- *       \code{.cpp}
- *       Span<float> span {arr_a};
- *       auto beg = span.begin();
- *
- *       Span<float> span_b = arr_b;
- *       span = span_b;
- *
- *       delete arr_a;
- *       beg++;                 // crash
- *       \endcode
- *
- *       While hoding a pointer or reference should avoid the problem, its a
- *       compromise. Since we have subspan, it's acceptable not to support
- *       passing iterator.
- */
-template <typename T,
-          detail::ptrdiff_t Extent = dynamic_extent>
-class Span {
- public:
-  using element_type = T;                               // NOLINT
-  using value_type = typename std::remove_cv<T>::type;  // NOLINT
-  using index_type = detail::ptrdiff_t;                 // NOLINT
-  using difference_type = detail::ptrdiff_t;            // NOLINT
-  using pointer = T*;                                   // NOLINT
-  using reference = T&;                                 // NOLINT
-
-  using iterator = detail::SpanIterator<Span<T, Extent>, false>;             // NOLINT
-  using const_iterator = const detail::SpanIterator<Span<T, Extent>, true>;  // NOLINT
-  using reverse_iterator = detail::SpanIterator<Span<T, Extent>, false>;     // NOLINT
-  using const_reverse_iterator = const detail::SpanIterator<Span<T, Extent>, true>;  // NOLINT
-
-  // constructors
-
-  XGBOOST_DEVICE constexpr Span() __span_noexcept : size_(0), data_(nullptr) {}
-
-  XGBOOST_DEVICE Span(pointer _ptr, index_type _count) :
-      size_(_count), data_(_ptr) {
-    SPAN_CHECK(_count >= 0);
-    SPAN_CHECK(_ptr || _count == 0);
-  }
-
-  XGBOOST_DEVICE Span(pointer _first, pointer _last) :
-      size_(_last - _first), data_(_first) {
-    SPAN_CHECK(size_ >= 0);
-    SPAN_CHECK(data_ || size_ == 0);
-  }
-
-  template <std::size_t N>
-  XGBOOST_DEVICE constexpr Span(element_type (&arr)[N])  // NOLINT
-      __span_noexcept : size_(N), data_(&arr[0]) {}
-
-  template <class Container,
-            class = typename std::enable_if<
-              !std::is_const<element_type>::value && !detail::IsSpan<Container>::value &&
-              std::is_convertible<typename Container::pointer,
-                                  pointer>::value &&
-              std::is_convertible<
-                typename Container::pointer,
-                decltype(std::declval<Container>().data())>::value>>
-  XGBOOST_DEVICE Span(Container& _cont) :  // NOLINT
-      size_(_cont.size()), data_(_cont.data()) {}
-
-  template <class Container,
-            class = typename std::enable_if<
-              std::is_const<element_type>::value && !detail::IsSpan<Container>::value &&
-              std::is_convertible<typename Container::pointer, pointer>::value &&
-              std::is_convertible<
-                typename Container::pointer,
-                decltype(std::declval<Container>().data())>::value>>
-  XGBOOST_DEVICE Span(const Container& _cont) : size_(_cont.size()),  // NOLINT
-                                                data_(_cont.data()) {}
-
-  template <class U, detail::ptrdiff_t OtherExtent,
-            class = typename std::enable_if<
-              detail::IsAllowedElementTypeConversion<U, T>::value &&
-              detail::IsAllowedExtentConversion<OtherExtent, Extent>::value>>
-  XGBOOST_DEVICE constexpr Span(const Span<U, OtherExtent>& _other)   // NOLINT
-      __span_noexcept : size_(_other.size()), data_(_other.data()) {}
-
-  XGBOOST_DEVICE constexpr Span(const Span& _other)
-      __span_noexcept : size_(_other.size()), data_(_other.data()) {}
-
-  XGBOOST_DEVICE Span& operator=(const Span& _other) __span_noexcept {
-    size_ = _other.size();
-    data_ = _other.data();
-    return *this;
-  }
-
-  XGBOOST_DEVICE ~Span() __span_noexcept {};  // NOLINT
-
-  XGBOOST_DEVICE constexpr iterator begin() const __span_noexcept {  // NOLINT
-    return {this, 0};
-  }
-
-  XGBOOST_DEVICE constexpr iterator end() const __span_noexcept {    // NOLINT
-    return {this, size()};
-  }
-
-  XGBOOST_DEVICE constexpr const_iterator cbegin() const __span_noexcept {  // NOLINT
-    return {this, 0};
-  }
-
-  XGBOOST_DEVICE constexpr const_iterator cend() const __span_noexcept {    // NOLINT
-    return {this, size()};
-  }
-
-  XGBOOST_DEVICE constexpr reverse_iterator rbegin() const __span_noexcept {  // NOLINT
-    return reverse_iterator{end()};
-  }
-
-  XGBOOST_DEVICE constexpr reverse_iterator rend() const __span_noexcept {    // NOLINT
-    return reverse_iterator{begin()};
-  }
-
-  XGBOOST_DEVICE constexpr const_reverse_iterator crbegin() const __span_noexcept {  // NOLINT
-    return const_reverse_iterator{cend()};
-  }
-
-  XGBOOST_DEVICE constexpr const_reverse_iterator crend() const __span_noexcept {    // NOLINT
-    return const_reverse_iterator{cbegin()};
-  }
-
-  XGBOOST_DEVICE reference operator[](index_type _idx) const {
-    SPAN_CHECK(_idx >= 0 && _idx < size());
-    return data()[_idx];
-  }
-
-  XGBOOST_DEVICE reference operator()(index_type _idx) const {
-    return this->operator[](_idx);
-  }
-
-  XGBOOST_DEVICE constexpr pointer data() const __span_noexcept {   // NOLINT
-    return data_;
-  }
-
-  // Observers
-  XGBOOST_DEVICE constexpr index_type size() const __span_noexcept {  // NOLINT
-    return size_;
-  }
-  XGBOOST_DEVICE constexpr index_type size_bytes() const __span_noexcept {  // NOLINT
-    return size() * sizeof(T);
-  }
-
-  XGBOOST_DEVICE constexpr bool empty() const __span_noexcept {  // NOLINT
-    return size() == 0;
-  }
-
-  // Subviews
-  template <detail::ptrdiff_t Count >
-  XGBOOST_DEVICE Span<element_type, Count> first() const {  // NOLINT
-    SPAN_CHECK(Count >= 0 && Count <= size());
-    return {data(), Count};
-  }
-
-  XGBOOST_DEVICE Span<element_type, dynamic_extent> first(  // NOLINT
-      detail::ptrdiff_t _count) const {
-    SPAN_CHECK(_count >= 0 && _count <= size());
-    return {data(), _count};
-  }
-
-  template <detail::ptrdiff_t Count >
-  XGBOOST_DEVICE Span<element_type, Count> last() const {  // NOLINT
-    SPAN_CHECK(Count >=0 && size() - Count >= 0);
-    return {data() + size() - Count, Count};
-  }
-
-  XGBOOST_DEVICE Span<element_type, dynamic_extent> last(  // NOLINT
-      detail::ptrdiff_t _count) const {
-    SPAN_CHECK(_count >= 0 && _count <= size());
-    return subspan(size() - _count, _count);
-  }
-
-  /*!
-   * If Count is std::dynamic_extent, r.size() == this->size() - Offset;
-   * Otherwise r.size() == Count.
-   */
-  template <detail::ptrdiff_t Offset,
-            detail::ptrdiff_t Count = dynamic_extent>
-  XGBOOST_DEVICE auto subspan() const ->                   // NOLINT
-      Span<element_type,
-           detail::ExtentValue<Extent, Offset, Count>::value> {
-    SPAN_CHECK(Offset >= 0 && (Offset < size() || size() == 0));
-    SPAN_CHECK(Count == dynamic_extent ||
-               (Count >= 0 && Offset + Count <= size()));
-
-    return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
-  }
-
-  XGBOOST_DEVICE Span<element_type, dynamic_extent> subspan(  // NOLINT
-      detail::ptrdiff_t _offset,
-      detail::ptrdiff_t _count = dynamic_extent) const {
-    SPAN_CHECK(_offset >= 0 && (_offset < size() || size() == 0));
-    SPAN_CHECK((_count == dynamic_extent) ||
-               (_count >= 0 && _offset + _count <= size()));
-
-    return {data() + _offset, _count ==
-            dynamic_extent ? size() - _offset : _count};
-  }
-
- private:
-  index_type size_;
-  pointer data_;
-};
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE bool operator==(Span<T, X> l, Span<U, Y> r) {
-  if (l.size() != r.size()) {
-    return false;
-  }
-  for (auto l_beg = l.cbegin(), r_beg = r.cbegin(); l_beg != l.cend();
-       ++l_beg, ++r_beg) {
-    if (*l_beg != *r_beg) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE constexpr bool operator!=(Span<T, X> l, Span<U, Y> r) {
-  return !(l == r);
-}
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE constexpr bool operator<(Span<T, X> l, Span<U, Y> r) {
-  return detail::LexicographicalCompare(l.begin(), l.end(),
-                                         r.begin(), r.end());
-}
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE constexpr bool operator<=(Span<T, X> l, Span<U, Y> r) {
-  return !(l > r);
-}
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE constexpr bool operator>(Span<T, X> l, Span<U, Y> r) {
-  return detail::LexicographicalCompare<
-    typename Span<T, X>::iterator, typename Span<U, Y>::iterator,
-    detail::Greater<typename Span<T, X>::element_type>>(l.begin(), l.end(),
-                                                        r.begin(), r.end());
-}
-
-template <class T, detail::ptrdiff_t X, class U, detail::ptrdiff_t Y>
-XGBOOST_DEVICE constexpr bool operator>=(Span<T, X> l, Span<U, Y> r) {
-  return !(l < r);
-}
-
-template <class T, detail::ptrdiff_t E>
-XGBOOST_DEVICE auto as_bytes(Span<T, E> s) __span_noexcept ->           // NOLINT
-    Span<const byte, detail::ExtentAsBytesValue<T, E>::value> {
-  return {reinterpret_cast<const byte*>(s.data()), s.size_bytes()};
-}
-
-template <class T, detail::ptrdiff_t E>
-XGBOOST_DEVICE auto as_writable_bytes(Span<T, E> s) __span_noexcept ->  // NOLINT
-    Span<byte, detail::ExtentAsBytesValue<T, E>::value> {
-  return {reinterpret_cast<byte*>(s.data()), s.size_bytes()};
-}
-
-}  // namespace common
-}  // namespace xgboost
-
-#if defined(_MSC_VER) &&_MSC_VER < 1910
-#undef constexpr
-#pragma pop_macro("constexpr")
-#undef __span_noexcept
-#endif  // _MSC_VER < 1910
-
-#endif  // XGBOOST_COMMON_SPAN_H_
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -10,9 +10,10 @@
 #include <vector>
 #include <type_traits>  // enable_if

-#include "host_device_vector.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/span.h"
+
 #include "common.h"
-#include "span.h"

 #if defined (__CUDACC__)
 #include "device_helpers.cuh"
--- a/src/data/columnar.h
+++ b/src/data/columnar.h
@@ -13,7 +13,8 @@
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
-#include "../common/span.h"
+#include "xgboost/span.h"
+
 #include "../common/bitfield.h"

 namespace xgboost {
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -11,6 +11,7 @@
 #include "./simple_dmatrix.h"
 #include "./simple_csr_source.h"
 #include "../common/io.h"
+#include "../common/group_data.h"

 #if DMLC_ENABLE_STD_THREAD
 #include "./sparse_page_source.h"
@@ -322,7 +323,35 @@ data::SparsePageFormat::DecideFormat(const std::string& cache_prefix) {
    return std::make_pair(raw, raw);
  }
 }
-
+SparsePage SparsePage::GetTranspose(int num_columns) const {
+  SparsePage transpose;
+  common::ParallelGroupBuilder<Entry> builder(&transpose.offset.HostVector(),
+                                              &transpose.data.HostVector());
+  const int nthread = omp_get_max_threads();
+  builder.InitBudget(num_columns, nthread);
+  long batch_size = static_cast<long>(this->Size());  // NOLINT(*)
+#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
+  for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
+    int tid = omp_get_thread_num();
+    auto inst = (*this)[i];
+    for (const auto& entry : inst) {
+      builder.AddBudget(entry.index, tid);
+    }
+  }
+  builder.InitStorage();
+#pragma omp parallel for default(none) shared(batch_size, builder) schedule(static)
+  for (long i = 0; i < batch_size; ++i) {  // NOLINT(*)
+    int tid = omp_get_thread_num();
+    auto inst = (*this)[i];
+    for (const auto& entry : inst) {
+      builder.Push(
+          entry.index,
+          Entry(static_cast<bst_uint>(this->base_rowid + i), entry.fvalue),
+          tid);
+    }
+  }
+  return transpose;
+}
 void SparsePage::Push(const SparsePage &batch) {
  auto& data_vec = data.HostVector();
  auto& offset_vec = offset.HostVector();
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -21,6 +21,7 @@
 #include <vector>

 #include "sparse_page_writer.h"
+#include "../common/common.h"

 namespace {

--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -10,10 +10,13 @@
 #include <xgboost/gbm.h>
 #include <xgboost/logging.h>
 #include <xgboost/linear_updater.h>
+
 #include <vector>
 #include <string>
 #include <sstream>
 #include <algorithm>
+
+#include "gblinear_model.h"
 #include "../common/timer.h"

 namespace xgboost {
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -6,11 +6,6 @@
 */
 #include <dmlc/omp.h>
 #include <dmlc/parameter.h>
-#include <dmlc/timer.h>
-#include <xgboost/logging.h>
-#include <xgboost/gbm.h>
-#include <xgboost/predictor.h>
-#include <xgboost/tree_updater.h>

 #include <vector>
 #include <memory>
@@ -19,11 +14,16 @@
 #include <limits>
 #include <algorithm>

-#include "../common/common.h"
-#include "../common/host_device_vector.h"
-#include "../common/random.h"
+#include "xgboost/logging.h"
+#include "xgboost/gbm.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_updater.h"
+#include "xgboost/host_device_vector.h"
+
 #include "gbtree.h"
 #include "gbtree_model.h"
+#include "../common/common.h"
+#include "../common/random.h"
 #include "../common/timer.h"


--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -23,8 +23,9 @@
 #include <string>

 #include "gbtree_model.h"
+#include "xgboost/host_device_vector.h"
+
 #include "../common/common.h"
-#include "../common/host_device_vector.h"
 #include "../common/timer.h"

 namespace xgboost {
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -19,11 +19,12 @@
 #include <ios>
 #include <utility>
 #include <vector>
-#include "./common/common.h"
-#include "./common/host_device_vector.h"
-#include "./common/io.h"
-#include "./common/random.h"
-#include "./common/timer.h"
+
+#include "xgboost/host_device_vector.h"
+#include "common/common.h"
+#include "common/io.h"
+#include "common/random.h"
+#include "common/timer.h"

 namespace {

--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -7,12 +7,13 @@
 #include <thrust/inner_product.h>
 #include <xgboost/data.h>
 #include <xgboost/linear_updater.h>
+#include "xgboost/span.h"
+
+#include "coordinate_common.h"
 #include "../common/common.h"
-#include "../common/span.h"
 #include "../common/device_helpers.cuh"
 #include "../common/timer.h"
 #include "./param.h"
-#include "coordinate_common.h"

 namespace xgboost {
 namespace linear {
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -11,7 +11,7 @@

 #include <vector>

-#include "../common/host_device_vector.h"
+#include "xgboost/host_device_vector.h"
 #include "../common/math.h"

 namespace {
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -4,12 +4,13 @@
 * \brief Provides an implementation of the hinge loss function
 * \author Henry Gouk
 */
-#include <xgboost/objective.h>
+#include "xgboost/objective.h"
+#include "xgboost/span.h"
+#include "xgboost/host_device_vector.h"
+
 #include "../common/math.h"
 #include "../common/transform.h"
 #include "../common/common.h"
-#include "../common/span.h"
-#include "../common/host_device_vector.h"

 namespace xgboost {
 namespace obj {
--- a/src/objective/objective.cc
+++ b/src/objective/objective.cc
@@ -6,7 +6,7 @@
 #include <xgboost/objective.h>
 #include <dmlc/registry.h>

-#include "../common/host_device_vector.h"
+#include "xgboost/host_device_vector.h"

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::ObjFunctionReg);
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -12,10 +12,11 @@
 #include <memory>
 #include <vector>

-#include "../common/span.h"
+#include "xgboost/span.h"
+#include "xgboost/host_device_vector.h"
+
 #include "../common/transform.h"
 #include "../common/common.h"
-#include "../common/host_device_vector.h"
 #include "./regression_loss.h"


--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -1,11 +1,13 @@
 /*!
 * Copyright by Contributors 2017
 */
-#include <xgboost/predictor.h>
-#include <xgboost/tree_model.h>
-#include <xgboost/tree_updater.h>
-#include "dmlc/logging.h"
-#include "../common/host_device_vector.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_model.h"
+#include "xgboost/tree_updater.h"
+#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
+
+#include "../gbm/gbtree_model.h"

 namespace xgboost {
 namespace predictor {
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -6,14 +6,17 @@
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
-#include <xgboost/data.h>
-#include <xgboost/predictor.h>
-#include <xgboost/tree_model.h>
-#include <xgboost/tree_updater.h>
 #include <memory>
+
+#include "xgboost/data.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_model.h"
+#include "xgboost/tree_updater.h"
+#include "xgboost/host_device_vector.h"
+
+#include "../gbm/gbtree_model.h"
 #include "../common/common.h"
 #include "../common/device_helpers.cuh"
-#include "../common/host_device_vector.h"

 namespace xgboost {
 namespace predictor {
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -6,17 +6,16 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>

-#include <xgboost/logging.h>
-
 #include <algorithm>
 #include <bitset>
 #include <string>
 #include <sstream>
 #include <set>

+#include "xgboost/logging.h"
+#include "xgboost/span.h"
 #include "constraints.cuh"
 #include "param.h"
-#include "../common/span.h"
 #include "../common/device_helpers.cuh"


--- a/src/tree/constraints.cuh
+++ b/src/tree/constraints.cuh
@@ -12,7 +12,7 @@
 #include <vector>

 #include "param.h"
-#include "../common/span.h"
+#include "xgboost/span.h"
 #include "../common/bitfield.h"
 #include "../common/device_helpers.cuh"

--- a/src/tree/split_evaluator.cc
+++ b/src/tree/split_evaluator.cc
@@ -3,10 +3,8 @@
 * \file split_evaluator.cc
 * \brief Contains implementations of different split evaluators.
 */
-#include "split_evaluator.h"
 #include <dmlc/json.h>
 #include <dmlc/registry.h>
-#include <xgboost/logging.h>
 #include <algorithm>
 #include <unordered_set>
 #include <vector>
@@ -15,9 +13,12 @@
 #include <string>
 #include <sstream>
 #include <utility>
+
+#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
 #include "param.h"
+#include "split_evaluator.h"
 #include "../common/common.h"
-#include "../common/host_device_vector.h"

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::tree::SplitEvaluatorReg);
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -3,10 +3,10 @@
 * \file tree_updater.cc
 * \brief Registry of tree updaters.
 */
-#include <xgboost/tree_updater.h>
 #include <dmlc/registry.h>

-#include "../common/host_device_vector.h"
+#include "xgboost/tree_updater.h"
+#include "xgboost/host_device_vector.h"

 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::TreeUpdaterReg);
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -14,13 +14,15 @@
 #include <queue>
 #include <utility>
 #include <vector>
+
+#include "xgboost/host_device_vector.h"
+#include "xgboost/span.h"
+
 #include "../common/common.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
-#include "../common/host_device_vector.h"
 #include "../common/timer.h"
-#include "../common/span.h"
 #include "../data/ellpack_page.cuh"
 #include "param.h"
 #include "updater_gpu_common.cuh"