Remove gpu_exact tree method (#4742)
This commit is contained in:
parent
2a4df8e29f
commit
851b5b3808
1
.gitignore
vendored
1
.gitignore
vendored
@ -90,7 +90,6 @@ lib/
|
||||
# spark
|
||||
metastore_db
|
||||
|
||||
plugin/updater_gpu/test/cpp/data
|
||||
/include/xgboost/build_config.h
|
||||
|
||||
# files from R-package source install
|
||||
|
||||
@ -30,7 +30,7 @@ wl <- list(train = dtrain, test = dtest)
|
||||
# - similar to the 'hist'
|
||||
# - the fastest option for moderately large datasets
|
||||
# - current limitations: max_depth < 16, does not implement guided loss
|
||||
# You can use tree_method = 'gpu_exact' for another GPU accelerated algorithm,
|
||||
# You can use tree_method = 'gpu_hist' for another GPU accelerated algorithm,
|
||||
# which is slower, more memory-hungry, but does not use binning.
|
||||
param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
|
||||
max_bin = 64, tree_method = 'gpu_hist')
|
||||
|
||||
@ -13,7 +13,7 @@ Installation Guide
|
||||
# * xgboost-{version}-py2.py3-none-win_amd64.whl
|
||||
pip3 install xgboost
|
||||
|
||||
* The binary wheel will support GPU algorithms (`gpu_exact`, `gpu_hist`) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
|
||||
* The binary wheel will support GPU algorithms (`gpu_hist`) on machines with NVIDIA GPUs. Please note that **training with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.
|
||||
* Currently, we provide binary wheels for 64-bit Linux and Windows.
|
||||
|
||||
****************************
|
||||
|
||||
@ -26,8 +26,6 @@ Algorithms
|
||||
+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| tree_method | Description |
|
||||
+=======================+=======================================================================================================================================================================+
|
||||
| gpu_exact (deprecated)| The standard XGBoost tree construction algorithm. Performs exact search for splits. Slower and uses considerably more memory than ``gpu_hist``. |
|
||||
+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| gpu_hist | Equivalent to the XGBoost fast histogram algorithm. Much faster and uses considerably less memory. NOTE: Will run very slowly on GPUs older than Pascal architecture. |
|
||||
+-----------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
|
||||
@ -37,31 +35,31 @@ Supported parameters
|
||||
.. |tick| unicode:: U+2714
|
||||
.. |cross| unicode:: U+2718
|
||||
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| parameter | ``gpu_exact`` (deprecated) | ``gpu_hist`` |
|
||||
+================================+============================+==============+
|
||||
| ``subsample`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``colsample_bytree`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``colsample_bylevel`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``max_bin`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``gpu_id`` | |tick| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``n_gpus`` (deprecated) | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``predictor`` | |tick| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``grow_policy`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``monotone_constraints`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``interaction_constraints`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
| ``single_precision_histogram`` | |cross| | |tick| |
|
||||
+--------------------------------+----------------------------+--------------+
|
||||
+--------------------------------+--------------+
|
||||
| parameter | ``gpu_hist`` |
|
||||
+================================+==============+
|
||||
| ``subsample`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``colsample_bytree`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``colsample_bylevel`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``max_bin`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``gpu_id`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``n_gpus`` (deprecated) | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``predictor`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``grow_policy`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``monotone_constraints`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``interaction_constraints`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
| ``single_precision_histogram`` | |tick| |
|
||||
+--------------------------------+--------------+
|
||||
|
||||
GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.
|
||||
|
||||
@ -194,12 +192,10 @@ Training time time on 1,000,000 rows x 50 columns with 500 boosting iterations a
|
||||
+--------------+----------+
|
||||
| hist | 63.55 |
|
||||
+--------------+----------+
|
||||
| gpu_exact | 161.08 |
|
||||
+--------------+----------+
|
||||
| exact | 1082.20 |
|
||||
+--------------+----------+
|
||||
|
||||
See `GPU Accelerated XGBoost <https://xgboost.ai/2016/12/14/GPU-accelerated-xgboost.html>`_ and `Updates to the XGBoost GPU algorithms <https://xgboost.ai/2018/07/04/gpu-xgboost-update.html>`_ for additional performance benchmarks of the ``gpu_exact`` and ``gpu_hist`` tree methods.
|
||||
See `GPU Accelerated XGBoost <https://xgboost.ai/2016/12/14/GPU-accelerated-xgboost.html>`_ and `Updates to the XGBoost GPU algorithms <https://xgboost.ai/2018/07/04/gpu-xgboost-update.html>`_ for additional performance benchmarks of the ``gpu_hist`` tree method.
|
||||
|
||||
Developer notes
|
||||
===============
|
||||
|
||||
@ -184,7 +184,7 @@ Parameters for Tree Booster
|
||||
- The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
|
||||
|
||||
- ``cpu_predictor``: Multicore CPU prediction algorithm.
|
||||
- ``gpu_predictor``: Prediction using GPU. Default when ``tree_method`` is ``gpu_exact`` or ``gpu_hist``.
|
||||
- ``gpu_predictor``: Prediction using GPU. Default when ``tree_method`` is ``gpu_hist``.
|
||||
|
||||
* ``num_parallel_tree``, [default=1]
|
||||
- Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
|
||||
|
||||
@ -15,7 +15,7 @@ path to a cache file that XGBoost will use for external memory cache.
|
||||
|
||||
.. note:: External memory is not available with GPU algorithms
|
||||
|
||||
External memory is not available when ``tree_method`` is set to ``gpu_exact`` or ``gpu_hist``.
|
||||
External memory is not available when ``tree_method`` is set to ``gpu_hist``.
|
||||
|
||||
The following code was extracted from `demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_:
|
||||
|
||||
|
||||
@ -75,9 +75,3 @@ CUB_PATH ?= cub
|
||||
# you can also add your own plugin like this
|
||||
#
|
||||
# XGB_PLUGINS += plugin/example/plugin.mk
|
||||
|
||||
# plugin to build tree on GPUs using CUDA
|
||||
PLUGIN_UPDATER_GPU ?= OFF
|
||||
ifeq ($(PLUGIN_UPDATER_GPU),ON)
|
||||
XGB_PLUGINS += plugin/updater_gpu/plugin.mk
|
||||
endif
|
||||
|
||||
@ -146,13 +146,6 @@ void GBTree::ConfigureUpdaters(const std::map<std::string, std::string>& cfg) {
|
||||
"single updater grow_quantile_histmaker.";
|
||||
tparam_.updater_seq = "grow_quantile_histmaker";
|
||||
break;
|
||||
case TreeMethod::kGPUExact:
|
||||
this->AssertGPUSupport();
|
||||
tparam_.updater_seq = "grow_gpu,prune";
|
||||
if (cfg.find("predictor") == cfg.cend()) {
|
||||
tparam_.predictor = "gpu_predictor";
|
||||
}
|
||||
break;
|
||||
case TreeMethod::kGPUHist:
|
||||
this->AssertGPUSupport();
|
||||
tparam_.updater_seq = "grow_gpu_hist";
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
namespace xgboost {
|
||||
enum class TreeMethod : int {
|
||||
kAuto = 0, kApprox = 1, kExact = 2, kHist = 3,
|
||||
kGPUExact = 4, kGPUHist = 5
|
||||
kGPUHist = 5
|
||||
};
|
||||
|
||||
// boosting process types
|
||||
@ -88,7 +88,6 @@ struct GBTreeTrainParam : public dmlc::Parameter<GBTreeTrainParam> {
|
||||
.add_enum("approx", TreeMethod::kApprox)
|
||||
.add_enum("exact", TreeMethod::kExact)
|
||||
.add_enum("hist", TreeMethod::kHist)
|
||||
.add_enum("gpu_exact", TreeMethod::kGPUExact)
|
||||
.add_enum("gpu_hist", TreeMethod::kGPUHist)
|
||||
.describe("Choice of tree construction method.");
|
||||
}
|
||||
@ -171,8 +170,7 @@ class GBTree : public GradientBooster {
|
||||
bool UseGPU() const override {
|
||||
return
|
||||
tparam_.predictor == "gpu_predictor" ||
|
||||
tparam_.tree_method == TreeMethod::kGPUHist ||
|
||||
tparam_.tree_method == TreeMethod::kGPUExact;
|
||||
tparam_.tree_method == TreeMethod::kGPUHist;
|
||||
}
|
||||
|
||||
void Load(dmlc::Stream* fi) override {
|
||||
|
||||
@ -37,7 +37,6 @@ DMLC_REGISTRY_LINK_TAG(updater_quantile_hist);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_histmaker);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_sync);
|
||||
#ifdef XGBOOST_USE_CUDA
|
||||
DMLC_REGISTRY_LINK_TAG(updater_gpu);
|
||||
DMLC_REGISTRY_LINK_TAG(updater_gpu_hist);
|
||||
#endif // XGBOOST_USE_CUDA
|
||||
} // namespace tree
|
||||
|
||||
@ -1,844 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2017-2018 XGBoost contributors
|
||||
*/
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
#include "../common/common.h"
|
||||
#include "param.h"
|
||||
#include "updater_gpu_common.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
DMLC_REGISTRY_FILE_TAG(updater_gpu);
|
||||
|
||||
template <typename GradientPairT>
|
||||
XGBOOST_DEVICE float inline LossChangeMissing(const GradientPairT& scan,
|
||||
const GradientPairT& missing,
|
||||
const GradientPairT& parent_sum,
|
||||
const float& parent_gain,
|
||||
const GPUTrainingParam& param,
|
||||
bool& missing_left_out) { // NOLINT
|
||||
// Put gradients of missing values to left
|
||||
float missing_left_loss =
|
||||
DeviceCalcLossChange(param, scan + missing, parent_sum, parent_gain);
|
||||
float missing_right_loss =
|
||||
DeviceCalcLossChange(param, scan, parent_sum, parent_gain);
|
||||
|
||||
if (missing_left_loss >= missing_right_loss) {
|
||||
missing_left_out = true;
|
||||
return missing_left_loss;
|
||||
} else {
|
||||
missing_left_out = false;
|
||||
return missing_right_loss;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Absolute BFS order IDs to col-wise unique IDs based on user input
|
||||
* @param tid the index of the element that this thread should access
|
||||
* @param abs the array of absolute IDs
|
||||
* @param colIds the array of column IDs for each element
|
||||
* @param nodeStart the start of the node ID at this level
|
||||
* @param nKeys number of nodes at this level.
|
||||
* @return the uniq key
|
||||
*/
|
||||
static HOST_DEV_INLINE NodeIdT Abs2UniqueKey(int tid,
|
||||
common::Span<const NodeIdT> abs,
|
||||
common::Span<const int> colIds,
|
||||
NodeIdT nodeStart, int nKeys) {
|
||||
int a = abs[tid];
|
||||
if (a == kUnusedNode) return a;
|
||||
return ((a - nodeStart) + (colIds[tid] * nKeys));
|
||||
}
|
||||
|
||||
/**
|
||||
* @struct Pair
|
||||
* @brief Pair used for key basd scan operations on GradientPair
|
||||
*/
|
||||
struct Pair {
|
||||
int key;
|
||||
GradientPair value;
|
||||
};
|
||||
|
||||
/** define a key that's not used at all in the entire boosting process */
|
||||
static const int kNoneKey = -100;
|
||||
|
||||
/**
|
||||
* @brief Allocate temporary buffers needed for scan operations
|
||||
* @param tmpScans gradient buffer
|
||||
* @param tmpKeys keys buffer
|
||||
* @param size number of elements that will be scanned
|
||||
*/
|
||||
template <int BLKDIM_L1L3 = 256>
|
||||
int ScanTempBufferSize(int size) {
|
||||
int num_blocks = common::DivRoundUp(size, BLKDIM_L1L3);
|
||||
return num_blocks;
|
||||
}
|
||||
|
||||
struct AddByKey {
|
||||
template <typename T>
|
||||
HOST_DEV_INLINE T operator()(const T& first, const T& second) const {
|
||||
T result;
|
||||
if (first.key == second.key) {
|
||||
result.key = first.key;
|
||||
result.value = first.value + second.value;
|
||||
} else {
|
||||
result.key = second.key;
|
||||
result.value = second.value;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Gradient value getter function
|
||||
* @param id the index into the vals or instIds array to which to fetch
|
||||
* @param vals the gradient value buffer
|
||||
* @param instIds instance index buffer
|
||||
* @return the expected gradient value
|
||||
*/
|
||||
HOST_DEV_INLINE GradientPair Get(int id,
|
||||
common::Span<const GradientPair> vals,
|
||||
common::Span<const int> instIds) {
|
||||
id = instIds[id];
|
||||
return vals[id];
|
||||
}
|
||||
|
||||
template <int BLKDIM_L1L3>
|
||||
__global__ void CubScanByKeyL1(
|
||||
common::Span<GradientPair> scans,
|
||||
common::Span<const GradientPair> vals,
|
||||
common::Span<const int> instIds,
|
||||
common::Span<GradientPair> mScans,
|
||||
common::Span<int> mKeys,
|
||||
common::Span<const NodeIdT> keys,
|
||||
int nUniqKeys,
|
||||
common::Span<const int> colIds, NodeIdT nodeStart,
|
||||
const int size) {
|
||||
Pair rootPair = {kNoneKey, GradientPair(0.f, 0.f)};
|
||||
int myKey;
|
||||
GradientPair myValue;
|
||||
using BlockScan = cub::BlockScan<Pair, BLKDIM_L1L3>;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
Pair threadData;
|
||||
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
||||
if (tid < size) {
|
||||
myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
myValue = Get(tid, vals, instIds);
|
||||
} else {
|
||||
myKey = kNoneKey;
|
||||
myValue = {};
|
||||
}
|
||||
threadData.key = myKey;
|
||||
threadData.value = myValue;
|
||||
// get previous key, especially needed for the last thread in this block
|
||||
// in order to pass on the partial scan values.
|
||||
// this statement MUST appear before the checks below!
|
||||
// else, the result of this shuffle operation will be undefined
|
||||
#if (__CUDACC_VER_MAJOR__ >= 9)
|
||||
int previousKey = __shfl_up_sync(0xFFFFFFFF, myKey, 1);
|
||||
#else
|
||||
int previousKey = __shfl_up(myKey, 1);
|
||||
#endif // (__CUDACC_VER_MAJOR__ >= 9)
|
||||
// Collectively compute the block-wide exclusive prefix sum
|
||||
BlockScan(temp_storage)
|
||||
.ExclusiveScan(threadData, threadData, rootPair, AddByKey());
|
||||
if (tid < size) {
|
||||
scans[tid] = threadData.value;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
||||
threadData.value =
|
||||
(myKey == previousKey) ? threadData.value : GradientPair(0.0f, 0.0f);
|
||||
mKeys[blockIdx.x] = myKey;
|
||||
mScans[blockIdx.x] = threadData.value + myValue;
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLKSIZE>
|
||||
__global__ void CubScanByKeyL2(common::Span<GradientPair> mScans,
|
||||
common::Span<int> mKeys, int mLength) {
|
||||
using BlockScan = cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS>;
|
||||
Pair threadData;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
|
||||
threadData.key = mKeys[i];
|
||||
threadData.value = mScans[i];
|
||||
BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey());
|
||||
mScans[i] = threadData.value;
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLKDIM_L1L3>
|
||||
__global__ void CubScanByKeyL3(common::Span<GradientPair> sums,
|
||||
common::Span<GradientPair> scans,
|
||||
common::Span<const GradientPair> vals,
|
||||
common::Span<const int> instIds,
|
||||
common::Span<const GradientPair> mScans,
|
||||
common::Span<const int> mKeys,
|
||||
common::Span<const NodeIdT> keys,
|
||||
int nUniqKeys,
|
||||
common::Span<const int> colIds, NodeIdT nodeStart,
|
||||
const int size) {
|
||||
int relId = threadIdx.x;
|
||||
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
|
||||
// to avoid the following warning from nvcc:
|
||||
// __shared__ memory variable with non-empty constructor or destructor
|
||||
// (potential race between threads)
|
||||
__shared__ char gradBuff[sizeof(GradientPair)];
|
||||
__shared__ int s_mKeys;
|
||||
GradientPair* s_mScans = reinterpret_cast<GradientPair*>(gradBuff);
|
||||
if (tid >= size) return;
|
||||
// cache block-wide partial scan info
|
||||
if (relId == 0) {
|
||||
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : kNoneKey;
|
||||
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : GradientPair();
|
||||
}
|
||||
int myKey = Abs2UniqueKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
int previousKey =
|
||||
tid == 0 ? kNoneKey
|
||||
: Abs2UniqueKey(tid - 1, keys, colIds, nodeStart, nUniqKeys);
|
||||
GradientPair my_value = scans[tid];
|
||||
__syncthreads();
|
||||
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
||||
my_value += s_mScans[0];
|
||||
}
|
||||
if (tid == size - 1) {
|
||||
sums[previousKey] = my_value + Get(tid, vals, instIds);
|
||||
}
|
||||
if ((previousKey != myKey) && (previousKey >= 0)) {
|
||||
sums[previousKey] = my_value;
|
||||
my_value = GradientPair(0.0f, 0.0f);
|
||||
}
|
||||
scans[tid] = my_value;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs fused reduce and scan by key functionality. It is assumed
|
||||
* that
|
||||
* the keys occur contiguously!
|
||||
* @param sums the output gradient reductions for each element performed
|
||||
* key-wise
|
||||
* @param scans the output gradient scans for each element performed key-wise
|
||||
* @param vals the gradients evaluated for each observation.
|
||||
* @param instIds instance ids for each element
|
||||
* @param keys keys to be used to segment the reductions. They need not occur
|
||||
* contiguously in contrast to scan_by_key. Currently, we need one key per
|
||||
* value in the 'vals' array.
|
||||
* @param size number of elements in the 'vals' array
|
||||
* @param nUniqKeys max number of uniq keys found per column
|
||||
* @param nCols number of columns
|
||||
* @param tmpScans temporary scan buffer needed for cub-pyramid algo
|
||||
* @param tmpKeys temporary key buffer needed for cub-pyramid algo
|
||||
* @param colIds column indices for each element in the array
|
||||
* @param nodeStart index of the leftmost node in the current level
|
||||
*/
|
||||
template <int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
||||
void ReduceScanByKey(common::Span<GradientPair> sums,
|
||||
common::Span<GradientPair> scans,
|
||||
common::Span<GradientPair> vals,
|
||||
common::Span<const int> instIds,
|
||||
common::Span<const NodeIdT> keys,
|
||||
int size, int nUniqKeys, int nCols,
|
||||
common::Span<GradientPair> tmpScans,
|
||||
common::Span<int> tmpKeys,
|
||||
common::Span<const int> colIds, NodeIdT nodeStart) {
|
||||
int nBlks = common::DivRoundUp(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums.data(), 0, nUniqKeys * nCols * sizeof(GradientPair));
|
||||
CubScanByKeyL1<BLKDIM_L1L3>
|
||||
<<<nBlks, BLKDIM_L1L3>>>(scans, vals, instIds, tmpScans, tmpKeys, keys,
|
||||
nUniqKeys, colIds, nodeStart, size);
|
||||
CubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
||||
CubScanByKeyL3<BLKDIM_L1L3>
|
||||
<<<nBlks, BLKDIM_L1L3>>>(sums, scans, vals, instIds, tmpScans, tmpKeys,
|
||||
keys, nUniqKeys, colIds, nodeStart, size);
|
||||
}
|
||||
|
||||
/**
|
||||
* @struct ExactSplitCandidate
|
||||
* @brief Abstraction of a possible split in the decision tree
|
||||
*/
|
||||
struct ExactSplitCandidate {
|
||||
/** the optimal gain score for this node */
|
||||
float score;
|
||||
/** index where to split in the DMatrix */
|
||||
int index;
|
||||
|
||||
HOST_DEV_INLINE ExactSplitCandidate() : score{-FLT_MAX}, index{INT_MAX} {}
|
||||
|
||||
/**
|
||||
* @brief Whether the split info is valid to be used to create a new child
|
||||
* @param minSplitLoss minimum score above which decision to split is made
|
||||
* @return true if splittable, else false
|
||||
*/
|
||||
HOST_DEV_INLINE bool IsSplittable(float minSplitLoss) const {
|
||||
return ((score >= minSplitLoss) && (index != INT_MAX));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
|
||||
* @brief Help decide which algorithm to use for multi-argmax operation
|
||||
*/
|
||||
enum ArgMaxByKeyAlgo {
|
||||
/** simplest, use gmem-atomics for all updates */
|
||||
kAbkGmem = 0,
|
||||
/** use smem-atomics for updates (when number of keys are less) */
|
||||
kAbkSmem
|
||||
};
|
||||
|
||||
/** max depth until which to use shared mem based atomics for argmax */
|
||||
static const int kMaxAbkLevels = 3;
|
||||
|
||||
HOST_DEV_INLINE ExactSplitCandidate MaxSplit(ExactSplitCandidate a,
|
||||
ExactSplitCandidate b) {
|
||||
ExactSplitCandidate out;
|
||||
if (a.score < b.score) {
|
||||
out.score = b.score;
|
||||
out.index = b.index;
|
||||
} else if (a.score == b.score) {
|
||||
out.score = a.score;
|
||||
out.index = (a.index < b.index) ? a.index : b.index;
|
||||
} else {
|
||||
out.score = a.score;
|
||||
out.index = a.index;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
DEV_INLINE void AtomicArgMax(ExactSplitCandidate* address,
|
||||
ExactSplitCandidate val) {
|
||||
unsigned long long* intAddress = reinterpret_cast<unsigned long long*>(address); // NOLINT
|
||||
unsigned long long old = *intAddress; // NOLINT
|
||||
unsigned long long assumed = old; // NOLINT
|
||||
do {
|
||||
assumed = old;
|
||||
ExactSplitCandidate res =
|
||||
MaxSplit(val, *reinterpret_cast<ExactSplitCandidate*>(&assumed));
|
||||
old = atomicCAS(intAddress, assumed, *reinterpret_cast<uint64_t*>(&res));
|
||||
} while (assumed != old);
|
||||
}
|
||||
|
||||
DEV_INLINE void ArgMaxWithAtomics(
|
||||
int id,
|
||||
common::Span<ExactSplitCandidate> nodeSplits,
|
||||
common::Span<const GradientPair> gradScans,
|
||||
common::Span<const GradientPair> gradSums,
|
||||
common::Span<const float> vals,
|
||||
common::Span<const int> colIds,
|
||||
common::Span<const NodeIdT> nodeAssigns,
|
||||
common::Span<const DeviceNodeStats> nodes, int nUniqKeys,
|
||||
NodeIdT nodeStart, int len,
|
||||
const GPUTrainingParam& param) {
|
||||
int nodeId = nodeAssigns[id];
|
||||
// @todo: this is really a bad check! but will be fixed when we move
|
||||
// to key-based reduction
|
||||
if ((id == 0) ||
|
||||
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
||||
(vals[id] == vals[id - 1]))) {
|
||||
if (nodeId != kUnusedNode) {
|
||||
int sumId = Abs2UniqueKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||
GradientPair colSum = gradSums[sumId];
|
||||
int uid = nodeId - nodeStart;
|
||||
DeviceNodeStats node_stat = nodes[nodeId];
|
||||
GradientPair parentSum = node_stat.sum_gradients;
|
||||
float parentGain = node_stat.root_gain;
|
||||
bool tmp;
|
||||
ExactSplitCandidate s;
|
||||
GradientPair missing = parentSum - colSum;
|
||||
s.score = LossChangeMissing(gradScans[id], missing, parentSum, parentGain,
|
||||
param, tmp);
|
||||
s.index = id;
|
||||
AtomicArgMax(&nodeSplits[uid], s);
|
||||
} // end if nodeId != UNUSED_NODE
|
||||
} // end if id == 0 ...
|
||||
}
|
||||
|
||||
__global__ void AtomicArgMaxByKeyGmem(
|
||||
common::Span<ExactSplitCandidate> nodeSplits,
|
||||
common::Span<const GradientPair> gradScans,
|
||||
common::Span<const GradientPair> gradSums,
|
||||
common::Span<const float> vals,
|
||||
common::Span<const int> colIds,
|
||||
common::Span<const NodeIdT> nodeAssigns,
|
||||
common::Span<const DeviceNodeStats> nodes,
|
||||
int nUniqKeys,
|
||||
NodeIdT nodeStart,
|
||||
int len,
|
||||
const TrainParam param) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
ArgMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
|
||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len,
|
||||
GPUTrainingParam(param));
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void AtomicArgMaxByKeySmem(
|
||||
common::Span<ExactSplitCandidate> nodeSplits,
|
||||
common::Span<const GradientPair> gradScans,
|
||||
common::Span<const GradientPair> gradSums,
|
||||
common::Span<const float> vals,
|
||||
common::Span<const int> colIds,
|
||||
common::Span<const NodeIdT> nodeAssigns,
|
||||
common::Span<const DeviceNodeStats> nodes,
|
||||
int nUniqKeys, NodeIdT nodeStart, int len, const GPUTrainingParam param) {
|
||||
extern __shared__ char sArr[];
|
||||
common::Span<ExactSplitCandidate> sNodeSplits =
|
||||
common::Span<ExactSplitCandidate>(
|
||||
reinterpret_cast<ExactSplitCandidate*>(sArr),
|
||||
static_cast<typename common::Span<ExactSplitCandidate>::index_type>(
|
||||
nUniqKeys * sizeof(ExactSplitCandidate)));
|
||||
int tid = threadIdx.x;
|
||||
ExactSplitCandidate defVal;
|
||||
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
sNodeSplits[i] = defVal;
|
||||
}
|
||||
__syncthreads();
|
||||
int id = tid + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
ArgMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
|
||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
|
||||
}
|
||||
__syncthreads();
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
ExactSplitCandidate s = sNodeSplits[i];
|
||||
AtomicArgMax(&nodeSplits[i], s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs argmax_by_key functionality but for cases when keys need not
|
||||
* occur contiguously
|
||||
* @param nodeSplits will contain information on best split for each node
|
||||
* @param gradScans exclusive sum on sorted segments for each col
|
||||
* @param gradSums gradient sum for each column in DMatrix based on to node-ids
|
||||
* @param vals feature values
|
||||
* @param colIds column index for each element in the feature values array
|
||||
* @param nodeAssigns node-id assignments to each element in DMatrix
|
||||
* @param nodes pointer to all nodes for this tree in BFS order
|
||||
* @param nUniqKeys number of unique node-ids in this level
|
||||
* @param nodeStart start index of the node-ids in this level
|
||||
* @param len number of elements
|
||||
* @param param training parameters
|
||||
* @param algo which algorithm to use for argmax_by_key
|
||||
*/
|
||||
template <int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
||||
void ArgMaxByKey(common::Span<ExactSplitCandidate> nodeSplits,
|
||||
common::Span<const GradientPair> gradScans,
|
||||
common::Span<const GradientPair> gradSums,
|
||||
common::Span<const float> vals,
|
||||
common::Span<const int> colIds,
|
||||
common::Span<const NodeIdT> nodeAssigns,
|
||||
common::Span<const DeviceNodeStats> nodes,
|
||||
int nUniqKeys,
|
||||
NodeIdT nodeStart, int len, const TrainParam param,
|
||||
ArgMaxByKeyAlgo algo,
|
||||
GPUSet const& devices) {
|
||||
dh::FillConst<ExactSplitCandidate, BLKDIM, ITEMS_PER_THREAD>(
|
||||
*(devices.begin()), nodeSplits.data(), nUniqKeys,
|
||||
ExactSplitCandidate());
|
||||
int nBlks = common::DivRoundUp(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
switch (algo) {
|
||||
case kAbkGmem:
|
||||
AtomicArgMaxByKeyGmem<<<nBlks, BLKDIM>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
case kAbkSmem:
|
||||
AtomicArgMaxByKeySmem<<<nBlks, BLKDIM,
|
||||
sizeof(ExactSplitCandidate) * nUniqKeys>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, GPUTrainingParam(param));
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void AssignColIds(int* colIds, const int* colOffsets) {
|
||||
int myId = blockIdx.x;
|
||||
int start = colOffsets[myId];
|
||||
int end = colOffsets[myId + 1];
|
||||
for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
|
||||
colIds[id] = myId;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void FillDefaultNodeIds(NodeIdT* nodeIdsPerInst,
|
||||
const DeviceNodeStats* nodes, int n_rows) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
if (id >= n_rows) {
|
||||
return;
|
||||
}
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
NodeIdT nId = nodeIdsPerInst[id];
|
||||
if (nId == kUnusedNode) {
|
||||
return;
|
||||
}
|
||||
const DeviceNodeStats n = nodes[nId];
|
||||
NodeIdT result;
|
||||
if (n.IsLeaf() || n.IsUnused()) {
|
||||
result = kUnusedNode;
|
||||
} else if (n.dir == kLeftDir) {
|
||||
result = (2 * n.idx) + 1;
|
||||
} else {
|
||||
result = (2 * n.idx) + 2;
|
||||
}
|
||||
nodeIdsPerInst[id] = result;
|
||||
}
|
||||
|
||||
__global__ void AssignNodeIds(NodeIdT* nodeIdsPerInst, int* nodeLocations,
|
||||
const NodeIdT* nodeIds, const int* instId,
|
||||
const DeviceNodeStats* nodes,
|
||||
const int* colOffsets, const float* vals,
|
||||
int nVals, int nCols) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < nVals; id += stride) {
|
||||
// fusing generation of indices for node locations
|
||||
nodeLocations[id] = id;
|
||||
// using nodeIds here since the previous kernel would have updated
|
||||
// the nodeIdsPerInst with all default assignments
|
||||
int nId = nodeIds[id];
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
if (nId != kUnusedNode) {
|
||||
const DeviceNodeStats n = nodes[nId];
|
||||
int colId = n.fidx;
|
||||
// printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||
int start = colOffsets[colId];
|
||||
int end = colOffsets[colId + 1];
|
||||
// @todo: too much wasteful threads!!
|
||||
if ((id >= start) && (id < end) && !(n.IsLeaf() || n.IsUnused())) {
|
||||
NodeIdT result = (2 * n.idx) + 1 + (vals[id] >= n.fvalue);
|
||||
nodeIdsPerInst[instId[id]] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void MarkLeavesKernel(DeviceNodeStats* nodes, int len) {
|
||||
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if ((id < len) && !nodes[id].IsUnused()) {
|
||||
int lid = (id << 1) + 1;
|
||||
int rid = (id << 1) + 2;
|
||||
if ((lid >= len) || (rid >= len)) {
|
||||
nodes[id].root_gain = -FLT_MAX; // bottom-most nodes
|
||||
} else if (nodes[lid].IsUnused() && nodes[rid].IsUnused()) {
|
||||
nodes[id].root_gain = -FLT_MAX; // unused child nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class GPUMaker : public TreeUpdater {
|
||||
protected:
|
||||
TrainParam param_;
|
||||
/** whether we have initialized memory already (so as not to repeat!) */
|
||||
bool allocated_;
|
||||
/** feature values stored in column-major compressed format */
|
||||
dh::DoubleBuffer<float> vals_;
|
||||
common::Span<float> vals_cached_;
|
||||
/** corresponding instance id's of these featutre values */
|
||||
dh::DoubleBuffer<int> instIds_;
|
||||
common::Span<int> inst_ids_cached_;
|
||||
/** column offsets for these feature values */
|
||||
common::Span<int> colOffsets_;
|
||||
common::Span<GradientPair> gradsInst_;
|
||||
dh::DoubleBuffer<NodeIdT> nodeAssigns_;
|
||||
dh::DoubleBuffer<int> nodeLocations_;
|
||||
common::Span<DeviceNodeStats> nodes_;
|
||||
common::Span<NodeIdT> node_assigns_per_inst_;
|
||||
common::Span<GradientPair> gradsums_;
|
||||
common::Span<GradientPair> gradscans_;
|
||||
common::Span<ExactSplitCandidate> nodeSplits_;
|
||||
int n_vals_;
|
||||
int n_rows_;
|
||||
int n_cols_;
|
||||
int maxNodes_;
|
||||
int maxLeaves_;
|
||||
|
||||
// devices are only used for sharding the HostDeviceVector passed as a parameter;
|
||||
// the algorithm works with a single GPU only
|
||||
GPUSet devices_;
|
||||
|
||||
dh::CubMemory tmp_mem_;
|
||||
common::Span<GradientPair> tmpScanGradBuff_;
|
||||
common::Span<int> tmp_scan_key_buff_;
|
||||
common::Span<int> colIds_;
|
||||
dh::BulkAllocator ba_;
|
||||
|
||||
public:
|
||||
GPUMaker() : allocated_{false} {}
|
||||
~GPUMaker() override = default;
|
||||
|
||||
char const* Name() const override {
|
||||
return "gpu_exact";
|
||||
}
|
||||
|
||||
void Configure(const Args &args) override {
|
||||
param_.InitAllowUnknown(args);
|
||||
maxNodes_ = (1 << (param_.max_depth + 1)) - 1;
|
||||
maxLeaves_ = 1 << param_.max_depth;
|
||||
|
||||
devices_ = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
|
||||
}
|
||||
|
||||
void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
const std::vector<RegTree*>& trees) override {
|
||||
// rescale learning rate according to size of trees
|
||||
float lr = param_.learning_rate;
|
||||
param_.learning_rate = lr / trees.size();
|
||||
|
||||
gpair->Shard(devices_);
|
||||
|
||||
try {
|
||||
// build tree
|
||||
for (auto tree : trees) {
|
||||
UpdateTree(gpair, dmat, tree);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
LOG(FATAL) << "grow_gpu exception: " << e.what() << std::endl;
|
||||
}
|
||||
param_.learning_rate = lr;
|
||||
}
|
||||
/// @note: Update should be only after Init!!
|
||||
void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
|
||||
RegTree* hTree) {
|
||||
if (!allocated_) {
|
||||
SetupOneTimeData(dmat);
|
||||
}
|
||||
for (int i = 0; i < param_.max_depth; ++i) {
|
||||
if (i == 0) {
|
||||
// make sure to start on a fresh tree with sorted values!
|
||||
dh::CopyDeviceSpan(vals_.CurrentSpan(), vals_cached_);
|
||||
dh::CopyDeviceSpan(instIds_.CurrentSpan(), inst_ids_cached_);
|
||||
TransferGrads(gpair);
|
||||
}
|
||||
int nNodes = 1 << i;
|
||||
NodeIdT nodeStart = nNodes - 1;
|
||||
InitNodeData(i, nodeStart, nNodes);
|
||||
FindSplit(i, nodeStart, nNodes);
|
||||
}
|
||||
// mark all the used nodes with unused children as leaf nodes
|
||||
MarkLeaves();
|
||||
Dense2SparseTree(hTree, nodes_, param_);
|
||||
}
|
||||
|
||||
void Split2Node(int nNodes, NodeIdT nodeStart) {
|
||||
auto d_nodes = nodes_;
|
||||
auto d_gradScans = gradscans_;
|
||||
auto d_gradsums = gradsums_;
|
||||
auto d_nodeAssigns = nodeAssigns_.CurrentSpan();
|
||||
auto d_colIds = colIds_;
|
||||
auto d_vals = vals_.Current();
|
||||
auto d_nodeSplits = nodeSplits_.data();
|
||||
int nUniqKeys = nNodes;
|
||||
float min_split_loss = param_.min_split_loss;
|
||||
auto gpu_param = GPUTrainingParam(param_);
|
||||
|
||||
dh::LaunchN(*(devices_.begin()), nNodes, [=] __device__(int uid) {
|
||||
int absNodeId = uid + nodeStart;
|
||||
ExactSplitCandidate s = d_nodeSplits[uid];
|
||||
if (s.IsSplittable(min_split_loss)) {
|
||||
int idx = s.index;
|
||||
int nodeInstId =
|
||||
Abs2UniqueKey(idx, d_nodeAssigns, d_colIds, nodeStart, nUniqKeys);
|
||||
bool missingLeft = true;
|
||||
const DeviceNodeStats& n = d_nodes[absNodeId];
|
||||
GradientPair gradScan = d_gradScans[idx];
|
||||
GradientPair gradSum = d_gradsums[nodeInstId];
|
||||
float thresh = d_vals[idx];
|
||||
int colId = d_colIds[idx];
|
||||
// get the default direction for the current node
|
||||
GradientPair missing = n.sum_gradients - gradSum;
|
||||
LossChangeMissing(gradScan, missing, n.sum_gradients, n.root_gain,
|
||||
gpu_param, missingLeft);
|
||||
// get the score/weight/id/gradSum for left and right child nodes
|
||||
GradientPair lGradSum = missingLeft ? gradScan + missing : gradScan;
|
||||
GradientPair rGradSum = n.sum_gradients - lGradSum;
|
||||
|
||||
// Create children
|
||||
d_nodes[LeftChildNodeIdx(absNodeId)] =
|
||||
DeviceNodeStats(lGradSum, LeftChildNodeIdx(absNodeId), gpu_param);
|
||||
d_nodes[RightChildNodeIdx(absNodeId)] =
|
||||
DeviceNodeStats(rGradSum, RightChildNodeIdx(absNodeId), gpu_param);
|
||||
// Set split for parent
|
||||
d_nodes[absNodeId].SetSplit(thresh, colId,
|
||||
missingLeft ? kLeftDir : kRightDir, lGradSum,
|
||||
rGradSum);
|
||||
} else {
|
||||
// cannot be split further, so this node is a leaf!
|
||||
d_nodes[absNodeId].root_gain = -FLT_MAX;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void FindSplit(int level, NodeIdT nodeStart, int nNodes) {
|
||||
ReduceScanByKey(gradsums_, gradscans_, gradsInst_,
|
||||
instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
|
||||
n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
|
||||
colIds_, nodeStart);
|
||||
auto devices = GPUSet::All(tparam_->gpu_id, tparam_->n_gpus);
|
||||
ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
|
||||
vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
|
||||
nodes_, nNodes, nodeStart, n_vals_, param_,
|
||||
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem,
|
||||
devices);
|
||||
Split2Node(nNodes, nodeStart);
|
||||
}
|
||||
|
||||
void AllocateAllData(int offsetSize) {
|
||||
int tmpBuffSize = ScanTempBufferSize(n_vals_);
|
||||
ba_.Allocate(*(devices_.begin()), &vals_, n_vals_,
|
||||
&vals_cached_, n_vals_, &instIds_, n_vals_, &inst_ids_cached_, n_vals_,
|
||||
&colOffsets_, offsetSize, &gradsInst_, n_rows_, &nodeAssigns_, n_vals_,
|
||||
&nodeLocations_, n_vals_, &nodes_, maxNodes_, &node_assigns_per_inst_,
|
||||
n_rows_, &gradsums_, maxLeaves_ * n_cols_, &gradscans_, n_vals_,
|
||||
&nodeSplits_, maxLeaves_, &tmpScanGradBuff_, tmpBuffSize,
|
||||
&tmp_scan_key_buff_, tmpBuffSize, &colIds_, n_vals_);
|
||||
}
|
||||
|
||||
void SetupOneTimeData(DMatrix* dmat) {
|
||||
if (!dmat->SingleColBlock()) {
|
||||
LOG(FATAL) << "exact::GPUBuilder - must have 1 column block";
|
||||
}
|
||||
std::vector<float> fval;
|
||||
std::vector<int> fId;
|
||||
std::vector<int> offset;
|
||||
ConvertToCsc(dmat, &fval, &fId, &offset);
|
||||
AllocateAllData(static_cast<int>(offset.size()));
|
||||
TransferAndSortData(fval, fId, offset);
|
||||
allocated_ = true;
|
||||
}
|
||||
|
||||
void ConvertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
||||
std::vector<int>* fId, std::vector<int>* offset) {
|
||||
const MetaInfo& info = dmat->Info();
|
||||
CHECK(info.num_col_ < std::numeric_limits<int>::max());
|
||||
CHECK(info.num_row_ < std::numeric_limits<int>::max());
|
||||
n_rows_ = static_cast<int>(info.num_row_);
|
||||
n_cols_ = static_cast<int>(info.num_col_);
|
||||
offset->reserve(n_cols_ + 1);
|
||||
offset->push_back(0);
|
||||
fval->reserve(n_cols_ * n_rows_);
|
||||
fId->reserve(n_cols_ * n_rows_);
|
||||
// in case you end up with a DMatrix having no column access
|
||||
// then make sure to enable that before copying the data!
|
||||
for (const auto& batch : dmat->GetBatches<SortedCSCPage>()) {
|
||||
for (int i = 0; i < batch.Size(); i++) {
|
||||
auto col = batch[i];
|
||||
for (const Entry& e : col) {
|
||||
int inst_id = static_cast<int>(e.index);
|
||||
fval->push_back(e.fvalue);
|
||||
fId->push_back(inst_id);
|
||||
}
|
||||
offset->push_back(static_cast<int>(fval->size()));
|
||||
}
|
||||
}
|
||||
CHECK(fval->size() < std::numeric_limits<int>::max());
|
||||
n_vals_ = static_cast<int>(fval->size());
|
||||
}
|
||||
|
||||
void TransferAndSortData(const std::vector<float>& fval,
|
||||
const std::vector<int>& fId,
|
||||
const std::vector<int>& offset) {
|
||||
dh::CopyVectorToDeviceSpan(vals_.CurrentSpan(), fval);
|
||||
dh::CopyVectorToDeviceSpan(instIds_.CurrentSpan(), fId);
|
||||
dh::CopyVectorToDeviceSpan(colOffsets_, offset);
|
||||
dh::SegmentedSort<float, int>(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_,
|
||||
colOffsets_);
|
||||
dh::CopyDeviceSpan(vals_cached_, vals_.CurrentSpan());
|
||||
dh::CopyDeviceSpan(inst_ids_cached_, instIds_.CurrentSpan());
|
||||
AssignColIds<<<n_cols_, 512>>>(colIds_.data(), colOffsets_.data());
|
||||
}
|
||||
|
||||
void TransferGrads(HostDeviceVector<GradientPair>* gpair) {
|
||||
gpair->GatherTo(
|
||||
thrust::device_pointer_cast(gradsInst_.data()),
|
||||
thrust::device_pointer_cast(gradsInst_.data() + gradsInst_.size()));
|
||||
// evaluate the full-grad reduction for the root node
|
||||
dh::SumReduction<GradientPair>(tmp_mem_, gradsInst_, gradsums_, n_rows_);
|
||||
}
|
||||
|
||||
void InitNodeData(int level, NodeIdT nodeStart, int nNodes) {
|
||||
// all instances belong to root node at the beginning!
|
||||
if (level == 0) {
|
||||
thrust::fill(thrust::device_pointer_cast(nodes_.data()),
|
||||
thrust::device_pointer_cast(nodes_.data() + nodes_.size()),
|
||||
DeviceNodeStats());
|
||||
thrust::fill(thrust::device_pointer_cast(nodeAssigns_.Current()),
|
||||
thrust::device_pointer_cast(nodeAssigns_.Current() +
|
||||
nodeAssigns_.Size()),
|
||||
0);
|
||||
thrust::fill(thrust::device_pointer_cast(node_assigns_per_inst_.data()),
|
||||
thrust::device_pointer_cast(node_assigns_per_inst_.data() +
|
||||
node_assigns_per_inst_.size()),
|
||||
0);
|
||||
// for root node, just update the gradient/score/weight/id info
|
||||
// before splitting it! Currently all data is on GPU, hence this
|
||||
// stupid little kernel
|
||||
auto d_nodes = nodes_;
|
||||
auto d_sums = gradsums_;
|
||||
auto gpu_params = GPUTrainingParam(param_);
|
||||
dh::LaunchN(*(devices_.begin()), 1, [=] __device__(int idx) {
|
||||
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
|
||||
});
|
||||
} else {
|
||||
const int BlkDim = 256;
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = common::DivRoundUp(n_rows_, BlkDim);
|
||||
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.data(),
|
||||
nodes_.data(), n_rows_);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = common::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
|
||||
AssignNodeIds<<<nBlks, BlkDim>>>(
|
||||
node_assigns_per_inst_.data(), nodeLocations_.Current(),
|
||||
nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
|
||||
colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
|
||||
// gather the node assignments across all other columns too
|
||||
dh::Gather(*(devices_.begin()), nodeAssigns_.Current(),
|
||||
node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
|
||||
SortKeys(level);
|
||||
}
|
||||
}
|
||||
|
||||
void SortKeys(int level) {
|
||||
// segmented-sort the arrays based on node-id's
|
||||
// but we don't need more than level+1 bits for sorting!
|
||||
SegmentedSort(&tmp_mem_, &nodeAssigns_, &nodeLocations_, n_vals_, n_cols_,
|
||||
colOffsets_, 0, level + 1);
|
||||
dh::Gather<float, int>(*(devices_.begin()), vals_.other(),
|
||||
vals_.Current(), instIds_.other(), instIds_.Current(),
|
||||
nodeLocations_.Current(), n_vals_);
|
||||
vals_.buff.selector ^= 1;
|
||||
instIds_.buff.selector ^= 1;
|
||||
}
|
||||
|
||||
void MarkLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = common::DivRoundUp(maxNodes_, BlkDim);
|
||||
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.data(), maxNodes_);
|
||||
}
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(GPUMaker, "grow_gpu")
|
||||
.describe("Grow tree with GPU.")
|
||||
.set_body([]() {
|
||||
LOG(WARNING) << "The gpu_exact tree method is deprecated and may be "
|
||||
"removed in a future version.";
|
||||
return new GPUMaker();
|
||||
});
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@ -36,10 +36,6 @@ TEST(GBTree, SelectTreeMethod) {
|
||||
ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker");
|
||||
#ifdef XGBOOST_USE_CUDA
|
||||
generic_param.InitAllowUnknown(std::vector<Arg>{Arg{"n_gpus", "1"}});
|
||||
gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_exact"),
|
||||
Arg("num_feature", n_feat)}, p_dmat);
|
||||
ASSERT_EQ(tparam.updater_seq, "grow_gpu,prune");
|
||||
ASSERT_EQ(tparam.predictor, "gpu_predictor");
|
||||
gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)},
|
||||
p_dmat);
|
||||
ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist");
|
||||
|
||||
@ -171,13 +171,6 @@ TEST(Learner, GPUConfiguration) {
|
||||
ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
|
||||
ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
|
||||
}
|
||||
{
|
||||
std::unique_ptr<Learner> learner {Learner::Create(mat)};
|
||||
learner->SetParams({Arg{"tree_method", "gpu_exact"}});
|
||||
learner->UpdateOneIter(0, p_dmat.get());
|
||||
ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0);
|
||||
ASSERT_EQ(learner->GetGenericParameter().n_gpus, 1);
|
||||
}
|
||||
{
|
||||
std::unique_ptr<Learner> learner {Learner::Create(mat)};
|
||||
learner->SetParams({Arg{"tree_method", "gpu_hist"}});
|
||||
|
||||
@ -1,49 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/tree_updater.h>
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "../helpers.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
TEST(GPUExact, Update) {
|
||||
using Arg = std::pair<std::string, std::string>;
|
||||
auto lparam = CreateEmptyGenericParam(0, 1);
|
||||
std::vector<Arg> args{{"max_depth", "1"}};
|
||||
|
||||
auto* p_gpuexact_maker = TreeUpdater::Create("grow_gpu", &lparam);
|
||||
p_gpuexact_maker->Configure(args);
|
||||
|
||||
size_t constexpr kNRows = 4;
|
||||
size_t constexpr kNCols = 8;
|
||||
bst_float constexpr kSparsity = 0.0f;
|
||||
|
||||
auto dmat = CreateDMatrix(kNRows, kNCols, kSparsity, 3);
|
||||
std::vector<GradientPair> h_gpair(kNRows);
|
||||
for (size_t i = 0; i < kNRows; ++i) {
|
||||
h_gpair[i] = GradientPair(i % 2, 1);
|
||||
}
|
||||
HostDeviceVector<GradientPair> gpair (h_gpair);
|
||||
RegTree tree;
|
||||
|
||||
p_gpuexact_maker->Update(&gpair, (*dmat).get(), {&tree});
|
||||
auto const& nodes = tree.GetNodes();
|
||||
ASSERT_EQ(nodes.size(), 3);
|
||||
|
||||
float constexpr kRtEps = 1e-6;
|
||||
ASSERT_NEAR(tree.Stat(0).sum_hess, 4, kRtEps);
|
||||
ASSERT_NEAR(tree.Stat(1).sum_hess, 2, kRtEps);
|
||||
ASSERT_NEAR(tree.Stat(2).sum_hess, 2, kRtEps);
|
||||
|
||||
ASSERT_NEAR(tree.Stat(0).loss_chg, 0.8f, kRtEps);
|
||||
|
||||
delete dmat;
|
||||
delete p_gpuexact_maker;
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@ -20,16 +20,6 @@ datasets = ["Boston", "Cancer", "Digits", "Sparse regression",
|
||||
|
||||
|
||||
class TestGPU(unittest.TestCase):
|
||||
def test_gpu_exact(self):
|
||||
variable_param = {'max_depth': [2, 6, 15], }
|
||||
for param in parameter_combinations(variable_param):
|
||||
param['tree_method'] = 'gpu_exact'
|
||||
gpu_results = run_suite(param, select_datasets=datasets)
|
||||
assert_results_non_increasing(gpu_results, 1e-2)
|
||||
param['tree_method'] = 'exact'
|
||||
cpu_results = run_suite(param, select_datasets=datasets)
|
||||
assert_gpu_results(cpu_results, gpu_results)
|
||||
|
||||
def test_gpu_hist(self):
|
||||
test_param = parameter_combinations({'n_gpus': [1], 'max_depth': [2, 8],
|
||||
'max_leaves': [255, 4],
|
||||
@ -65,7 +55,7 @@ class TestGPU(unittest.TestCase):
|
||||
'max_leaves': [255, 4],
|
||||
'max_bin': [2, 64],
|
||||
'grow_policy': ['lossguide'],
|
||||
'tree_method': ['gpu_hist', 'gpu_exact']}
|
||||
'tree_method': ['gpu_hist']}
|
||||
for param in parameter_combinations(variable_param):
|
||||
gpu_results = run_suite(param, select_datasets=datasets)
|
||||
assert_results_non_increasing(gpu_results, 1e-2)
|
||||
|
||||
@ -444,9 +444,9 @@ def test_sklearn_n_jobs():
|
||||
|
||||
|
||||
def test_kwargs():
|
||||
params = {'updater': 'grow_gpu', 'subsample': .5, 'n_jobs': -1}
|
||||
params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1}
|
||||
clf = xgb.XGBClassifier(n_estimators=1000, **params)
|
||||
assert clf.get_params()['updater'] == 'grow_gpu'
|
||||
assert clf.get_params()['updater'] == 'grow_gpu_hist'
|
||||
assert clf.get_params()['subsample'] == .5
|
||||
assert clf.get_params()['n_estimators'] == 1000
|
||||
|
||||
@ -472,7 +472,7 @@ def test_kwargs_grid_search():
|
||||
|
||||
|
||||
def test_kwargs_error():
|
||||
params = {'updater': 'grow_gpu', 'subsample': .5, 'n_jobs': -1}
|
||||
params = {'updater': 'grow_gpu_hist', 'subsample': .5, 'n_jobs': -1}
|
||||
with pytest.raises(TypeError):
|
||||
clf = xgb.XGBClassifier(n_jobs=1000, **params)
|
||||
assert isinstance(clf, xgb.XGBClassifier)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user