diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 14e16eb63..289a7c835 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -19,7 +19,8 @@ /*! * \brief Whether always log console message with time. * It will display like, with timestamp appended to head of the message. - * "[21:47:50] 6513x126 matrix with 143286 entries loaded from ../data/agaricus.txt.train" + * "[21:47:50] 6513x126 matrix with 143286 entries loaded from + * ../data/agaricus.txt.train" */ #ifndef XGBOOST_LOG_WITH_TIME #define XGBOOST_LOG_WITH_TIME 1 @@ -36,7 +37,7 @@ * \brief Whether to customize global PRNG. */ #ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG -#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG XGBOOST_STRICT_R_MODE +#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG XGBOOST_STRICT_R_MODE #endif /*! @@ -48,16 +49,27 @@ #define XGBOOST_ALIGNAS(X) #endif -#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ >= 8 && !defined(__CUDACC__) +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ >= 8 && \ + !defined(__CUDACC__) #include #define XGBOOST_PARALLEL_SORT(X, Y, Z) __gnu_parallel::sort((X), (Y), (Z)) -#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) __gnu_parallel::stable_sort((X), (Y), (Z)) +#define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) \ + __gnu_parallel::stable_sort((X), (Y), (Z)) #else #define XGBOOST_PARALLEL_SORT(X, Y, Z) std::sort((X), (Y), (Z)) #define XGBOOST_PARALLEL_STABLE_SORT(X, Y, Z) std::stable_sort((X), (Y), (Z)) #endif -/*! \brief namespace of xgboo st*/ +/*! + * \brief Tag function as usable by device + */ +#ifdef __NVCC__ +#define XGBOOST_DEVICE __host__ __device__ +#else +#define XGBOOST_DEVICE +#endif + +/*! \brief namespace of xgboost*/ namespace xgboost { /*! * \brief unsigned integer type used in boost, @@ -76,8 +88,41 @@ struct bst_gpair { bst_float grad; /*! \brief second order gradient statistics */ bst_float hess; - bst_gpair() {} - bst_gpair(bst_float grad, bst_float hess) : grad(grad), hess(hess) {} + + XGBOOST_DEVICE bst_gpair() : grad(0), hess(0) {} + + XGBOOST_DEVICE bst_gpair(bst_float grad, bst_float hess) + : grad(grad), hess(hess) {} + + XGBOOST_DEVICE bst_gpair &operator+=(const bst_gpair &rhs) { + grad += rhs.grad; + hess += rhs.hess; + return *this; + } + + XGBOOST_DEVICE bst_gpair operator+(const bst_gpair &rhs) const { + bst_gpair g; + g.grad = grad + rhs.grad; + g.hess = hess + rhs.hess; + return g; + } + + XGBOOST_DEVICE bst_gpair &operator-=(const bst_gpair &rhs) { + grad -= rhs.grad; + hess -= rhs.hess; + return *this; + } + + XGBOOST_DEVICE bst_gpair operator-(const bst_gpair &rhs) const { + bst_gpair g; + g.grad = grad - rhs.grad; + g.hess = hess - rhs.hess; + return g; + } + + XGBOOST_DEVICE bst_gpair(int value) { + *this = bst_gpair(static_cast(value), static_cast(value)); + } }; /*! \brief small eps gap for minimum split decision. */ diff --git a/plugin/updater_gpu/src/common.cuh b/plugin/updater_gpu/src/common.cuh index 452a0c6a0..a427c11d1 100644 --- a/plugin/updater_gpu/src/common.cuh +++ b/plugin/updater_gpu/src/common.cuh @@ -15,33 +15,29 @@ namespace xgboost { namespace tree { -// When we split on a value which has no left neighbour, define its left -// neighbour as having left_fvalue = current_fvalue - FVALUE_EPS -// This produces a split value slightly lower than the current instance -#define FVALUE_EPS 0.0001 __device__ inline float device_calc_loss_chg(const GPUTrainingParam& param, - const gpu_gpair& scan, - const gpu_gpair& missing, - const gpu_gpair& parent_sum, + const bst_gpair& scan, + const bst_gpair& missing, + const bst_gpair& parent_sum, const float& parent_gain, bool missing_left) { - gpu_gpair left = scan; + bst_gpair left = scan; if (missing_left) { left += missing; } - gpu_gpair right = parent_sum - left; + bst_gpair right = parent_sum - left; - float left_gain = CalcGain(param, left.grad(), left.hess()); - float right_gain = CalcGain(param, right.grad(), right.hess()); + float left_gain = CalcGain(param, left.grad, left.hess); + float right_gain = CalcGain(param, right.grad, right.hess); return left_gain + right_gain - parent_gain; } -__device__ float inline loss_chg_missing(const gpu_gpair& scan, - const gpu_gpair& missing, - const gpu_gpair& parent_sum, +__device__ float inline loss_chg_missing(const bst_gpair& scan, + const bst_gpair& missing, + const bst_gpair& parent_sum, const float& parent_gain, const GPUTrainingParam& param, bool& missing_left_out) { // NOLINT @@ -134,39 +130,39 @@ inline void dense2sparse_tree(RegTree* p_tree, tree[nid].set_split(n.split.findex, n.split.fvalue, n.split.missing_left); tree.stat(nid).loss_chg = n.split.loss_chg; tree.stat(nid).base_weight = n.weight; - tree.stat(nid).sum_hess = n.sum_gradients.hess(); + tree.stat(nid).sum_hess = n.sum_gradients.hess; tree[tree[nid].cleft()].set_leaf(0); tree[tree[nid].cright()].set_leaf(0); nid++; } else if (flag == LEAF) { tree[nid].set_leaf(n.weight * param.learning_rate); - tree.stat(nid).sum_hess = n.sum_gradients.hess(); + tree.stat(nid).sum_hess = n.sum_gradients.hess; nid++; } } } // Set gradient pair to 0 with p = 1 - subsample -inline void subsample_gpair(dh::dvec* p_gpair, float subsample, +inline void subsample_gpair(dh::dvec* p_gpair, float subsample, int offset) { if (subsample == 1.0) { return; } - dh::dvec& gpair = *p_gpair; + dh::dvec& gpair = *p_gpair; auto d_gpair = gpair.data(); dh::BernoulliRng rng(subsample, common::GlobalRandom()()); dh::launch_n(gpair.device_idx(), gpair.size(), [=] __device__(int i) { if (!rng(i + offset)) { - d_gpair[i] = gpu_gpair(); + d_gpair[i] = bst_gpair(); } }); } // Set gradient pair to 0 with p = 1 - subsample -inline void subsample_gpair(dh::dvec* p_gpair, float subsample) { +inline void subsample_gpair(dh::dvec* p_gpair, float subsample) { int offset = 0; subsample_gpair(p_gpair, subsample, offset); } @@ -182,11 +178,11 @@ inline std::vector col_sample(std::vector features, float colsample) { } struct GpairCallbackOp { // Running prefix - gpu_gpair running_total; + bst_gpair running_total; // Constructor - __device__ GpairCallbackOp() : running_total(gpu_gpair()) {} - __device__ gpu_gpair operator()(gpu_gpair block_aggregate) { - gpu_gpair old_prefix = running_total; + __device__ GpairCallbackOp() : running_total(bst_gpair()) {} + __device__ bst_gpair operator()(bst_gpair block_aggregate) { + bst_gpair old_prefix = running_total; running_total += block_aggregate; return old_prefix; } diff --git a/plugin/updater_gpu/src/exact/argmax_by_key.cuh b/plugin/updater_gpu/src/exact/argmax_by_key.cuh index 01935b5fa..f66e9d085 100644 --- a/plugin/updater_gpu/src/exact/argmax_by_key.cuh +++ b/plugin/updater_gpu/src/exact/argmax_by_key.cuh @@ -17,8 +17,8 @@ #include "../../../../src/tree/param.h" #include "../common.cuh" -#include "loss_functions.cuh" #include "node.cuh" +#include "../types.cuh" namespace xgboost { namespace tree { @@ -66,10 +66,10 @@ DEV_INLINE void atomicArgMax(Split* address, Split val) { template DEV_INLINE void argMaxWithAtomics( - int id, Split* nodeSplits, const gpu_gpair* gradScans, - const gpu_gpair* gradSums, const float* vals, const int* colIds, + int id, Split* nodeSplits, const bst_gpair* gradScans, + const bst_gpair* gradSums, const float* vals, const int* colIds, const node_id_t* nodeAssigns, const Node* nodes, int nUniqKeys, - node_id_t nodeStart, int len, const TrainParam& param) { + node_id_t nodeStart, int len, const GPUTrainingParam& param) { int nodeId = nodeAssigns[id]; ///@todo: this is really a bad check! but will be fixed when we move /// to key-based reduction @@ -78,14 +78,14 @@ DEV_INLINE void argMaxWithAtomics( (vals[id] == vals[id - 1]))) { if (nodeId != UNUSED_NODE) { int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys); - gpu_gpair colSum = gradSums[sumId]; + bst_gpair colSum = gradSums[sumId]; int uid = nodeId - nodeStart; Node n = nodes[nodeId]; - gpu_gpair parentSum = n.gradSum; + bst_gpair parentSum = n.gradSum; float parentGain = n.score; bool tmp; Split s; - gpu_gpair missing = parentSum - colSum; + bst_gpair missing = parentSum - colSum; s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain, param, tmp); s.index = id; @@ -96,7 +96,7 @@ DEV_INLINE void argMaxWithAtomics( template __global__ void atomicArgMaxByKeyGmem( - Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums, + Split* nodeSplits, const bst_gpair* gradScans, const bst_gpair* gradSums, const float* vals, const int* colIds, const node_id_t* nodeAssigns, const Node* nodes, int nUniqKeys, node_id_t nodeStart, int len, const TrainParam param) { @@ -104,13 +104,13 @@ __global__ void atomicArgMaxByKeyGmem( const int stride = blockDim.x * gridDim.x; for (; id < len; id += stride) { argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds, - nodeAssigns, nodes, nUniqKeys, nodeStart, len, param); + nodeAssigns, nodes, nUniqKeys, nodeStart, len, GPUTrainingParam(param)); } } template __global__ void atomicArgMaxByKeySmem( - Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums, + Split* nodeSplits, const bst_gpair* gradScans, const bst_gpair* gradSums, const float* vals, const int* colIds, const node_id_t* nodeAssigns, const Node* nodes, int nUniqKeys, node_id_t nodeStart, int len, const TrainParam param) { @@ -153,8 +153,8 @@ __global__ void atomicArgMaxByKeySmem( * @param algo which algorithm to use for argmax_by_key */ template -void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans, - const gpu_gpair* gradSums, const float* vals, +void argMaxByKey(Split* nodeSplits, const bst_gpair* gradScans, + const bst_gpair* gradSums, const float* vals, const int* colIds, const node_id_t* nodeAssigns, const Node* nodes, int nUniqKeys, node_id_t nodeStart, int len, const TrainParam param, diff --git a/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh b/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh index 92b48ad7a..c56275ebc 100644 --- a/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh +++ b/plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh @@ -16,7 +16,6 @@ #pragma once #include "../common.cuh" -#include "gradients.cuh" namespace xgboost { namespace tree { @@ -24,11 +23,11 @@ namespace exact { /** * @struct Pair fused_scan_reduce_by_key.cuh - * @brief Pair used for key basd scan operations on gpu_gpair + * @brief Pair used for key basd scan operations on bst_gpair */ struct Pair { int key; - gpu_gpair value; + bst_gpair value; }; /** define a key that's not used at all in the entire boosting process */ @@ -61,15 +60,27 @@ struct AddByKey { } }; +/** +* @brief Gradient value getter function +* @param id the index into the vals or instIds array to which to fetch +* @param vals the gradient value buffer +* @param instIds instance index buffer +* @return the expected gradient value +*/ +HOST_DEV_INLINE bst_gpair get(int id, const bst_gpair* vals, const int* instIds) { + id = instIds[id]; + return vals[id]; +} + template -__global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals, - const int* instIds, gpu_gpair* mScans, +__global__ void cubScanByKeyL1(bst_gpair* scans, const bst_gpair* vals, + const int* instIds, bst_gpair* mScans, int* mKeys, const node_id_t* keys, int nUniqKeys, const int* colIds, node_id_t nodeStart, const int size) { - Pair rootPair = {NONE_KEY, gpu_gpair(0.f, 0.f)}; + Pair rootPair = {NONE_KEY, bst_gpair(0.f, 0.f)}; int myKey; - gpu_gpair myValue; + bst_gpair myValue; typedef cub::BlockScan BlockScan; __shared__ typename BlockScan::TempStorage temp_storage; Pair threadData; @@ -98,14 +109,14 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals, } if (threadIdx.x == BLKDIM_L1L3 - 1) { threadData.value = - (myKey == previousKey) ? threadData.value : gpu_gpair(0.0f, 0.0f); + (myKey == previousKey) ? threadData.value : bst_gpair(0.0f, 0.0f); mKeys[blockIdx.x] = myKey; mScans[blockIdx.x] = threadData.value + myValue; } } template -__global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) { +__global__ void cubScanByKeyL2(bst_gpair* mScans, int* mKeys, int mLength) { typedef cub::BlockScan BlockScan; Pair threadData; __shared__ typename BlockScan::TempStorage temp_storage; @@ -119,9 +130,9 @@ __global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) { } template -__global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans, - const gpu_gpair* vals, const int* instIds, - const gpu_gpair* mScans, const int* mKeys, +__global__ void cubScanByKeyL3(bst_gpair* sums, bst_gpair* scans, + const bst_gpair* vals, const int* instIds, + const bst_gpair* mScans, const int* mKeys, const node_id_t* keys, int nUniqKeys, const int* colIds, node_id_t nodeStart, const int size) { @@ -130,19 +141,19 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans, // to avoid the following warning from nvcc: // __shared__ memory variable with non-empty constructor or destructor // (potential race between threads) - __shared__ char gradBuff[sizeof(gpu_gpair)]; + __shared__ char gradBuff[sizeof(bst_gpair)]; __shared__ int s_mKeys; - gpu_gpair* s_mScans = (gpu_gpair*)gradBuff; + bst_gpair* s_mScans = (bst_gpair*)gradBuff; if (tid >= size) return; // cache block-wide partial scan info if (relId == 0) { s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY; - s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : gpu_gpair(); + s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : bst_gpair(); } int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys); int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid - 1, keys, colIds, nodeStart, nUniqKeys); - gpu_gpair myValue = scans[tid]; + bst_gpair myValue = scans[tid]; __syncthreads(); if (blockIdx.x > 0 && s_mKeys == previousKey) { myValue += s_mScans[0]; @@ -152,7 +163,7 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans, } if ((previousKey != myKey) && (previousKey >= 0)) { sums[previousKey] = myValue; - myValue = gpu_gpair(0.0f, 0.0f); + myValue = bst_gpair(0.0f, 0.0f); } scans[tid] = myValue; } @@ -178,12 +189,12 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans, * @param nodeStart index of the leftmost node in the current level */ template -void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals, +void reduceScanByKey(bst_gpair* sums, bst_gpair* scans, const bst_gpair* vals, const int* instIds, const node_id_t* keys, int size, - int nUniqKeys, int nCols, gpu_gpair* tmpScans, + int nUniqKeys, int nCols, bst_gpair* tmpScans, int* tmpKeys, const int* colIds, node_id_t nodeStart) { int nBlks = dh::div_round_up(size, BLKDIM_L1L3); - cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(gpu_gpair)); + cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(bst_gpair)); cubScanByKeyL1<<>>( scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds, nodeStart, size); diff --git a/plugin/updater_gpu/src/exact/gpu_builder.cuh b/plugin/updater_gpu/src/exact/gpu_builder.cuh index 7b9b87cee..e6dfb50a5 100644 --- a/plugin/updater_gpu/src/exact/gpu_builder.cuh +++ b/plugin/updater_gpu/src/exact/gpu_builder.cuh @@ -19,13 +19,11 @@ #include #include "../../../../src/tree/param.h" #include "../common.cuh" -#include "argmax_by_key.cuh" -#include "cub/cub.cuh" -#include "fused_scan_reduce_by_key.cuh" -#include "gradients.cuh" -#include "loss_functions.cuh" +#include #include "node.cuh" #include "split2node.cuh" +#include "argmax_by_key.cuh" +#include "fused_scan_reduce_by_key.cuh" #include "xgboost/tree_updater.h" namespace xgboost { @@ -33,13 +31,13 @@ namespace tree { namespace exact { template -__global__ void initRootNode(Node* nodes, const gpu_gpair* sums, +__global__ void initRootNode(Node* nodes, const bst_gpair* sums, const TrainParam param) { // gradients already evaluated inside transferGrads Node n; n.gradSum = sums[0]; - n.score = CalcGain(param, n.gradSum.g, n.gradSum.h); - n.weight = CalcWeight(param, n.gradSum.g, n.gradSum.h); + n.score = CalcGain(param, n.gradSum.grad , n.gradSum.hess); + n.weight = CalcWeight(param, n.gradSum.grad , n.gradSum.hess); n.id = 0; nodes[0] = n; } @@ -198,13 +196,13 @@ class GPUBuilder { dh::dvec instIds_cached; /** column offsets for these feature values */ dh::dvec colOffsets; - dh::dvec gradsInst; + dh::dvec gradsInst; dh::dvec2 nodeAssigns; dh::dvec2 nodeLocations; dh::dvec> nodes; dh::dvec nodeAssignsPerInst; - dh::dvec gradSums; - dh::dvec gradScans; + dh::dvec gradSums; + dh::dvec gradScans; dh::dvec nodeSplits; int nVals; int nRows; @@ -212,7 +210,7 @@ class GPUBuilder { int maxNodes; int maxLeaves; dh::CubMemory tmp_mem; - dh::dvec tmpScanGradBuff; + dh::dvec tmpScanGradBuff; dh::dvec tmpScanKeyBuff; dh::dvec colIds; dh::bulk_allocator ba; @@ -310,10 +308,10 @@ class GPUBuilder { void transferGrads(const std::vector& gpair) { // HACK dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]), - sizeof(gpu_gpair) * nRows, + sizeof(bst_gpair) * nRows, cudaMemcpyHostToDevice)); // evaluate the full-grad reduction for the root node - sumReduction(tmp_mem, gradsInst, gradSums, nRows); + sumReduction(tmp_mem, gradsInst, gradSums, nRows); } void initNodeData(int level, node_id_t nodeStart, int nNodes) { @@ -371,13 +369,13 @@ class GPUBuilder { const Node& n = hNodes[i]; if ((i != 0) && hNodes[i].isLeaf()) { tree[nodeId].set_leaf(n.weight * param.learning_rate); - tree.stat(nodeId).sum_hess = n.gradSum.h; + tree.stat(nodeId).sum_hess = n.gradSum.hess; ++nodeId; } else if (!hNodes[i].isUnused()) { tree.AddChilds(nodeId); tree[nodeId].set_split(n.colIdx, n.threshold, n.dir == LeftDir); tree.stat(nodeId).loss_chg = n.score; - tree.stat(nodeId).sum_hess = n.gradSum.h; + tree.stat(nodeId).sum_hess = n.gradSum.hess; tree.stat(nodeId).base_weight = n.weight; tree[tree[nodeId].cleft()].set_leaf(0); tree[tree[nodeId].cright()].set_leaf(0); diff --git a/plugin/updater_gpu/src/exact/gradients.cuh b/plugin/updater_gpu/src/exact/gradients.cuh deleted file mode 100644 index 96cc290c5..000000000 --- a/plugin/updater_gpu/src/exact/gradients.cuh +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights - * reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "../common.cuh" - -namespace xgboost { -namespace tree { -namespace exact { - -/** - * @struct gpu_gpair gradients.cuh - * @brief The first/second order gradients for iteratively building the tree - */ -struct gpu_gpair { - /** the 'g_i' as it appears in the xgboost paper */ - float g; - /** the 'h_i' as it appears in the xgboost paper */ - float h; - - HOST_DEV_INLINE gpu_gpair() : g(0.f), h(0.f) {} - HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h) : g(_g), h(_h) {} - HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a) : g(a.g), h(a.h) {} - - /** - * @brief Checks whether the hessian is more than the defined weight - * @param minWeight minimum weight to be compared against - * @return true if the hessian is greater than the minWeight - * @note this is useful in deciding whether to further split to child node - */ - HOST_DEV_INLINE bool isSplittable(float minWeight) const { - return (h > minWeight); - } - - HOST_DEV_INLINE gpu_gpair& operator+=(const gpu_gpair& a) { - g += a.g; - h += a.h; - return *this; - } - - HOST_DEV_INLINE gpu_gpair& operator-=(const gpu_gpair& a) { - g -= a.g; - h -= a.h; - return *this; - } - - HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a, - const gpu_gpair& b) { - return gpu_gpair(a.g + b.g, a.h + b.h); - } - - HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a, - const gpu_gpair& b) { - return gpu_gpair(a.g - b.g, a.h - b.h); - } - - HOST_DEV_INLINE gpu_gpair(int value) { - *this = gpu_gpair((float)value, (float)value); - } -}; - -/** - * @brief Gradient value getter function - * @param id the index into the vals or instIds array to which to fetch - * @param vals the gradient value buffer - * @param instIds instance index buffer - * @return the expected gradient value - */ -HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, - const int* instIds) { - id = instIds[id]; - return vals[id]; -} - -} // namespace exact -} // namespace tree -} // namespace xgboost diff --git a/plugin/updater_gpu/src/exact/loss_functions.cuh b/plugin/updater_gpu/src/exact/loss_functions.cuh deleted file mode 100644 index 870b84d3c..000000000 --- a/plugin/updater_gpu/src/exact/loss_functions.cuh +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights - * reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "../common.cuh" -#include "gradients.cuh" - -namespace xgboost { -namespace tree { -namespace exact { - -HOST_DEV_INLINE float device_calc_loss_chg( - const TrainParam ¶m, const gpu_gpair &scan, const gpu_gpair &missing, - const gpu_gpair &parent_sum, const float &parent_gain, bool missing_left) { - gpu_gpair left = scan; - if (missing_left) { - left += missing; - } - gpu_gpair right = parent_sum - left; - float left_gain = CalcGain(param, left.g, left.h); - float right_gain = CalcGain(param, right.g, right.h); - return left_gain + right_gain - parent_gain; -} - -HOST_DEV_INLINE float loss_chg_missing(const gpu_gpair &scan, - const gpu_gpair &missing, - const gpu_gpair &parent_sum, - const float &parent_gain, - const TrainParam ¶m, - bool &missing_left_out) { - float missing_left_loss = - device_calc_loss_chg(param, scan, missing, parent_sum, parent_gain, true); - float missing_right_loss = device_calc_loss_chg( - param, scan, missing, parent_sum, parent_gain, false); - if (missing_left_loss >= missing_right_loss) { - missing_left_out = true; - return missing_left_loss; - } else { - missing_left_out = false; - return missing_right_loss; - } -} - -} // namespace exact -} // namespace tree -} // namespace xgboost diff --git a/plugin/updater_gpu/src/exact/node.cuh b/plugin/updater_gpu/src/exact/node.cuh index 4c62f26fc..f4c9186e3 100644 --- a/plugin/updater_gpu/src/exact/node.cuh +++ b/plugin/updater_gpu/src/exact/node.cuh @@ -17,7 +17,6 @@ #pragma once #include "../common.cuh" -#include "gradients.cuh" namespace xgboost { namespace tree { @@ -67,7 +66,7 @@ template class Node { public: /** sum of gradients across all training samples part of this node */ - gpu_gpair gradSum; + bst_gpair gradSum; /** the optimal score for this node */ float score; /** weightage for this node */ diff --git a/plugin/updater_gpu/src/exact/split2node.cuh b/plugin/updater_gpu/src/exact/split2node.cuh index dcdc2b4c8..083bd76e1 100644 --- a/plugin/updater_gpu/src/exact/split2node.cuh +++ b/plugin/updater_gpu/src/exact/split2node.cuh @@ -16,8 +16,6 @@ #pragma once #include "../../../../src/tree/param.h" -#include "gradients.cuh" -#include "loss_functions.cuh" #include "node.cuh" namespace xgboost { @@ -37,11 +35,11 @@ namespace exact { */ template DEV_INLINE void updateOneChildNode(Node* nodes, int nid, - const gpu_gpair& grad, + const bst_gpair& grad, const TrainParam& param) { nodes[nid].gradSum = grad; - nodes[nid].score = CalcGain(param, grad.g, grad.h); - nodes[nid].weight = CalcWeight(param, grad.g, grad.h); + nodes[nid].score = CalcGain(param, grad.grad, grad.hess); + nodes[nid].weight = CalcWeight(param, grad.grad, grad.hess); nodes[nid].id = nid; } @@ -56,7 +54,7 @@ DEV_INLINE void updateOneChildNode(Node* nodes, int nid, */ template DEV_INLINE void updateChildNodes(Node* nodes, int pid, - const gpu_gpair& gradL, const gpu_gpair& gradR, + const bst_gpair& gradL, const bst_gpair& gradR, const TrainParam& param) { int childId = (pid * 2) + 1; updateOneChildNode(nodes, childId, gradL, param); @@ -66,15 +64,15 @@ DEV_INLINE void updateChildNodes(Node* nodes, int pid, template DEV_INLINE void updateNodeAndChildren(Node* nodes, const Split& s, const Node& n, int absNodeId, - int colId, const gpu_gpair& gradScan, - const gpu_gpair& colSum, float thresh, + int colId, const bst_gpair& gradScan, + const bst_gpair& colSum, float thresh, const TrainParam& param) { bool missingLeft = true; // get the default direction for the current node - gpu_gpair missing = n.gradSum - colSum; + bst_gpair missing = n.gradSum - colSum; loss_chg_missing(gradScan, missing, n.gradSum, n.score, param, missingLeft); // get the score/weight/id/gradSum for left and right child nodes - gpu_gpair lGradSum, rGradSum; + bst_gpair lGradSum, rGradSum; if (missingLeft) { lGradSum = gradScan + n.gradSum - colSum; } else { @@ -90,8 +88,8 @@ DEV_INLINE void updateNodeAndChildren(Node* nodes, const Split& s, template __global__ void split2nodeKernel( - Node* nodes, const Split* nodeSplits, const gpu_gpair* gradScans, - const gpu_gpair* gradSums, const float* vals, const int* colIds, + Node* nodes, const Split* nodeSplits, const bst_gpair* gradScans, + const bst_gpair* gradSums, const float* vals, const int* colIds, const int* colOffsets, const node_id_t* nodeAssigns, int nUniqKeys, node_id_t nodeStart, int nCols, const TrainParam param) { int uid = (blockIdx.x * blockDim.x) + threadIdx.x; @@ -132,7 +130,7 @@ __global__ void split2nodeKernel( */ template void split2node(Node* nodes, const Split* nodeSplits, - const gpu_gpair* gradScans, const gpu_gpair* gradSums, + const bst_gpair* gradScans, const bst_gpair* gradSums, const float* vals, const int* colIds, const int* colOffsets, const node_id_t* nodeAssigns, int nUniqKeys, node_id_t nodeStart, int nCols, const TrainParam param) { diff --git a/plugin/updater_gpu/src/functions.cuh b/plugin/updater_gpu/src/functions.cuh deleted file mode 100644 index 34426d7d5..000000000 --- a/plugin/updater_gpu/src/functions.cuh +++ /dev/null @@ -1,11 +0,0 @@ -/*! - * Copyright 2016 Rory mitchell -*/ -#pragma once -#include "../../../src/common/random.h" -#include "../../../src/tree/param.h" -#include "types.cuh" - -namespace xgboost { -namespace tree {} // namespace tree -} // namespace xgboost diff --git a/plugin/updater_gpu/src/gpu_hist_builder.cu b/plugin/updater_gpu/src/gpu_hist_builder.cu index dabe4b1cf..3a236e49f 100644 --- a/plugin/updater_gpu/src/gpu_hist_builder.cu +++ b/plugin/updater_gpu/src/gpu_hist_builder.cu @@ -40,10 +40,10 @@ void DeviceHist::Init(int n_bins_in) { void DeviceHist::Reset(int device_idx) { cudaSetDevice(device_idx); - data.fill(gpu_gpair()); + data.fill(bst_gpair()); } -gpu_gpair* DeviceHist::GetLevelPtr(int depth) { +bst_gpair* DeviceHist::GetLevelPtr(int depth) { return data.data() + n_nodes(depth - 1) * n_bins; } @@ -53,20 +53,20 @@ HistBuilder DeviceHist::GetBuilder() { return HistBuilder(data.data(), n_bins); } -HistBuilder::HistBuilder(gpu_gpair* ptr, int n_bins) +HistBuilder::HistBuilder(bst_gpair* ptr, int n_bins) : d_hist(ptr), n_bins(n_bins) {} -__device__ void HistBuilder::Add(gpu_gpair gpair, int gidx, int nidx) const { +__device__ void HistBuilder::Add(bst_gpair gpair, int gidx, int nidx) const { int hist_idx = nidx * n_bins + gidx; - atomicAdd(&(d_hist[hist_idx]._grad), gpair._grad); // OPTMARK: This and below + atomicAdd(&(d_hist[hist_idx].grad), gpair.grad); // OPTMARK: This and below // line lead to about 3X // slowdown due to memory // dependency and access // pattern issues. - atomicAdd(&(d_hist[hist_idx]._hess), gpair._hess); + atomicAdd(&(d_hist[hist_idx].hess), gpair.hess); } -__device__ gpu_gpair HistBuilder::Get(int gidx, int nidx) const { +__device__ bst_gpair HistBuilder::Get(int gidx, int nidx) const { return d_hist[nidx * n_bins + gidx]; } @@ -362,7 +362,7 @@ void GPUHistBuilder::BuildHist(int depth) { if (!is_smallest && depth > 0) return; int gidx = d_gidx[local_idx]; - gpu_gpair gpair = d_gpair[ridx - row_begin]; + bst_gpair gpair = d_gpair[ridx - row_begin]; hist_builder.Add(gpair, gidx, nidx); // OPTMARK: This is slow, could use // shared memory or cache results @@ -382,14 +382,14 @@ void GPUHistBuilder::BuildHist(int depth) { // TODO(JCM): use out of place with pre-allocated buffer, but then have to // copy // back on device - // fprintf(stderr,"sizeof(gpu_gpair)/sizeof(float)=%d\n",sizeof(gpu_gpair)/sizeof(float)); + // fprintf(stderr,"sizeof(bst_gpair)/sizeof(float)=%d\n",sizeof(bst_gpair)/sizeof(float)); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; dh::safe_cuda(cudaSetDevice(device_idx)); dh::safe_nccl(ncclAllReduce( reinterpret_cast(hist_vec[d_idx].GetLevelPtr(depth)), reinterpret_cast(hist_vec[d_idx].GetLevelPtr(depth)), - hist_vec[d_idx].LevelSize(depth) * sizeof(gpu_gpair) / sizeof(float), + hist_vec[d_idx].LevelSize(depth) * sizeof(bst_gpair) / sizeof(float), ncclFloat, ncclSum, comms[d_idx], *(streams[d_idx]))); } @@ -423,9 +423,9 @@ void GPUHistBuilder::BuildHist(int depth) { } int gidx = idx % hist_builder.n_bins; - gpu_gpair parent = hist_builder.Get(gidx, parent_nidx(nidx)); + bst_gpair parent = hist_builder.Get(gidx, parent_nidx(nidx)); int other_nidx = left_smallest ? nidx - 1 : nidx + 1; - gpu_gpair other = hist_builder.Get(gidx, other_nidx); + bst_gpair other = hist_builder.Get(gidx, other_nidx); hist_builder.Add(parent - other, gidx, nidx); // OPTMARK: This is slow, could use shared // memory or cache results intead of writing to @@ -438,16 +438,16 @@ void GPUHistBuilder::BuildHist(int depth) { template __global__ void find_split_kernel( - const gpu_gpair* d_level_hist, int* d_feature_segments, int depth, + const bst_gpair* d_level_hist, int* d_feature_segments, int depth, int n_features, int n_bins, Node* d_nodes, Node* d_nodes_temp, Node* d_nodes_child_temp, int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map, GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp, bool colsample, int* d_feature_flags) { typedef cub::KeyValuePair ArgMaxT; - typedef cub::BlockScan + typedef cub::BlockScan BlockScanT; typedef cub::BlockReduce MaxReduceT; - typedef cub::BlockReduce SumReduceT; + typedef cub::BlockReduce SumReduceT; union TempStorage { typename BlockScanT::TempStorage scan; @@ -456,12 +456,12 @@ __global__ void find_split_kernel( }; struct UninitializedSplit : cub::Uninitialized {}; - struct UninitializedGpair : cub::Uninitialized {}; + struct UninitializedGpair : cub::Uninitialized {}; __shared__ UninitializedSplit uninitialized_split; Split& split = uninitialized_split.Alias(); __shared__ UninitializedGpair uninitialized_sum; - gpu_gpair& shared_sum = uninitialized_sum.Alias(); + bst_gpair& shared_sum = uninitialized_sum.Alias(); __shared__ ArgMaxT block_max; __shared__ TempStorage temp_storage; @@ -484,12 +484,12 @@ __global__ void find_split_kernel( int gidx = (begin - (level_node_idx * n_bins)) + threadIdx.x; bool thread_active = threadIdx.x < end - begin; - gpu_gpair feature_sum = gpu_gpair(); + bst_gpair feature_sum = bst_gpair(); for (int reduce_begin = begin; reduce_begin < end; reduce_begin += BLOCK_THREADS) { // Scan histogram - gpu_gpair bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x] - : gpu_gpair(); + bst_gpair bin = thread_active ? d_level_hist[reduce_begin + threadIdx.x] + : bst_gpair(); feature_sum += SumReduceT(temp_storage.sum_reduce).Reduce(bin, cub::Sum()); @@ -503,17 +503,17 @@ __global__ void find_split_kernel( GpairCallbackOp prefix_op = GpairCallbackOp(); for (int scan_begin = begin; scan_begin < end; scan_begin += BLOCK_THREADS) { - gpu_gpair bin = - thread_active ? d_level_hist[scan_begin + threadIdx.x] : gpu_gpair(); + bst_gpair bin = + thread_active ? d_level_hist[scan_begin + threadIdx.x] : bst_gpair(); BlockScanT(temp_storage.scan) .ExclusiveScan(bin, bin, cub::Sum(), prefix_op); // Calculate gain - gpu_gpair parent_sum = d_nodes[node_idx].sum_gradients; + bst_gpair parent_sum = d_nodes[node_idx].sum_gradients; float parent_gain = d_nodes[node_idx].root_gain; - gpu_gpair missing = parent_sum - shared_sum; + bst_gpair missing = parent_sum - shared_sum; bool missing_left; float gain = thread_active @@ -543,8 +543,8 @@ __global__ void find_split_kernel( fvalue = d_gidx_fvalue_map[gidx - 1]; } - gpu_gpair left = missing_left ? bin + missing : bin; - gpu_gpair right = parent_sum - left; + bst_gpair left = missing_left ? bin + missing : bin; + bst_gpair right = parent_sum - left; split.Update(gain, missing_left, fvalue, fidx, left, right, gpu_param); } @@ -581,16 +581,16 @@ __global__ void find_split_kernel( *Nodeleft = Node( split.left_sum, - CalcGain(gpu_param, split.left_sum.grad(), split.left_sum.hess()), - CalcWeight(gpu_param, split.left_sum.grad(), split.left_sum.hess())); + CalcGain(gpu_param, split.left_sum.grad, split.left_sum.hess), + CalcWeight(gpu_param, split.left_sum.grad, split.left_sum.hess)); *Noderight = Node( split.right_sum, - CalcGain(gpu_param, split.right_sum.grad(), split.right_sum.hess()), - CalcWeight(gpu_param, split.right_sum.grad(), split.right_sum.hess())); + CalcGain(gpu_param, split.right_sum.grad, split.right_sum.hess), + CalcWeight(gpu_param, split.right_sum.grad, split.right_sum.hess)); // Record smallest node - if (split.left_sum.hess() <= split.right_sum.hess()) { + if (split.left_sum.hess <= split.right_sum.hess) { *left_child_smallest = true; } else { *left_child_smallest = false; @@ -654,11 +654,11 @@ void GPUHistBuilder::LaunchFindSplit(int depth) { int nodes_offset_device = d_idx * num_nodes_device; find_split_kernel<<>>( - (const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), + (const bst_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(), nodes_child_temp[d_idx].data(), nodes_offset_device, - fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), gpu_param, + fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest_temp[d_idx].data(), colsample, feature_flags[d_idx].data()); } @@ -751,11 +751,11 @@ void GPUHistBuilder::LaunchFindSplit(int depth) { int nodes_offset_device = d_idx * num_nodes_device; find_split_kernel<<>>( - (const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), + (const bst_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL, nodes_offset_device, fidx_min_map[d_idx].data(), - gidx_fvalue_map[d_idx].data(), gpu_param, + gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest[d_idx].data(), colsample, feature_flags[d_idx].data()); @@ -805,11 +805,11 @@ void GPUHistBuilder::LaunchFindSplit(int depth) { int nodes_offset_device = 0; find_split_kernel<<>>( - (const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), + (const bst_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)), feature_segments[d_idx].data(), depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL, nodes_offset_device, fidx_min_map[d_idx].data(), - gidx_fvalue_map[d_idx].data(), gpu_param, + gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param), left_child_smallest[d_idx].data(), colsample, feature_flags[d_idx].data()); } @@ -827,21 +827,21 @@ void GPUHistBuilder::InitFirstNode(const std::vector& gpair) { // and C:/Program Files (x86)/Microsoft Visual Studio // 14.0/VC/bin/../../VC/INCLUDE\future(1888): error : no instance of function // template "std::_Invoke_stored" matches the argument list - std::vector future_results(n_devices); + std::vector future_results(n_devices); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; auto begin = device_gpair[d_idx].tbegin(); auto end = device_gpair[d_idx].tend(); - gpu_gpair init = gpu_gpair(); - auto binary_op = thrust::plus(); + bst_gpair init = bst_gpair(); + auto binary_op = thrust::plus(); dh::safe_cuda(cudaSetDevice(device_idx)); future_results[d_idx] = thrust::reduce(begin, end, init, binary_op); } // sum over devices on host (with blocking get()) - gpu_gpair sum = gpu_gpair(); + bst_gpair sum = bst_gpair(); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; sum += future_results[d_idx]; @@ -849,7 +849,7 @@ void GPUHistBuilder::InitFirstNode(const std::vector& gpair) { #else // asynch reduce per device - std::vector> future_results(n_devices); + std::vector> future_results(n_devices); for (int d_idx = 0; d_idx < n_devices; d_idx++) { // std::async captures the algorithm parameters by value // use std::launch::async to ensure the creation of a new thread @@ -858,14 +858,14 @@ void GPUHistBuilder::InitFirstNode(const std::vector& gpair) { dh::safe_cuda(cudaSetDevice(device_idx)); auto begin = device_gpair[d_idx].tbegin(); auto end = device_gpair[d_idx].tend(); - gpu_gpair init = gpu_gpair(); - auto binary_op = thrust::plus(); + bst_gpair init = bst_gpair(); + auto binary_op = thrust::plus(); return thrust::reduce(begin, end, init, binary_op); }); } // sum over devices on host (with blocking get()) - gpu_gpair sum = gpu_gpair(); + bst_gpair sum = bst_gpair(); for (int d_idx = 0; d_idx < n_devices; d_idx++) { int device_idx = dList[d_idx]; sum += future_results[d_idx].get(); @@ -879,15 +879,15 @@ void GPUHistBuilder::InitFirstNode(const std::vector& gpair) { int device_idx = dList[d_idx]; auto d_nodes = nodes[d_idx].data(); - auto gpu_param_alias = gpu_param; + auto gpu_param = GPUTrainingParam(param); dh::launch_n(device_idx, 1, [=] __device__(int idx) { - gpu_gpair sum_gradients = sum; + bst_gpair sum_gradients = sum; d_nodes[idx] = Node( sum_gradients, - CalcGain(gpu_param_alias, sum_gradients.grad(), sum_gradients.hess()), - CalcWeight(gpu_param_alias, sum_gradients.grad(), - sum_gradients.hess())); + CalcGain(gpu_param, sum_gradients.grad, sum_gradients.hess), + CalcWeight(gpu_param, sum_gradients.grad, + sum_gradients.hess)); }); } // synch all devices to host before moving on (No, can avoid because BuildHist @@ -916,7 +916,7 @@ void GPUHistBuilder::UpdatePositionDense(int depth) { size_t end = device_row_segments[d_idx + 1]; dh::launch_n(device_idx, end - begin, [=] __device__(int local_idx) { - NodeIdT pos = d_position[local_idx]; + int pos = d_position[local_idx]; if (!is_active(pos, depth)) { return; } @@ -961,7 +961,7 @@ void GPUHistBuilder::UpdatePositionSparse(int depth) { // Update missing direction dh::launch_n(device_idx, row_end - row_begin, [=] __device__(int local_idx) { - NodeIdT pos = d_position[local_idx]; + int pos = d_position[local_idx]; if (!is_active(pos, depth)) { d_position_tmp[local_idx] = pos; return; @@ -985,7 +985,7 @@ void GPUHistBuilder::UpdatePositionSparse(int depth) { dh::launch_n( device_idx, element_end - element_begin, [=] __device__(int local_idx) { int ridx = d_ridx[local_idx]; - NodeIdT pos = d_position[ridx - row_begin]; + int pos = d_position[ridx - row_begin]; if (!is_active(pos, depth)) { return; } diff --git a/plugin/updater_gpu/src/gpu_hist_builder.cuh b/plugin/updater_gpu/src/gpu_hist_builder.cuh index 115d94e54..a5cd57736 100644 --- a/plugin/updater_gpu/src/gpu_hist_builder.cuh +++ b/plugin/updater_gpu/src/gpu_hist_builder.cuh @@ -31,16 +31,16 @@ struct DeviceGMat { }; struct HistBuilder { - gpu_gpair *d_hist; + bst_gpair *d_hist; int n_bins; - __host__ __device__ HistBuilder(gpu_gpair *ptr, int n_bins); - __device__ void Add(gpu_gpair gpair, int gidx, int nidx) const; - __device__ gpu_gpair Get(int gidx, int nidx) const; + __host__ __device__ HistBuilder(bst_gpair *ptr, int n_bins); + __device__ void Add(bst_gpair gpair, int gidx, int nidx) const; + __device__ bst_gpair Get(int gidx, int nidx) const; }; struct DeviceHist { int n_bins; - dh::dvec data; + dh::dvec data; void Init(int max_depth); @@ -48,7 +48,7 @@ struct DeviceHist { HistBuilder GetBuilder(); - gpu_gpair *GetLevelPtr(int depth); + bst_gpair *GetLevelPtr(int depth); int LevelSize(int depth); }; @@ -61,8 +61,6 @@ class GPUHistBuilder { void UpdateParam(const TrainParam ¶m) { this->param = param; - this->gpu_param = GPUTrainingParam(param.min_child_weight, param.reg_lambda, - param.reg_alpha, param.max_delta_step); } void InitData(const std::vector &gpair, DMatrix &fmat, // NOLINT @@ -85,7 +83,6 @@ class GPUHistBuilder { std::vector *p_out_preds); TrainParam param; - GPUTrainingParam gpu_param; common::HistCutMatrix hmat_; common::GHistIndexMatrix gmat_; MetaInfo *info; @@ -124,7 +121,7 @@ class GPUHistBuilder { std::vector> position; std::vector> position_tmp; std::vector device_matrix; - std::vector> device_gpair; + std::vector> device_gpair; std::vector> gidx_feature_map; std::vector> gidx_fvalue_map; diff --git a/plugin/updater_gpu/src/types.cuh b/plugin/updater_gpu/src/types.cuh index bba2bb5d5..f9aef1634 100644 --- a/plugin/updater_gpu/src/types.cuh +++ b/plugin/updater_gpu/src/types.cuh @@ -11,85 +11,6 @@ namespace xgboost { namespace tree { -typedef int16_t NodeIdT; - -// gpair type defined with device accessible functions -struct gpu_gpair { - float _grad; - float _hess; - - __host__ __device__ __forceinline__ float grad() const { return _grad; } - - __host__ __device__ __forceinline__ float hess() const { return _hess; } - - __host__ __device__ gpu_gpair() : _grad(0), _hess(0) {} - - __host__ __device__ gpu_gpair(float g, float h) : _grad(g), _hess(h) {} - - __host__ __device__ gpu_gpair(bst_gpair gpair) - : _grad(gpair.grad), _hess(gpair.hess) {} - - __host__ __device__ bool operator==(const gpu_gpair &rhs) const { - return (_grad == rhs._grad) && (_hess == rhs._hess); - } - - __host__ __device__ bool operator!=(const gpu_gpair &rhs) const { - return !(*this == rhs); - } - - __host__ __device__ gpu_gpair &operator+=(const gpu_gpair &rhs) { - _grad += rhs._grad; - _hess += rhs._hess; - return *this; - } - - __host__ __device__ gpu_gpair operator+(const gpu_gpair &rhs) const { - gpu_gpair g; - g._grad = _grad + rhs._grad; - g._hess = _hess + rhs._hess; - return g; - } - - __host__ __device__ gpu_gpair &operator-=(const gpu_gpair &rhs) { - _grad -= rhs._grad; - _hess -= rhs._hess; - return *this; - } - - __host__ __device__ gpu_gpair operator-(const gpu_gpair &rhs) const { - gpu_gpair g; - g._grad = _grad - rhs._grad; - g._hess = _hess - rhs._hess; - return g; - } - - friend std::ostream &operator<<(std::ostream &os, const gpu_gpair &g) { - os << g.grad() << "/" << g.hess(); - return os; - } - - __host__ __device__ void print() const { - printf("%1.4f/%1.4f\n", grad(), hess()); - } - - __host__ __device__ bool approximate_compare(const gpu_gpair &b, - float g_eps = 0.1, - float h_eps = 0.1) const { - float gdiff = abs(this->grad() - b.grad()); - float hdiff = abs(this->hess() - b.hess()); - - return (gdiff <= g_eps) && (hdiff <= h_eps); - } -}; - -typedef thrust::device_vector::iterator uint_iter; -typedef thrust::device_vector::iterator gpair_iter; -typedef thrust::device_vector::iterator float_iter; -typedef thrust::device_vector::iterator node_id_iter; -typedef thrust::permutation_iterator gpair_perm_iter; -typedef thrust::tuple ItemTuple; -typedef thrust::zip_iterator ItemIter; - struct GPUTrainingParam { // minimum amount of hessian(weight) allowed in a child float min_child_weight; @@ -104,6 +25,12 @@ struct GPUTrainingParam { __host__ __device__ GPUTrainingParam() {} + __host__ __device__ GPUTrainingParam(const TrainParam ¶m) + : min_child_weight(param.min_child_weight), + reg_lambda(param.reg_lambda), + reg_alpha(param.reg_alpha), + max_delta_step(param.max_delta_step) {} + __host__ __device__ GPUTrainingParam(float min_child_weight_in, float reg_lambda_in, float reg_alpha_in, float max_delta_step_in) @@ -118,19 +45,19 @@ struct Split { bool missing_left; float fvalue; int findex; - gpu_gpair left_sum; - gpu_gpair right_sum; + bst_gpair left_sum; + bst_gpair right_sum; __host__ __device__ Split() : loss_chg(-FLT_MAX), missing_left(true), fvalue(0), findex(-1) {} __device__ void Update(float loss_chg_in, bool missing_left_in, - float fvalue_in, int findex_in, gpu_gpair left_sum_in, - gpu_gpair right_sum_in, + float fvalue_in, int findex_in, bst_gpair left_sum_in, + bst_gpair right_sum_in, const GPUTrainingParam ¶m) { if (loss_chg_in > loss_chg && - left_sum_in.hess() >= param.min_child_weight && - right_sum_in.hess() >= param.min_child_weight) { + left_sum_in.hess>= param.min_child_weight && + right_sum_in.hess>= param.min_child_weight) { loss_chg = loss_chg_in; missing_left = missing_left_in; fvalue = fvalue_in; @@ -152,16 +79,16 @@ struct Split { } } - __host__ __device__ void Print() { - printf("Loss: %1.4f\n", loss_chg); - printf("Missing left: %d\n", missing_left); - printf("fvalue: %1.4f\n", fvalue); - printf("Left sum: "); - left_sum.print(); + //__host__ __device__ void Print() { + // printf("Loss: %1.4f\n", loss_chg); + // printf("Missing left: %d\n", missing_left); + // printf("fvalue: %1.4f\n", fvalue); + // printf("Left sum: "); + // left_sum.print(); - printf("Right sum: "); - right_sum.print(); - } + // printf("Right sum: "); + // right_sum.print(); + //} }; struct split_reduce_op { @@ -173,7 +100,7 @@ struct split_reduce_op { }; struct Node { - gpu_gpair sum_gradients; + bst_gpair sum_gradients; float root_gain; float weight; @@ -181,7 +108,7 @@ struct Node { __host__ __device__ Node() : weight(0), root_gain(0) {} - __host__ __device__ Node(gpu_gpair sum_gradients_in, float root_gain_in, + __host__ __device__ Node(bst_gpair sum_gradients_in, float root_gain_in, float weight_in) { sum_gradients = sum_gradients_in; root_gain = root_gain_in; diff --git a/src/tree/param.h b/src/tree/param.h index 283440307..12baa1c1b 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -14,11 +14,6 @@ #include #include -#ifdef __NVCC__ -#define XGB_DEVICE __host__ __device__ -#else -#define XGB_DEVICE -#endif namespace xgboost { namespace tree { @@ -234,7 +229,7 @@ struct TrainParam : public dmlc::Parameter { // functions for L1 cost template -XGB_DEVICE inline static T1 ThresholdL1(T1 w, T2 lambda) { +XGBOOST_DEVICE inline static T1 ThresholdL1(T1 w, T2 lambda) { if (w > +lambda) return w - lambda; if (w < -lambda) @@ -243,18 +238,18 @@ XGB_DEVICE inline static T1 ThresholdL1(T1 w, T2 lambda) { } template -XGB_DEVICE inline static T Sqr(T a) { return a * a; } +XGBOOST_DEVICE inline static T Sqr(T a) { return a * a; } // calculate the cost of loss function template -XGB_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, +XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad, T sum_hess, T w) { return -(2.0 * sum_grad * w + (sum_hess + p.reg_lambda) * Sqr(w)); } // calculate the cost of loss function template -XGB_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) { +XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) { if (sum_hess < p.min_child_weight) return 0.0; if (p.max_delta_step == 0.0f) { @@ -276,7 +271,7 @@ XGB_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) { } // calculate cost of loss function with four statistics template -XGB_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess, +XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess, T test_grad, T test_hess) { T w = CalcWeight(sum_grad, sum_hess); T ret = test_grad * w + 0.5 * (test_hess + p.reg_lambda) * Sqr(w); @@ -288,7 +283,7 @@ XGB_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess, } // calculate weight given the statistics template -XGB_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad, +XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad, T sum_hess) { if (sum_hess < p.min_child_weight) return 0.0;